summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
commit36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/dav1d/src
parentInitial commit. (diff)
downloadfirefox-esr-upstream.tar.xz
firefox-esr-upstream.zip
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src')
-rw-r--r--third_party/dav1d/src/arm/32/cdef.S540
-rw-r--r--third_party/dav1d/src/arm/32/cdef16.S233
-rw-r--r--third_party/dav1d/src/arm/32/cdef_tmpl.S515
-rw-r--r--third_party/dav1d/src/arm/32/filmgrain.S2039
-rw-r--r--third_party/dav1d/src/arm/32/filmgrain16.S2137
-rw-r--r--third_party/dav1d/src/arm/32/ipred.S2937
-rw-r--r--third_party/dav1d/src/arm/32/ipred16.S3254
-rw-r--r--third_party/dav1d/src/arm/32/itx.S3343
-rw-r--r--third_party/dav1d/src/arm/32/itx16.S3625
-rw-r--r--third_party/dav1d/src/arm/32/loopfilter.S868
-rw-r--r--third_party/dav1d/src/arm/32/loopfilter16.S859
-rw-r--r--third_party/dav1d/src/arm/32/looprestoration.S791
-rw-r--r--third_party/dav1d/src/arm/32/looprestoration16.S801
-rw-r--r--third_party/dav1d/src/arm/32/looprestoration_common.S453
-rw-r--r--third_party/dav1d/src/arm/32/looprestoration_tmpl.S600
-rw-r--r--third_party/dav1d/src/arm/32/mc.S3340
-rw-r--r--third_party/dav1d/src/arm/32/mc16.S3658
-rw-r--r--third_party/dav1d/src/arm/32/msac.S575
-rw-r--r--third_party/dav1d/src/arm/32/refmvs.S97
-rw-r--r--third_party/dav1d/src/arm/32/util.S184
-rw-r--r--third_party/dav1d/src/arm/64/cdef.S520
-rw-r--r--third_party/dav1d/src/arm/64/cdef16.S229
-rw-r--r--third_party/dav1d/src/arm/64/cdef_tmpl.S511
-rw-r--r--third_party/dav1d/src/arm/64/filmgrain.S2010
-rw-r--r--third_party/dav1d/src/arm/64/filmgrain16.S1997
-rw-r--r--third_party/dav1d/src/arm/64/ipred.S3985
-rw-r--r--third_party/dav1d/src/arm/64/ipred16.S4204
-rw-r--r--third_party/dav1d/src/arm/64/itx.S3270
-rw-r--r--third_party/dav1d/src/arm/64/itx16.S3648
-rw-r--r--third_party/dav1d/src/arm/64/loopfilter.S1129
-rw-r--r--third_party/dav1d/src/arm/64/loopfilter16.S925
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration.S1336
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration16.S1419
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration_common.S432
-rw-r--r--third_party/dav1d/src/arm/64/looprestoration_tmpl.S597
-rw-r--r--third_party/dav1d/src/arm/64/mc.S3310
-rw-r--r--third_party/dav1d/src/arm/64/mc16.S3611
-rw-r--r--third_party/dav1d/src/arm/64/msac.S480
-rw-r--r--third_party/dav1d/src/arm/64/refmvs.S91
-rw-r--r--third_party/dav1d/src/arm/64/util.S229
-rw-r--r--third_party/dav1d/src/arm/asm-offsets.h43
-rw-r--r--third_party/dav1d/src/arm/asm.S291
-rw-r--r--third_party/dav1d/src/arm/cdef.h88
-rw-r--r--third_party/dav1d/src/arm/cpu.c99
-rw-r--r--third_party/dav1d/src/arm/cpu.h37
-rw-r--r--third_party/dav1d/src/arm/filmgrain.h204
-rw-r--r--third_party/dav1d/src/arm/ipred.h222
-rw-r--r--third_party/dav1d/src/arm/itx.h141
-rw-r--r--third_party/dav1d/src/arm/loopfilter.h45
-rw-r--r--third_party/dav1d/src/arm/looprestoration.h265
-rw-r--r--third_party/dav1d/src/arm/mc.h114
-rw-r--r--third_party/dav1d/src/arm/msac.h52
-rw-r--r--third_party/dav1d/src/arm/refmvs.h39
-rw-r--r--third_party/dav1d/src/cdef.h71
-rw-r--r--third_party/dav1d/src/cdef_apply.h39
-rw-r--r--third_party/dav1d/src/cdef_apply_tmpl.c309
-rw-r--r--third_party/dav1d/src/cdef_tmpl.c331
-rw-r--r--third_party/dav1d/src/cdf.c4123
-rw-r--r--third_party/dav1d/src/cdf.h150
-rw-r--r--third_party/dav1d/src/cpu.c101
-rw-r--r--third_party/dav1d/src/cpu.h102
-rw-r--r--third_party/dav1d/src/ctx.h91
-rw-r--r--third_party/dav1d/src/data.c145
-rw-r--r--third_party/dav1d/src/data.h56
-rw-r--r--third_party/dav1d/src/dav1d.rc.in32
-rw-r--r--third_party/dav1d/src/decode.c3910
-rw-r--r--third_party/dav1d/src/decode.h35
-rw-r--r--third_party/dav1d/src/dequant_tables.c229
-rw-r--r--third_party/dav1d/src/dequant_tables.h37
-rw-r--r--third_party/dav1d/src/env.h521
-rw-r--r--third_party/dav1d/src/ext/x86/x86inc.asm1902
-rw-r--r--third_party/dav1d/src/fg_apply.h58
-rw-r--r--third_party/dav1d/src/fg_apply_tmpl.c239
-rw-r--r--third_party/dav1d/src/filmgrain.h84
-rw-r--r--third_party/dav1d/src/filmgrain_tmpl.c441
-rw-r--r--third_party/dav1d/src/getbits.c178
-rw-r--r--third_party/dav1d/src/getbits.h59
-rw-r--r--third_party/dav1d/src/internal.h467
-rw-r--r--third_party/dav1d/src/intra_edge.c165
-rw-r--r--third_party/dav1d/src/intra_edge.h57
-rw-r--r--third_party/dav1d/src/ipred.h94
-rw-r--r--third_party/dav1d/src/ipred_prepare.h108
-rw-r--r--third_party/dav1d/src/ipred_prepare_tmpl.c204
-rw-r--r--third_party/dav1d/src/ipred_tmpl.c771
-rw-r--r--third_party/dav1d/src/itx.h48
-rw-r--r--third_party/dav1d/src/itx_1d.c1034
-rw-r--r--third_party/dav1d/src/itx_1d.h59
-rw-r--r--third_party/dav1d/src/itx_tmpl.c264
-rw-r--r--third_party/dav1d/src/levels.h289
-rw-r--r--third_party/dav1d/src/lf_apply.h48
-rw-r--r--third_party/dav1d/src/lf_apply_tmpl.c466
-rw-r--r--third_party/dav1d/src/lf_mask.c491
-rw-r--r--third_party/dav1d/src/lf_mask.h83
-rw-r--r--third_party/dav1d/src/lib.c802
-rw-r--r--third_party/dav1d/src/log.c57
-rw-r--r--third_party/dav1d/src/log.h47
-rw-r--r--third_party/dav1d/src/loopfilter.h57
-rw-r--r--third_party/dav1d/src/loopfilter_tmpl.c268
-rw-r--r--third_party/dav1d/src/looprestoration.h79
-rw-r--r--third_party/dav1d/src/looprestoration_tmpl.c554
-rw-r--r--third_party/dav1d/src/lr_apply.h47
-rw-r--r--third_party/dav1d/src/lr_apply_tmpl.c201
-rw-r--r--third_party/dav1d/src/mc.h136
-rw-r--r--third_party/dav1d/src/mc_tmpl.c953
-rw-r--r--third_party/dav1d/src/mem.c119
-rw-r--r--third_party/dav1d/src/mem.h103
-rw-r--r--third_party/dav1d/src/meson.build348
-rw-r--r--third_party/dav1d/src/msac.c208
-rw-r--r--third_party/dav1d/src/msac.h108
-rw-r--r--third_party/dav1d/src/obu.c1702
-rw-r--r--third_party/dav1d/src/obu.h36
-rw-r--r--third_party/dav1d/src/picture.c367
-rw-r--r--third_party/dav1d/src/picture.h115
-rw-r--r--third_party/dav1d/src/ppc/cdef.h61
-rw-r--r--third_party/dav1d/src/ppc/cdef_tmpl.c487
-rw-r--r--third_party/dav1d/src/ppc/cpu.c51
-rw-r--r--third_party/dav1d/src/ppc/cpu.h37
-rw-r--r--third_party/dav1d/src/ppc/dav1d_types.h54
-rw-r--r--third_party/dav1d/src/ppc/looprestoration.h48
-rw-r--r--third_party/dav1d/src/ppc/looprestoration_tmpl.c321
-rw-r--r--third_party/dav1d/src/qm.c3148
-rw-r--r--third_party/dav1d/src/qm.h37
-rw-r--r--third_party/dav1d/src/recon.h85
-rw-r--r--third_party/dav1d/src/recon_tmpl.c2202
-rw-r--r--third_party/dav1d/src/ref.c107
-rw-r--r--third_party/dav1d/src/ref.h60
-rw-r--r--third_party/dav1d/src/refmvs.c940
-rw-r--r--third_party/dav1d/src/refmvs.h176
-rw-r--r--third_party/dav1d/src/scan.c299
-rw-r--r--third_party/dav1d/src/scan.h37
-rw-r--r--third_party/dav1d/src/tables.c1013
-rw-r--r--third_party/dav1d/src/tables.h125
-rw-r--r--third_party/dav1d/src/thread.h188
-rw-r--r--third_party/dav1d/src/thread_data.h40
-rw-r--r--third_party/dav1d/src/thread_task.c936
-rw-r--r--third_party/dav1d/src/thread_task.h53
-rw-r--r--third_party/dav1d/src/warpmv.c209
-rw-r--r--third_party/dav1d/src/warpmv.h39
-rw-r--r--third_party/dav1d/src/wedge.c342
-rw-r--r--third_party/dav1d/src/wedge.h41
-rw-r--r--third_party/dav1d/src/win32/thread.c99
-rw-r--r--third_party/dav1d/src/x86/cdef.h87
-rw-r--r--third_party/dav1d/src/x86/cdef16_avx2.asm877
-rw-r--r--third_party/dav1d/src/x86/cdef16_avx512.asm622
-rw-r--r--third_party/dav1d/src/x86/cdef16_sse.asm1033
-rw-r--r--third_party/dav1d/src/x86/cdef_avx2.asm1772
-rw-r--r--third_party/dav1d/src/x86/cdef_avx512.asm860
-rw-r--r--third_party/dav1d/src/x86/cdef_sse.asm1357
-rw-r--r--third_party/dav1d/src/x86/cpu.c100
-rw-r--r--third_party/dav1d/src/x86/cpu.h44
-rw-r--r--third_party/dav1d/src/x86/cpuid.asm55
-rw-r--r--third_party/dav1d/src/x86/filmgrain.h81
-rw-r--r--third_party/dav1d/src/x86/filmgrain16_avx2.asm2248
-rw-r--r--third_party/dav1d/src/x86/filmgrain16_avx512.asm932
-rw-r--r--third_party/dav1d/src/x86/filmgrain16_sse.asm3421
-rw-r--r--third_party/dav1d/src/x86/filmgrain_avx2.asm2107
-rw-r--r--third_party/dav1d/src/x86/filmgrain_avx512.asm813
-rw-r--r--third_party/dav1d/src/x86/filmgrain_common.asm46
-rw-r--r--third_party/dav1d/src/x86/filmgrain_sse.asm3233
-rw-r--r--third_party/dav1d/src/x86/ipred.h151
-rw-r--r--third_party/dav1d/src/x86/ipred16_avx2.asm4992
-rw-r--r--third_party/dav1d/src/x86/ipred16_avx512.asm833
-rw-r--r--third_party/dav1d/src/x86/ipred16_sse.asm1923
-rw-r--r--third_party/dav1d/src/x86/ipred_avx2.asm5387
-rw-r--r--third_party/dav1d/src/x86/ipred_avx512.asm1432
-rw-r--r--third_party/dav1d/src/x86/ipred_sse.asm5409
-rw-r--r--third_party/dav1d/src/x86/itx.h363
-rw-r--r--third_party/dav1d/src/x86/itx16_avx2.asm8599
-rw-r--r--third_party/dav1d/src/x86/itx16_avx512.asm4133
-rw-r--r--third_party/dav1d/src/x86/itx16_sse.asm8135
-rw-r--r--third_party/dav1d/src/x86/itx_avx2.asm5542
-rw-r--r--third_party/dav1d/src/x86/itx_avx512.asm7389
-rw-r--r--third_party/dav1d/src/x86/itx_sse.asm6533
-rw-r--r--third_party/dav1d/src/x86/loopfilter.h66
-rw-r--r--third_party/dav1d/src/x86/loopfilter16_avx2.asm1161
-rw-r--r--third_party/dav1d/src/x86/loopfilter16_avx512.asm912
-rw-r--r--third_party/dav1d/src/x86/loopfilter16_sse.asm1793
-rw-r--r--third_party/dav1d/src/x86/loopfilter_avx2.asm1569
-rw-r--r--third_party/dav1d/src/x86/loopfilter_avx512.asm1534
-rw-r--r--third_party/dav1d/src/x86/loopfilter_sse.asm2348
-rw-r--r--third_party/dav1d/src/x86/looprestoration.h94
-rw-r--r--third_party/dav1d/src/x86/looprestoration16_avx2.asm2540
-rw-r--r--third_party/dav1d/src/x86/looprestoration16_avx512.asm2524
-rw-r--r--third_party/dav1d/src/x86/looprestoration16_sse.asm3723
-rw-r--r--third_party/dav1d/src/x86/looprestoration_avx2.asm2237
-rw-r--r--third_party/dav1d/src/x86/looprestoration_avx512.asm2122
-rw-r--r--third_party/dav1d/src/x86/looprestoration_sse.asm3681
-rw-r--r--third_party/dav1d/src/x86/mc.h299
-rw-r--r--third_party/dav1d/src/x86/mc16_avx2.asm5879
-rw-r--r--third_party/dav1d/src/x86/mc16_avx512.asm4858
-rw-r--r--third_party/dav1d/src/x86/mc16_sse.asm8731
-rw-r--r--third_party/dav1d/src/x86/mc_avx2.asm5669
-rw-r--r--third_party/dav1d/src/x86/mc_avx512.asm4538
-rw-r--r--third_party/dav1d/src/x86/mc_sse.asm9599
-rw-r--r--third_party/dav1d/src/x86/msac.asm667
-rw-r--r--third_party/dav1d/src/x86/msac.h75
-rw-r--r--third_party/dav1d/src/x86/refmvs.asm688
-rw-r--r--third_party/dav1d/src/x86/refmvs.h61
198 files changed, 249732 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/32/cdef.S b/third_party/dav1d/src/arm/32/cdef.S
new file mode 100644
index 0000000000..4a0df6eac8
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/cdef.S
@@ -0,0 +1,540 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+// n1 = s0/d0
+// w1 = d0/q0
+// n2 = s4/d2
+// w2 = d2/q1
+.macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret
+ tst r7, #1 // CDEF_HAVE_LEFT
+ beq 2f
+ // CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldrh r12, [\s1, #-2]
+ vldr \n1, [\s1]
+ vdup.16 d4, r12
+ ldrh r12, [\s1, #\w]
+ vmov.16 d4[1], r12
+ ldrh r12, [\s2, #-2]
+ vldr \n2, [\s2]
+ vmov.16 d4[2], r12
+ ldrh r12, [\s2, #\w]
+ vmovl.u8 q0, d0
+ vmov.16 d4[3], r12
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vstr s8, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s9, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s10, [r0, #-4]
+ vst1.16 {\w2}, [r0, :\align]
+ vstr s11, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldrh r12, [\s1, #-2]
+ vldr \n1, [\s1]
+ vdup.16 d4, r12
+ ldrh r12, [\s2, #-2]
+ vldr \n2, [\s2]
+ vmovl.u8 q0, d0
+ vmov.16 d4[1], r12
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vstr s8, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s9, [r0, #-4]
+ vst1.16 {\w2}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+2:
+ // !CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ vldr \n1, [\s1]
+ ldrh r12, [\s1, #\w]
+ vldr \n2, [\s2]
+ vdup.16 d4, r12
+ ldrh r12, [\s2, #\w]
+ vmovl.u8 q0, d0
+ vmov.16 d4[1], r12
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vstr s12, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s8, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s12, [r0, #-4]
+ vst1.16 {\w2}, [r0, :\align]
+ vstr s9, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vldr \n1, [\s1]
+ vldr \n2, [\s2]
+ vmovl.u8 q0, d0
+ vmovl.u8 q1, d2
+ vstr s12, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s12, [r0, #-4]
+ vst1.16 {\w2}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+.endif
+3:
+.endm
+
+.macro load_n_incr dst, src, incr, w
+.if \w == 4
+ vld1.32 {\dst\()[0]}, [\src, :32], \incr
+.else
+ vld1.8 {\dst\()}, [\src, :64], \incr
+.endif
+.endm
+
+// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+// n1 = s0/d0
+// w1 = d0/q0
+// n2 = s4/d2
+// w2 = d2/q1
+.macro padding_func w, stride, n1, w1, n2, w2, align
+function cdef_padding\w\()_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ cmp r7, #0xf // fully edged
+ beq cdef_padding\w\()_edged_8bpc_neon
+ vmov.i16 q3, #0x8000
+ tst r7, #4 // CDEF_HAVE_TOP
+ bne 1f
+ // !CDEF_HAVE_TOP
+ sub r12, r0, #2*(2*\stride+2)
+ vmov.i16 q2, #0x8000
+ vst1.16 {q2,q3}, [r12]!
+.if \w == 8
+ vst1.16 {q2,q3}, [r12]!
+.endif
+ b 3f
+1:
+ // CDEF_HAVE_TOP
+ add r8, r4, r2
+ sub r0, r0, #2*(2*\stride)
+ pad_top_bottom r4, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
+
+ // Middle section
+3:
+ tst r7, #1 // CDEF_HAVE_LEFT
+ beq 2f
+ // CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ vld1.16 {d2[]}, [r3, :16]!
+ ldrh r12, [r1, #\w]
+ load_n_incr d0, r1, r2, \w
+ subs r6, r6, #1
+ vmov.16 d2[1], r12
+ vmovl.u8 q0, d0
+ vmovl.u8 q1, d2
+ vstr s4, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s5, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 0b
+ b 3f
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.16 {d2[]}, [r3, :16]!
+ load_n_incr d0, r1, r2, \w
+ subs r6, r6, #1
+ vmovl.u8 q0, d0
+ vmovl.u8 q1, d2
+ vstr s4, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 1b
+ b 3f
+2:
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ldrh r12, [r1, #\w]
+ load_n_incr d0, r1, r2, \w
+ vdup.16 d2, r12
+ subs r6, r6, #1
+ vmovl.u8 q0, d0
+ vmovl.u8 q1, d2
+ vstr s12, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s4, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 0b
+ b 3f
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ load_n_incr d0, r1, r2, \w
+ subs r6, r6, #1
+ vmovl.u8 q0, d0
+ vstr s12, [r0, #-4]
+ vst1.16 {\w1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 1b
+
+3:
+ tst r7, #8 // CDEF_HAVE_BOTTOM
+ bne 1f
+ // !CDEF_HAVE_BOTTOM
+ sub r12, r0, #4
+ vmov.i16 q2, #0x8000
+ vst1.16 {q2,q3}, [r12]!
+.if \w == 8
+ vst1.16 {q2,q3}, [r12]!
+.endif
+ pop {r4-r8,pc}
+1:
+ // CDEF_HAVE_BOTTOM
+ add r8, r5, r2
+ pad_top_bottom r5, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 1
+endfunc
+.endm
+
+padding_func 8, 16, d0, q0, d2, q1, 128
+padding_func 4, 8, s0, d0, s4, d2, 64
+
+// void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+.macro padding_func_edged w, stride, reg, align
+function cdef_padding\w\()_edged_8bpc_neon
+ sub r0, r0, #(2*\stride)
+
+ ldrh r12, [r4, #-2]
+ vldr \reg, [r4]
+ add r8, r4, r2
+ strh r12, [r0, #-2]
+ ldrh r12, [r4, #\w]
+ vstr \reg, [r0]
+ strh r12, [r0, #\w]
+
+ ldrh r12, [r8, #-2]
+ vldr \reg, [r8]
+ strh r12, [r0, #\stride-2]
+ ldrh r12, [r8, #\w]
+ vstr \reg, [r0, #\stride]
+ strh r12, [r0, #\stride+\w]
+ add r0, r0, #2*\stride
+
+0:
+ ldrh r12, [r3], #2
+ vldr \reg, [r1]
+ str r12, [r0, #-2]
+ ldrh r12, [r1, #\w]
+ add r1, r1, r2
+ subs r6, r6, #1
+ vstr \reg, [r0]
+ str r12, [r0, #\w]
+ add r0, r0, #\stride
+ bgt 0b
+
+ ldrh r12, [r5, #-2]
+ vldr \reg, [r5]
+ add r8, r5, r2
+ strh r12, [r0, #-2]
+ ldrh r12, [r5, #\w]
+ vstr \reg, [r0]
+ strh r12, [r0, #\w]
+
+ ldrh r12, [r8, #-2]
+ vldr \reg, [r8]
+ strh r12, [r0, #\stride-2]
+ ldrh r12, [r8, #\w]
+ vstr \reg, [r0, #\stride]
+ strh r12, [r0, #\stride+\w]
+
+ pop {r4-r8,pc}
+endfunc
+.endm
+
+padding_func_edged 8, 16, d0, 64
+padding_func_edged 4, 8, s0, 32
+
+tables
+
+filter 8, 8
+filter 4, 8
+
+find_dir 8
+
+.macro load_px_8 d11, d12, d21, d22, w
+.if \w == 8
+ add r6, r2, r9 // x + off
+ sub r9, r2, r9 // x - off
+ vld1.8 {\d11}, [r6] // p0
+ add r6, r6, #16 // += stride
+ vld1.8 {\d21}, [r9] // p1
+ add r9, r9, #16 // += stride
+ vld1.8 {\d12}, [r6] // p0
+ vld1.8 {\d22}, [r9] // p1
+.else
+ add r6, r2, r9 // x + off
+ sub r9, r2, r9 // x - off
+ vld1.32 {\d11[0]}, [r6] // p0
+ add r6, r6, #8 // += stride
+ vld1.32 {\d21[0]}, [r9] // p1
+ add r9, r9, #8 // += stride
+ vld1.32 {\d11[1]}, [r6] // p0
+ add r6, r6, #8 // += stride
+ vld1.32 {\d21[1]}, [r9] // p1
+ add r9, r9, #8 // += stride
+ vld1.32 {\d12[0]}, [r6] // p0
+ add r6, r6, #8 // += stride
+ vld1.32 {\d22[0]}, [r9] // p1
+ add r9, r9, #8 // += stride
+ vld1.32 {\d12[1]}, [r6] // p0
+ vld1.32 {\d22[1]}, [r9] // p1
+.endif
+.endm
+.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
+.if \min
+ vmin.u8 q3, q3, \s1
+ vmax.u8 q4, q4, \s1
+ vmin.u8 q3, q3, \s2
+ vmax.u8 q4, q4, \s2
+.endif
+ vabd.u8 q8, q0, \s1 // abs(diff)
+ vabd.u8 q11, q0, \s2 // abs(diff)
+ vshl.u8 q9, q8, \shift // abs(diff) >> shift
+ vshl.u8 q12, q11, \shift // abs(diff) >> shift
+ vqsub.u8 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
+ vqsub.u8 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
+ vcgt.u8 q10, q0, \s1 // px > p0
+ vcgt.u8 q13, q0, \s2 // px > p1
+ vmin.u8 q9, q9, q8 // imin(abs(diff), clip)
+ vmin.u8 q12, q12, q11 // imin(abs(diff), clip)
+ vneg.s8 q8, q9 // -imin()
+ vneg.s8 q11, q12 // -imin()
+ vbsl q10, q8, q9 // constrain() = imax(imin(diff, clip), -clip)
+ vdup.8 d18, \tap // taps[k]
+ vbsl q13, q11, q12 // constrain() = imax(imin(diff, clip), -clip)
+ vmlal.s8 q1, d20, d18 // sum += taps[k] * constrain()
+ vmlal.s8 q1, d26, d18 // sum += taps[k] * constrain()
+ vmlal.s8 q2, d21, d18 // sum += taps[k] * constrain()
+ vmlal.s8 q2, d27, d18 // sum += taps[k] * constrain()
+.endm
+
+// void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride,
+// const uint16_t *tmp, int pri_strength,
+// int sec_strength, int dir, int damping,
+// int h, size_t edges);
+.macro filter_func_8 w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_edged_neon
+.if \pri
+ movrel_local r8, pri_taps
+ and r9, r3, #1
+ add r8, r8, r9, lsl #1
+.endif
+ movrel_local r9, directions\w
+ add r5, r9, r5, lsl #1
+ vmov.u8 d17, #7
+ vdup.8 d16, r6 // damping
+
+ vmov.8 d8[0], r3
+ vmov.8 d8[1], r4
+ vclz.i8 d8, d8 // clz(threshold)
+ vsub.i8 d8, d17, d8 // ulog2(threshold)
+ vqsub.u8 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
+ vneg.s8 d8, d8 // -shift
+.if \sec
+ vdup.8 q6, d8[1]
+.endif
+.if \pri
+ vdup.8 q5, d8[0]
+.endif
+
+1:
+.if \w == 8
+ add r12, r2, #16
+ vld1.8 {d0}, [r2, :64] // px
+ vld1.8 {d1}, [r12, :64] // px
+.else
+ add r12, r2, #8
+ vld1.32 {d0[0]}, [r2, :32] // px
+ add r9, r2, #2*8
+ vld1.32 {d0[1]}, [r12, :32] // px
+ add r12, r12, #2*8
+ vld1.32 {d1[0]}, [r9, :32] // px
+ vld1.32 {d1[1]}, [r12, :32] // px
+.endif
+
+ vmov.u8 q1, #0 // sum
+ vmov.u8 q2, #0 // sum
+.if \min
+ vmov.u16 q3, q0 // min
+ vmov.u16 q4, q0 // max
+.endif
+
+ // Instead of loading sec_taps 2, 1 from memory, just set it
+ // to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
+ mov lr, #2 // sec_taps[0]
+
+2:
+.if \pri
+ ldrsb r9, [r5] // off1
+
+ load_px_8 d28, d29, d30, d31, \w
+.endif
+
+.if \sec
+ add r5, r5, #4 // +2*2
+ ldrsb r9, [r5] // off2
+.endif
+
+.if \pri
+ ldrb r12, [r8] // *pri_taps
+ vdup.8 q7, r3 // threshold
+
+ handle_pixel_8 q14, q15, q7, q5, r12, \min
+.endif
+
+.if \sec
+ load_px_8 d28, d29, d30, d31, \w
+
+ add r5, r5, #8 // +2*4
+ ldrsb r9, [r5] // off3
+
+ vdup.8 q7, r4 // threshold
+
+ handle_pixel_8 q14, q15, q7, q6, lr, \min
+
+ load_px_8 d28, d29, d30, d31, \w
+
+ handle_pixel_8 q14, q15, q7, q6, lr, \min
+
+ sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
+.else
+ add r5, r5, #1 // r5 += 1
+.endif
+ subs lr, lr, #1 // sec_tap-- (value)
+.if \pri
+ add r8, r8, #1 // pri_taps++ (pointer)
+.endif
+ bne 2b
+
+ vshr.s16 q14, q1, #15 // -(sum < 0)
+ vshr.s16 q15, q2, #15 // -(sum < 0)
+ vadd.i16 q1, q1, q14 // sum - (sum < 0)
+ vadd.i16 q2, q2, q15 // sum - (sum < 0)
+ vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
+ vrshr.s16 q2, q2, #4 // (8 + sum - (sum < 0)) >> 4
+ vaddw.u8 q1, q1, d0 // px + (8 + sum ...) >> 4
+ vaddw.u8 q2, q2, d1 // px + (8 + sum ...) >> 4
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+.if \min
+ vmin.u8 q0, q0, q4
+ vmax.u8 q0, q0, q3 // iclip(px + .., min, max)
+.endif
+.if \w == 8
+ vst1.8 {d0}, [r0, :64], r1
+ add r2, r2, #2*16 // tmp += 2*tmp_stride
+ subs r7, r7, #2 // h -= 2
+ vst1.8 {d1}, [r0, :64], r1
+.else
+ vst1.32 {d0[0]}, [r0, :32], r1
+ add r2, r2, #4*8 // tmp += 4*tmp_stride
+ vst1.32 {d0[1]}, [r0, :32], r1
+ subs r7, r7, #4 // h -= 4
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+.endif
+
+ // Reset pri_taps and directions back to the original point
+ sub r5, r5, #2
+.if \pri
+ sub r8, r8, #2
+.endif
+
+ bgt 1b
+ vpop {q4-q7}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+.macro filter_8 w
+filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
+.endm
+
+filter_8 8
+filter_8 4
diff --git a/third_party/dav1d/src/arm/32/cdef16.S b/third_party/dav1d/src/arm/32/cdef16.S
new file mode 100644
index 0000000000..d14525d720
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/cdef16.S
@@ -0,0 +1,233 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+// r1 = d0/q0
+// r2 = d2/q1
+.macro pad_top_bot_16 s1, s2, w, stride, r1, r2, align, ret
+ tst r7, #1 // CDEF_HAVE_LEFT
+ beq 2f
+ // CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ vldr s8, [\s1, #-4]
+ vld1.16 {\r1}, [\s1, :\align]
+ vldr s9, [\s1, #2*\w]
+ vldr s10, [\s2, #-4]
+ vld1.16 {\r2}, [\s2, :\align]
+ vldr s11, [\s2, #2*\w]
+ vstr s8, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s9, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s10, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s11, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vldr s8, [\s1, #-4]
+ vld1.16 {\r1}, [\s1, :\align]
+ vldr s9, [\s2, #-4]
+ vld1.16 {\r2}, [\s2, :\align]
+ vstr s8, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s9, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+2:
+ // !CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ vld1.16 {\r1}, [\s1, :\align]
+ vldr s8, [\s1, #2*\w]
+ vld1.16 {\r2}, [\s2, :\align]
+ vldr s9, [\s2, #2*\w]
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s8, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s12, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s9, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.16 {\r1}, [\s1, :\align]
+ vld1.16 {\r2}, [\s2, :\align]
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ vstr s12, [r0, #-4]
+ vst1.16 {\r2}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+.if \ret
+ pop {r4-r8,pc}
+.else
+ add r0, r0, #2*\stride
+.endif
+3:
+.endm
+
+// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+// r1 = d0/q0
+// r2 = d2/q1
+.macro padding_func_16 w, stride, r1, r2, align
+function cdef_padding\w\()_16bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ vmov.i16 q3, #0x8000
+ tst r7, #4 // CDEF_HAVE_TOP
+ bne 1f
+ // !CDEF_HAVE_TOP
+ sub r12, r0, #2*(2*\stride+2)
+ vmov.i16 q2, #0x8000
+ vst1.16 {q2,q3}, [r12]!
+.if \w == 8
+ vst1.16 {q2,q3}, [r12]!
+.endif
+ b 3f
+1:
+ // CDEF_HAVE_TOP
+ add r8, r4, r2
+ sub r0, r0, #2*(2*\stride)
+ pad_top_bot_16 r4, r8, \w, \stride, \r1, \r2, \align, 0
+
+ // Middle section
+3:
+ tst r7, #1 // CDEF_HAVE_LEFT
+ beq 2f
+ // CDEF_HAVE_LEFT
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ vld1.32 {d2[]}, [r3, :32]!
+ vldr s5, [r1, #2*\w]
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r6, r6, #1
+ vstr s4, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s5, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 0b
+ b 3f
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.32 {d2[]}, [r3, :32]!
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r6, r6, #1
+ vstr s4, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 1b
+ b 3f
+2:
+ tst r7, #2 // CDEF_HAVE_RIGHT
+ beq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ vldr s4, [r1, #2*\w]
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r6, r6, #1
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s4, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 0b
+ b 3f
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ vld1.16 {\r1}, [r1, :\align], r2
+ subs r6, r6, #1
+ vstr s12, [r0, #-4]
+ vst1.16 {\r1}, [r0, :\align]
+ vstr s12, [r0, #2*\w]
+ add r0, r0, #2*\stride
+ bgt 1b
+
+3:
+ tst r7, #8 // CDEF_HAVE_BOTTOM
+ bne 1f
+ // !CDEF_HAVE_BOTTOM
+ sub r12, r0, #4
+ vmov.i16 q2, #0x8000
+ vst1.16 {q2,q3}, [r12]!
+.if \w == 8
+ vst1.16 {q2,q3}, [r12]!
+.endif
+ pop {r4-r8,pc}
+1:
+ // CDEF_HAVE_BOTTOM
+ add r8, r5, r2
+ pad_top_bot_16 r5, r8, \w, \stride, \r1, \r2, \align, 1
+endfunc
+.endm
+
+padding_func_16 8, 16, q0, q1, 128
+padding_func_16 4, 8, d0, d2, 64
+
+tables
+
+filter 8, 16
+filter 4, 16
+
+find_dir 16
diff --git a/third_party/dav1d/src/arm/32/cdef_tmpl.S b/third_party/dav1d/src/arm/32/cdef_tmpl.S
new file mode 100644
index 0000000000..33ff9e5816
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/cdef_tmpl.S
@@ -0,0 +1,515 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro dir_table w, stride
+const directions\w
+ .byte -1 * \stride + 1, -2 * \stride + 2
+ .byte 0 * \stride + 1, -1 * \stride + 2
+ .byte 0 * \stride + 1, 0 * \stride + 2
+ .byte 0 * \stride + 1, 1 * \stride + 2
+ .byte 1 * \stride + 1, 2 * \stride + 2
+ .byte 1 * \stride + 0, 2 * \stride + 1
+ .byte 1 * \stride + 0, 2 * \stride + 0
+ .byte 1 * \stride + 0, 2 * \stride - 1
+// Repeated, to avoid & 7
+ .byte -1 * \stride + 1, -2 * \stride + 2
+ .byte 0 * \stride + 1, -1 * \stride + 2
+ .byte 0 * \stride + 1, 0 * \stride + 2
+ .byte 0 * \stride + 1, 1 * \stride + 2
+ .byte 1 * \stride + 1, 2 * \stride + 2
+ .byte 1 * \stride + 0, 2 * \stride + 1
+endconst
+.endm
+
+.macro tables
+dir_table 8, 16
+dir_table 4, 8
+
+const pri_taps
+ .byte 4, 2, 3, 3
+endconst
+.endm
+
+.macro load_px d11, d12, d21, d22, w
+.if \w == 8
+ add r6, r2, r9, lsl #1 // x + off
+ sub r9, r2, r9, lsl #1 // x - off
+ vld1.16 {\d11,\d12}, [r6] // p0
+ vld1.16 {\d21,\d22}, [r9] // p1
+.else
+ add r6, r2, r9, lsl #1 // x + off
+ sub r9, r2, r9, lsl #1 // x - off
+ vld1.16 {\d11}, [r6] // p0
+ add r6, r6, #2*8 // += stride
+ vld1.16 {\d21}, [r9] // p1
+ add r9, r9, #2*8 // += stride
+ vld1.16 {\d12}, [r6] // p0
+ vld1.16 {\d22}, [r9] // p1
+.endif
+.endm
+.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
+.if \min
+ vmin.u16 q2, q2, \s1
+ vmax.s16 q3, q3, \s1
+ vmin.u16 q2, q2, \s2
+ vmax.s16 q3, q3, \s2
+.endif
+ vabd.u16 q8, q0, \s1 // abs(diff)
+ vabd.u16 q11, q0, \s2 // abs(diff)
+ vshl.u16 q9, q8, \shift // abs(diff) >> shift
+ vshl.u16 q12, q11, \shift // abs(diff) >> shift
+ vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
+ vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
+ vsub.i16 q10, \s1, q0 // diff = p0 - px
+ vsub.i16 q13, \s2, q0 // diff = p1 - px
+ vneg.s16 q8, q9 // -clip
+ vneg.s16 q11, q12 // -clip
+ vmin.s16 q10, q10, q9 // imin(diff, clip)
+ vmin.s16 q13, q13, q12 // imin(diff, clip)
+ vdup.16 q9, \tap // taps[k]
+ vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip)
+ vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip)
+ vmla.i16 q1, q10, q9 // sum += taps[k] * constrain()
+ vmla.i16 q1, q13, q9 // sum += taps[k] * constrain()
+.endm
+
+// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
+// const uint16_t *tmp, int pri_strength,
+// int sec_strength, int dir, int damping,
+// int h, size_t edges);
+.macro filter_func w, bpc, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_\bpc\()bpc_neon
+.if \bpc == 8
+ cmp r8, #0xf
+ beq cdef_filter\w\suffix\()_edged_neon
+.endif
+.if \pri
+.if \bpc == 16
+ clz r9, r9
+ sub r9, r9, #24 // -bitdepth_min_8
+ neg r9, r9 // bitdepth_min_8
+.endif
+ movrel_local r8, pri_taps
+.if \bpc == 16
+ lsr r9, r3, r9 // pri_strength >> bitdepth_min_8
+ and r9, r9, #1 // (pri_strength >> bitdepth_min_8) & 1
+.else
+ and r9, r3, #1
+.endif
+ add r8, r8, r9, lsl #1
+.endif
+ movrel_local r9, directions\w
+ add r5, r9, r5, lsl #1
+ vmov.u16 d17, #15
+ vdup.16 d16, r6 // damping
+
+.if \pri
+ vdup.16 q5, r3 // threshold
+.endif
+.if \sec
+ vdup.16 q7, r4 // threshold
+.endif
+ vmov.16 d8[0], r3
+ vmov.16 d8[1], r4
+ vclz.i16 d8, d8 // clz(threshold)
+ vsub.i16 d8, d17, d8 // ulog2(threshold)
+ vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
+ vneg.s16 d8, d8 // -shift
+.if \sec
+ vdup.16 q6, d8[1]
+.endif
+.if \pri
+ vdup.16 q4, d8[0]
+.endif
+
+1:
+.if \w == 8
+ vld1.16 {q0}, [r2, :128] // px
+.else
+ add r12, r2, #2*8
+ vld1.16 {d0}, [r2, :64] // px
+ vld1.16 {d1}, [r12, :64] // px
+.endif
+
+ vmov.u16 q1, #0 // sum
+.if \min
+ vmov.u16 q2, q0 // min
+ vmov.u16 q3, q0 // max
+.endif
+
+ // Instead of loading sec_taps 2, 1 from memory, just set it
+ // to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
+ mov lr, #2 // sec_taps[0]
+
+2:
+.if \pri
+ ldrsb r9, [r5] // off1
+
+ load_px d28, d29, d30, d31, \w
+.endif
+
+.if \sec
+ add r5, r5, #4 // +2*2
+ ldrsb r9, [r5] // off2
+.endif
+
+.if \pri
+ ldrb r12, [r8] // *pri_taps
+
+ handle_pixel q14, q15, q5, q4, r12, \min
+.endif
+
+.if \sec
+ load_px d28, d29, d30, d31, \w
+
+ add r5, r5, #8 // +2*4
+ ldrsb r9, [r5] // off3
+
+ handle_pixel q14, q15, q7, q6, lr, \min
+
+ load_px d28, d29, d30, d31, \w
+
+ handle_pixel q14, q15, q7, q6, lr, \min
+
+ sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
+.else
+ add r5, r5, #1 // r5 += 1
+.endif
+ subs lr, lr, #1 // sec_tap-- (value)
+.if \pri
+ add r8, r8, #1 // pri_taps++ (pointer)
+.endif
+ bne 2b
+
+ vshr.s16 q14, q1, #15 // -(sum < 0)
+ vadd.i16 q1, q1, q14 // sum - (sum < 0)
+ vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
+ vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4
+.if \min
+ vmin.s16 q0, q0, q3
+ vmax.s16 q0, q0, q2 // iclip(px + .., min, max)
+.endif
+.if \bpc == 8
+ vmovn.u16 d0, q0
+.endif
+.if \w == 8
+ add r2, r2, #2*16 // tmp += tmp_stride
+ subs r7, r7, #1 // h--
+.if \bpc == 8
+ vst1.8 {d0}, [r0, :64], r1
+.else
+ vst1.16 {q0}, [r0, :128], r1
+.endif
+.else
+.if \bpc == 8
+ vst1.32 {d0[0]}, [r0, :32], r1
+.else
+ vst1.16 {d0}, [r0, :64], r1
+.endif
+ add r2, r2, #2*16 // tmp += 2*tmp_stride
+ subs r7, r7, #2 // h -= 2
+.if \bpc == 8
+ vst1.32 {d0[1]}, [r0, :32], r1
+.else
+ vst1.16 {d1}, [r0, :64], r1
+.endif
+.endif
+
+ // Reset pri_taps and directions back to the original point
+ sub r5, r5, #2
+.if \pri
+ sub r8, r8, #2
+.endif
+
+ bgt 1b
+ vpop {q4-q7}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+.macro filter w, bpc
+filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_\bpc\()bpc_neon, export=1
+ push {r4-r9,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #92]
+ ldrd r6, r7, [sp, #100]
+.if \bpc == 16
+ ldrd r8, r9, [sp, #108]
+.else
+ ldr r8, [sp, #108]
+.endif
+ cmp r3, #0 // pri_strength
+ bne 1f
+ b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
+1:
+ cmp r4, #0 // sec_strength
+ bne 1f
+ b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
+1:
+ b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
+endfunc
+.endm
+
+const div_table, align=4
+ .short 840, 420, 280, 210, 168, 140, 120, 105
+endconst
+
+const alt_fact, align=4
+ .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
+endconst
+
+.macro cost_alt dest, s1, s2, s3, s4, s5, s6
+ vmull.s16 q1, \s1, \s1 // sum_alt[n]*sum_alt[n]
+ vmull.s16 q2, \s2, \s2
+ vmull.s16 q3, \s3, \s3
+ vmull.s16 q5, \s4, \s4 // sum_alt[n]*sum_alt[n]
+ vmull.s16 q12, \s5, \s5
+ vmull.s16 q6, \s6, \s6 // q6 overlaps the first \s1-\s2 here
+ vmul.i32 q1, q1, q13 // sum_alt[n]^2*fact
+ vmla.i32 q1, q2, q14
+ vmla.i32 q1, q3, q15
+ vmul.i32 q5, q5, q13 // sum_alt[n]^2*fact
+ vmla.i32 q5, q12, q14
+ vmla.i32 q5, q6, q15
+ vadd.i32 d2, d2, d3
+ vadd.i32 d3, d10, d11
+ vpadd.i32 \dest, d2, d3 // *cost_ptr
+.endm
+
+.macro find_best s1, s2, s3
+.ifnb \s2
+ vmov.32 lr, \s2
+.endif
+ cmp r12, r1 // cost[n] > best_cost
+ itt gt
+ movgt r0, r3 // best_dir = n
+ movgt r1, r12 // best_cost = cost[n]
+.ifnb \s2
+ add r3, r3, #1 // n++
+ cmp lr, r1 // cost[n] > best_cost
+ vmov.32 r12, \s3
+ itt gt
+ movgt r0, r3 // best_dir = n
+ movgt r1, lr // best_cost = cost[n]
+ add r3, r3, #1 // n++
+.endif
+.endm
+
+// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
+// unsigned *const var)
+.macro find_dir bpc
+function cdef_find_dir_\bpc\()bpc_neon, export=1
+ push {lr}
+ vpush {q4-q7}
+.if \bpc == 16
+ clz r3, r3 // clz(bitdepth_max)
+ sub lr, r3, #24 // -bitdepth_min_8
+.endif
+ sub sp, sp, #32 // cost
+ mov r3, #8
+ vmov.u16 q1, #0 // q0-q1 sum_diag[0]
+ vmov.u16 q3, #0 // q2-q3 sum_diag[1]
+ vmov.u16 q5, #0 // q4-q5 sum_hv[0-1]
+ vmov.u16 q8, #0 // q6,d16 sum_alt[0]
+ // q7,d17 sum_alt[1]
+ vmov.u16 q9, #0 // q9,d22 sum_alt[2]
+ vmov.u16 q11, #0
+ vmov.u16 q10, #0 // q10,d23 sum_alt[3]
+
+
+.irpc i, 01234567
+.if \bpc == 8
+ vld1.8 {d30}, [r0, :64], r1
+ vmov.u8 d31, #128
+ vsubl.u8 q15, d30, d31 // img[x] - 128
+.else
+ vld1.16 {q15}, [r0, :128], r1
+ vdup.16 q14, lr // -bitdepth_min_8
+ vshl.u16 q15, q15, q14
+ vmov.u16 q14, #128
+ vsub.i16 q15, q15, q14 // img[x] - 128
+.endif
+ vmov.u16 q14, #0
+
+.if \i == 0
+ vmov q0, q15 // sum_diag[0]
+.else
+ vext.8 q12, q14, q15, #(16-2*\i)
+ vext.8 q13, q15, q14, #(16-2*\i)
+ vadd.i16 q0, q0, q12 // sum_diag[0]
+ vadd.i16 q1, q1, q13 // sum_diag[0]
+.endif
+ vrev64.16 q13, q15
+ vswp d26, d27 // [-x]
+.if \i == 0
+ vmov q2, q13 // sum_diag[1]
+.else
+ vext.8 q12, q14, q13, #(16-2*\i)
+ vext.8 q13, q13, q14, #(16-2*\i)
+ vadd.i16 q2, q2, q12 // sum_diag[1]
+ vadd.i16 q3, q3, q13 // sum_diag[1]
+.endif
+
+ vpadd.u16 d26, d30, d31 // [(x >> 1)]
+ vmov.u16 d27, #0
+ vpadd.u16 d24, d26, d28
+ vpadd.u16 d24, d24, d28 // [y]
+ vmov.u16 r12, d24[0]
+ vadd.i16 q5, q5, q15 // sum_hv[1]
+.if \i < 4
+ vmov.16 d8[\i], r12 // sum_hv[0]
+.else
+ vmov.16 d9[\i-4], r12 // sum_hv[0]
+.endif
+
+.if \i == 0
+ vmov.u16 q6, q13 // sum_alt[0]
+.else
+ vext.8 q12, q14, q13, #(16-2*\i)
+ vext.8 q14, q13, q14, #(16-2*\i)
+ vadd.i16 q6, q6, q12 // sum_alt[0]
+ vadd.i16 d16, d16, d28 // sum_alt[0]
+.endif
+ vrev64.16 d26, d26 // [-(x >> 1)]
+ vmov.u16 q14, #0
+.if \i == 0
+ vmov q7, q13 // sum_alt[1]
+.else
+ vext.8 q12, q14, q13, #(16-2*\i)
+ vext.8 q13, q13, q14, #(16-2*\i)
+ vadd.i16 q7, q7, q12 // sum_alt[1]
+ vadd.i16 d17, d17, d26 // sum_alt[1]
+.endif
+
+.if \i < 6
+ vext.8 q12, q14, q15, #(16-2*(3-(\i/2)))
+ vext.8 q13, q15, q14, #(16-2*(3-(\i/2)))
+ vadd.i16 q9, q9, q12 // sum_alt[2]
+ vadd.i16 d22, d22, d26 // sum_alt[2]
+.else
+ vadd.i16 q9, q9, q15 // sum_alt[2]
+.endif
+.if \i == 0
+ vmov q10, q15 // sum_alt[3]
+.elseif \i == 1
+ vadd.i16 q10, q10, q15 // sum_alt[3]
+.else
+ vext.8 q12, q14, q15, #(16-2*(\i/2))
+ vext.8 q13, q15, q14, #(16-2*(\i/2))
+ vadd.i16 q10, q10, q12 // sum_alt[3]
+ vadd.i16 d23, d23, d26 // sum_alt[3]
+.endif
+.endr
+
+ vmov.u32 q15, #105
+
+ vmull.s16 q12, d8, d8 // sum_hv[0]*sum_hv[0]
+ vmlal.s16 q12, d9, d9
+ vmull.s16 q13, d10, d10 // sum_hv[1]*sum_hv[1]
+ vmlal.s16 q13, d11, d11
+ vadd.s32 d8, d24, d25
+ vadd.s32 d9, d26, d27
+ vpadd.s32 d8, d8, d9 // cost[2,6] (s16, s17)
+ vmul.i32 d8, d8, d30 // cost[2,6] *= 105
+
+ vrev64.16 q1, q1
+ vrev64.16 q3, q3
+ vext.8 q1, q1, q1, #10 // sum_diag[0][14-n]
+ vext.8 q3, q3, q3, #10 // sum_diag[1][14-n]
+
+ vstr s16, [sp, #2*4] // cost[2]
+ vstr s17, [sp, #6*4] // cost[6]
+
+ movrel_local r12, div_table
+ vld1.16 {q14}, [r12, :128]
+
+ vmull.s16 q5, d0, d0 // sum_diag[0]*sum_diag[0]
+ vmull.s16 q12, d1, d1
+ vmlal.s16 q5, d2, d2
+ vmlal.s16 q12, d3, d3
+ vmull.s16 q0, d4, d4 // sum_diag[1]*sum_diag[1]
+ vmull.s16 q1, d5, d5
+ vmlal.s16 q0, d6, d6
+ vmlal.s16 q1, d7, d7
+ vmovl.u16 q13, d28 // div_table
+ vmovl.u16 q14, d29
+ vmul.i32 q5, q5, q13 // cost[0]
+ vmla.i32 q5, q12, q14
+ vmul.i32 q0, q0, q13 // cost[4]
+ vmla.i32 q0, q1, q14
+ vadd.i32 d10, d10, d11
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d10, d0 // cost[0,4] = s0,s1
+
+ movrel_local r12, alt_fact
+ vld1.16 {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105
+
+ vstr s0, [sp, #0*4] // cost[0]
+ vstr s1, [sp, #4*4] // cost[4]
+
+ vmovl.u16 q13, d29 // div_table[2*m+1] + 105
+ vmovl.u16 q14, d30
+ vmovl.u16 q15, d31
+
+ cost_alt d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3]
+ cost_alt d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7]
+ vstr s28, [sp, #1*4] // cost[1]
+ vstr s29, [sp, #3*4] // cost[3]
+
+ mov r0, #0 // best_dir
+ vmov.32 r1, d0[0] // best_cost
+ mov r3, #1 // n
+
+ vstr s30, [sp, #5*4] // cost[5]
+ vstr s31, [sp, #7*4] // cost[7]
+
+ vmov.32 r12, d14[0]
+
+ find_best d14[0], d8[0], d14[1]
+ find_best d14[1], d0[1], d15[0]
+ find_best d15[0], d8[1], d15[1]
+ find_best d15[1]
+
+ eor r3, r0, #4 // best_dir ^4
+ ldr r12, [sp, r3, lsl #2]
+ sub r1, r1, r12 // best_cost - cost[best_dir ^ 4]
+ lsr r1, r1, #10
+ str r1, [r2] // *var
+
+ add sp, sp, #32
+ vpop {q4-q7}
+ pop {pc}
+endfunc
+.endm
diff --git a/third_party/dav1d/src/arm/32/filmgrain.S b/third_party/dav1d/src/arm/32/filmgrain.S
new file mode 100644
index 0000000000..d1f83efb98
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/filmgrain.S
@@ -0,0 +1,2039 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr r11, r2, #3
+ lsr r12, r2, #12
+ lsr lr, r2, #1
+ eor r11, r2, r11 // (r >> 0) ^ (r >> 3)
+ eor r12, r12, lr // (r >> 12) ^ (r >> 1)
+ eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr r2, r2, #\steps
+.endif
+ and r11, r11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr r2, r2, r11, lsl #(16 - \steps) // *state
+.else
+ orr r2, r2, r11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, r2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, r2, #17 - \bits, #\bits
+ lsr r2, r2, #1
+.endm
+
+// special calling convention:
+// r2 holds seed
+// r3 holds dav1d_gaussian_sequence
+// clobbers r11-r12
+// returns in d0-d1
+function get_gaussian_neon
+ push {r5-r6,lr}
+ increment_seed 4
+ read_rand r5, 11, 3
+ read_rand r6, 11, 2
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[0]}, [r5]
+ read_rand r5, 11, 1
+ vld1.16 {d0[1]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 0
+ increment_seed 4
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[2]}, [r5]
+ read_rand r5, 11, 3
+ vld1.16 {d0[3]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 2
+ vld1.16 {d1[0]}, [r5]
+ add r6, r3, r6, lsl #1
+ read_rand r5, 11, 1
+ vld1.16 {d1[1]}, [r6]
+ read_rand r6, 11, 0
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d1[2]}, [r5]
+ vld1.16 {d1[3]}, [r6]
+ pop {r5-r6,pc}
+endfunc
+
+.macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r0, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r1, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r2, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r3, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r4, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r5, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r6, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r7, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r8, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r9, q0
+ increment_seed 2
+ read_rand r11, 11, 1
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ vld1.16 {d0[1]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 \r10, q0
+.endm
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+ vst1.16 {\r0, \r1, \r2, \r3}, [r0]!
+ vst1.16 {\r4, \r5, \r6, \r7}, [r0]!
+ vst1.16 {\r8, \r9}, [r0]!
+ vst1.16 {\r10[0]}, [r0]!
+.endm
+
+.macro get_grain_row_44 r0, r1, r2, r3, r4, r5
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r0, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r1, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r2, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r3, q0
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ vmovn.i16 \r4, q0
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d0[1]}, [r12]
+ add r11, r3, r11, lsl #1
+ read_rand r12, 11, 0
+ vld1.16 {d0[2]}, [r11]
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[3]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 \r5, q0
+.endm
+
+.macro store_grain_row_44 r0, r1, r2, r3, r4, r5
+ vst1.16 {\r0, \r1, \r2, \r3}, [r0]!
+ vst1.16 {\r4, \r5}, [r0]
+ add r0, r0, #GRAIN_WIDTH-32
+.endm
+
+function get_grain_2_neon
+ push {r11,lr}
+ increment_seed 2
+ read_rand r11, 11, 1
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ vld1.16 {d0[1]}, [r12]
+ vrshl.s16 d0, d0, d30
+ vmovn.i16 d0, q0
+ pop {r11,pc}
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, d0
+ vmov \dst, d0
+.endif
+.endm
+
+// r1 holds the number of entries to produce
+// r6, r8 and r10 hold the previous output entries
+// q0 holds the vector of produced entries
+// q1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+ push {r0, lr}
+.if \n == 1
+ mov lr, #-128
+.else
+ mov r0, #1
+ mov lr, #1
+ sub r7, r7, #1
+ sub r9, r9, #1
+ lsl r0, r0, r7
+ lsl lr, lr, r9
+ add r7, r7, #1
+ add r9, r9, #1
+.endif
+1:
+ read_shift_rand r12, 11
+ vmov.32 r11, d2[0]
+ lsl r12, r12, #1
+ vext.8 q0, q0, q0, #1
+ ldrsh r12, [r3, r12]
+.if \n == 1
+ mla r11, r6, r4, r11 // sum (above) + *coeff * prev output
+ add r6, r11, r8 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, r10
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ cmp r6, r5
+.elseif \n == 2
+ mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r6, r10, r11 // += *coeff * prev output 2
+ mov r8, r6
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mov lr, #-128
+.else
+ push {r1-r3}
+ sbfx r1, r4, #0, #8
+ sbfx r2, r4, #8, #8
+ sbfx r3, r4, #16, #8
+ mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2
+ mla r11, r6, r3, r11 // += *coeff * prev output 3
+ pop {r1-r3}
+ mov r10, r8
+ mov r8, r6
+
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mov lr, #-128
+.endif
+ it gt
+ movgt r6, r5
+ cmp r6, lr
+ it lt
+ movlt r6, lr
+.if \n >= 2
+ pop {lr}
+.endif
+ subs r1, r1, #1
+ vext.8 q1, q1, q1, #4
+ vmov.8 d1[7], r6
+ bgt 1b
+ pop {r0, pc}
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ vmull.s8 q2, d6, d28
+ vmull.s8 q3, d7, d28
+ vmull.s8 q4, d0, d27
+ vmull.s8 q5, d1, d27
+
+ vaddl.s16 q0, d4, d8
+ vaddl.s16 q2, d5, d9
+ vaddl.s16 q4, d6, d10
+ vaddl.s16 q5, d7, d11
+
+ vmull.s8 q3, d3, d29
+ vmull.s8 q1, d2, d29
+
+ vaddw.s16 q4, q4, d6
+ vaddw.s16 q5, q5, d7
+ vaddw.s16 q3, q2, d3
+ vaddw.s16 q2, q0, d2
+ bx lr
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
+.ifc \lag\()_\edge, lag3_left
+ bl sum_lag3_left_above_neon
+.else
+ bl sum_\lag\()_above_neon
+.endif
+.ifc \type, uv_420
+ vpush {q6-q7}
+ add r12, r11, #GRAIN_WIDTH
+ vld1.16 {q0, q1}, [r11]!
+ vld1.16 {q6, q7}, [r12]!
+ vpaddl.s8 q0, q0
+ vpaddl.s8 q1, q1
+ vpaddl.s8 q6, q6
+ vpaddl.s8 q7, q7
+ vadd.i16 q0, q0, q6
+ vadd.i16 q1, q1, q7
+ vpop {q6-q7}
+ vrshrn.s16 d0, q0, #2
+ vrshrn.s16 d1, q1, #2
+.endif
+.ifc \type, uv_422
+ vld1.8 {q0, q1}, [r11]!
+ vpaddl.s8 q0, q0
+ vpaddl.s8 q1, q1
+ vrshrn.s16 d0, q0, #1
+ vrshrn.s16 d1, q1, #1
+.endif
+.ifc \type, uv_444
+ vld1.8 {q0}, [r11]!
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ vdup.8 d13, \uv_coeff
+.endif
+ vmull.s8 q1, d0, d13
+ vmull.s8 q0, d1, d13
+ vaddw.s16 q2, q2, d2
+ vaddw.s16 q3, q3, d3
+ vaddw.s16 q4, q4, d0
+ vaddw.s16 q5, q5, d1
+.endif
+.if \uv_layout && \elems == 16
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 15
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 9
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+ push {r11}
+.ifc \edge, left
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d1[1]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d1[2]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d1[3]}, [r11]
+ lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ vrshl.s16 d1, d1, d30
+ vmovn.i16 d1, q0
+ vext.8 q2, q2, q2, #12
+.ifc \lag, lag3
+ vmov.s8 r10, d1[5]
+.endif
+.ifnc \lag, lag1
+ vmov.s8 r8, d1[6]
+.endif
+ vmov.s8 r6, d1[7]
+
+ vmov q1, q2
+ mov r1, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ vmov q1, q2
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ vmov q1, q3
+ mov r1, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ vmov q1, q4
+.if \elems == 9
+ mov r1, #1
+ bl output_\lag\()_neon
+ lsr r2, r2, #3
+
+ read_rand r11, 11, 2
+ read_rand r12, 11, 1
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d2[0]}, [r11]
+ read_rand r11, 11, 0
+ vld1.16 {d2[1]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d2[2]}, [r11]
+ vrshl.s16 d2, d2, d30
+ vmovn.i16 d2, q1
+ vext.8 q0, q0, q1, #7
+.else
+ mov r1, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ vmov q1, q5
+
+.ifc \edge, right
+ mov r1, #3
+ bl output_\lag\()_neon
+ read_shift_rand r11, 11
+ add r11, r3, r11, lsl #1
+ vld1.16 {d2[0]}, [r11]
+ vrshl.s16 d2, d2, d30
+ vext.8 q0, q0, q1, #1
+.else
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+.endif
+.if \store
+ vst1.8 {q0}, [r0]!
+.endif
+ pop {r11}
+ pop {r1, pc}
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag1_\edge\()_neon
+ push {r1, lr}
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 15
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 15
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 9
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 9
+
+.macro sum_lag1 type, dst, left, mid, right, edge=mid
+ vmov q3, \mid
+ vext.8 q0, \left, \mid, #15
+ vext.8 q1, \mid, \right, #1
+ bl sum_\type\()_lag1_\edge\()_neon
+ vmov \dst, q0
+.endm
+
+.macro sum_y_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 y, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_444, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_422, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_420, \dst, \left, \mid, \right, \edge
+.endm
+
+
+function sum_lag2_above_neon
+ push {lr}
+ sub r12, r0, #2*GRAIN_WIDTH - 16
+ sub lr, r0, #1*GRAIN_WIDTH - 16
+ vld1.8 {q10}, [r12] // load top right
+ vld1.8 {q13}, [lr]
+
+ vext.8 q6, q8, q9, #14 // top left, top mid
+ vdup.8 d14, d28[0]
+ vext.8 q8, q8, q9, #15
+ vdup.8 d15, d28[1]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q2, d0, d12
+ vaddl.s16 q3, d1, d13
+ vaddl.s16 q4, d2, d16
+ vaddl.s16 q5, d3, d17
+
+ vext.8 q6, q9, q10, #1 // top mid, top right
+ vdup.8 d14, d28[3]
+ vext.8 q8, q9, q10, #2
+ vdup.8 d15, d28[4]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vext.8 q6, q11, q12, #14 // top left, top mid
+ vdup.8 d14, d28[5]
+ vext.8 q8, q11, q12, #15
+ vdup.8 d15, d28[6]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vext.8 q6, q12, q13, #1 // top mid, top right
+ vdup.8 d14, d29[0]
+ vext.8 q8, q12, q13, #2
+ vdup.8 d15, d29[1]
+
+ vmull.s8 q0, d12, d14
+ vmull.s8 q1, d13, d14
+ vmull.s8 q6, d16, d15
+ vmull.s8 q8, d17, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vdup.8 d14, d28[2]
+ vdup.8 d15, d28[7]
+
+ vmull.s8 q0, d18, d14
+ vmull.s8 q1, d19, d14
+ vmull.s8 q6, d24, d15
+ vmull.s8 q8, d25, d15
+
+ vaddl.s16 q7, d0, d12
+ vaddl.s16 q0, d1, d13
+ vaddl.s16 q6, d2, d16
+ vaddl.s16 q1, d3, d17
+
+ vmov q8, q9
+ vmov q9, q10
+
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q0
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q1
+
+ vmov q11, q12
+ vmov q12, q13
+
+ pop {pc}
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag2_\edge\()_neon
+ push {r1, lr}
+.ifc \edge, left
+ sub r12, r0, #2*GRAIN_WIDTH
+ sub lr, r0, #1*GRAIN_WIDTH
+ vld1.8 {q9}, [r12] // load the previous block right above
+ vld1.8 {q12}, [lr]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 15
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 15
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 9
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 9
+
+
+function sum_lag3_left_above_neon
+ // A separate codepath for the left edge, to avoid reading outside
+ // of the edge of the buffer.
+ sub r12, r0, #3*GRAIN_WIDTH
+ vld1.8 {q11, q12}, [r12]
+ vext.8 q12, q11, q12, #13
+ vext.8 q11, q11, q11, #13
+ b sum_lag3_above_start
+endfunc
+
+function sum_lag3_above_neon
+ sub r12, r0, #3*GRAIN_WIDTH + 3
+ vld1.8 {q11, q12}, [r12]
+
+sum_lag3_above_start:
+ vdup.8 d20, d26[0]
+ vext.8 q9, q11, q12, #1
+ vdup.8 d21, d26[1]
+
+ vmull.s8 q0, d22, d20
+ vmull.s8 q1, d23, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vext.8 q8, q11, q12, #2
+ vdup.8 d20, d26[2]
+ vext.8 q9, q11, q12, #3
+ vdup.8 d21, d26[3]
+
+ vaddl.s16 q2, d0, d12
+ vaddl.s16 q3, d1, d13
+ vaddl.s16 q4, d2, d14
+ vaddl.s16 q5, d3, d15
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #4
+ vdup.8 d20, d26[4]
+ vext.8 q7, q11, q12, #5
+ vdup.8 d21, d26[5]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ sub r12, r0, #2*GRAIN_WIDTH + 3
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #6
+ vld1.8 {q11, q12}, [r12]
+ vdup.8 d20, d26[6]
+ vdup.8 d21, d26[7]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d22, d21
+ vmull.s8 q7, d23, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #1
+ vdup.8 d20, d27[0]
+ vext.8 q7, q11, q12, #2
+ vdup.8 d21, d27[1]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #3
+ vdup.8 d20, d27[2]
+ vext.8 q9, q11, q12, #4
+ vdup.8 d21, d27[3]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ sub r12, r0, #1*GRAIN_WIDTH + 3
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #5
+ vdup.8 d20, d27[4]
+ vext.8 q7, q11, q12, #6
+ vdup.8 d21, d27[5]
+
+ vld1.8 {q11, q12}, [r12]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vdup.8 d20, d27[6]
+ vext.8 q9, q11, q12, #1
+ vdup.8 d21, d27[7]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d22, d20
+ vmull.s8 q1, d23, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #2
+ vdup.8 d20, d28[0]
+ vext.8 q7, q11, q12, #3
+ vdup.8 d21, d28[1]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+ vmull.s8 q8, d14, d21
+ vmull.s8 q9, d15, d21
+
+ vaddl.s16 q6, d0, d16
+ vaddl.s16 q7, d1, d17
+ vaddl.s16 q0, d2, d18
+ vaddl.s16 q1, d3, d19
+
+ vext.8 q8, q11, q12, #4
+ vdup.8 d20, d28[2]
+ vext.8 q9, q11, q12, #5
+ vdup.8 d21, d28[3]
+
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d16, d20
+ vmull.s8 q1, d17, d20
+ vmull.s8 q6, d18, d21
+ vmull.s8 q7, d19, d21
+
+ vaddl.s16 q8, d0, d12
+ vaddl.s16 q9, d1, d13
+ vaddl.s16 q0, d2, d14
+ vaddl.s16 q1, d3, d15
+
+ vext.8 q6, q11, q12, #6
+ vdup.8 d20, d28[4]
+
+ vadd.i32 q2, q2, q8
+ vadd.i32 q3, q3, q9
+ vadd.i32 q4, q4, q0
+ vadd.i32 q5, q5, q1
+
+ vmull.s8 q0, d12, d20
+ vmull.s8 q1, d13, d20
+
+ vaddw.s16 q2, q2, d0
+ vaddw.s16 q3, q3, d1
+ vaddw.s16 q4, q4, d2
+ vaddw.s16 q5, q5, d3
+
+ bx lr
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag3_\edge\()_neon
+ push {r1, lr}
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 15
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 15
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 9
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 9
+
+function generate_grain_rows_neon
+ push {r11,lr}
+1:
+ get_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
+ subs r1, r1, #1
+ store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
+ bgt 1b
+ pop {r11,pc}
+endfunc
+
+function generate_grain_rows_44_neon
+ push {r11,lr}
+1:
+ get_grain_row_44 d16, d17, d18, d19, d20, d21
+ subs r1, r1, #1
+ store_grain_row_44 d16, d17, d18, d19, d20, d21
+ bgt 1b
+ pop {r11,pc}
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+ vld1.8 {q3}, [r11]!
+ push {r11,lr}
+ bl get_gaussian_neon
+ vrshl.s16 q8, q0, q15
+ bl get_gaussian_neon
+ vrshl.s16 q9, q0, q15
+ vqmovn.s16 d0, q8
+ vqmovn.s16 d1, q9
+
+ vand q3, q3, q1
+ vmull.s8 q2, d6, d22
+ vmull.s8 q3, d7, d22
+ vrshl.s16 q2, q2, q12
+ vrshl.s16 q3, q3, q12
+ vaddw.s8 q2, q2, d0
+ vaddw.s8 q3, q3, d1
+ vqmovn.s16 d4, q2
+ vqmovn.s16 d5, q3
+ vst1.8 {q2}, [r0]!
+ pop {r11,pc}
+endfunc
+
+function get_grain_row_44_neon
+ push {r11,lr}
+ get_grain_row_44 d16, d17, d18, d19, d20, d21
+ pop {r11,pc}
+endfunc
+
+function add_uv_420_coeff_lag0_neon
+ vld1.16 {q2, q3}, [r11]!
+ vld1.16 {q4, q5}, [r12]!
+ vpaddl.s8 q2, q2
+ vpaddl.s8 q3, q3
+ vpaddl.s8 q4, q4
+ vpaddl.s8 q5, q5
+ vadd.i16 q2, q2, q4
+ vadd.i16 q3, q3, q5
+ vrshrn.s16 d4, q2, #2
+ vrshrn.s16 d5, q3, #2
+ b add_coeff_lag0_start
+endfunc
+
+function add_uv_422_coeff_lag0_neon
+ vld1.16 {q2, q3}, [r11]!
+ vpaddl.s8 q2, q2
+ vpaddl.s8 q3, q3
+ vrshrn.s16 d4, q2, #1
+ vrshrn.s16 d5, q3, #1
+
+add_coeff_lag0_start:
+ vand q3, q2, q1
+ vmull.s8 q2, d6, d22
+ vmull.s8 q3, d7, d22
+ vrshl.s16 q2, q2, q12
+ vrshl.s16 q3, q3, q12
+ vaddw.s8 q2, q2, d0
+ vaddw.s8 q3, q3, d1
+ vqmovn.s16 d4, q2
+ vqmovn.s16 d5, q3
+ bx lr
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+
+.ifc \type, uv_444
+ mov r12, r3
+ mov lr, #28
+ add r11, r1, #3*GRAIN_WIDTH
+ mov r1, r2
+ mul r12, r12, lr
+.endif
+ movrel r3, X(gaussian_sequence)
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add r4, r1, #FGD_AR_COEFFS_Y
+.else
+ add r4, r1, #FGD_AR_COEFFS_UV
+.endif
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+.ifc \type, uv_444
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+.endif
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, y
+ mov r1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+ mov r1, #GRAIN_HEIGHT-3
+
+ vdup.16 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vext.8 q13, q0, q1, #13
+ vext.8 q14, q1, q0, #1
+ vneg.s16 q12, q12
+
+1:
+ vmov q1, q13
+ bl gen_grain_uv_444_lag0_neon // 16
+ vmov.i8 q1, #255
+ bl gen_grain_uv_444_lag0_neon // 32
+ bl gen_grain_uv_444_lag0_neon // 48
+ bl gen_grain_uv_444_lag0_neon // 64
+ vmov q1, q14
+ bl gen_grain_uv_444_lag0_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+ add r11, r11, #2
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb r4, [r4, #1] // ar_coeffs_y[3]
+.else
+ add r4, r4, #2
+.endif
+
+ mov r1, #3
+.ifc \type, uv_444
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ sum_\type\()_lag1 q7, q8, q8, q9, left
+ sum_\type\()_lag1 q8, q8, q9, q10
+ sum_\type\()_lag1 q9, q9, q10, q11
+ sum_\type\()_lag1 q10, q10, q11, q12
+ sum_\type\()_lag1 q12, q11, q12, q13, right
+ get_grain_2 d26
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26
+ vmov q11, q10
+ vmov q10, q9
+ vmov q9, q8
+ vmov q8, q7
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ vpush {d26}
+ bl generate_grain_rows_neon
+ vpop {d26}
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #2
+.endif
+ vst1.16 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH-(3*32)
+.else
+ sub \reg, \reg, #3*32-GRAIN_WIDTH
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+
+ mov r12, r3
+ mov lr, #28
+ add r11, r1, #3*GRAIN_WIDTH-3
+ mov r1, r2
+ mul r12, r12, lr
+
+ movrel r3, X(gaussian_sequence)
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+ add r4, r1, #FGD_AR_COEFFS_UV
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, uv_420
+ vpush {q4-q5}
+.endif
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+ set_height r1, \type
+
+ vdup.16 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vext.8 q13, q0, q1, #13
+ vext.8 q14, q1, q0, #7
+ vneg.s16 q12, q12
+
+1:
+ bl get_grain_row_44_neon
+.ifc \type, uv_420
+ add r12, r11, #GRAIN_WIDTH
+.endif
+ vmov q1, q13
+ vmov q0, q8
+ bl add_\type\()_coeff_lag0_neon
+ vmov.i8 q1, #255
+ vmov q0, q9
+ vmov q8, q2
+ bl add_\type\()_coeff_lag0_neon
+ vmov.i8 q1, q14
+ vmov q0, q10
+ vmov q9, q2
+ bl add_\type\()_coeff_lag0_neon
+ vmov q10, q2
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ store_grain_row_44 d16, d17, d18, d19, d20, d21
+ bgt 1b
+
+.ifc \type, uv_420
+ vpop {q4-q5}
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2]
+ add r4, r4, #2
+
+ mov r1, #3
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ sum_\type\()_lag1 q7, q8, q8, q9, left
+ sum_\type\()_lag1 q8, q8, q9, q10
+ sum_\type\()_lag1 q10, q9, q10, q11, right
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ store_grain_row_44 d14, d15, d16, d17, d20, d21
+ vmov q9, q8
+ vmov q8, q7
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH-48
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #127
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH-48
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+ vmov.u8 r11, \src1[0+\off]
+ vmov.u8 r12, \src2[0+\off]
+ add r11, r11, r3
+ vmov.u8 lr, \src1[2+\off]
+ add r12, r12, r3
+ vld1.8 {\dst1[0+\off]}, [r11]
+ vmov.u8 r11, \src2[2+\off]
+ add lr, lr, r3
+ vld1.8 {\dst2[0+\off]}, [r12]
+ vmov.u8 r12, \src1[4+\off]
+ add r11, r11, r3
+ vld1.8 {\dst1[2+\off]}, [lr]
+ vmov.u8 lr, \src2[4+\off]
+ add r12, r12, r3
+ vld1.8 {\dst2[2+\off]}, [r11]
+ vmov.u8 r11, \src1[6+\off]
+ add lr, lr, r3
+ vld1.8 {\dst1[4+\off]}, [r12]
+ vmov.u8 r12, \src2[6+\off]
+ add r11, r11, r3
+ vld1.8 {\dst2[4+\off]}, [lr]
+ add r12, r12, r3
+ vld1.8 {\dst1[6+\off]}, [r11]
+ vld1.8 {\dst2[6+\off]}, [r12]
+.endm
+
+.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4
+ gather_interleaved \dst1, \dst3, \src1, \src3, 0
+ gather_interleaved \dst1, \dst3, \src1, \src3, 1
+ gather_interleaved \dst2, \dst4, \src2, \src4, 0
+ gather_interleaved \dst2, \dst4, \src2, \src4, 1
+.endm
+
+function gather32_neon
+ push {r11-r12,lr}
+ gather d8, d9, d10, d11, d0, d1, d2, d3
+ pop {r11-r12,pc}
+endfunc
+
+function gather16_neon
+ push {r11-r12,lr}
+ gather_interleaved d8, d9, d0, d1, 0
+ gather_interleaved d8, d9, d0, d1, 1
+ pop {r11-r12,pc}
+endfunc
+
+const overlap_coeffs_0, align=4
+ .byte 27, 17, 0, 0, 0, 0, 0, 0
+ .byte 17, 27, 32, 32, 32, 32, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .byte 23, 0, 0, 0, 0, 0, 0, 0
+ .byte 22, 32, 32, 32, 32, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type);
+function fgy_32x32_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut
+ ldrd r6, r7, [sp, #108] // offsets, h
+ ldr r8, [sp, #116] // clip
+ mov r9, #GRAIN_WIDTH // grain_lut stride
+
+ neg r4, r4
+ vdup.16 q13, r4 // -scaling_shift
+ cmp r8, #0
+
+ movrel_local r12, overlap_coeffs_0
+
+ beq 1f
+ // clip
+ vmov.i8 q14, #16
+ vmov.i8 q15, #235
+ b 2f
+1:
+ // no clip
+ vmov.i8 q14, #0
+ vmov.i8 q15, #255
+2:
+
+ vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ add r5, r5, #9 // grain_lut += 9
+ add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r9 // grain_lut += grain_stride
+
+ ldr r10, [r6, #8] // offsets[1][0]
+ calc_offset r10, r4, r10, 0, 0
+ add_offset r4, r10, r4, r5, r9
+ ldr r10, [r6, #4] // offsets[0][1]
+ calc_offset r10, r11, r10, 0, 0
+ add_offset r11, r10, r11, r5, r9
+ ldr r10, [r6, #12] // offsets[1][1]
+ calc_offset r10, r8, r10, 0, 0
+ add_offset r8, r10, r8, r5, r9
+ ldr r6, [r6] // offsets[0][0]
+ calc_offset r6, lr, r6, 0, 0
+ add_offset r5, r6, lr, r5, r9
+
+ add r4, r4, #32 // grain_lut += BLOCK_SIZE * bx
+ add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+
+ ldr r10, [sp, #120] // type
+ adr r11, L(fgy_loop_tbl)
+
+ tst r10, #1
+ ldr r10, [r11, r10, lsl #2]
+
+ add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r8, r8, #32 // grain_lut += BLOCK_SIZE * bx
+
+ add r11, r11, r10
+
+ beq 1f
+ // y overlap
+ vdup.8 d14, d24[0]
+ vdup.8 d15, d24[1]
+ mov r10, r7 // backup actual h
+ mov r7, #2
+1:
+ bx r11
+endfunc
+
+function fgy_loop_neon
+L(fgy_loop_tbl):
+ .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
+
+.macro fgy ox, oy
+L(loop_\ox\oy):
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r9 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q2, q3}, [r6], r9 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r8], r9 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r1, :128], r2 // src
+ vld1.8 {q10, q11}, [r5], r9 // grain_lut
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d4, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d4, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d15
+ vmull.s8 q5, d21, d15
+ vmull.s8 q8, d22, d15
+ vmull.s8 q9, d23, d15
+ vmlal.s8 q4, d4, d14
+ vmlal.s8 q5, d5, d14
+ vmlal.s8 q8, d6, d14
+ vmlal.s8 q9, d7, d14
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+ vqrshrn.s16 d22, q8, #5
+ vqrshrn.s16 d23, q9, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+
+ bl gather32_neon
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+
+ vmovl.u8 q2, d8 // scaling
+ vmovl.u8 q3, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vmul.i16 q8, q8, q2 // scaling * grain
+ vmul.i16 q9, q9, q3
+ vmul.i16 q10, q10, q4
+ vmul.i16 q11, q11, q5
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+ vrshl.s16 q10, q10, q13
+ vrshl.s16 q11, q11, q13
+
+ vaddw.u8 q8, q8, d0 // *src + noise
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+
+ vmax.u8 q0, q0, q14
+ vmax.u8 q1, q1, q14
+ vmin.u8 q0, q0, q15
+ vmin.u8 q1, q1, q15
+
+ subs r7, r7, #1
+.if \oy
+ vdup.8 d14, d25[0]
+ vdup.8 d15, d25[1]
+.endif
+ vst1.8 {q0, q1}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r10, #2
+ sub r7, r10, #2 // restore actual remaining h
+ bgt L(loop_\ox\()0)
+.endif
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+endfunc
+
+// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // data, grain_lut
+ ldrd r6, r7, [sp, #108] // luma_row, luma_stride
+ ldrd r8, r9, [sp, #116] // offsets, h
+ ldrd r10, r11, [sp, #124] // uv, is_id
+
+ // !csfl
+ add r10, r4, r10, lsl #2 // + 4*uv
+ add r12, r10, #FGD_UV_LUMA_MULT
+ add lr, r10, #FGD_UV_MULT
+ add r10, r10, #FGD_UV_OFFSET
+ vld1.16 {d4[]}, [r12] // uv_luma_mult
+ vld1.16 {d4[2]}, [r10] // uv_offset
+ vld1.16 {d4[1]}, [lr] // uv_mult
+
+ ldr lr, [r4, #FGD_SCALING_SHIFT]
+ ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ neg lr, lr // -scaling_shift
+
+ cmp r12, #0
+ vdup.16 q13, lr // -scaling_shift
+
+ beq 1f
+ // clip
+ cmp r11, #0
+ vmov.i8 q14, #16
+ vmov.i8 q15, #240
+ beq 2f
+ // is_id
+ vmov.i8 q15, #235
+ b 2f
+1:
+ // no clip
+ vmov.i8 q14, #0
+ vmov.i8 q15, #255
+2:
+
+ mov r10, #GRAIN_WIDTH // grain_lut stride
+
+ add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
+.if \sy
+ add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride
+ add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r10 // grain_lut += grain_stride
+.endif
+
+ ldr r12, [r8, #8] // offsets[1][0]
+ calc_offset r12, r4, r12, \sx, \sy
+ add_offset r4, r12, r4, r5, r10
+
+ ldr r12, [r8, #4] // offsets[0][1]
+ calc_offset r12, lr, r12, \sx, \sy
+ add_offset lr, r12, lr, r5, r10
+
+ ldr r12, [r8, #12] // offsets[1][1]
+ calc_offset r12, r11, r12, \sx, \sy
+ add_offset r11, r12, r11, r5, r10
+
+ ldr r8, [r8] // offsets[0][0]
+ calc_offset r8, r12, r8, \sx, \sy
+ add_offset r5, r8, r12, r5, r10
+
+ add r4, r4, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+ add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+
+ movrel_local r12, overlap_coeffs_\sx
+ ldr lr, [sp, #132] // type
+
+ vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ movrel_local r12, L(fguv_loop_sx\sx\()_tbl)
+#if CONFIG_THUMB
+ // This uses movrel_local instead of adr above, because the target
+ // can be out of range for adr. But movrel_local leaves the thumb bit
+ // set on COFF (but probably wouldn't if building for thumb on ELF),
+ // thus try to clear the bit for robustness.
+ bic r12, r12, #1
+#endif
+
+ tst lr, #1
+ ldr lr, [r12, lr, lsl #2]
+
+ add r12, r12, lr
+
+ beq 1f
+ // y overlap
+ sub lr, r9, #(2 >> \sy) // backup remaining h
+ mov r9, #(2 >> \sy)
+
+1:
+
+.if \sy
+ vmov.i8 d6, #23
+ vmov.i8 d7, #22
+.else
+ vmov.i8 d6, #27
+ vmov.i8 d7, #17
+.endif
+
+.if \sy
+ add r7, r7, r7 // luma_stride *= 2
+.endif
+
+ bx r12
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+L(fguv_loop_sx0_tbl):
+ .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+.if \oy
+ mov r12, lr
+.endif
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r10 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q8, q9}, [r8], r10 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r11], r10 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r6, :128], r7 // luma
+ vld1.8 {q10, q11}, [r5], r10 // grain_lut
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d16, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d16, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d7
+ vmull.s8 q5, d21, d7
+ vmull.s8 q6, d22, d7
+ vmull.s8 q7, d23, d7
+ vmlal.s8 q4, d16, d6
+ vmlal.s8 q5, d17, d6
+ vmlal.s8 q6, d18, d6
+ vmlal.s8 q7, d19, d6
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+ vqrshrn.s16 d22, q6, #5
+ vqrshrn.s16 d23, q7, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+.if !\csfl
+ vld1.8 {q8, q9}, [r1, :128] // src
+ vmovl.u8 q4, d0
+ vmovl.u8 q5, d1
+ vmovl.u8 q6, d2
+ vmovl.u8 q7, d3
+ vmovl.u8 q0, d16
+ vmovl.u8 q1, d17
+ vmovl.u8 q8, d18
+ vmovl.u8 q9, d19
+ vmul.i16 q4, q4, d4[0]
+ vmul.i16 q5, q5, d4[0]
+ vmul.i16 q6, q6, d4[0]
+ vmul.i16 q7, q7, d4[0]
+ vmul.i16 q0, q0, d4[1]
+ vmul.i16 q1, q1, d4[1]
+ vmul.i16 q8, q8, d4[1]
+ vmul.i16 q9, q9, d4[1]
+ vqadd.s16 q4, q4, q0
+ vqadd.s16 q5, q5, q1
+ vqadd.s16 q6, q6, q8
+ vqadd.s16 q7, q7, q9
+ vdup.16 q0, d4[2]
+ vshr.s16 q4, q4, #6
+ vshr.s16 q5, q5, #6
+ vshr.s16 q6, q6, #6
+ vshr.s16 q7, q7, #6
+ vadd.i16 q4, q4, q0
+ vadd.i16 q5, q5, q0
+ vadd.i16 q6, q6, q0
+ vadd.i16 q7, q7, q0
+ vqmovun.s16 d0, q4
+ vqmovun.s16 d1, q5
+ vqmovun.s16 d2, q6
+ vqmovun.s16 d3, q7
+.endif
+
+ bl gather32_neon
+
+ vld1.8 {q0, q1}, [r1, :128], r2 // src
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vmul.i16 q8, q8, q6 // scaling * grain
+ vmul.i16 q9, q9, q7
+ vmul.i16 q10, q10, q4
+ vmul.i16 q11, q11, q5
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+ vrshl.s16 q10, q10, q13
+ vrshl.s16 q11, q11, q13
+
+ vaddw.u8 q8, q8, d0 // *src + noise
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+
+ vmax.u8 q0, q0, q14
+ vmax.u8 q1, q1, q14
+ vmin.u8 q0, q0, q15
+ vmin.u8 q1, q1, q15
+
+ subs r9, r9, #1
+.if \oy
+ vdup.8 d6, d25[0]
+ vdup.8 d7, d25[1]
+.endif
+
+ vst1.8 {q0, q1}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function fguv_loop_sx1_neon
+L(fguv_loop_sx1_tbl):
+ .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+.if \oy
+ mov r12, lr
+.endif
+1:
+.if \ox
+ vld1.8 {d8}, [r4], r10 // grain_lut old
+.endif
+.if \oy
+ vld1.8 {q8}, [r8], r10 // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.8 {d10}, [r11], r10 // grain_lut top old
+.endif
+ vld1.8 {q0, q1}, [r6, :128], r7 // luma
+ vld1.8 {q10}, [r5], r10 // grain_lut
+ vld1.8 {q11}, [r1, :128], r2 // src
+
+.if \ox
+ vmull.s8 q4, d8, d24
+ vmlal.s8 q4, d20, d25
+.endif
+
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+.if \oy
+.if \ox
+ vmull.s8 q5, d10, d24
+ vmlal.s8 q5, d16, d25
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d16, q5, #5
+.endif
+
+ vmull.s8 q4, d20, d7
+ vmull.s8 q5, d21, d7
+ vmlal.s8 q4, d16, d6
+ vmlal.s8 q5, d17, d6
+ vqrshrn.s16 d20, q4, #5
+ vqrshrn.s16 d21, q5, #5
+.elseif \ox
+ vqrshrn.s16 d20, q4, #5
+.endif
+.if \csfl
+ vrshrn.u16 d0, q0, #1
+ vrshrn.u16 d1, q1, #1
+.else
+ vrshr.u16 q4, q0, #1
+ vrshr.u16 q5, q1, #1
+ vmovl.u8 q0, d22
+ vmovl.u8 q1, d23
+ vmul.i16 q4, q4, d4[0]
+ vmul.i16 q5, q5, d4[0]
+ vmul.i16 q0, q0, d4[1]
+ vmul.i16 q1, q1, d4[1]
+ vqadd.s16 q4, q4, q0
+ vqadd.s16 q5, q5, q1
+ vdup.16 q0, d4[2]
+ vshr.s16 q4, q4, #6
+ vshr.s16 q5, q5, #6
+ vadd.i16 q4, q4, q0
+ vadd.i16 q5, q5, q0
+ vqmovun.s16 d0, q4
+ vqmovun.s16 d1, q5
+.endif
+
+ bl gather16_neon
+
+ vmovl.s8 q8, d20 // grain
+ vmovl.s8 q9, d21
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+
+ vmul.i16 q8, q8, q6 // scaling * grain
+ vmul.i16 q9, q9, q7
+
+ vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
+ vrshl.s16 q9, q9, q13
+
+ vaddw.u8 q8, q8, d22 // *src + noise
+ vaddw.u8 q9, q9, d23
+
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+
+ vmax.u8 q0, q0, q14
+ vmin.u8 q0, q0, q15
+
+ subs r9, r9, #1
+.if \oy
+ vswp d6, d7
+.endif
+ vst1.8 {q0}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/filmgrain16.S b/third_party/dav1d/src/arm/32/filmgrain16.S
new file mode 100644
index 0000000000..6c36caceae
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/filmgrain16.S
@@ -0,0 +1,2137 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr r11, r2, #3
+ lsr r12, r2, #12
+ lsr lr, r2, #1
+ eor r11, r2, r11 // (r >> 0) ^ (r >> 3)
+ eor r12, r12, lr // (r >> 12) ^ (r >> 1)
+ eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr r2, r2, #\steps
+.endif
+ and r11, r11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr r2, r2, r11, lsl #(16 - \steps) // *state
+.else
+ orr r2, r2, r11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, r2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, r2, #17 - \bits, #\bits
+ lsr r2, r2, #1
+.endm
+
+// special calling convention:
+// r2 holds seed
+// r3 holds dav1d_gaussian_sequence
+// clobbers r11-r12
+// returns in d0-d1
+function get_gaussian_neon
+ push {r5-r6,lr}
+ increment_seed 4
+ read_rand r5, 11, 3
+ read_rand r6, 11, 2
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[0]}, [r5]
+ read_rand r5, 11, 1
+ vld1.16 {d0[1]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 0
+ increment_seed 4
+ add r6, r3, r6, lsl #1
+ vld1.16 {d0[2]}, [r5]
+ read_rand r5, 11, 3
+ vld1.16 {d0[3]}, [r6]
+ add r5, r3, r5, lsl #1
+ read_rand r6, 11, 2
+ vld1.16 {d1[0]}, [r5]
+ add r6, r3, r6, lsl #1
+ read_rand r5, 11, 1
+ vld1.16 {d1[1]}, [r6]
+ read_rand r6, 11, 0
+ add r5, r3, r5, lsl #1
+ add r6, r3, r6, lsl #1
+ vld1.16 {d1[2]}, [r5]
+ vld1.16 {d1[3]}, [r6]
+ pop {r5-r6,pc}
+endfunc
+
+function get_grain_2_neon
+ push {r11,lr}
+ increment_seed 2
+ read_rand r11, 11, 1
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ vld1.16 {d0[1]}, [r12]
+ vrshl.s16 d0, d0, d30
+ pop {r11,pc}
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, d0
+ vmov \dst, d0
+.endif
+.endm
+
+function get_grain_4_neon
+ push {r11,lr}
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[0]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d0[1]}, [r12]
+ read_rand r12, 11, 0
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d0[2]}, [r11]
+ vld1.16 {d0[3]}, [r12]
+ vrshl.s16 d0, d0, d30
+ pop {r11,pc}
+endfunc
+
+.macro get_grain_4 dst
+ bl get_grain_4_neon
+.ifnc \dst, d0
+ vmov \dst, d0
+.endif
+.endm
+
+// r1 holds the number of entries to produce
+// r6, r8 and r10 hold the previous output entries
+// q0 holds the vector of produced entries
+// q1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+ push {r0, lr}
+.if \n == 1
+ mvn lr, r5 // grain_min = ~grain_max
+.else
+ mov r0, #1
+ mov lr, #1
+ sub r7, r7, #1
+ sub r9, r9, #1
+ lsl r0, r0, r7
+ lsl lr, lr, r9
+ add r7, r7, #1
+ add r9, r9, #1
+.endif
+1:
+ read_shift_rand r12, 11
+ vmov.32 r11, d2[0]
+ lsl r12, r12, #1
+ vext.8 q0, q0, q0, #2
+ ldrsh r12, [r3, r12]
+.if \n == 1
+ mla r11, r6, r4, r11 // sum (above) + *coeff * prev output
+ add r6, r11, r8 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, r10
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add r6, r6, r12
+ cmp r6, r5
+.elseif \n == 2
+ mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r6, r10, r11 // += *coeff * prev output 2
+ mov r8, r6
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mvn lr, r5 // grain_min = ~grain_max
+.else
+ push {r1-r3}
+ sbfx r1, r4, #0, #8
+ sbfx r2, r4, #8, #8
+ sbfx r3, r4, #16, #8
+ mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1
+ mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2
+ mla r11, r6, r3, r11 // += *coeff * prev output 3
+ pop {r1-r3}
+ mov r10, r8
+ mov r8, r6
+
+ add r6, r11, r0 // 1 << (ar_coeff_shift - 1)
+ add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+ asr r6, r6, r7 // >> ar_coeff_shift
+ asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add r6, r6, r12
+ push {lr}
+ cmp r6, r5
+ mvn lr, r5 // grain_min = ~grain_max
+.endif
+ it gt
+ movgt r6, r5
+ cmp r6, lr
+ it lt
+ movlt r6, lr
+.if \n >= 2
+ pop {lr}
+.endif
+ subs r1, r1, #1
+ vext.8 q1, q1, q1, #4
+ vmov.16 d1[3], r6
+ bgt 1b
+ pop {r0, pc}
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ sub r12, r0, #1*GRAIN_WIDTH*2 - 16
+ vld1.16 {q10}, [r12] // load top right
+
+ vext.8 q0, q8, q9, #14 // top left, top mid
+ vext.8 q1, q9, q10, #2 // top left, top mid
+
+ vmull.s16 q2, d18, d28
+ vmlal.s16 q2, d0, d27
+ vmlal.s16 q2, d2, d29
+ vmull.s16 q3, d19, d28
+ vmlal.s16 q3, d1, d27
+ vmlal.s16 q3, d3, d29
+
+ vmov q8, q9
+ vmov q9, q10
+
+ bx lr
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
+.ifc \lag\()_\edge, lag3_left
+ bl sum_lag3_left_above_neon
+.else
+ bl sum_\lag\()_above_neon
+.endif
+.ifc \type, uv_420
+ vpush {q6-q7}
+ add r12, r11, #GRAIN_WIDTH*2
+ vld1.16 {q0, q1}, [r11]!
+ vld1.16 {q6, q7}, [r12]!
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d12, d12, d13
+ vpadd.i16 d13, d14, d15
+ vadd.i16 q0, q0, q6
+ vpop {q6-q7}
+ vrshr.s16 q0, q0, #2
+.endif
+.ifc \type, uv_422
+ vld1.16 {q0, q1}, [r11]!
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vrshr.s16 q0, q0, #1
+.endif
+.ifc \type, uv_444
+ vld1.16 {q0}, [r11]!
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ vdup.8 d13, \uv_coeff
+ vmovl.s8 q6, d13
+.endif
+ vmlal.s16 q2, d0, d13
+ vmlal.s16 q3, d1, d13
+.endif
+.if \uv_layout && \elems == 8
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 7
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 1
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+ push {r11}
+.if \elems > 4
+.ifc \edge, left
+ increment_seed 4
+ read_rand r11, 11, 3
+ read_rand r12, 11, 2
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d1[1]}, [r11]
+ read_rand r11, 11, 1
+ vld1.16 {d1[2]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d1[3]}, [r11]
+ lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ vrshl.s16 d1, d1, d30
+ vext.8 q2, q2, q2, #12
+.ifc \lag, lag3
+ vmov.s16 r10, d1[1]
+.endif
+.ifnc \lag, lag1
+ vmov.s16 r8, d1[2]
+.endif
+ vmov.s16 r6, d1[3]
+
+ vmov q1, q2
+ mov r1, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ vmov q1, q2
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ vmov q1, q3
+.ifc \edge, right
+ mov r1, #3
+ bl output_\lag\()_neon
+ read_shift_rand r12, 11
+ add r12, r3, r12, lsl #1
+ vld1.16 {d2[0]}, [r12]
+ vrshl.s16 d2, d2, d30
+ vext.8 q0, q0, q1, #2
+.else
+ mov r1, #4
+ bl output_\lag\()_neon
+.endif
+.else
+ // elems == 1
+ increment_seed 4, shift=0
+ vmov q1, q2
+ mov r1, #1
+ bl output_\lag\()_neon
+ lsr r2, r2, #3
+
+ read_rand r11, 11, 2
+ read_rand r12, 11, 1
+ add r11, r3, r11, lsl #1
+ add r12, r3, r12, lsl #1
+ vld1.16 {d2[0]}, [r11]
+ read_rand r11, 11, 0
+ vld1.16 {d2[1]}, [r12]
+ add r11, r3, r11, lsl #1
+ vld1.16 {d2[2]}, [r11]
+ vrshl.s16 d2, d2, d30
+ vext.8 q0, q0, q1, #14
+.endif
+ vst1.16 {q0}, [r0]!
+ pop {r11}
+ pop {r1, pc}
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag1_\edge\()_neon
+ push {r1, lr}
+.ifc \edge, left
+ sub r12, r0, #1*GRAIN_WIDTH*2
+ vld1.8 {q9}, [r12] // load the previous block right above
+.endif
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 7
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 7
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 1
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 1
+
+
+function sum_lag2_above_neon
+ push {lr}
+ sub r12, r0, #2*GRAIN_WIDTH*2 - 16
+ sub lr, r0, #1*GRAIN_WIDTH*2 - 16
+ vld1.16 {q10}, [r12] // load top right
+ vld1.16 {q13}, [lr]
+
+ vdup.8 d10, d28[0]
+ vext.8 q0, q8, q9, #12 // top left, top mid
+ vdup.8 d12, d28[1]
+ vext.8 q1, q8, q9, #14
+ vdup.8 d14, d28[3]
+ vext.8 q4, q9, q10, #2 // top mid, top right
+ vmovl.s8 q5, d10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+
+ vmull.s16 q2, d0, d10
+ vmlal.s16 q2, d2, d12
+ vmlal.s16 q2, d8, d14
+ vmull.s16 q3, d1, d10
+ vmlal.s16 q3, d3, d12
+ vmlal.s16 q3, d9, d14
+
+ vdup.8 d10, d28[4]
+ vext.8 q0, q9, q10, #4 // top mid, top right
+ vdup.8 d12, d28[5]
+ vext.8 q1, q11, q12, #12 // top left, top mid
+ vdup.8 d14, d28[6]
+ vext.8 q4, q11, q12, #14
+ vmovl.s8 q5, d10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+
+ vmlal.s16 q2, d0, d10
+ vmlal.s16 q2, d2, d12
+ vmlal.s16 q2, d8, d14
+ vmlal.s16 q3, d1, d10
+ vmlal.s16 q3, d3, d12
+ vmlal.s16 q3, d9, d14
+
+ vdup.8 d10, d29[0]
+ vext.8 q0, q12, q13, #2 // top mid, top right
+ vdup.8 d12, d29[1]
+ vext.8 q1, q12, q13, #4
+
+ vdup.8 d14, d28[2]
+ vdup.8 d8, d28[7]
+
+ vmovl.s8 q5, d10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q4, d8
+
+ vmlal.s16 q2, d0, d10
+ vmlal.s16 q2, d2, d12
+ vmlal.s16 q2, d18, d14
+ vmlal.s16 q2, d24, d8
+ vmlal.s16 q3, d1, d10
+ vmlal.s16 q3, d3, d12
+ vmlal.s16 q3, d19, d14
+ vmlal.s16 q3, d25, d8
+
+ vmov q8, q9
+ vmov q9, q10
+
+ vmov q11, q12
+ vmov q12, q13
+
+ pop {pc}
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag2_\edge\()_neon
+ push {r1, lr}
+.ifc \edge, left
+ sub r12, r0, #2*GRAIN_WIDTH*2
+ sub lr, r0, #1*GRAIN_WIDTH*2
+ vld1.16 {q9}, [r12] // load the previous block right above
+ vld1.16 {q12}, [lr]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 7
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 7
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 1
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 1
+
+
+function sum_lag3_left_above_neon
+ // A separate codepath for the left edge, to avoid reading outside
+ // of the edge of the buffer.
+ sub r12, r0, #3*GRAIN_WIDTH*2
+ vld1.8 {q11, q12}, [r12]
+ vext.8 q12, q11, q12, #10
+ vext.8 q11, q11, q11, #10
+ b sum_lag3_above_start
+endfunc
+
+function sum_lag3_above_neon
+ movw r12, #(3*GRAIN_WIDTH + 3)*2
+ sub r12, r0, r12
+ vld1.8 {q11, q12}, [r12]
+
+sum_lag3_above_start:
+ vdup.8 d12, d26[0]
+ vext.8 q1, q11, q12, #2
+ vdup.8 d14, d26[1]
+ vext.8 q4, q11, q12, #4
+ vdup.8 d16, d26[2]
+ vext.8 q5, q11, q12, #6
+ vdup.8 d18, d26[3]
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ movw r12, #(2*GRAIN_WIDTH + 3)*2
+ sub r12, r0, r12
+
+ vmull.s16 q2, d22, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d10, d18
+ vmull.s16 q3, d23, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d26[4]
+ vext.8 q0, q11, q12, #8
+ vdup.8 d14, d26[5]
+ vext.8 q1, q11, q12, #10
+ vdup.8 d16, d26[6]
+ vext.8 q4, q11, q12, #12
+ vld1.8 {q11, q12}, [r12]
+ vdup.8 d18, d26[7]
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d22, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d23, d18
+
+ vdup.8 d12, d27[0]
+ vext.8 q0, q11, q12, #2
+ vdup.8 d14, d27[1]
+ vext.8 q1, q11, q12, #4
+ vdup.8 d16, d27[2]
+ vext.8 q4, q11, q12, #6
+ vdup.8 d18, d27[3]
+ vext.8 q5, q11, q12, #8
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ sub r12, r0, #(1*GRAIN_WIDTH + 3)*2
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d10, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d27[4]
+ vext.8 q0, q11, q12, #10
+ vdup.8 d14, d27[5]
+ vext.8 q1, q11, q12, #12
+ vld1.8 {q11, q12}, [r12]
+ vdup.8 d16, d27[6]
+ vdup.8 d18, d27[7]
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vext.8 q5, q11, q12, #2
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d22, d16
+ vmlal.s16 q2, d10, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d23, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d28[0]
+ vext.8 q0, q11, q12, #4
+ vdup.8 d14, d28[1]
+ vext.8 q1, q11, q12, #6
+ vdup.8 d16, d28[2]
+ vext.8 q4, q11, q12, #8
+ vdup.8 d18, d28[3]
+ vext.8 q5, q11, q12, #10
+ vmovl.s8 q6, d12
+ vmovl.s8 q7, d14
+ vmovl.s8 q8, d16
+ vmovl.s8 q9, d18
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q2, d2, d14
+ vmlal.s16 q2, d8, d16
+ vmlal.s16 q2, d10, d18
+ vmlal.s16 q3, d1, d12
+ vmlal.s16 q3, d3, d14
+ vmlal.s16 q3, d9, d16
+ vmlal.s16 q3, d11, d18
+
+ vdup.8 d12, d28[4]
+ vext.8 q0, q11, q12, #12
+ vmovl.s8 q6, d12
+
+ vmlal.s16 q2, d0, d12
+ vmlal.s16 q3, d1, d12
+
+ bx lr
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag3_\edge\()_neon
+ push {r1, lr}
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 7
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 7
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 1
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 1
+
+function generate_grain_rows_neon
+ push {r10-r11,lr}
+1:
+ mov r10, #80
+2:
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ subs r10, r10, #8
+ vst1.16 {q0}, [r0]!
+ bgt 2b
+ get_grain_2 d0
+ subs r1, r1, #1
+ vst1.32 {d0[0]}, [r0]!
+ bgt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function generate_grain_rows_44_neon
+ push {r10-r11,lr}
+1:
+ mov r10, #40
+2:
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+ subs r10, r10, #8
+ vst1.16 {q0}, [r0]!
+ bgt 2b
+ get_grain_4 d0
+ subs r1, r1, #1
+ vst1.16 {d0}, [r0]
+ add r0, r0, #GRAIN_WIDTH*2-80
+ bgt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+ vld1.16 {q3}, [r11]!
+gen_grain_uv_lag0_8_start:
+ push {r11,lr}
+ bl get_gaussian_neon
+ vrshl.s16 q0, q0, q15
+gen_grain_uv_lag0_8_add:
+ vand q3, q3, q1
+ vmull.s16 q2, d6, d22
+ vmull.s16 q3, d7, d22
+ vrshl.s32 q2, q2, q12
+ vrshl.s32 q3, q3, q12
+ vqmovn.s32 d4, q2
+ vqmovn.s32 d5, q3
+ vqadd.s16 q2, q2, q0
+ vmin.s16 q2, q2, q9
+ vmax.s16 q2, q2, q10
+ vst1.16 {q2}, [r0]!
+ pop {r11,pc}
+endfunc
+
+function gen_grain_uv_420_lag0_8_neon
+ add r12, r11, #GRAIN_WIDTH*2
+ vld1.16 {q2,q3}, [r11]!
+ vld1.16 {q4,q5}, [r12]
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vpadd.i16 d8, d8, d9
+ vpadd.i16 d9, d10, d11
+ vadd.i16 q2, q2, q4
+ vrshr.s16 q3, q2, #2
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_422_lag0_8_neon
+ vld1.16 {q2,q3}, [r11]!
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vrshr.s16 q3, q2, #1
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_420_lag0_4_neon
+ add r12, r11, #GRAIN_WIDTH*2
+ vld1.16 {q2}, [r11]
+ vld1.16 {q0}, [r12]
+ add r11, r11, #32
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d0, d0, d1
+ vadd.i16 d4, d4, d0
+ vrshr.s16 d6, d4, #2
+ push {r11,lr}
+ get_grain_4 d0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+function gen_grain_uv_422_lag0_4_neon
+ vld1.16 {q2}, [r11]
+ add r11, r11, #32
+ vpadd.i16 d4, d4, d5
+ vrshr.s16 d6, d4, #1
+ push {r11,lr}
+ get_grain_4 d0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+
+.ifc \type, uv_444
+ ldr r4, [sp, #36]
+ mov r12, r3
+ mov lr, #28
+ add r11, r1, #3*GRAIN_WIDTH*2
+ mov r1, r2
+ mul r12, r12, lr
+ clz lr, r4
+.else
+ clz lr, r2
+.endif
+ movrel r3, X(gaussian_sequence)
+ sub lr, lr, #24 // -bitdepth_min_8
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add r4, r1, #FGD_AR_COEFFS_Y
+.else
+ add r4, r1, #FGD_AR_COEFFS_UV
+.endif
+ add r9, r9, lr // grain_scale_shift - bitdepth_min_8
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+.ifc \type, uv_444
+ push {lr}
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+ pop {lr}
+.endif
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ neg lr, lr // bitdepth_min_8
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, y
+ mov r1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ mvn r6, r5 // grain_min = ~grain_max
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+ mov r1, #GRAIN_HEIGHT-3
+
+ vdup.32 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vdup.16 q9, r5
+ vdup.16 q10, r6
+ vext.8 q13, q0, q1, #10
+ vext.8 q14, q1, q0, #2
+ vneg.s32 q12, q12
+ vmovl.s8 q11, d22
+
+1:
+ vmov q1, q13
+ bl gen_grain_uv_444_lag0_neon // 8
+ vmov.i8 q1, #255
+ bl gen_grain_uv_444_lag0_neon // 16
+ bl gen_grain_uv_444_lag0_neon // 24
+ bl gen_grain_uv_444_lag0_neon // 32
+ bl gen_grain_uv_444_lag0_neon // 40
+ bl gen_grain_uv_444_lag0_neon // 48
+ bl gen_grain_uv_444_lag0_neon // 56
+ bl gen_grain_uv_444_lag0_neon // 64
+ bl gen_grain_uv_444_lag0_neon // 72
+ vmov q1, q14
+ bl gen_grain_uv_444_lag0_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+ add r11, r11, #4
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb r4, [r4, #1] // ar_coeffs_y[3]
+.else
+ add r4, r4, #2
+.endif
+
+ mov r1, #3
+.ifc \type, uv_444
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+ vmovl.s8 q13, d27
+ vmovl.s8 q12, d29
+ vmovl.s8 q14, d28
+ vmov d29, d24
+.ifc \type, uv_444
+ vmovl.s8 q6, d13
+.endif
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_mid_neon // 48
+ bl sum_\type\()_lag1_mid_neon // 56
+ bl sum_\type\()_lag1_mid_neon // 64
+ bl sum_\type\()_lag1_mid_neon // 72
+ bl sum_\type\()_lag1_right_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #4
+.endif
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_neon
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_mid_neon // 48
+ bl sum_\type\()_lag2_mid_neon // 56
+ bl sum_\type\()_lag2_mid_neon // 64
+ bl sum_\type\()_lag2_mid_neon // 72
+ bl sum_\type\()_lag2_right_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #4
+.endif
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ vpush {d26}
+ bl generate_grain_rows_neon
+ vpop {d26}
+
+ mov r1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_mid_neon // 48
+ bl sum_\type\()_lag3_mid_neon // 56
+ bl sum_\type\()_lag3_mid_neon // 64
+ bl sum_\type\()_lag3_mid_neon // 72
+ bl sum_\type\()_lag3_right_neon // 80
+ get_grain_2 d16
+ subs r1, r1, #1
+.ifc \type, uv_444
+ add r11, r11, #4
+.endif
+ vst1.32 {d16[0]}, [r0]!
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
+.else
+ sub \reg, \reg, #6*32-GRAIN_WIDTH*2
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+
+ ldr r4, [sp, #36]
+ mov r12, r3
+ movw r11, #(3*GRAIN_WIDTH-3)*2
+ mov lr, #28
+ add r11, r1, r11
+ mov r1, r2
+ mul r12, r12, lr
+ clz lr, r4
+
+ movrel r3, X(gaussian_sequence)
+ sub lr, lr, #24 // -bitdepth_min_8
+ ldr r2, [r1, #FGD_SEED]
+ ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT]
+ add r4, r1, #FGD_AR_COEFFS_UV
+ add r9, r9, lr // grain_scale_shift - bitdepth_min_8
+ adr r5, L(gen_grain_\type\()_tbl)
+ ldr r6, [r1, #FGD_AR_COEFF_LAG]
+ add r9, r9, #4
+ ldr r6, [r5, r6, lsl #2]
+ vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ add r5, r5, r6
+ vneg.s16 q15, q15
+
+ push {lr}
+ cmp r12, #0
+ movw r10, #0x49d8
+ movw lr, #0xb524
+ // Intentionally using a separate register instead of moveq with an
+ // immediate constant, to avoid armv8 deprecated it instruction forms.
+ it eq
+ moveq r10, lr
+ add r4, r4, r12 // Add offset to ar_coeffs_uv[1]
+ eor r2, r2, r10
+ pop {lr}
+
+ ldr r7, [r1, #FGD_AR_COEFF_SHIFT]
+ neg lr, lr
+ mov r8, #1
+ mov r10, #1
+ lsl r8, r8, r7 // 1 << ar_coeff_shift
+ lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift)
+ lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ bx r5
+
+ .align 2
+L(gen_grain_\type\()_tbl):
+ .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+ .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
+
+L(generate_grain_\type\()_lag0):
+.ifc \type, uv_420
+ vpush {q4-q5}
+.endif
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ mvn r6, r5 // grain_min = ~grain_max
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+ set_height r1, \type
+
+ vdup.32 q12, r7
+ vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0]
+ vmov.i8 q0, #0
+ vmov.i8 q1, #255
+ vdup.16 q9, r5
+ vdup.16 q10, r6
+ vext.8 q13, q0, q1, #10
+ vext.8 q14, q1, q0, #14
+ vneg.s32 q12, q12
+ vmovl.s8 q11, d22
+
+1:
+ vmov q1, q13
+ bl gen_grain_\type\()_lag0_8_neon // 8
+ vmov.i8 q1, #255
+ bl gen_grain_\type\()_lag0_8_neon // 16
+ bl gen_grain_\type\()_lag0_8_neon // 24
+ bl gen_grain_\type\()_lag0_8_neon // 32
+ bl gen_grain_\type\()_lag0_8_neon // 40
+ vmov q1, q14
+ bl gen_grain_\type\()_lag0_4_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+.ifc \type, uv_420
+ vpop {q4-q5}
+.endif
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag1):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0]
+ vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1]
+ vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2]
+ add r4, r4, #2
+
+ mov r1, #3
+ vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4]
+ ldrsb r4, [r4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+ vmovl.s8 q13, d27
+ vmovl.s8 q12, d29
+ vmovl.s8 q14, d28
+ vmov d29, d24
+ vmovl.s8 q6, d13
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_right_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag2):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12]
+
+ vmov.s8 r4, d29[2]
+ vmov.s8 r10, d29[3]
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_right_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(generate_grain_\type\()_lag3):
+ vpush {q4-q7}
+ mov r5, #128
+ lsl r5, r5, lr // 128 << bitdepth_min_8
+ sub r5, r5, #1 // (128 << bitdepth_min_8) - 1
+ vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+
+ vmov.u8 r4, d28[5]
+ vmov.u8 r10, d28[6]
+ vmov.u8 r12, d28[7]
+
+ orr r4, r4, r10, lsl #8
+ orr r4, r4, r12, lsl #16
+
+ mov r1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height r1, \type
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_right_neon // 44
+ subs r1, r1, #1
+ increment_y_ptr r11, \type
+ add r0, r0, #GRAIN_WIDTH*2-6*16
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off
+ vmov.u16 r11, \src1[0+\off]
+ vmov.u16 r12, \src3[0+\off]
+ add r11, r11, r3
+ vmov.u16 lr, \src1[2+\off]
+ add r12, r12, r3
+ vld1.8 {\dst1[0+\off]}, [r11]
+ vmov.u16 r11, \src3[2+\off]
+ add lr, lr, r3
+ vld1.8 {\dst2[0+\off]}, [r12]
+ vmov.u16 r12, \src2[0+\off]
+ add r11, r11, r3
+ vld1.8 {\dst1[2+\off]}, [lr]
+ vmov.u16 lr, \src4[0+\off]
+ add r12, r12, r3
+ vld1.8 {\dst2[2+\off]}, [r11]
+ vmov.u16 r11, \src2[2+\off]
+ add lr, lr, r3
+ vld1.8 {\dst1[4+\off]}, [r12]
+ vmov.u16 r12, \src4[2+\off]
+ add r11, r11, r3
+ vld1.8 {\dst2[4+\off]}, [lr]
+ add r12, r12, r3
+ vld1.8 {\dst1[6+\off]}, [r11]
+ vld1.8 {\dst2[6+\off]}, [r12]
+.endm
+
+.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8
+ gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0
+ gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1
+ gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0
+ gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1
+.endm
+
+function gather32_neon
+ push {r11-r12,lr}
+ gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7
+ pop {r11-r12,pc}
+endfunc
+
+function gather16_neon
+ push {r11-r12,lr}
+ gather_interleaved d8, d9, d0, d1, d2, d3, 0
+ gather_interleaved d8, d9, d0, d1, d2, d3, 1
+ pop {r11-r12,pc}
+endfunc
+
+const overlap_coeffs_0, align=4
+ .short 27, 17, 0, 0
+ .short 17, 27, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .short 23, 0, 0, 0
+ .short 22, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx, lsl #1 // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+function fgy_32x32_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut
+ ldrd r6, r7, [sp, #108] // offsets, h
+ ldr r8, [sp, #116] // clip
+ mov r9, #GRAIN_WIDTH*2 // grain_lut stride
+ ldr r10, [sp, #124] // bitdepth_max
+
+ eor r4, r4, #15 // 15 - scaling_shift
+ vdup.16 q6, r10 // bitdepth_max
+ clz r10, r10
+ vdup.16 q13, r4 // 15 - scaling_shift
+ rsb r10, r10, #24 // bitdepth_min_8
+ cmp r8, #0
+ vdup.16 q12, r10 // bitdepth_min_8
+
+ movrel_local r12, overlap_coeffs_0
+
+ beq 1f
+ // clip
+ vmov.i16 q14, #16
+ vmov.i16 q15, #235
+ vshl.s16 q14, q14, q12
+ vshl.s16 q15, q15, q12
+ b 2f
+1:
+ // no clip
+ vmov.i16 q14, #0
+ vmov q15, q6
+2:
+ vshr.u16 q6, q6, #1 // grain_max
+
+ vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ add r5, r5, #18 // grain_lut += 9
+ add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r9 // grain_lut += grain_stride
+
+ ldr r10, [r6, #8] // offsets[1][0]
+ calc_offset r10, r4, r10, 0, 0
+ add_offset r4, r10, r4, r5, r9
+ ldr r10, [r6, #4] // offsets[0][1]
+ calc_offset r10, r11, r10, 0, 0
+ add_offset r11, r10, r11, r5, r9
+ ldr r10, [r6, #12] // offsets[1][1]
+ calc_offset r10, r8, r10, 0, 0
+ add_offset r8, r10, r8, r5, r9
+ ldr r6, [r6] // offsets[0][0]
+ calc_offset r6, lr, r6, 0, 0
+ add_offset r5, r6, lr, r5, r9
+
+ add r4, r4, #32*2 // grain_lut += BLOCK_SIZE * bx
+ add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+
+ ldr r10, [sp, #120] // type
+ adr r11, L(fgy_loop_tbl)
+
+ tst r10, #1
+ ldr r10, [r11, r10, lsl #2]
+
+ add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r8, r8, #32*2 // grain_lut += BLOCK_SIZE * bx
+
+ add r11, r11, r10
+
+ beq 1f
+ // y overlap
+ vdup.16 d14, d24[0]
+ vdup.16 d15, d24[1]
+ mov r10, r7 // backup actual h
+ mov r7, #2
+1:
+ sub r2, r2, #32 // src_stride -= 32
+ sub r9, r9, #32 // grain_stride -= 32
+ bx r11
+endfunc
+
+function fgy_loop_neon
+L(fgy_loop_tbl):
+ .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
+ .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
+
+.macro fgy ox, oy
+L(loop_\ox\oy):
+1:
+.if \ox
+ vld1.16 {d0}, [r4], r9 // grain_lut old
+.endif
+.if \oy
+ vld1.16 {q2, q3}, [r6]! // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.16 {d2}, [r8], r9 // grain_lut top old
+.endif
+.if \oy
+ vld1.16 {q4, q5}, [r6], r9 // grain_lut top
+.endif
+.if !\ox && !\oy
+ vld1.16 {q0, q1}, [r1, :128]! // src
+.endif
+ vld1.16 {q8, q9}, [r5]! // grain_lut
+.if !\ox && !\oy
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+.endif
+.if !\oy
+ vmvn.i16 q5, #0xf000 // 0x0fff
+.endif
+ vld1.16 {q10, q11}, [r5], r9 // grain_lut
+
+.if \ox
+ add r4, r4, #32
+ vmull.s16 q0, d0, d24
+ vmlal.s16 q0, d16, d25
+.endif
+
+.if \oy
+.if \ox
+ add r8, r8, #32
+ vmull.s16 q1, d2, d24
+ vmlal.s16 q1, d4, d25
+ vqrshrn.s32 d16, q0, #5
+ vmvn d0, d12 // grain_min
+ vqrshrn.s32 d4, q1, #5
+ vmin.s16 d16, d16, d12
+ vmin.s16 d4, d4, d12
+ vmax.s16 d16, d16, d0
+ vmax.s16 d4, d4, d0
+.endif
+
+ vmull.s16 q0, d4, d14
+ vmull.s16 q1, d5, d14
+ vmull.s16 q2, d6, d14
+ vmull.s16 q3, d7, d14
+ vmlal.s16 q0, d16, d15
+ vmlal.s16 q1, d17, d15
+ vmlal.s16 q2, d18, d15
+ vmlal.s16 q3, d19, d15
+ vmull.s16 q8, d20, d15
+ vmull.s16 q9, d21, d15
+ vmull.s16 q10, d22, d15
+ vmull.s16 q11, d23, d15
+ vmlal.s16 q8, d8, d14
+ vmlal.s16 q9, d9, d14
+ vmlal.s16 q10, d10, d14
+ vmlal.s16 q11, d11, d14
+ vmvn q4, q6 // grain_min
+ vqrshrn.s32 d0, q0, #5
+ vqrshrn.s32 d1, q1, #5
+ vqrshrn.s32 d2, q2, #5
+ vqrshrn.s32 d3, q3, #5
+ vqrshrn.s32 d4, q8, #5
+ vqrshrn.s32 d5, q9, #5
+ vqrshrn.s32 d6, q10, #5
+ vqrshrn.s32 d7, q11, #5
+ vmin.s16 q8, q0, q6
+ vmin.s16 q9, q1, q6
+ vld1.16 {q0, q1}, [r1, :128]! // src
+ vmin.s16 q10, q2, q6
+ vmin.s16 q11, q3, q6
+ vmax.s16 q8, q8, q4
+ vmax.s16 q9, q9, q4
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+ vmvn.i16 q5, #0xf000 // 0x0fff
+ vmax.s16 q10, q10, q4
+ vmax.s16 q11, q11, q4
+.elseif \ox
+ vmvn d4, d12 // grain_min
+ vqrshrn.s32 d16, q0, #5
+ vld1.16 {q0, q1}, [r1, :128]! // src
+ vmin.s16 d16, d16, d12
+ vmax.s16 d16, d16, d4
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+.endif
+
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ vand q0, q0, q5
+ vand q1, q1, q5
+ vand q2, q2, q5
+ vand q3, q3, q5
+
+ bl gather32_neon
+
+.if \ox || \oy
+ vpush {q6-q7}
+.endif
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
+ vshl.u16 q7, q7, q13
+ vshl.u16 q4, q4, q13
+ vshl.u16 q5, q5, q13
+
+ vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
+ vqrdmulh.s16 q9, q9, q7
+ vqrdmulh.s16 q10, q10, q4
+ vqrdmulh.s16 q11, q11, q5
+
+.if \ox || \oy
+ vpop {q6-q7}
+.endif
+
+ vqadd.s16 q0, q0, q8 // *src + noise
+ vqadd.s16 q1, q1, q9
+ vqadd.s16 q2, q2, q10
+ vqadd.s16 q3, q3, q11
+
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+
+ vst1.16 {q0, q1}, [r0, :128]! // dst
+ subs r7, r7, #1
+.if \oy
+ vdup.16 d14, d25[0]
+ vdup.16 d15, d25[1]
+.endif
+ vst1.16 {q2, q3}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r10, #2
+ sub r7, r10, #2 // restore actual remaining h
+ bgt L(loop_\ox\()0)
+.endif
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+endfunc
+
+// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100] // data, grain_lut
+ ldrd r10, r11, [sp, #124] // uv, is_id
+ ldr r6, [sp, #136] // bitdepth_max
+
+ clz r7, r6
+ rsb r7, r7, #24 // bitdepth_min_8
+
+ // !csfl
+ add r10, r4, r10, lsl #2 // + 4*uv
+ add r12, r10, #FGD_UV_LUMA_MULT
+ add lr, r10, #FGD_UV_MULT
+ ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset
+ vld1.16 {d30[]}, [r12] // uv_luma_mult
+ lsl r10, r10, r7 // uv_offset << bitdepth_min_8
+ vld1.16 {d30[1]}, [lr] // uv_mult
+
+ ldr lr, [r4, #FGD_SCALING_SHIFT]
+ ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ eor lr, lr, #15 // 15 - scaling_shift
+
+ vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8
+
+ cmp r12, #0
+ vdup.16 q13, lr // 15 - scaling_shift
+
+ beq 1f
+ // clip
+ cmp r11, #0
+ mov r8, #16
+ mov r9, #240
+ lsl r8, r8, r7
+ lsl r9, r9, r7
+ beq 2f
+ // is_id
+ mov r9, #235
+ lsl r9, r9, r7
+ b 2f
+1:
+ // no clip
+ mov r8, #0
+ mov r9, r6 // bitdepth_max
+2:
+ vmov.16 d30[3], r6 // bitdepth_max
+ vdup.16 d31, r8 // clip_min
+
+ mov r10, #GRAIN_WIDTH*2 // grain_lut stride
+
+.if \sy
+ mov r6, #23
+ mov r7, #22
+.else
+ mov r6, #27
+ mov r7, #17
+.endif
+ vmov.16 d31[1], r9 // clip_max
+
+ ldrd r8, r9, [sp, #116] // offsets, h
+
+ add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
+.if \sy
+ add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride
+ add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride
+ add r5, r5, r10 // grain_lut += grain_stride
+.endif
+ vmov.16 d31[2], r6 // overlap y [0]
+
+ ldr r12, [r8, #8] // offsets[1][0]
+ calc_offset r12, r4, r12, \sx, \sy
+ add_offset r4, r12, r4, r5, r10
+
+ ldr r12, [r8, #4] // offsets[0][1]
+ calc_offset r12, lr, r12, \sx, \sy
+ add_offset lr, r12, lr, r5, r10
+
+ ldr r12, [r8, #12] // offsets[1][1]
+ calc_offset r12, r11, r12, \sx, \sy
+ add_offset r11, r12, r11, r5, r10
+
+ ldr r8, [r8] // offsets[0][0]
+ calc_offset r8, r12, r8, \sx, \sy
+ add_offset r5, r8, r12, r5, r10
+
+ vmov.16 d31[3], r7 // overlap y [1]
+
+ add r4, r4, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+ add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add r11, r11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+
+ movrel_local r12, overlap_coeffs_\sx
+ ldr lr, [sp, #132] // type
+ ldrd r6, r7, [sp, #108] // luma_row, luma_stride
+
+ vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs
+
+ movrel_local r12, L(fguv_loop_sx\sx\()_tbl)
+#if CONFIG_THUMB
+ // This uses movrel_local instead of adr above, because the target
+ // can be out of range for adr. But movrel_local leaves the thumb bit
+ // set on COFF (but probably wouldn't if building for thumb on ELF),
+ // thus try to clear the bit for robustness.
+ bic r12, r12, #1
+#endif
+
+ tst lr, #1
+ ldr lr, [r12, lr, lsl #2]
+
+ add r12, r12, lr
+
+ beq 1f
+ // y overlap
+ sub lr, r9, #(2 >> \sy) // backup remaining h
+ mov r9, #(2 >> \sy)
+
+1:
+.if \sy
+ add r7, r7, r7 // luma_stride *= 2
+.endif
+ sub r7, r7, #32 // luma_stride -= 32
+
+ bx r12
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+L(fguv_loop_sx0_tbl):
+ .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+ sub r2, r2, #32 // src_stride -= 32
+ sub r10, r10, #32 // grain_stride -= 32
+.if \oy
+ mov r12, lr
+.endif
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart):
+1:
+.if \ox
+ vld1.16 {d0}, [r4], r10 // grain_lut old
+.endif
+.if \oy
+ vld1.16 {q2, q3}, [r8]! // grain_lut top
+.endif
+.if \ox && \oy
+ vld1.16 {d2}, [r11], r10 // grain_lut top old
+.endif
+.if !\ox && !\oy
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+.endif
+ vld1.16 {q8, q9}, [r5]! // grain_lut
+.if \oy
+ vld1.16 {q4, q5}, [r8], r10 // grain_lut top
+.endif
+.if !\ox && !\oy
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+.endif
+.if \oy
+ vdup.16 d28, d31[2] // overlap y coeff
+ vdup.16 d29, d31[3] // overlap y coeff
+.endif
+ vld1.16 {q10, q11}, [r5], r10 // grain_lut
+
+.if \ox
+ vdup.16 q7, d30[3] // bitdepth_max
+ add r4, r4, #32
+ vmull.s16 q0, d0, d24
+ vshr.u16 q7, q7, #1 // grain_max
+ vmlal.s16 q0, d16, d25
+ vmvn q6, q7 // grain_min
+.endif
+
+.if \oy
+.if \ox
+ add r11, r11, #32
+ vmull.s16 q1, d2, d24
+ vmlal.s16 q1, d4, d25
+ vqrshrn.s32 d16, q0, #5
+ vqrshrn.s32 d4, q1, #5
+ vmin.s16 d4, d4, d14
+ vmin.s16 d16, d16, d14
+ vmax.s16 d4, d4, d12
+ vmax.s16 d16, d16, d12
+.endif
+
+ vmull.s16 q0, d4, d28
+ vmull.s16 q1, d5, d28
+ vmull.s16 q2, d6, d28
+ vmull.s16 q3, d7, d28
+.if !\ox
+ vdup.16 q7, d30[3] // bitdepth_max
+.endif
+ vmlal.s16 q0, d16, d29
+ vmlal.s16 q1, d17, d29
+ vmlal.s16 q2, d18, d29
+ vmlal.s16 q3, d19, d29
+.if !\ox
+ vshr.u16 q7, q7, #1 // grain_max
+.endif
+ vmull.s16 q8, d20, d29
+ vmull.s16 q9, d21, d29
+ vmull.s16 q10, d22, d29
+ vmull.s16 q11, d23, d29
+.if !\ox
+ vmvn q6, q7 // grain_min
+.endif
+ vmlal.s16 q8, d8, d28
+ vmlal.s16 q9, d9, d28
+ vmlal.s16 q10, d10, d28
+ vmlal.s16 q11, d11, d28
+ vqrshrn.s32 d0, q0, #5
+ vqrshrn.s32 d1, q1, #5
+ vqrshrn.s32 d2, q2, #5
+ vqrshrn.s32 d3, q3, #5
+ vqrshrn.s32 d4, q8, #5
+ vqrshrn.s32 d5, q9, #5
+ vqrshrn.s32 d6, q10, #5
+ vqrshrn.s32 d7, q11, #5
+ vmin.s16 q8, q0, q7
+ vmin.s16 q9, q1, q7
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 q10, q2, q7
+ vmin.s16 q11, q3, q7
+ vmax.s16 q8, q8, q6
+ vmax.s16 q9, q9, q6
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+ vmax.s16 q10, q10, q6
+ vmax.s16 q11, q11, q6
+.elseif \ox
+ vqrshrn.s32 d16, q0, #5
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 d16, d16, d14
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+ vmax.s16 d16, d16, d12
+.endif
+
+.if !\csfl
+ vdup.16 d28, d30[0] // uv_luma_mult
+ vld1.16 {q4, q5}, [r1, :128]! // src
+ vdup.16 d29, d30[1] // uv_mult
+ vmull.s16 q6, d0, d28
+ vmull.s16 q7, d1, d28
+ vmull.s16 q0, d2, d28
+ vmull.s16 q1, d3, d28
+ vmlal.s16 q6, d8, d29
+ vmlal.s16 q7, d9, d29
+ vmlal.s16 q0, d10, d29
+ vmlal.s16 q1, d11, d29
+ vld1.16 {q4, q5}, [r1, :128] // src
+ sub r1, r1, #32
+ vshrn.s32 d12, q6, #6
+ vshrn.s32 d13, q7, #6
+ vshrn.s32 d14, q0, #6
+ vshrn.s32 d15, q1, #6
+ vmull.s16 q0, d4, d28
+ vmull.s16 q1, d5, d28
+ vmull.s16 q2, d6, d28
+ vmull.s16 q3, d7, d28
+ vmlal.s16 q0, d8, d29
+ vmlal.s16 q1, d9, d29
+ vmlal.s16 q2, d10, d29
+ vmlal.s16 q3, d11, d29
+ vdup.16 q14, d30[2] // uv_offset
+ vshrn.s32 d0, q0, #6
+ vshrn.s32 d1, q1, #6
+ vshrn.s32 d2, q2, #6
+ vshrn.s32 d3, q3, #6
+ vdup.16 q4, d30[3] // bitdepth_max
+ vmov.i16 q5, #0
+ vadd.i16 q6, q6, q14
+ vadd.i16 q7, q7, q14
+ vadd.i16 q2, q0, q14
+ vadd.i16 q3, q1, q14
+ vmin.s16 q0, q6, q4
+ vmin.s16 q1, q7, q4
+ vmin.s16 q2, q2, q4
+ vmin.s16 q3, q3, q4
+ vmax.s16 q0, q0, q5
+ vmax.s16 q1, q1, q5
+ vmax.s16 q2, q2, q5
+ vmax.s16 q3, q3, q5
+.else
+ vdup.16 q14, d30[3] // bitdepth_max
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ vand q0, q0, q14
+ vand q1, q1, q14
+ vand q2, q2, q14
+ vand q3, q3, q14
+.endif
+
+ bl gather32_neon
+
+ vld1.16 {q0, q1}, [r1, :128]! // src
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+ vmovl.u8 q4, d10
+ vmovl.u8 q5, d11
+
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+
+ vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
+ vshl.u16 q7, q7, q13
+ vshl.u16 q4, q4, q13
+ vshl.u16 q5, q5, q13
+
+ vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
+ vqrdmulh.s16 q9, q9, q7
+ vqrdmulh.s16 q10, q10, q4
+ vqrdmulh.s16 q11, q11, q5
+
+
+ vdup.16 q4, d31[0] // clip_min
+ vdup.16 q5, d31[1] // clip_max
+
+ vqadd.s16 q0, q0, q8 // *src + noise
+ vqadd.s16 q1, q1, q9
+ vqadd.s16 q2, q2, q10
+ vqadd.s16 q3, q3, q11
+
+.if \oy
+ vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x
+.endif
+
+ vmax.s16 q0, q0, q4
+ vmax.s16 q1, q1, q4
+ vmax.s16 q2, q2, q4
+ vmax.s16 q3, q3, q4
+ vmin.s16 q0, q0, q5
+ vmin.s16 q1, q1, q5
+ vmin.s16 q2, q2, q5
+ vmin.s16 q3, q3, q5
+
+ vst1.16 {q0, q1}, [r0, :128]! // dst
+
+ subs r9, r9, #1
+.if \oy
+ vmov.32 d31[1], lr // new coeffs for overlap y
+.endif
+
+ vst1.16 {q2, q3}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function fguv_loop_sx1_neon
+L(fguv_loop_sx1_tbl):
+ .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+ .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
+
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+.if \oy
+ mov r12, lr
+.endif
+1:
+.if \ox
+ vld1.16 {d0}, [r4], r10 // grain_lut old
+.endif
+.if \ox && \oy
+ vld1.16 {d2}, [r11], r10 // grain_lut top old
+.endif
+.if \oy
+ vld1.16 {q2, q3}, [r8], r10 // grain_lut top
+.endif
+.if !\ox && !\oy
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+.endif
+ vld1.16 {q8, q9}, [r5], r10 // grain_lut
+.if \oy
+ vdup.16 d28, d31[2] // overlap y coeff
+ vdup.16 d29, d31[3] // overlap y coeff
+.endif
+.if !\ox && !\oy
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+.endif
+
+.if \ox
+ vdup.16 q7, d30[3] // bitdepth_max
+ vmull.s16 q0, d0, d24
+ vshr.u16 q7, q7, #1 // grain_max
+ vmlal.s16 q0, d16, d25
+ vmvn q6, q7 // grain_min
+.endif
+
+.if \oy
+.if \ox
+ vmull.s16 q1, d2, d24
+ vmlal.s16 q1, d4, d25
+ vqrshrn.s32 d16, q0, #5
+ vqrshrn.s32 d4, q1, #5
+ vmin.s16 d4, d4, d14
+ vmin.s16 d16, d16, d14
+ vmax.s16 d4, d4, d12
+ vmax.s16 d16, d16, d12
+.endif
+
+ vmull.s16 q0, d4, d28
+ vmull.s16 q1, d5, d28
+ vmull.s16 q2, d6, d28
+ vmull.s16 q3, d7, d28
+.if !\ox
+ vdup.16 q7, d30[3] // bitdepth_max
+.endif
+ vmlal.s16 q0, d16, d29
+ vmlal.s16 q1, d17, d29
+ vmlal.s16 q2, d18, d29
+ vmlal.s16 q3, d19, d29
+.if !\ox
+ vshr.u16 q7, q7, #1 // grain_max
+.endif
+ vqrshrn.s32 d16, q0, #5
+ vqrshrn.s32 d17, q1, #5
+ vqrshrn.s32 d18, q2, #5
+ vqrshrn.s32 d19, q3, #5
+.if !\ox
+ vmvn q6, q7 // grain_min
+.endif
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 q8, q8, q7
+ vmin.s16 q9, q9, q7
+ vmax.s16 q8, q8, q6
+ vmax.s16 q9, q9, q6
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+.elseif \ox
+ vqrshrn.s32 d16, q0, #5
+ vld1.16 {q0, q1}, [r6, :128]! // luma
+ vmin.s16 d16, d16, d14
+ vld1.16 {q2, q3}, [r6, :128], r7 // luma
+ vmax.s16 d16, d16, d12
+.endif
+
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d4, d5
+ vpadd.i16 d3, d6, d7
+ vrshr.u16 q0, q0, #1
+ vrshr.u16 q1, q1, #1
+.if !\csfl
+ vdup.16 d28, d30[0] // uv_luma_mult
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+ vdup.16 d29, d30[1] // uv_mult
+ vmull.s16 q6, d0, d28
+ vmull.s16 q7, d1, d28
+ vmull.s16 q0, d2, d28
+ vmull.s16 q1, d3, d28
+ vmlal.s16 q6, d4, d29
+ vmlal.s16 q7, d5, d29
+ vmlal.s16 q0, d6, d29
+ vmlal.s16 q1, d7, d29
+ vshrn.s32 d12, q6, #6
+ vshrn.s32 d13, q7, #6
+ vshrn.s32 d14, q0, #6
+ vshrn.s32 d15, q1, #6
+ vdup.16 q14, d30[2] // uv_offset
+ vdup.16 q4, d30[3] // bitdepth_max
+ vmov.i16 q5, #0
+ vadd.i16 q6, q6, q14
+ vadd.i16 q7, q7, q14
+ vmin.s16 q0, q6, q4
+ vmin.s16 q1, q7, q4
+ vmax.s16 q0, q0, q5
+ vmax.s16 q1, q1, q5
+.else
+ vdup.16 q14, d30[3] // bitdepth_max
+ vld1.16 {q2, q3}, [r1, :128], r2 // src
+
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ vand q0, q0, q14
+ vand q1, q1, q14
+.endif
+
+ bl gather16_neon
+
+ vmovl.u8 q6, d8 // scaling
+ vmovl.u8 q7, d9
+
+ vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
+ vshl.u16 q7, q7, q13
+
+ vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
+ vqrdmulh.s16 q9, q9, q7
+
+
+ vdup.16 q4, d31[0] // clip_min
+ vdup.16 q5, d31[1] // clip_max
+
+ vqadd.s16 q0, q2, q8 // *src + noise
+ vqadd.s16 q1, q3, q9
+
+.if \oy
+ // Swap the two last coefficients of d31, place them first in d28
+ vrev64.16 d28, d31
+.endif
+
+ vmax.s16 q0, q0, q4
+ vmax.s16 q1, q1, q4
+ vmin.s16 q0, q0, q5
+ vmin.s16 q1, q1, q5
+
+ subs r9, r9, #1
+.if \oy
+ // Take the first two 16 bit coefficients of d28 and place them at the
+ // end of d31
+ vtrn.32 d31, d28
+.endif
+
+ vst1.16 {q0, q1}, [r0, :128], r2 // dst
+ bgt 1b
+
+.if \oy
+ cmp r12, #0
+ mov r9, r12 // restore actual remaining h
+ bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/ipred.S b/third_party/dav1d/src/arm/32/ipred.S
new file mode 100644
index 0000000000..ff55d95d4a
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/ipred.S
@@ -0,0 +1,2937 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * Copyright © 2019, B Krishnan Iyer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_128_8bpc_neon, export=1
+ push {r4, lr}
+ ldr r4, [sp, #8]
+ clz r3, r3
+ adr r2, L(ipred_dc_128_tbl)
+ sub r3, r3, #25
+ ldr r3, [r2, r3, lsl #2]
+ vmov.i8 q0, #128
+ add r2, r2, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r2
+
+ .align 2
+L(ipred_dc_128_tbl):
+ .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 16f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+4:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4, pc}
+8:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4, pc}
+16:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 16b
+ pop {r4, pc}
+320:
+ vmov.i8 q1, #128
+32:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4, pc}
+640:
+ vmov.i8 q1, #128
+ sub r1, r1, #32
+64:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 64b
+ pop {r4, pc}
+endfunc
+
+// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_8bpc_neon, export=1
+ push {r4, lr}
+ ldr lr, [sp, #8]
+ clz r3, r3
+ adr r4, L(ipred_v_tbl)
+ sub r3, r3, #25
+ ldr r3, [r4, r3, lsl #2]
+ add r2, r2, #1
+ add r4, r4, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r4
+
+ .align 2
+L(ipred_v_tbl):
+ .word 640f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_v_tbl) + CONFIG_THUMB
+40:
+ vld1.32 {d0[]}, [r2]
+4:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs lr, lr, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4, pc}
+80:
+ vld1.8 {d0}, [r2]
+8:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs lr, lr, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4, pc}
+160:
+ vld1.8 {q0}, [r2]
+16:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 16b
+ pop {r4, pc}
+320:
+ vld1.8 {q0, q1}, [r2]
+32:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4, pc}
+640:
+ vld1.8 {q0, q1}, [r2]!
+ sub r1, r1, #32
+ vld1.8 {q2, q3}, [r2]
+64:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
+ bgt 64b
+ pop {r4, pc}
+endfunc
+
+// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_8bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ clz r3, r3
+ adr r5, L(ipred_h_tbl)
+ sub r3, r3, #25
+ ldr r3, [r5, r3, lsl #2]
+ sub r2, r2, #4
+ mov lr, #-4
+ add r5, r5, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_h_tbl):
+ .word 640f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_h_tbl) + CONFIG_THUMB
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr
+ vst1.32 {d3[0]}, [r0, :32], r1
+ vst1.32 {d2[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r5, pc}
+8:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr
+ vst1.8 {d3}, [r0, :64], r1
+ vst1.8 {d2}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d1}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ add r2, r2, #3
+ mov lr, #-1
+16:
+ vld1.8 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.8 {d2[], d3[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128], r1
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vst1.8 {q1}, [r12, :128], r1
+ vld1.8 {d6[], d7[]}, [r2], lr
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ add r2, r2, #3
+ mov lr, #-1
+ sub r1, r1, #16
+32:
+ vld1.8 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.8 {d2[], d3[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128]!
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vst1.8 {q1}, [r12, :128]!
+ vld1.8 {d6[], d7[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128], r1
+ vst1.8 {q1}, [r12, :128], r1
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ add r2, r2, #3
+ mov lr, #-1
+ sub r1, r1, #48
+64:
+ vld1.8 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.8 {d2[], d3[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128]!
+ vld1.8 {d4[], d5[]}, [r2], lr
+ vst1.8 {q1}, [r12, :128]!
+ vld1.8 {d6[], d7[]}, [r2], lr
+ vst1.8 {q0}, [r0, :128]!
+ vst1.8 {q1}, [r12, :128]!
+ vst1.8 {q0}, [r0, :128]!
+ vst1.8 {q1}, [r12, :128]!
+ vst1.8 {q0}, [r0, :128], r1
+ vst1.8 {q1}, [r12, :128], r1
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128]!
+ vst1.8 {q3}, [r12, :128]!
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r12, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_8bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ clz r3, r3
+ adr r5, L(ipred_dc_top_tbl)
+ sub r3, r3, #25
+ ldr r3, [r5, r3, lsl #2]
+ add r2, r2, #1
+ add r5, r5, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_top_tbl):
+ .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+40:
+ vld1.32 {d0[]}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #2
+ vdup.8 d0, d0[0]
+4:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r5, pc}
+80:
+ vld1.8 {d0}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #3
+ vdup.8 d0, d0[0]
+8:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ vld1.8 {d0, d1}, [r2]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #4
+ vdup.8 q0, d0[0]
+16:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ vld1.8 {d0, d1, d2, d3}, [r2]
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d4, q0, #5
+ vdup.8 q0, d4[0]
+ vdup.8 q1, d4[0]
+32:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ vld1.8 {d0, d1, d2, d3}, [r2]!
+ vaddl.u8 q0, d0, d1
+ vld1.8 {d4, d5, d6, d7}, [r2]
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q0, q1
+ vadd.u16 q1, q2, q3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d18, q0, #6
+ vdup.8 q0, d18[0]
+ vdup.8 q1, d18[0]
+ sub r1, r1, #32
+64:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_8bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ sub r2, r2, r4
+ clz r3, r3
+ clz lr, r4
+ sub lr, lr, #25
+ adr r5, L(ipred_dc_left_tbl)
+ sub r3, r3, #20
+ ldr r3, [r5, r3, lsl #2]
+ ldr lr, [r5, lr, lsl #2]
+ add r3, r5, r3
+ add r5, r5, lr
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_left_tbl):
+ .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+
+L(ipred_dc_left_h4):
+ vld1.32 {d0[]}, [r2, :32]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #2
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w4):
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt L(ipred_dc_left_w4)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h8):
+ vld1.8 {d0}, [r2, :64]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #3
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w8):
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt L(ipred_dc_left_w8)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h16):
+ vld1.8 {d0, d1}, [r2, :128]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #4
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w16):
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt L(ipred_dc_left_w16)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h32):
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #5
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w32):
+ vmov.8 q1, q0
+1:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+L(ipred_dc_left_h64):
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.8 {d4, d5, d6, d7}, [r2, :128]
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q0, q1
+ vadd.u16 q1, q2, q3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshrn.u16 d0, q0, #6
+ vdup.8 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w64):
+ vmov.8 q1, q0
+ sub r1, r1, #32
+1:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_8bpc_neon, export=1
+ push {r4-r6, lr}
+ ldr r4, [sp, #16]
+ sub r2, r2, r4
+ add lr, r3, r4 // width + height
+ clz r3, r3
+ clz r12, r4
+ vdup.16 q15, lr // width + height
+ adr r5, L(ipred_dc_tbl)
+ rbit lr, lr // rbit(width + height)
+ sub r3, r3, #20 // 25 leading bits, minus table offset 5
+ sub r12, r12, #25
+ clz lr, lr // ctz(width + height)
+ ldr r3, [r5, r3, lsl #2]
+ ldr r12, [r5, r12, lsl #2]
+ neg lr, lr // -ctz(width + height)
+ add r3, r5, r3
+ add r5, r5, r12
+ vshr.u16 q15, q15, #1 // (width + height) >> 1
+ vdup.16 q14, lr // -ctz(width + height)
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_tbl):
+ .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB
+
+L(ipred_dc_h4):
+ vld1.32 {d0[]}, [r2, :32]!
+ vpaddl.u8 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w4):
+ vld1.32 {d1[]}, [r2]
+ vadd.s16 d0, d0, d30
+ vpaddl.u8 d1, d1
+ vpadd.u16 d1, d1
+ cmp r4, #4
+ vadd.s16 d0, d0, d1
+ vshl.u16 d0, d0, d28
+ beq 1f
+ // h = 8/16
+ movw lr, #(0x3334/2)
+ movw r5, #(0x5556/2)
+ cmp r4, #16
+ it ne
+ movne lr, r5
+ vdup.16 d30, lr
+ vqdmulh.s16 d0, d0, d30
+1:
+ vdup.8 d0, d0[0]
+2:
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[0]}, [r12, :32], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h8):
+ vld1.8 {d0}, [r2, :64]!
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w8):
+ vld1.8 {d2}, [r2]
+ vadd.s16 d0, d0, d30
+ vpaddl.u8 d2, d2
+ vpadd.u16 d2, d2
+ vpadd.u16 d2, d2
+ cmp r4, #8
+ vadd.s16 d0, d0, d2
+ vshl.u16 d0, d0, d28
+ beq 1f
+ // h = 4/16/32
+ cmp r4, #32
+ movw lr, #(0x3334/2)
+ movw r5, #(0x5556/2)
+ it ne
+ movne lr, r5
+ vdup.16 d24, lr
+ vqdmulh.s16 d0, d0, d24
+1:
+ vdup.8 d0, d0[0]
+2:
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d0}, [r12, :64], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h16):
+ vld1.8 {d0, d1}, [r2, :128]!
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w16):
+ vld1.8 {d2, d3}, [r2]
+ vadd.s16 d0, d0, d30
+ vaddl.u8 q1, d2, d3
+ vadd.u16 d2, d2, d3
+ vpadd.u16 d2, d2
+ vpadd.u16 d2, d2
+ cmp r4, #16
+ vadd.s16 d0, d0, d2
+ vshl.u16 d0, d0, d28
+ beq 1f
+ // h = 4/8/32/64
+ tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
+ movw lr, #(0x3334/2)
+ movw r5, #(0x5556/2)
+ it ne
+ movne lr, r5
+ vdup.16 d24, lr
+ vqdmulh.s16 d0, d0, d24
+1:
+ vdup.8 q0, d0[0]
+2:
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1}, [r0, :128], r1
+ vst1.8 {d0, d1}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h32):
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w32):
+ vld1.8 {d2, d3, d4, d5}, [r2]
+ vadd.s16 d0, d0, d30
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vadd.u16 q1, q1, q2
+ vadd.u16 d2, d2, d3
+ vpadd.u16 d2, d2
+ vpadd.u16 d2, d2
+ cmp r4, #32
+ vadd.s16 d0, d0, d2
+ vshl.u16 d4, d0, d28
+ beq 1f
+ // h = 8/16/64
+ cmp r4, #8
+ movw lr, #(0x3334/2)
+ movw r5, #(0x5556/2)
+ it ne
+ movne lr, r5
+ vdup.16 d24, lr
+ vqdmulh.s16 d4, d4, d24
+1:
+ vdup.8 q0, d4[0]
+ vdup.8 q1, d4[0]
+2:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h64):
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
+ vaddl.u8 q0, d0, d1
+ vld1.8 {d4, d5, d6, d7}, [r2, :128]!
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q0, q1
+ vadd.u16 q1, q2, q3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ add r2, r2, #1
+ vpadd.u16 d0, d0
+ bx r3
+L(ipred_dc_w64):
+ vld1.8 {d2, d3, d4, d5}, [r2]!
+ vadd.s16 d0, d0, d30
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q1, d2, d3
+ vadd.u16 d4, d4, d5
+ vadd.u16 d2, d2, d3
+ vld1.8 {d16, d17, d18, d19}, [r2]
+ vpadd.u16 d4, d4
+ vpadd.u16 d2, d2
+ vpadd.u16 d4, d4
+ vpadd.u16 d2, d2
+ vaddl.u8 q8, d16, d17
+ vaddl.u8 q9, d18, d19
+ vadd.u16 d16, d16, d17
+ vadd.u16 d18, d18, d19
+ vpadd.u16 d16, d16
+ vpadd.u16 d18, d18
+ vpadd.u16 d16, d16
+ vpadd.u16 d18, d18
+ vadd.u16 d2, d2, d4
+ vadd.u16 d3, d16, d18
+ cmp r4, #64
+ vadd.s16 d0, d0, d2
+ vadd.s16 d0, d0, d3
+ vshl.u16 d18, d0, d28
+ beq 1f
+ // h = 16/32
+ movw lr, #(0x5556/2)
+ movt lr, #(0x3334/2)
+ and r5, r4, #31
+ lsr lr, lr, r5
+ vdup.16 d30, lr
+ vqdmulh.s16 d18, d18, d30
+1:
+ sub r1, r1, #32
+ vdup.8 q0, d18[0]
+ vdup.8 q1, d18[0]
+2:
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.8 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+endfunc
+
+// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_paeth_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ clz lr, r3
+ adr r5, L(ipred_paeth_tbl)
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.8 {d4[], d5[]}, [r2]
+ add r8, r2, #1
+ sub r2, r2, #4
+ add r5, r5, lr
+ mov r7, #-4
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_paeth_tbl):
+ .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[], d7[]}, [r8]
+ vsubl.u8 q8, d6, d4 // top - topleft
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7
+ vzip.32 d0, d1
+ vzip.32 d2, d3
+ vaddw.u8 q9, q8, d0
+ vaddw.u8 q10, q8, d2
+ vqmovun.s16 d18, q9 // base
+ vqmovun.s16 d19, q10
+ vmov d1, d2
+ vabd.u8 q10, q3, q9 // tdiff
+ vabd.u8 q11, q2, q9 // tldiff
+ vabd.u8 q9, q0, q9 // ldiff
+ vmin.u8 q12, q10, q11 // min(tdiff, tldiff)
+ vcge.u8 q10, q11, q10 // tldiff >= tdiff
+ vcge.u8 q9, q12, q9 // min(tdiff, tldiff) >= ldiff
+ vbsl q10, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbit q10, q0, q9 // ldiff <= min ? left : ...
+ vst1.32 {d21[1]}, [r0, :32], r1
+ vst1.32 {d21[0]}, [r6, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d20[1]}, [r0, :32], r1
+ vst1.32 {d20[0]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d6}, [r8]
+ vsubl.u8 q8, d6, d4 // top - topleft
+ vmov d7, d6
+8:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7
+ vaddw.u8 q9, q8, d0
+ vaddw.u8 q10, q8, d1
+ vaddw.u8 q11, q8, d2
+ vaddw.u8 q12, q8, d3
+ vqmovun.s16 d18, q9 // base
+ vqmovun.s16 d19, q10
+ vqmovun.s16 d20, q11
+ vqmovun.s16 d21, q12
+ vabd.u8 q11, q3, q9 // tdiff
+ vabd.u8 q12, q3, q10
+ vabd.u8 q13, q2, q9 // tldiff
+ vabd.u8 q14, q2, q10
+ vabd.u8 q10, q1, q10 // ldiff
+ vabd.u8 q9, q0, q9
+ vmin.u8 q15, q12, q14 // min(tdiff, tldiff)
+ vcge.u8 q12, q14, q12 // tldiff >= tdiff
+ vmin.u8 q14, q11, q13 // min(tdiff, tldiff)
+ vcge.u8 q11, q13, q11 // tldiff >= tdiff
+ vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff
+ vcge.u8 q9, q14, q9
+ vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbsl q11, q3, q2
+ vbit q12, q1, q10 // ldiff <= min ? left : ...
+ vbit q11, q0, q9
+ vst1.8 {d25}, [r0, :64], r1
+ vst1.8 {d24}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d23}, [r0, :64], r1
+ vst1.8 {d22}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+640:
+ vld1.8 {d6}, [r8]!
+ mov r12, r3
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3
+1:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7
+2:
+ vsubl.u8 q8, d6, d4 // top - topleft
+ vmov d7, d6
+ vaddw.u8 q9, q8, d0
+ vaddw.u8 q10, q8, d1
+ vaddw.u8 q11, q8, d2
+ vaddw.u8 q12, q8, d3
+ vqmovun.s16 d18, q9 // base
+ vqmovun.s16 d19, q10
+ vqmovun.s16 d20, q11
+ vqmovun.s16 d21, q12
+ vabd.u8 q11, q3, q9 // tdiff
+ vabd.u8 q12, q3, q10
+ vabd.u8 q13, q2, q9 // tldiff
+ vabd.u8 q14, q2, q10
+ vabd.u8 q10, q1, q10 // ldiff
+ vabd.u8 q9, q0, q9
+ vmin.u8 q15, q12, q14 // min(tdiff, tldiff)
+ vcge.u8 q12, q14, q12 // tldiff >= tdiff
+ vmin.u8 q14, q11, q13 // min(tdiff, tldiff)
+ vcge.u8 q11, q13, q11 // tldiff >= tdiff
+ vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff
+ vcge.u8 q9, q14, q9
+ vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbsl q11, q3, q2
+ vbit q12, q1, q10 // ldiff <= min ? left : ...
+ vbit q11, q0, q9
+ subs r3, r3, #8
+ vst1.8 {d25}, [r0, :64]!
+ vst1.8 {d24}, [r6, :64]!
+ vst1.8 {d23}, [r5, :64]!
+ vst1.8 {d22}, [lr, :64]!
+ ble 8f
+ vld1.8 {d6}, [r8]!
+ b 2b
+8:
+ subs r4, r4, #4
+ ble 9f
+ // End of horizontal loop, move pointers to next four rows
+ sub r8, r8, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ vld1.8 {d6}, [r8]!
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_8bpc_neon, export=1
+ push {r4-r10, lr}
+ ldr r4, [sp, #32]
+ movrel r10, X(sm_weights)
+ add r12, r10, r4
+ add r10, r10, r3
+ clz r9, r3
+ adr r5, L(ipred_smooth_tbl)
+ sub lr, r2, r4
+ sub r9, r9, #25
+ ldr r9, [r5, r9, lsl #2]
+ vld1.8 {d4[]}, [lr] // bottom
+ add r8, r2, #1
+ add r5, r5, r9
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_tbl):
+ .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d16[]}, [r8] // top
+ vld1.32 {d18[]}, [r10, :32] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vdup.8 q3, d16[3] // right
+ vsubl.u8 q8, d16, d4 // top-bottom
+ vmovl.u8 q9, d18 // weights_hor
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left
+ vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver
+ vshll.i8 q12, d6, #8 // right*256
+ vshll.i8 q13, d6, #8
+ vzip.32 d1, d0 // left, flipped
+ vzip.32 d3, d2
+ vzip.32 d20, d21 // weights_ver
+ vzip.32 d22, d23
+ vshll.i8 q14, d4, #8 // bottom*256
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q0, d1, d6 // left-right
+ vsubl.u8 q1, d3, d6
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+ vmla.i16 q12, q1, q9 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q0, q9 // (left flipped)
+ vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q15, q8, q11
+ vhadd.u16 q12, q12, q14
+ vhadd.u16 q13, q13, q15
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ vst1.32 {d24[0]}, [r0, :32], r1
+ vst1.32 {d24[1]}, [r6, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d25[0]}, [r0, :32], r1
+ vst1.32 {d25[1]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r10, pc}
+80:
+ vld1.8 {d16}, [r8] // top
+ vld1.8 {d18}, [r10, :64] // weights_hor
+ sub r2, r2, #2
+ mov r7, #-2
+ vdup.8 q3, d16[7] // right
+ vsubl.u8 q8, d16, d4 // top-bottom
+ vmovl.u8 q9, d18 // weights_hor
+8:
+ vld2.8 {d0[], d1[]}, [r2, :16], r7 // left
+ vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver
+ vshll.i8 q12, d6, #8 // right*256
+ vshll.i8 q13, d6, #8
+ vshll.i8 q14, d4, #8 // bottom*256
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q1, d0, d6 // left-right (left flipped)
+ vsubl.u8 q0, d1, d6
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+ vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q1, q9
+ vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q15, q8, q11
+ vhadd.u16 q12, q12, q14
+ vhadd.u16 q13, q13, q15
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ subs r4, r4, #2
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d25}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r10, pc}
+160:
+320:
+640:
+ add lr, r2, r3
+ sub r2, r2, #2
+ mov r7, #-2
+ vld1.8 {d6[], d7[]}, [lr] // right
+ sub r1, r1, r3
+ mov r9, r3
+
+1:
+ vld2.8 {d0[], d1[]}, [r2, :16], r7 // left
+ vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver
+ vsubl.u8 q1, d0, d6 // left-right (left flipped)
+ vsubl.u8 q0, d1, d6
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+2:
+ vld1.8 {d16}, [r8]! // top
+ vld1.8 {d18}, [r10, :64]! // weights_hor
+ vshll.i8 q12, d6, #8 // right*256
+ vshll.i8 q13, d6, #8
+ vmovl.u8 q9, d18 // weights_hor
+ vshll.i8 q14, d4, #8 // bottom*256
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q8, d16, d4 // top-bottom
+ vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q1, q9
+ vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q15, q8, q11
+ vhadd.u16 q12, q12, q14
+ vhadd.u16 q13, q13, q15
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ subs r3, r3, #8
+ vst1.8 {d24}, [r0, :64]!
+ vst1.8 {d25}, [r6, :64]!
+ bgt 2b
+ subs r4, r4, #2
+ ble 9f
+ sub r8, r8, r9
+ sub r10, r10, r9
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, r9
+ b 1b
+9:
+ pop {r4-r10, pc}
+endfunc
+
+// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_v_8bpc_neon, export=1
+ push {r4-r7, lr}
+ ldr r4, [sp, #20]
+ movrel r7, X(sm_weights)
+ add r7, r7, r4
+ clz lr, r3
+ adr r5, L(ipred_smooth_v_tbl)
+ sub r12, r2, r4
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.8 {d4[]}, [r12] // bottom
+ add r2, r2, #1
+ add r5, r5, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_v_tbl):
+ .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[]}, [r2] // top
+ vsubl.u8 q3, d6, d4 // top-bottom
+4:
+ vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver
+ vshll.i8 q10, d4, #8 // bottom*256
+ vshll.i8 q11, d4, #8
+ vzip.32 d16, d17 // weights_ver
+ vzip.32 d18, d19
+ vmovl.u8 q8, d16 // weights_ver
+ vmovl.u8 q9, d18
+ subs r4, r4, #4
+ vmla.i16 q10, q3, q8 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q11, q3, q9
+ vrshrn.i16 d20, q10, #8
+ vrshrn.i16 d21, q11, #8
+ vst1.32 {d20[0]}, [r0, :32], r1
+ vst1.32 {d20[1]}, [r6, :32], r1
+ vst1.32 {d21[0]}, [r0, :32], r1
+ vst1.32 {d21[1]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r7, pc}
+80:
+ vld1.8 {d6}, [r2] // top
+ vsubl.u8 q3, d6, d4 // top-bottom
+8:
+ vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver
+ vshll.i8 q12, d4, #8 // bottom*256
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vmovl.u8 q8, d16 // weights_ver
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+ vmla.i16 q12, q3, q8 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q13, q3, q9
+ vmla.i16 q14, q3, q10
+ vmla.i16 q15, q3, q11
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ vrshrn.i16 d26, q14, #8
+ vrshrn.i16 d27, q15, #8
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d25}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d26}, [r0, :64], r1
+ vst1.8 {d27}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r7, pc}
+160:
+320:
+640:
+ vpush {q4-q7}
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3
+ mov r12, r3
+
+1:
+ vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver
+ vmovl.u8 q4, d8 // weights_ver
+ vmovl.u8 q5, d10
+ vmovl.u8 q6, d12
+ vmovl.u8 q7, d14
+2:
+ vld1.8 {q3}, [r2]! // top
+ vshll.i8 q8, d4, #8 // bottom*256
+ vshll.i8 q9, d4, #8
+ vshll.i8 q10, d4, #8
+ vshll.i8 q11, d4, #8
+ vsubl.u8 q0, d6, d4 // top-bottom
+ vsubl.u8 q1, d7, d4
+ vshll.i8 q12, d4, #8
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vmla.i16 q8, q0, q4 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q9, q1, q4
+ vmla.i16 q10, q0, q5
+ vmla.i16 q11, q1, q5
+ vmla.i16 q12, q0, q6 // bottom*256 + (top-bottom)*weights_ver
+ vmla.i16 q13, q1, q6
+ vmla.i16 q14, q0, q7
+ vmla.i16 q15, q1, q7
+ vrshrn.i16 d16, q8, #8
+ vrshrn.i16 d17, q9, #8
+ vrshrn.i16 d18, q10, #8
+ vrshrn.i16 d19, q11, #8
+ vrshrn.i16 d20, q12, #8
+ vrshrn.i16 d21, q13, #8
+ vrshrn.i16 d22, q14, #8
+ vrshrn.i16 d23, q15, #8
+ subs r3, r3, #16
+ vst1.8 {q8}, [r0, :128]!
+ vst1.8 {q9}, [r6, :128]!
+ vst1.8 {q10}, [r5, :128]!
+ vst1.8 {q11}, [lr, :128]!
+ bgt 2b
+ subs r4, r4, #4
+ ble 9f
+ sub r2, r2, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r7, pc}
+endfunc
+
+// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_h_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ movrel r8, X(sm_weights)
+ add r8, r8, r3
+ clz lr, r3
+ adr r5, L(ipred_smooth_h_tbl)
+ add r12, r2, r3
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.8 {d4[]}, [r12] // right
+ add r5, r5, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_h_tbl):
+ .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[]}, [r8, :32] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vmovl.u8 q3, d6 // weights_hor
+4:
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left
+ vshll.i8 q8, d4, #8 // right*256
+ vshll.i8 q9, d4, #8
+ vzip.32 d3, d2 // left, flipped
+ vzip.32 d1, d0
+ vsubl.u8 q1, d3, d4 // left-right
+ vsubl.u8 q0, d1, d4
+ subs r4, r4, #4
+ vmla.i16 q8, q1, q3 // right*256 + (left-right)*weights_hor
+ vmla.i16 q9, q0, q3
+ vrshrn.i16 d16, q8, #8
+ vrshrn.i16 d17, q9, #8
+ vst1.32 {d16[0]}, [r0, :32], r1
+ vst1.32 {d16[1]}, [r6, :32], r1
+ vst1.32 {d17[0]}, [r0, :32], r1
+ vst1.32 {d17[1]}, [r6, :32], r1
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d6}, [r8, :64] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vmovl.u8 q3, d6 // weights_hor
+8:
+ vld4.8 {d16[], d18[], d20[], d22[]}, [r2, :32], r7 // left
+ vshll.i8 q12, d4, #8 // right*256
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vsubl.u8 q11, d22, d4 // left-right
+ vsubl.u8 q10, d20, d4
+ vsubl.u8 q9, d18, d4
+ vsubl.u8 q8, d16, d4
+ vmla.i16 q12, q11, q3 // right*256 + (left-right)*weights_hor
+ vmla.i16 q13, q10, q3 // (left flipped)
+ vmla.i16 q14, q9, q3
+ vmla.i16 q15, q8, q3
+ vrshrn.i16 d24, q12, #8
+ vrshrn.i16 d25, q13, #8
+ vrshrn.i16 d26, q14, #8
+ vrshrn.i16 d27, q15, #8
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d25}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d26}, [r0, :64], r1
+ vst1.8 {d27}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+640:
+ vpush {q4-q7}
+ sub r2, r2, #4
+ mov r7, #-4
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3
+ mov r12, r3
+
+1:
+ vld4.8 {d8[], d10[], d12[], d14[]}, [r2, :32], r7 // left
+ vsubl.u8 q4, d8, d4 // left-right
+ vsubl.u8 q5, d10, d4
+ vsubl.u8 q6, d12, d4
+ vsubl.u8 q7, d14, d4
+2:
+ vld1.8 {q1}, [r8, :128]! // weights_hor
+ vshll.i8 q8, d4, #8 // right*256
+ vshll.i8 q9, d4, #8
+ vshll.i8 q10, d4, #8
+ vshll.i8 q11, d4, #8
+ vmovl.u8 q0, d2 // weights_hor
+ vmovl.u8 q1, d3
+ vshll.i8 q12, d4, #8
+ vshll.i8 q13, d4, #8
+ vshll.i8 q14, d4, #8
+ vshll.i8 q15, d4, #8
+ vmla.i16 q8, q7, q0 // right*256 + (left-right)*weights_hor
+ vmla.i16 q9, q7, q1 // (left flipped)
+ vmla.i16 q10, q6, q0
+ vmla.i16 q11, q6, q1
+ vmla.i16 q12, q5, q0
+ vmla.i16 q13, q5, q1
+ vmla.i16 q14, q4, q0
+ vmla.i16 q15, q4, q1
+ vrshrn.i16 d16, q8, #8
+ vrshrn.i16 d17, q9, #8
+ vrshrn.i16 d18, q10, #8
+ vrshrn.i16 d19, q11, #8
+ vrshrn.i16 d20, q12, #8
+ vrshrn.i16 d21, q13, #8
+ vrshrn.i16 d22, q14, #8
+ vrshrn.i16 d23, q15, #8
+ subs r3, r3, #16
+ vst1.8 {q8}, [r0, :128]!
+ vst1.8 {q9}, [r6, :128]!
+ vst1.8 {q10}, [r5, :128]!
+ vst1.8 {q11}, [lr, :128]!
+ bgt 2b
+ subs r4, r4, #4
+ ble 9f
+ sub r8, r8, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int filt_idx,
+// const int max_width, const int max_height);
+function ipred_filter_8bpc_neon, export=1
+ push {r4-r8, lr}
+ movw r12, #511
+ ldrd r4, r5, [sp, #24]
+ and r5, r5, r12 // 511
+ movrel r6, X(filter_intra_taps)
+ lsl r5, r5, #6
+ add r6, r6, r5
+ vld1.8 {d20, d21, d22, d23}, [r6, :128]!
+ clz lr, r3
+ adr r5, L(ipred_filter_tbl)
+ vld1.8 {d27, d28, d29}, [r6, :64]
+ sub lr, lr, #26
+ ldr lr, [r5, lr, lsl #2]
+ vmovl.s8 q8, d20
+ vmovl.s8 q9, d21
+ add r5, r5, lr
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmovl.s8 q12, d27
+ vmovl.s8 q13, d28
+ vmovl.s8 q14, d29
+ add r8, r2, #1
+ sub r2, r2, #2
+ mov r7, #-2
+ bx r5
+
+ .align 2
+L(ipred_filter_tbl):
+ .word 320f - L(ipred_filter_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_filter_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_filter_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_filter_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d0[]}, [r8] // top (0-3)
+ vmovl.u8 q0, d0 // top (0-3)
+4:
+ vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2)
+ vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1)
+ vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2)
+ vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3)
+ vmovl.u8 q1, d2 // left (0-1) + topleft (2)
+ vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4)
+ vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0)
+ vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5)
+ vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6)
+ vqrshrun.s16 d4, q2, #4
+ subs r4, r4, #2
+ vst1.32 {d4[0]}, [r0, :32], r1
+ vmovl.u8 q0, d4
+ vst1.32 {d4[1]}, [r6, :32], r1
+ vmov d0, d1 // move top from [4-7] to [0-3]
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d0}, [r8] // top (0-7)
+ vmovl.u8 q0, d0 // top (0-7)
+8:
+ vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2)
+ vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1)
+ vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2)
+ vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3)
+ vmovl.u8 q1, d2 // left (0-1) + topleft (2)
+ vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4)
+ vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0)
+ vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5)
+ vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6)
+ vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1)
+ vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2)
+ vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d4, q2, #4
+ vmovl.u8 q1, d4 // first block, in 16 bit
+ vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4)
+ vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0)
+ vmla.i16 q3, q13, d2[3] // p5(left[0]) * filter(5)
+ vmla.i16 q3, q14, d3[3] // p6(left[1]) * filter(6)
+ vqrshrun.s16 d5, q3, #4
+ vzip.32 d4, d5
+ subs r4, r4, #2
+ vst1.8 {d4}, [r0, :64], r1
+ vmovl.u8 q0, d5
+ vst1.8 {d5}, [r6, :64], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+ vpush {q4-q5}
+ sub r1, r1, r3
+ mov lr, r3
+
+1:
+ vld1.32 {d0[]}, [r2], r7 // left (0-1) + topleft (2)
+ vmovl.u8 q0, d0 // left (0-1) + topleft (2)
+2:
+ vld1.8 {q2}, [r8]! // top(0-15)
+ vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0)
+ vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5)
+ vmovl.u8 q1, d4 // top(0-7)
+ vmovl.u8 q2, d5 // top(8-15)
+ vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6)
+ vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1)
+ vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2)
+ vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3)
+ vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4)
+
+ vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1)
+ vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2)
+ vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d6, q3, #4
+ vmovl.u8 q0, d6 // first block, in 16 bit
+ vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4)
+ vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0)
+ vmla.i16 q4, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q4, q14, d1[3] // p6(left[1]) * filter(6)
+
+ vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1)
+ vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2)
+ vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d7, q4, #4
+ vmovl.u8 q0, d7 // second block, in 16 bit
+ vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4)
+ vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0)
+ vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6)
+
+ vmul.i16 q15, q9, d5[0] // p1(top[0]) * filter(1)
+ vmla.i16 q15, q10, d5[1] // p2(top[1]) * filter(2)
+ vmla.i16 q15, q11, d5[2] // p3(top[2]) * filter(3)
+ vqrshrun.s16 d8, q5, #4
+ vmovl.u8 q0, d8 // third block, in 16 bit
+ vmov.u8 r12, d5[6]
+ vmla.i16 q15, q12, d5[3] // p4(top[3]) * filter(4)
+ vmla.i16 q15, q8, d4[3] // p0(topleft) * filter(0)
+ vmla.i16 q15, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q15, q14, d1[3] // p6(left[1]) * filter(6)
+ vmov.8 d0[4], r12
+
+ subs r3, r3, #16
+ vqrshrun.s16 d9, q15, #4
+
+ vst4.32 {d6[0], d7[0], d8[0], d9[0]}, [r0, :128]!
+ vst4.32 {d6[1], d7[1], d8[1], d9[1]}, [r6, :128]!
+ ble 8f
+ vmov.u8 r12, d9[7]
+ vmov.8 d0[0], r12
+ vmov.u8 r12, d9[3]
+ vmov.8 d0[2], r12
+ b 2b
+8:
+ subs r4, r4, #2
+
+ ble 9f
+ sub r8, r6, lr
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, lr
+ b 1b
+9:
+ vpop {q4-q5}
+ pop {r4-r8, pc}
+endfunc
+
+// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint16_t *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_8bpc_neon, export=1
+ push {r4-r5, lr}
+ ldrd r4, r5, [sp, #12]
+ vld1.16 {q0}, [r2, :128]
+ clz lr, r4
+ adr r12, L(pal_pred_tbl)
+ sub lr, lr, #25
+ ldr lr, [r12, lr, lsl #2]
+ vmovn.i16 d0, q0
+ add r12, r12, lr
+ add r2, r0, r1
+ bx r12
+
+ .align 2
+L(pal_pred_tbl):
+ .word 640f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 320f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 160f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 80f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 40f - L(pal_pred_tbl) + CONFIG_THUMB
+
+40:
+ lsl r1, r1, #1
+4:
+ vld1.8 {q1}, [r3, :128]!
+ subs r5, r5, #4
+ vtbl.8 d2, {d0}, d2
+ vtbl.8 d3, {d0}, d3
+ vst1.32 {d2[0]}, [r0, :32], r1
+ vst1.32 {d2[1]}, [r2, :32], r1
+ vst1.32 {d3[0]}, [r0, :32], r1
+ vst1.32 {d3[1]}, [r2, :32], r1
+ bgt 4b
+ pop {r4-r5, pc}
+80:
+ lsl r1, r1, #1
+8:
+ vld1.8 {q1, q2}, [r3, :128]!
+ subs r5, r5, #4
+ vtbl.8 d2, {d0}, d2
+ vtbl.8 d3, {d0}, d3
+ vst1.8 {d2}, [r0, :64], r1
+ vtbl.8 d4, {d0}, d4
+ vst1.8 {d3}, [r2, :64], r1
+ vtbl.8 d5, {d0}, d5
+ vst1.8 {d4}, [r0, :64], r1
+ vst1.8 {d5}, [r2, :64], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ lsl r1, r1, #1
+16:
+ vld1.8 {q8, q9}, [r3, :128]!
+ subs r5, r5, #4
+ vld1.8 {q10, q11}, [r3, :128]!
+ vtbl.8 d16, {d0}, d16
+ vtbl.8 d17, {d0}, d17
+ vtbl.8 d18, {d0}, d18
+ vtbl.8 d19, {d0}, d19
+ vtbl.8 d20, {d0}, d20
+ vtbl.8 d21, {d0}, d21
+ vst1.8 {q8}, [r0, :128], r1
+ vtbl.8 d22, {d0}, d22
+ vst1.8 {q9}, [r2, :128], r1
+ vtbl.8 d23, {d0}, d23
+ vst1.8 {q10}, [r0, :128], r1
+ vst1.8 {q11}, [r2, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ lsl r1, r1, #1
+32:
+ vld1.8 {q8, q9}, [r3, :128]!
+ subs r5, r5, #2
+ vld1.8 {q10, q11}, [r3, :128]!
+ vtbl.8 d16, {d0}, d16
+ vtbl.8 d17, {d0}, d17
+ vtbl.8 d18, {d0}, d18
+ vtbl.8 d19, {d0}, d19
+ vtbl.8 d20, {d0}, d20
+ vtbl.8 d21, {d0}, d21
+ vst1.8 {q8, q9}, [r0, :128], r1
+ vtbl.8 d22, {d0}, d22
+ vtbl.8 d23, {d0}, d23
+ vst1.8 {q10, q11}, [r2, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ sub r1, r1, #32
+64:
+ vld1.8 {q8, q9}, [r3, :128]!
+ subs r5, r5, #1
+ vld1.8 {q10, q11}, [r3, :128]!
+ vtbl.8 d16, {d0}, d16
+ vtbl.8 d17, {d0}, d17
+ vtbl.8 d18, {d0}, d18
+ vtbl.8 d19, {d0}, d19
+ vtbl.8 d20, {d0}, d20
+ vtbl.8 d21, {d0}, d21
+ vst1.8 {q8, q9}, [r0, :128]!
+ vtbl.8 d22, {d0}, d22
+ vtbl.8 d23, {d0}, d23
+ vst1.8 {q10, q11}, [r0, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_128_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz lr, r3
+ adr r12, L(ipred_cfl_128_tbl)
+ sub lr, lr, #26
+ ldr lr, [r12, lr, lsl #2]
+ vmov.i16 q0, #128 // dc
+ vdup.i16 q1, r6 // alpha
+ add r12, r12, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r12
+
+ .align 2
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+ .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_splat_w4):
+ vld1.16 {q2, q3}, [r5, :128]!
+ vmul.i16 q2, q2, q1 // diff = ac * alpha
+ vmul.i16 q3, q3, q1
+ vshr.s16 q8, q2, #15 // sign = diff >> 15
+ vshr.s16 q9, q3, #15
+ vadd.i16 q2, q2, q8 // diff + sign
+ vadd.i16 q3, q3, q9
+ vrshr.s16 q2, q2, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshr.s16 q3, q3, #6
+ vadd.i16 q2, q2, q0 // dc + apply_sign()
+ vadd.i16 q3, q3, q0
+ vqmovun.s16 d4, q2 // iclip_pixel(dc + apply_sign())
+ vqmovun.s16 d5, q3
+ vst1.32 {d4[0]}, [r0, :32], r1
+ vst1.32 {d4[1]}, [r6, :32], r1
+ subs r4, r4, #4
+ vst1.32 {d5[0]}, [r0, :32], r1
+ vst1.32 {d5[1]}, [r6, :32], r1
+ bgt L(ipred_cfl_splat_w4)
+ pop {r4-r8, pc}
+L(ipred_cfl_splat_w8):
+ vld1.16 {q8, q9}, [r5, :128]!
+ vld1.16 {q10, q11}, [r5, :128]!
+ vmul.i16 q8, q8, q1 // diff = ac * alpha
+ vmul.i16 q9, q9, q1
+ vmul.i16 q10, q10, q1
+ vmul.i16 q11, q11, q1
+ vshr.s16 q12, q8, #15 // sign = diff >> 15
+ vshr.s16 q13, q9, #15
+ vshr.s16 q14, q10, #15
+ vshr.s16 q15, q11, #15
+ vadd.i16 q8, q8, q12 // diff + sign
+ vadd.i16 q9, q9, q13
+ vadd.i16 q10, q10, q14
+ vadd.i16 q11, q11, q15
+ vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshr.s16 q9, q9, #6
+ vrshr.s16 q10, q10, #6
+ vrshr.s16 q11, q11, #6
+ vadd.i16 q8, q8, q0 // dc + apply_sign()
+ vadd.i16 q9, q9, q0
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q0
+ vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign())
+ vqmovun.s16 d17, q9
+ vqmovun.s16 d18, q10
+ vqmovun.s16 d19, q11
+ vst1.8 {d16}, [r0, :64], r1
+ vst1.8 {d17}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.8 {d18}, [r0, :64], r1
+ vst1.8 {d19}, [r6, :64], r1
+ bgt L(ipred_cfl_splat_w8)
+ pop {r4-r8, pc}
+L(ipred_cfl_splat_w16):
+ add r12, r5, r3, lsl #1
+ sub r1, r1, r3
+ mov lr, r3
+1:
+ vld1.16 {q8, q9}, [r5, :128]!
+ vmul.i16 q8, q8, q1 // diff = ac * alpha
+ vld1.16 {q10, q11}, [r12, :128]!
+ vmul.i16 q9, q9, q1
+ vmul.i16 q10, q10, q1
+ vmul.i16 q11, q11, q1
+ vshr.s16 q12, q8, #15 // sign = diff >> 15
+ vshr.s16 q13, q9, #15
+ vshr.s16 q14, q10, #15
+ vshr.s16 q15, q11, #15
+ vadd.i16 q8, q8, q12 // diff + sign
+ vadd.i16 q9, q9, q13
+ vadd.i16 q10, q10, q14
+ vadd.i16 q11, q11, q15
+ vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshr.s16 q9, q9, #6
+ vrshr.s16 q10, q10, #6
+ vrshr.s16 q11, q11, #6
+ vadd.i16 q8, q8, q0 // dc + apply_sign()
+ vadd.i16 q9, q9, q0
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q0
+ vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign())
+ vqmovun.s16 d17, q9
+ vqmovun.s16 d18, q10
+ vqmovun.s16 d19, q11
+ subs r3, r3, #16
+ vst1.16 {q8}, [r0, :128]!
+ vst1.16 {q9}, [r6, :128]!
+ bgt 1b
+ subs r4, r4, #2
+ add r5, r5, lr, lsl #1
+ add r12, r12, lr, lsl #1
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, lr
+ bgt 1b
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_top_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz lr, r3
+ adr r12, L(ipred_cfl_top_tbl)
+ sub lr, lr, #26
+ ldr lr, [r12, lr, lsl #2]
+ vdup.16 q1, r6 // alpha
+ add r2, r2, #1
+ add r12, r12, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r12
+
+ .align 2
+L(ipred_cfl_top_tbl):
+ .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+
+4:
+ vld1.32 {d0[]}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w4)
+8:
+ vld1.8 {d0}, [r2]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w8)
+16:
+ vld1.8 {q0}, [r2]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+32:
+ vld1.8 {q2, q3}, [r2]
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q2, q3
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #5
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+endfunc
+
+// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_left_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ sub r2, r2, r4
+ clz lr, r3
+ clz r8, r4
+ adr r12, L(ipred_cfl_splat_tbl)
+ adr r7, L(ipred_cfl_left_tbl)
+ sub lr, lr, #26
+ sub r8, r8, #26
+ ldr lr, [r12, lr, lsl #2]
+ ldr r8, [r7, r8, lsl #2]
+ vdup.16 q1, r6 // alpha
+ add r12, r12, lr
+ add r7, r7, r8
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r7
+
+ .align 2
+L(ipred_cfl_left_tbl):
+ .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_left_h4):
+ vld1.32 {d0[]}, [r2, :32]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h8):
+ vld1.8 {d0}, [r2, :64]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h16):
+ vld1.8 {q0}, [r2, :128]
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h32):
+ vld1.8 {q2, q3}, [r2, :128]
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q2, q3
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0
+ vpadd.u16 d0, d0
+ vrshr.u16 d0, d0, #5
+ vdup.16 q0, d0[0]
+ bx r12
+endfunc
+
+// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_8bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ sub r2, r2, r4
+ add r8, r3, r4 // width + height
+ vdup.16 q1, r6 // alpha
+ clz lr, r3
+ clz r6, r4
+ vdup.16 d16, r8 // width + height
+ adr r7, L(ipred_cfl_tbl)
+ rbit r8, r8 // rbit(width + height)
+ sub lr, lr, #22 // 26 leading bits, minus table offset 4
+ sub r6, r6, #26
+ clz r8, r8 // ctz(width + height)
+ ldr lr, [r7, lr, lsl #2]
+ ldr r6, [r7, r6, lsl #2]
+ neg r8, r8 // -ctz(width + height)
+ add r12, r7, lr
+ add r7, r7, r6
+ vshr.u16 d16, d16, #1 // (width + height) >> 1
+ vdup.16 d17, r8 // -ctz(width + height)
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r7
+
+ .align 2
+L(ipred_cfl_tbl):
+ .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_h4):
+ vld1.32 {d0[]}, [r2, :32]!
+ vpaddl.u8 d0, d0
+ add r2, r2, #1
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w4):
+ vld1.32 {d1[]}, [r2]
+ vadd.i16 d0, d0, d16
+ vpaddl.u8 d1, d1
+ vpadd.u16 d1, d1
+ cmp r4, #4
+ vadd.i16 d0, d0, d1
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 8/16
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ cmp r4, #16
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+ vld1.8 {d0}, [r2, :64]!
+ vpaddl.u8 d0, d0
+ vpadd.i16 d0, d0
+ add r2, r2, #1
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w8):
+ vld1.8 {d1}, [r2]
+ vadd.i16 d0, d0, d16
+ vpaddl.u8 d1, d1
+ vpadd.i16 d1, d1
+ vpadd.i16 d1, d1
+ cmp r4, #8
+ vadd.i16 d0, d0, d1
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 4/16/32
+ cmp r4, #32
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+ vld1.8 {q0}, [r2, :128]!
+ vaddl.u8 q0, d0, d1
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0
+ add r2, r2, #1
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w16):
+ vld1.8 {q2}, [r2]
+ vadd.i16 d0, d0, d16
+ vaddl.u8 q2, d4, d5
+ vadd.i16 d4, d4, d5
+ vpadd.i16 d4, d4
+ vpadd.i16 d4, d4
+ cmp r4, #16
+ vadd.i16 d0, d0, d4
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 4/8/32/64
+ tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+ vld1.8 {q2, q3}, [r2, :128]!
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.i16 q0, q2, q3
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0
+ add r2, r2, #1
+ vpadd.i16 d0, d0
+ bx r12
+L(ipred_cfl_w32):
+ vld1.8 {q2, q3}, [r2]
+ vadd.i16 d0, d0, d16
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.i16 q2, q2, q3
+ vadd.i16 d4, d4, d5
+ vpadd.i16 d4, d4
+ vpadd.i16 d4, d4
+ cmp r4, #32
+ vadd.i16 d0, d0, d4
+ vshl.u16 d0, d0, d17
+ beq 1f
+ // h = 8/16/64
+ cmp r4, #8
+ movw lr, #(0x3334/2)
+ movw r8, #(0x5556/2)
+ it ne
+ movne lr, r8
+ vdup.16 d18, lr
+ vqdmulh.s16 d0, d0, d18
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+endfunc
+
+// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_420_tbl)
+ sub r8, r8, #27
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_420_tbl):
+ .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w4):
+1: // Copy and subsample input
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d2}, [r12, :64], r2
+ vld1.8 {d1}, [r1, :64], r2
+ vld1.8 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vadd.i16 q0, q0, q1
+ vshl.i16 q0, q0, #1
+ subs r8, r8, #2
+ vst1.16 {q0}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d1
+ vmov d2, d1
+ vmov d3, d1
+L(ipred_cfl_ac_420_w4_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q8, q8, q1
+ bgt 2b
+3:
+L(ipred_cfl_ac_420_w4_calc_subtract_dc):
+ // Aggregate the sums
+ vadd.i16 q0, q8, q9
+ vadd.i16 q1, q10, q11
+ vpaddl.u16 q0, q0
+ vpaddl.u16 q1, q1
+ vadd.i32 q0, q1
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d0, d0 // sum
+ sub r0, r0, r6, lsl #3
+ vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz
+ vdup.16 q8, d16[0]
+L(ipred_cfl_ac_420_w4_subtract_dc):
+6: // Subtract dc from ac
+ vld1.16 {q0, q1}, [r0, :128]
+ subs r6, r6, #4
+ vsub.i16 q0, q0, q8
+ vsub.i16 q1, q1, q8
+ vst1.16 {q0, q1}, [r0, :128]!
+ bgt 6b
+ pop {r4-r8, pc}
+
+L(ipred_cfl_ac_420_w8):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_420_w8_wpad)
+1: // Copy and subsample input, without padding
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q1}, [r12, :128], r2
+ vld1.8 {q2}, [r1, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q3}, [r12, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vshl.i16 q0, q0, #1
+ vshl.i16 q1, q2, #1
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q1
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1: // Copy and subsample input, padding 4
+ vld1.16 {d0}, [r1, :64], r2
+ vld1.16 {d2}, [r12, :64], r2
+ vld1.16 {d1}, [r1, :64], r2
+ vld1.16 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vadd.i16 q0, q0, q1
+ vshl.i16 q0, q0, #1
+ vdup.16 d3, d1[3]
+ vmov d2, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q1
+
+L(ipred_cfl_ac_420_w8_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 2b
+3:
+
+ // Double the height and reuse the w4 summing/subtracting
+ lsl r6, r6, #1
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_w16):
+ adr r7, L(ipred_cfl_ac_420_w16_tbl)
+ ldr r3, [r7, r3, lsl #2]
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_420_w16_tbl):
+ .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w16_wpad0):
+1: // Copy and subsample input, without padding
+ vld1.8 {q0, q1}, [r1, :128], r2
+ vld1.8 {q2, q3}, [r12, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q12, q13}, [r1, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vld1.8 {q2, q3}, [r12, :128], r2
+ vpaddl.u8 q12, q12
+ vpaddl.u8 q13, q13
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q12, q12, q2
+ vadd.i16 q13, q13, q3
+ vshl.i16 q0, q0, #1
+ vshl.i16 q1, q1, #1
+ vshl.i16 q2, q12, #1
+ vshl.i16 q3, q13, #1
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+1: // Copy and subsample input, padding 4
+ vldr d2, [r1, #16]
+ vld1.8 {q0}, [r1, :128], r2
+ vldr d6, [r12, #16]
+ vld1.8 {q2}, [r12, :128], r2
+ vpaddl.u8 d2, d2
+ vldr d26, [r1, #16]
+ vpaddl.u8 q0, q0
+ vld1.8 {q12}, [r1, :128], r2
+ vpaddl.u8 d6, d6
+ vldr d30, [r12, #16]
+ vpaddl.u8 q2, q2
+ vld1.8 {q14}, [r12, :128], r2
+ vpaddl.u8 d26, d26
+ vpaddl.u8 q12, q12
+ vpaddl.u8 d30, d30
+ vpaddl.u8 q14, q14
+ vadd.i16 d2, d2, d6
+ vadd.i16 q0, q0, q2
+ vadd.i16 d26, d26, d30
+ vadd.i16 q12, q12, q14
+ vshl.i16 d2, d2, #1
+ vshl.i16 q0, q0, #1
+ vshl.i16 d6, d26, #1
+ vshl.i16 q2, q12, #1
+ vdup.16 d3, d2[3]
+ vdup.16 d7, d6[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q1}, [r12, :128], r2
+ vld1.8 {q2}, [r1, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q3}, [r12, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vshl.i16 q0, q0, #1
+ vshl.i16 q2, q2, #1
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vld1.8 {d4}, [r1, :64], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {d5}, [r12, :64], r2
+ vpaddl.u8 q2, q2
+ vadd.i16 d0, d0, d1
+ vadd.i16 d4, d4, d5
+ vshl.i16 d0, d0, #1
+ vshl.i16 d4, d4, #1
+ vdup.16 q1, d0[3]
+ vdup.16 q3, d4[3]
+ vdup.16 d1, d0[3]
+ vdup.16 d5, d4[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 2b
+3:
+
+ // Quadruple the height and reuse the w4 summing/subtracting
+ lsl r6, r6, #2
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+endfunc
+
+// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_422_tbl)
+ sub r8, r8, #27
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_422_tbl):
+ .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w4):
+1: // Copy and subsample input
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vld1.8 {d2}, [r1, :64], r2
+ vld1.8 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d3
+ vmov d1, d3
+ vmov d2, d3
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_422_w8_wpad)
+1: // Copy and subsample input, without padding
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q1}, [r12, :128], r2
+ vld1.8 {q2}, [r1, :128], r2
+ vpaddl.u8 q0, q0
+ vld1.8 {q3}, [r12, :128], r2
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vshl.i16 q2, q2, #2
+ vshl.i16 q3, q3, #2
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1: // Copy and subsample input, padding 4
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vld1.8 {d2}, [r1, :64], r2
+ vld1.8 {d3}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vdup.16 d7, d3[3]
+ vmov d6, d3
+ vdup.16 d5, d2[3]
+ vmov d4, d2
+ vdup.16 d3, d1[3]
+ vmov d2, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+ adr r7, L(ipred_cfl_ac_422_w16_tbl)
+ ldr r3, [r7, r3, lsl #2]
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_422_w16_tbl):
+ .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w16_wpad0):
+1: // Copy and subsample input, without padding
+ vld1.8 {q0, q1}, [r1, :128], r2
+ vld1.8 {q2, q3}, [r12, :128], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vpaddl.u8 q2, q2
+ vpaddl.u8 q3, q3
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vshl.i16 q2, q2, #2
+ vshl.i16 q3, q3, #2
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+1: // Copy and subsample input, padding 4
+ vldr d2, [r1, #16]
+ vld1.8 {q0}, [r1, :128], r2
+ vldr d6, [r12, #16]
+ vld1.8 {q2}, [r12, :128], r2
+ vpaddl.u8 d2, d2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 d6, d6
+ vpaddl.u8 q2, q2
+ vshl.i16 d2, d2, #2
+ vshl.i16 q0, q0, #2
+ vshl.i16 d6, d6, #2
+ vshl.i16 q2, q2, #2
+ vdup.16 d3, d2[3]
+ vdup.16 d7, d6[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ vld1.8 {q0}, [r1, :128], r2
+ vld1.8 {q2}, [r12, :128], r2
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q2, q2
+ vshl.i16 q0, q0, #2
+ vshl.i16 q2, q2, #2
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d1}, [r12, :64], r2
+ vpaddl.u8 q0, q0
+ vshl.i16 q0, q0, #2
+ vdup.16 q3, d1[3]
+ vdup.16 q1, d0[3]
+ vdup.16 d5, d1[3]
+ vmov d4, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+endfunc
+
+// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_8bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_444_tbl)
+ sub r8, r8, #26
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_444_tbl):
+ .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w4):
+1: // Copy and expand input
+ vld1.32 {d0[]}, [r1, :32], r2
+ vld1.32 {d0[1]}, [r12, :32], r2
+ vld1.32 {d2[]}, [r1, :32], r2
+ vld1.32 {d2[1]}, [r12, :32], r2
+ vshll.u8 q0, d0, #3
+ vshll.u8 q1, d2, #3
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d3
+ vmov d1, d3
+ vmov d2, d3
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+1: // Copy and expand input
+ vld1.16 {d0}, [r1, :64], r2
+ vld1.16 {d2}, [r12, :64], r2
+ vld1.16 {d4}, [r1, :64], r2
+ vshll.u8 q0, d0, #3
+ vld1.16 {d6}, [r12, :64], r2
+ vshll.u8 q1, d2, #3
+ vshll.u8 q2, d4, #3
+ vshll.u8 q3, d6, #3
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ vld1.8 {q1}, [r1, :128], r2
+ vld1.8 {q3}, [r12, :128], r2
+ vshll.u8 q0, d2, #3
+ vshll.u8 q1, d3, #3
+ vshll.u8 q2, d6, #3
+ vshll.u8 q3, d7, #3
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d4}, [r12, :64], r2
+ vshll.u8 q0, d0, #3
+ vshll.u8 q2, d4, #3
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ adr r7, L(ipred_cfl_ac_444_w32_tbl)
+ ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_444_w32_tbl):
+ .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w32_wpad0):
+1: // Copy and expand input, without padding
+ vld1.8 {q2, q3}, [r1, :128], r2
+ vld1.8 {q13, q14}, [r12, :128], r2
+ vshll.u8 q0, d4, #3
+ vshll.u8 q1, d5, #3
+ vshll.u8 q2, d6, #3
+ vshll.u8 q3, d7, #3
+ vshll.u8 q12, d26, #3
+ vshll.u8 q13, d27, #3
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vshll.u8 q0, d28, #3
+ vshll.u8 q1, d29, #3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+1: // Copy and expand input, padding 8
+ vldr d4, [r1, #16]
+ vld1.8 {q1}, [r1, :128], r2
+ vldr d28, [r12, #16]
+ vld1.8 {q13}, [r12, :128], r2
+ vshll.u8 q2, d4, #3
+ vshll.u8 q0, d2, #3
+ vshll.u8 q1, d3, #3
+ vshll.u8 q12, d26, #3
+ vshll.u8 q13, d27, #3
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vshll.u8 q0, d28, #3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vdup.16 q1, d1[3]
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+1: // Copy and expand input, padding 16
+ vld1.8 {q1}, [r1, :128], r2
+ vld1.8 {q13}, [r12, :128], r2
+ vshll.u8 q0, d2, #3
+ vshll.u8 q1, d3, #3
+ vshll.u8 q12, d26, #3
+ vshll.u8 q13, d27, #3
+ vdup.16 q2, d3[3]
+ vdup.16 q3, d3[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vdup.16 q0, d27[3]
+ vdup.16 q1, d27[3]
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+1: // Copy and expand input, padding 24
+ vld1.8 {d0}, [r1, :64], r2
+ vld1.8 {d24}, [r12, :64], r2
+ vshll.u8 q0, d0, #3
+ vshll.u8 q12, d24, #3
+ subs r8, r8, #2
+ vdup.16 q1, d1[3]
+ vdup.16 q2, d1[3]
+ vdup.16 q3, d1[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q8, q8, q0
+ vadd.i16 q9, q9, q1
+ vdup.16 q13, d25[3]
+ vdup.16 q0, d25[3]
+ vdup.16 q1, d25[3]
+ vst1.16 {q2, q3}, [r0, :128]!
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q3
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 1b
+ cmp r4, #0
+
+L(ipred_cfl_ac_444_w32_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #1
+ vst1.16 {q12, q13}, [r0, :128]!
+ vadd.i16 q8, q8, q12
+ vadd.i16 q9, q9, q13
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q10, q10, q0
+ vadd.i16 q11, q11, q1
+ bgt 2b
+3:
+
+ // Multiply the height by eight and reuse the w4 subtracting
+ lsl r6, r6, #3
+ // Aggregate the sums, with wider intermediates earlier than in
+ // ipred_cfl_ac_420_w8_calc_subtract_dc.
+ vpaddl.u16 q0, q8
+ vpaddl.u16 q1, q9
+ vpaddl.u16 q2, q10
+ vpaddl.u16 q3, q11
+ vadd.i32 q0, q0, q1
+ vadd.i32 q2, q2, q3
+ vadd.i32 q0, q0, q2
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d0, d0 // sum
+ sub r0, r0, r6, lsl #3
+ vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz
+ vdup.16 q8, d16[0]
+ b L(ipred_cfl_ac_420_w4_subtract_dc)
+endfunc
diff --git a/third_party/dav1d/src/arm/32/ipred16.S b/third_party/dav1d/src/arm/32/ipred16.S
new file mode 100644
index 0000000000..993d9500aa
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/ipred16.S
@@ -0,0 +1,3254 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, B Krishnan Iyer
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height,
+// const int bitdepth_max);
+function ipred_dc_128_16bpc_neon, export=1
+ push {r4, lr}
+ ldr r4, [sp, #8]
+ ldr r12, [sp, #24]
+ clz r3, r3
+ adr r2, L(ipred_dc_128_tbl)
+ sub r3, r3, #25
+ vdup.16 q0, r12
+ ldr r3, [r2, r3, lsl #2]
+ add r12, r0, r1
+ vrshr.u16 q0, q0, #1
+ add r2, r2, r3
+ lsl r1, r1, #1
+ bx r2
+
+ .align 2
+L(ipred_dc_128_tbl):
+ .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB
+4:
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 4b
+ pop {r4, pc}
+8:
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt 8b
+ pop {r4, pc}
+160:
+ vmov q1, q0
+16:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 16b
+ pop {r4, pc}
+320:
+ vmov q1, q0
+ sub r1, r1, #32
+32:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4, pc}
+640:
+ vmov q1, q0
+ sub r1, r1, #96
+64:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ subs r4, r4, #2
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 64b
+ pop {r4, pc}
+endfunc
+
+// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_16bpc_neon, export=1
+ push {r4, lr}
+ ldr lr, [sp, #8]
+ clz r3, r3
+ adr r4, L(ipred_v_tbl)
+ sub r3, r3, #25
+ ldr r3, [r4, r3, lsl #2]
+ add r2, r2, #2
+ add r4, r4, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r4
+
+ .align 2
+L(ipred_v_tbl):
+ .word 640f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_v_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_v_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d0}, [r2]
+4:
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs lr, lr, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 4b
+ pop {r4, pc}
+80:
+ vld1.16 {q0}, [r2]
+8:
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt 8b
+ pop {r4, pc}
+160:
+ vld1.16 {q0, q1}, [r2]
+16:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 16b
+ pop {r4, pc}
+320:
+ vld1.16 {q0, q1}, [r2]!
+ sub r1, r1, #32
+ vld1.16 {q2, q3}, [r2]
+32:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.16 {d4, d5, d6, d7}, [r12, :128], r1
+ subs lr, lr, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d4, d5, d6, d7}, [r0, :128], r1
+ vst1.16 {d4, d5, d6, d7}, [r12, :128], r1
+ bgt 32b
+ pop {r4, pc}
+640:
+ vld1.16 {q0, q1}, [r2]!
+ sub r1, r1, #96
+ vld1.16 {q2, q3}, [r2]!
+ vld1.16 {q8, q9}, [r2]!
+ vld1.16 {q10, q11}, [r2]!
+64:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d4, d5, d6, d7}, [r0, :128]!
+ vst1.16 {d4, d5, d6, d7}, [r12, :128]!
+ subs lr, lr, #2
+ vst1.16 {d16, d17, d18, d19}, [r0, :128]!
+ vst1.16 {d16, d17, d18, d19}, [r12, :128]!
+ vst1.16 {d20, d21, d22, d23}, [r0, :128], r1
+ vst1.16 {d20, d21, d22, d23}, [r12, :128], r1
+ bgt 64b
+ pop {r4, pc}
+endfunc
+
+// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_16bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ clz r3, r3
+ adr r5, L(ipred_h_tbl)
+ sub r3, r3, #25
+ ldr r3, [r5, r3, lsl #2]
+ sub r2, r2, #2
+ mov lr, #-2
+ add r5, r5, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_h_tbl):
+ .word 640f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_h_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_h_tbl) + CONFIG_THUMB
+40:
+ sub r2, r2, #6
+ mov lr, #-8
+4:
+ vld4.16 {d0[], d1[], d2[], d3[]}, [r2], lr
+ vst1.16 {d3}, [r0, :64], r1
+ vst1.16 {d2}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d1}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 4b
+ pop {r4-r5, pc}
+8:
+ vld1.16 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.16 {d2[], d3[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128], r1
+ vld1.16 {d4[], d5[]}, [r2], lr
+ vst1.16 {q1}, [r12, :128], r1
+ vld1.16 {d6[], d7[]}, [r2], lr
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r12, :128], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ sub r1, r1, #16
+16:
+ vld1.16 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.16 {d2[], d3[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128]!
+ vld1.16 {d4[], d5[]}, [r2], lr
+ vst1.16 {q1}, [r12, :128]!
+ vld1.16 {d6[], d7[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r12, :128], r1
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ sub r1, r1, #48
+32:
+ vld1.16 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #4
+ vld1.16 {d2[], d3[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128]!
+ vld1.16 {d4[], d5[]}, [r2], lr
+ vst1.16 {q1}, [r12, :128]!
+ vld1.16 {d6[], d7[]}, [r2], lr
+ vst1.16 {q0}, [r0, :128]!
+ vst1.16 {q1}, [r12, :128]!
+ vst1.16 {q0}, [r0, :128]!
+ vst1.16 {q1}, [r12, :128]!
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r12, :128], r1
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ sub r1, r1, #96
+64:
+ vld1.16 {d0[], d1[]}, [r2], lr
+ subs r4, r4, #2
+ vld1.16 {d4[], d5[]}, [r2], lr
+ vmov q1, q0
+ vmov q3, q2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vst1.16 {q2, q3}, [r12, :128]!
+ vst1.16 {q0, q1}, [r0, :128]!
+ vst1.16 {q2, q3}, [r12, :128]!
+ vst1.16 {q0, q1}, [r0, :128]!
+ vst1.16 {q2, q3}, [r12, :128]!
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vst1.16 {q2, q3}, [r12, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_16bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ clz r3, r3
+ adr r5, L(ipred_dc_top_tbl)
+ sub r3, r3, #25
+ ldr r3, [r5, r3, lsl #2]
+ add r2, r2, #2
+ add r5, r5, r3
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_top_tbl):
+ .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d0}, [r2]
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 d0, d0[0]
+4:
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 4b
+ pop {r4-r5, pc}
+80:
+ vld1.16 {d0, d1}, [r2]
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+8:
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ vld1.16 {d0, d1, d2, d3}, [r2]
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d4, d0, #4
+ vdup.16 q0, d4[0]
+ vdup.16 q1, d4[0]
+16:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ vld1.16 {d0, d1, d2, d3}, [r2]!
+ vld1.16 {d4, d5, d6, d7}, [r2]
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vadd.i16 q0, q0, q2
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpaddl.u16 d0, d0
+ vrshrn.i32 d18, q0, #5
+ vdup.16 q0, d18[0]
+ vdup.16 q1, d18[0]
+ sub r1, r1, #32
+32:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ vld1.16 {d0, d1, d2, d3}, [r2]!
+ vld1.16 {d4, d5, d6, d7}, [r2]!
+ vadd.i16 q0, q0, q1
+ vld1.16 {d16, d17, d18, d19}, [r2]!
+ vadd.i16 q2, q2, q3
+ vld1.16 {d20, d21, d22, d23}, [r2]
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q0, q2
+ vadd.i16 q8, q8, q10
+ vadd.i16 q0, q0, q8
+ vadd.i16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpadd.i32 d0, d0, d0
+ vrshrn.i32 d18, q0, #6
+ vdup.16 q0, d18[0]
+ vdup.16 q1, d18[0]
+ sub r1, r1, #96
+64:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ subs r4, r4, #2
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_16bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ sub r2, r2, r4, lsl #1
+ clz r3, r3
+ clz lr, r4
+ sub lr, lr, #25
+ adr r5, L(ipred_dc_left_tbl)
+ sub r3, r3, #20
+ ldr r3, [r5, r3, lsl #2]
+ ldr lr, [r5, lr, lsl #2]
+ add r3, r5, r3
+ add r5, r5, lr
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_left_tbl):
+ .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
+
+L(ipred_dc_left_h4):
+ vld1.16 {d0}, [r2, :64]
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w4):
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt L(ipred_dc_left_w4)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h8):
+ vld1.16 {d0, d1}, [r2, :128]
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w8):
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt L(ipred_dc_left_w8)
+ pop {r4-r5, pc}
+L(ipred_dc_left_h16):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w16):
+ vmov q1, q0
+1:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+L(ipred_dc_left_h32):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.16 {d4, d5, d6, d7}, [r2, :128]
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vadd.i16 q0, q0, q2
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpaddl.u16 d0, d0
+ vrshrn.i32 d0, q0, #5
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w32):
+ sub r1, r1, #32
+ vmov q1, q0
+1:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+L(ipred_dc_left_h64):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.16 {d4, d5, d6, d7}, [r2, :128]!
+ vadd.i16 q0, q0, q1
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]!
+ vadd.i16 q2, q2, q3
+ vld1.16 {d20, d21, d22, d23}, [r2, :128]
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q0, q2
+ vadd.i16 q8, q8, q10
+ vadd.i16 q0, q0, q8
+ vadd.i16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ vpadd.i32 d0, d0, d0
+ vrshrn.i32 d0, q0, #6
+ vdup.16 q0, d0[0]
+ bx r3
+L(ipred_dc_left_w64):
+ sub r1, r1, #96
+ vmov q1, q0
+1:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ subs r4, r4, #2
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 1b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_16bpc_neon, export=1
+ push {r4-r6, lr}
+ ldr r4, [sp, #16]
+ sub r2, r2, r4, lsl #1
+ add lr, r3, r4 // width + height
+ clz r3, r3
+ clz r12, r4
+ vdup.32 q15, lr // width + height
+ adr r5, L(ipred_dc_tbl)
+ rbit lr, lr // rbit(width + height)
+ sub r3, r3, #20 // 25 leading bits, minus table offset 5
+ sub r12, r12, #25
+ clz lr, lr // ctz(width + height)
+ ldr r3, [r5, r3, lsl #2]
+ ldr r12, [r5, r12, lsl #2]
+ neg lr, lr // -ctz(width + height)
+ add r3, r5, r3
+ add r5, r5, r12
+ vshr.u32 q15, q15, #1 // (width + height) >> 1
+ vdup.32 q14, lr // -ctz(width + height)
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_dc_tbl):
+ .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB
+ .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB
+
+L(ipred_dc_h4):
+ vld1.16 {d0}, [r2, :64]!
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r3
+L(ipred_dc_w4):
+ vld1.16 {d2}, [r2]
+ vadd.i32 d0, d0, d30
+ vpadd.i16 d2, d2, d2
+ vpaddl.u16 d2, d2
+ cmp r4, #4
+ vadd.i32 d0, d0, d2
+ vshl.u32 d0, d0, d28
+ beq 1f
+ // h = 8/16
+ cmp r4, #16
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d0, d0, d24
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 d0, d0[0]
+2:
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d0}, [r12, :64], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h8):
+ vld1.16 {d0, d1}, [r2, :128]!
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r3
+L(ipred_dc_w8):
+ vld1.16 {d2, d3}, [r2]
+ vadd.i32 d0, d0, d30
+ vadd.i16 d2, d2, d3
+ vpadd.i16 d2, d2, d2
+ vpaddl.u16 d2, d2
+ cmp r4, #8
+ vadd.i32 d0, d0, d2
+ vshl.u32 d0, d0, d28
+ beq 1f
+ // h = 4/16/32
+ cmp r4, #32
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d0, d0, d24
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+2:
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1}, [r0, :128], r1
+ vst1.16 {d0, d1}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h16):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r3
+L(ipred_dc_w16):
+ vld1.16 {d2, d3, d4, d5}, [r2]
+ vadd.i32 d0, d0, d30
+ vadd.i16 q1, q1, q2
+ vadd.i16 d2, d2, d3
+ vpadd.i16 d2, d2, d1
+ vpaddl.u16 d2, d2
+ cmp r4, #16
+ vadd.i32 d0, d0, d2
+ vshl.u32 d4, d0, d28
+ beq 1f
+ // h = 4/8/32/64
+ tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d4, d4, d24
+ vshr.u32 d4, d4, #17
+1:
+ vdup.16 q0, d4[0]
+ vdup.16 q1, d4[0]
+2:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+
+L(ipred_dc_h32):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.16 {d4, d5, d6, d7}, [r2, :128]!
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vadd.i16 q0, q0, q2
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r3
+L(ipred_dc_w32):
+ vld1.16 {d2, d3, d4, d5}, [r2]!
+ vadd.i32 d0, d0, d30
+ vld1.16 {d16, d17, d18, d19}, [r2]
+ vadd.i16 q1, q1, q2
+ vadd.i16 q8, q8, q9
+ vadd.i16 q1, q1, q8
+ vadd.i16 d2, d2, d3
+ vpadd.i16 d2, d2, d2
+ vpaddl.u16 d2, d2
+ cmp r4, #32
+ vadd.i32 d0, d0, d2
+ vshl.u32 d4, d0, d28
+ beq 1f
+ // h = 8/16/64
+ cmp r4, #8
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d4, d4, d24
+ vshr.u32 d4, d4, #17
+1:
+ sub r1, r1, #32
+ vdup.16 q0, d4[0]
+ vdup.16 q1, d4[0]
+2:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ subs r4, r4, #4
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+L(ipred_dc_h64):
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.16 {d4, d5, d6, d7}, [r2, :128]!
+ vadd.i16 q0, q0, q1
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]!
+ vadd.i16 q2, q2, q3
+ vld1.16 {d20, d21, d22, d23}, [r2, :128]!
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q0, q2
+ vadd.i16 q8, q8, q10
+ vadd.i16 q0, q0, q8
+ vadd.i16 d0, d0, d1
+ vpaddl.u16 d0, d0
+ add r2, r2, #2
+ vpadd.i32 d0, d0, d0
+ bx r3
+L(ipred_dc_w64):
+ vld1.16 {d2, d3, d4, d5}, [r2]!
+ vadd.i32 d0, d0, d30
+ vld1.16 {d16, d17, d18, d19}, [r2]!
+ vadd.i16 q1, q1, q2
+ vld1.16 {d20, d21, d22, d23}, [r2]!
+ vadd.i16 q8, q8, q9
+ vld1.16 {d24, d25, d26, d27}, [r2]!
+ vadd.i16 q10, q10, q11
+ vadd.i16 q12, q12, q13
+ vadd.i16 q1, q1, q8
+ vadd.i16 q10, q10, q12
+ vadd.i16 q1, q1, q10
+ vadd.i16 d2, d2, d3
+ vpaddl.u16 d2, d2
+ vpadd.i32 d2, d2, d2
+ cmp r4, #64
+ vadd.i32 d0, d0, d2
+ vshl.u32 d4, d0, d28
+ beq 1f
+ // h = 16/32
+ cmp r4, #16
+ movw lr, #0x6667
+ movw r5, #0xAAAB
+ it ne
+ movne lr, r5
+ vdup.32 d24, lr
+ vmul.i32 d4, d4, d24
+ vshr.u32 d4, d4, #17
+1:
+ sub r1, r1, #96
+ vdup.16 q0, d4[0]
+ vdup.16 q1, d4[0]
+2:
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ subs r4, r4, #2
+ vst1.16 {d0, d1, d2, d3}, [r0, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r12, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r0, :128], r1
+ vst1.16 {d0, d1, d2, d3}, [r12, :128], r1
+ bgt 2b
+ pop {r4-r6, pc}
+endfunc
+
+// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_paeth_16bpc_neon, export=1
+ push {r4-r6, lr}
+ vpush {q4}
+ ldr r4, [sp, #32]
+ clz lr, r3
+ adr r12, L(ipred_paeth_tbl)
+ sub lr, lr, #25
+ ldr lr, [r12, lr, lsl #2]
+ vld1.16 {d4[], d5[]}, [r2]
+ add r6, r2, #2
+ sub r2, r2, #4
+ add r12, r12, lr
+ mov r5, #-4
+ add lr, r0, r1
+ lsl r1, r1, #1
+ bx r12
+
+ .align 2
+L(ipred_paeth_tbl):
+ .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB
+
+40:
+ sub r2, r2, #4
+ mov r5, #-8
+ vld1.16 {d6}, [r6]
+ vsub.i16 d16, d6, d4 // top - topleft
+ vmov d7, d6
+ vmov d17, d16
+4:
+ vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r5
+ vadd.i16 q9, q8, q0 // base
+ vadd.i16 q10, q8, q1
+ vabd.s16 q11, q3, q9 // tdiff
+ vabd.s16 q12, q3, q10
+ vabd.s16 q13, q2, q9 // tldiff
+ vabd.s16 q14, q2, q10
+ vabd.s16 q9, q0, q9 // ldiff
+ vabd.s16 q10, q1, q10
+ vmin.u16 q15, q11, q13 // min(tdiff, tldiff)
+ vmin.u16 q4, q12, q14
+ vcge.u16 q11, q13, q11 // tldiff >= tdiff
+ vcge.u16 q12, q14, q12
+ vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff
+ vcge.u16 q10, q4, q10
+ vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbsl q11, q3, q2
+ vbit q12, q1, q10 // ldiff <= min ? left : ...
+ vbit q11, q0, q9
+ vst1.16 {d25}, [r0, :64], r1
+ vst1.16 {d24}, [lr, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d23}, [r0, :64], r1
+ vst1.16 {d22}, [lr, :64], r1
+ bgt 4b
+ vpop {q4}
+ pop {r4-r6, pc}
+80:
+160:
+320:
+640:
+ vld1.16 {q3}, [r6]!
+ mov r12, r3
+ sub r1, r1, r3, lsl #1
+1:
+ vld2.16 {d0[], d2[]}, [r2, :32], r5
+ vmov d1, d0
+ vmov d3, d2
+2:
+ vsub.i16 q8, q3, q2 // top - topleft
+ vadd.i16 q9, q8, q0 // base
+ vadd.i16 q10, q8, q1
+ vabd.s16 q11, q3, q9 // tdiff
+ vabd.s16 q12, q3, q10
+ vabd.s16 q13, q2, q9 // tldiff
+ vabd.s16 q14, q2, q10
+ vabd.s16 q9, q0, q9 // ldiff
+ vabd.s16 q10, q1, q10
+ vmin.u16 q15, q11, q13 // min(tdiff, tldiff)
+ vmin.u16 q4, q12, q14
+ vcge.u16 q11, q13, q11 // tldiff >= tdiff
+ vcge.u16 q12, q14, q12
+ vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff
+ vcge.u16 q10, q4, q10
+ vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft
+ vbsl q11, q3, q2
+ vbit q12, q1, q10 // ldiff <= min ? left : ...
+ vbit q11, q0, q9
+ subs r3, r3, #8
+ vst1.16 {q12}, [r0, :128]!
+ vst1.16 {q11}, [lr, :128]!
+ ble 8f
+ vld1.16 {q3}, [r6]!
+ b 2b
+8:
+ subs r4, r4, #2
+ ble 9f
+ // End of horizontal loop, move pointers to next two rows
+ sub r6, r6, r12, lsl #1
+ add r0, r0, r1
+ add lr, lr, r1
+ vld1.16 {q3}, [r6]!
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4}
+ pop {r4-r6, pc}
+endfunc
+
+// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_16bpc_neon, export=1
+ push {r4-r10, lr}
+ ldr r4, [sp, #32]
+ movrel r10, X(sm_weights)
+ add r12, r10, r4
+ add r10, r10, r3
+ clz r9, r3
+ adr r5, L(ipred_smooth_tbl)
+ sub lr, r2, r4, lsl #1
+ sub r9, r9, #25
+ ldr r9, [r5, r9, lsl #2]
+ vld1.16 {d4[], d5[]}, [lr] // bottom
+ add r8, r2, #2
+ add r5, r5, r9
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_tbl):
+ .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d16}, [r8] // top
+ vld1.32 {d18[]}, [r10, :32] // weights_hor
+ sub r2, r2, #8
+ mov r7, #-8
+ vdup.16 q3, d16[3] // right
+ vsub.i16 q8, q8, q2 // top-bottom
+ vmovl.u8 q9, d18 // weights_hor
+ vadd.i16 d19, d4, d6 // bottom+right
+4:
+ vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left
+ vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver
+ vshll.u16 q12, d19, #8 // (bottom+right)*256
+ vshll.u16 q13, d19, #8
+ vshll.u16 q14, d19, #8
+ vshll.u16 q15, d19, #8
+ vzip.32 d20, d21 // weights_ver
+ vzip.32 d22, d23
+ vsub.i16 q1, q1, q3 // left-right
+ vsub.i16 q0, q0, q3
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+ vmlal.s16 q12, d3, d18 // += (left-right)*weights_hor
+ vmlal.s16 q13, d2, d18 // (left flipped)
+ vmlal.s16 q14, d1, d18
+ vmlal.s16 q15, d0, d18
+ vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver
+ vmlal.s16 q13, d16, d21
+ vmlal.s16 q14, d16, d22
+ vmlal.s16 q15, d16, d23
+ vrshrn.i32 d24, q12, #9
+ vrshrn.i32 d25, q13, #9
+ vrshrn.i32 d26, q14, #9
+ vrshrn.i32 d27, q15, #9
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d25}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d26}, [r0, :64], r1
+ vst1.16 {d27}, [r6, :64], r1
+ bgt 4b
+ pop {r4-r10, pc}
+80:
+ vld1.16 {q8}, [r8] // top
+ vld1.8 {d18}, [r10, :64] // weights_hor
+ sub r2, r2, #4
+ mov r7, #-4
+ vdup.16 q3, d17[3] // right
+ vsub.i16 q8, q8, q2 // top-bottom
+ vmovl.u8 q9, d18 // weights_hor
+ vadd.i16 d3, d4, d6 // bottom+right
+8:
+ vld2.16 {d0[], d1[]}, [r2, :32], r7 // left
+ vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver
+ vshll.u16 q12, d3, #8 // (bottom+right)*256
+ vshll.u16 q13, d3, #8
+ vshll.u16 q14, d3, #8
+ vshll.u16 q15, d3, #8
+ vsub.i16 q0, q0, q3 // left-right
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+ vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor
+ vmlal.s16 q13, d1, d19 // (left flipped)
+ vmlal.s16 q14, d0, d18
+ vmlal.s16 q15, d0, d19
+ vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver
+ vmlal.s16 q13, d17, d20
+ vmlal.s16 q14, d16, d22
+ vmlal.s16 q15, d17, d22
+ vrshrn.i32 d24, q12, #9
+ vrshrn.i32 d25, q13, #9
+ vrshrn.i32 d26, q14, #9
+ vrshrn.i32 d27, q15, #9
+ subs r4, r4, #2
+ vst1.16 {q12}, [r0, :128], r1
+ vst1.16 {q13}, [r6, :128], r1
+ bgt 8b
+ pop {r4-r10, pc}
+160:
+320:
+640:
+ add lr, r2, r3, lsl #1
+ sub r2, r2, #4
+ mov r7, #-4
+ vld1.16 {d6[], d7[]}, [lr] // right
+ sub r1, r1, r3, lsl #1
+ mov r9, r3
+ vadd.i16 d3, d4, d6 // bottom+right
+
+1:
+ vld2.16 {d0[], d1[]}, [r2, :32], r7 // left
+ vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver
+ vsub.i16 q0, q0, q3 // left-right
+ vmovl.u8 q10, d20 // weights_ver
+ vmovl.u8 q11, d22
+2:
+ vld1.8 {d18}, [r10, :64]! // weights_hor
+ vld1.16 {q8}, [r8]! // top
+ vshll.u16 q12, d3, #8 // (bottom+right)*256
+ vshll.u16 q13, d3, #8
+ vmovl.u8 q9, d18 // weights_hor
+ vshll.u16 q14, d3, #8
+ vshll.u16 q15, d3, #8
+ vsub.i16 q8, q8, q2 // top-bottom
+ vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor
+ vmlal.s16 q13, d1, d19 // (left flipped)
+ vmlal.s16 q14, d0, d18
+ vmlal.s16 q15, d0, d19
+ vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver
+ vmlal.s16 q13, d17, d20
+ vmlal.s16 q14, d16, d22
+ vmlal.s16 q15, d17, d22
+ vrshrn.i32 d24, q12, #9
+ vrshrn.i32 d25, q13, #9
+ vrshrn.i32 d26, q14, #9
+ vrshrn.i32 d27, q15, #9
+ subs r3, r3, #8
+ vst1.16 {q12}, [r0, :128]!
+ vst1.16 {q13}, [r6, :128]!
+ bgt 2b
+ subs r4, r4, #2
+ ble 9f
+ sub r8, r8, r9, lsl #1
+ sub r10, r10, r9
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, r9
+ b 1b
+9:
+ pop {r4-r10, pc}
+endfunc
+
+// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_v_16bpc_neon, export=1
+ push {r4-r7, lr}
+ ldr r4, [sp, #20]
+ movrel r7, X(sm_weights)
+ add r7, r7, r4
+ clz lr, r3
+ adr r5, L(ipred_smooth_v_tbl)
+ sub r12, r2, r4, lsl #1
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.16 {d4[], d5[]}, [r12] // bottom
+ add r2, r2, #2
+ add r5, r5, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_v_tbl):
+ .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d6}, [r2] // top
+ vsub.i16 d6, d6, d4 // top-bottom
+ vmov d7, d6
+4:
+ vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver
+ vzip.32 d16, d17 // weights_ver
+ vzip.32 d18, d19
+ vshll.u8 q8, d16, #7 // weights_ver << 7
+ vshll.u8 q9, d18, #7
+ vqrdmulh.s16 q10, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8
+ vqrdmulh.s16 q11, q3, q9
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vst1.16 {d20}, [r0, :64], r1
+ vst1.16 {d21}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d22}, [r0, :64], r1
+ vst1.16 {d23}, [r6, :64], r1
+ bgt 4b
+ pop {r4-r7, pc}
+80:
+ vld1.16 {q3}, [r2] // top
+ vsub.i16 q3, q3, q2 // top-bottom
+8:
+ vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver
+ vshll.u8 q8, d16, #7 // weights_ver << 7
+ vshll.u8 q9, d18, #7
+ vshll.u8 q10, d20, #7
+ vshll.u8 q11, d22, #7
+ vqrdmulh.s16 q8, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8
+ vqrdmulh.s16 q9, q3, q9
+ vqrdmulh.s16 q10, q3, q10
+ vqrdmulh.s16 q11, q3, q11
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vst1.16 {q8}, [r0, :128], r1
+ vst1.16 {q9}, [r6, :128], r1
+ subs r4, r4, #4
+ vst1.16 {q10}, [r0, :128], r1
+ vst1.16 {q11}, [r6, :128], r1
+ bgt 8b
+ pop {r4-r7, pc}
+160:
+320:
+640:
+ vpush {q4-q7}
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3, lsl #1
+ mov r12, r3
+
+1:
+ vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver
+ vshll.u8 q4, d8, #7 // weights_ver << 7
+ vshll.u8 q5, d10, #7
+ vshll.u8 q6, d12, #7
+ vshll.u8 q7, d14, #7
+2:
+ vld1.16 {q0, q1}, [r2]! // top
+ vsub.i16 q0, q0, q2 // top-bottom
+ vsub.i16 q1, q1, q2
+ vqrdmulh.s16 q8, q0, q4 // ((top-bottom)*weights_ver + 128) >> 8
+ vqrdmulh.s16 q9, q1, q4
+ vqrdmulh.s16 q10, q0, q5
+ vqrdmulh.s16 q11, q1, q5
+ vqrdmulh.s16 q12, q0, q6
+ vqrdmulh.s16 q13, q1, q6
+ vqrdmulh.s16 q14, q0, q7
+ vqrdmulh.s16 q15, q1, q7
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vadd.i16 q12, q12, q2
+ vadd.i16 q13, q13, q2
+ vadd.i16 q14, q14, q2
+ vadd.i16 q15, q15, q2
+ subs r3, r3, #16
+ vst1.16 {q8, q9}, [r0, :128]!
+ vst1.16 {q10, q11}, [r6, :128]!
+ vst1.16 {q12, q13}, [r5, :128]!
+ vst1.16 {q14, q15}, [lr, :128]!
+ bgt 2b
+ subs r4, r4, #4
+ ble 9f
+ sub r2, r2, r12, lsl #1
+ add r0, r0, r1
+ add r6, r6, r1
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r7, pc}
+endfunc
+
+// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_h_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldr r4, [sp, #24]
+ movrel r8, X(sm_weights)
+ add r8, r8, r3
+ clz lr, r3
+ adr r5, L(ipred_smooth_h_tbl)
+ add r12, r2, r3, lsl #1
+ sub lr, lr, #25
+ ldr lr, [r5, lr, lsl #2]
+ vld1.16 {d4[], d5[]}, [r12] // right
+ add r5, r5, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ bx r5
+
+ .align 2
+L(ipred_smooth_h_tbl):
+ .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB
+
+40:
+ vld1.32 {d6[]}, [r8, :32] // weights_hor
+ sub r2, r2, #8
+ mov r7, #-8
+ vshll.u8 q3, d6, #7 // weights_hor << 7
+4:
+ vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left
+ vsub.i16 q0, q0, q2 // left-right
+ vsub.i16 q1, q1, q2
+ subs r4, r4, #4
+ vqrdmulh.s16 q8, q1, q3 // ((left-right)*weights_hor + 128) >> 8
+ vqrdmulh.s16 q9, q0, q3 // (left flipped)
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vst1.16 {d17}, [r0, :64], r1
+ vst1.16 {d16}, [r6, :64], r1
+ vst1.16 {d19}, [r0, :64], r1
+ vst1.16 {d18}, [r6, :64], r1
+ bgt 4b
+ pop {r4-r8, pc}
+80:
+ vld1.8 {d6}, [r8, :64] // weights_hor
+ sub r2, r2, #8
+ mov r7, #-8
+ vshll.u8 q3, d6, #7 // weights_hor << 7
+8:
+ vld1.16 {d23}, [r2, :64], r7 // left
+ subs r4, r4, #4
+ vsub.i16 d23, d23, d4 // left-right
+ vdup.16 q8, d23[3] // flip left
+ vdup.16 q9, d23[2]
+ vdup.16 q10, d23[1]
+ vdup.16 q11, d23[0]
+ vqrdmulh.s16 q8, q8, q3 // ((left-right)*weights_hor + 128) >> 8
+ vqrdmulh.s16 q9, q9, q3
+ vqrdmulh.s16 q10, q10, q3
+ vqrdmulh.s16 q11, q11, q3
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vst1.16 {q8}, [r0, :128], r1
+ vst1.16 {q9}, [r6, :128], r1
+ vst1.16 {q10}, [r0, :128], r1
+ vst1.16 {q11}, [r6, :128], r1
+ bgt 8b
+ pop {r4-r8, pc}
+160:
+320:
+640:
+ vpush {q4-q7}
+ sub r2, r2, #8
+ mov r7, #-8
+ // Set up pointers for four rows in parallel; r0, r6, r5, lr
+ add r5, r0, r1
+ add lr, r6, r1
+ lsl r1, r1, #1
+ sub r1, r1, r3, lsl #1
+ mov r12, r3
+
+1:
+ vld1.16 {d15}, [r2, :64], r7 // left
+ vsub.i16 d15, d15, d4 // left-right
+ vdup.16 q4, d15[3] // flip left
+ vdup.16 q5, d15[2]
+ vdup.16 q6, d15[1]
+ vdup.16 q7, d15[0]
+2:
+ vld1.8 {q1}, [r8, :128]! // weights_hor
+ subs r3, r3, #16
+ vshll.u8 q0, d2, #7 // weights_hor << 7
+ vshll.u8 q1, d3, #7
+ vqrdmulh.s16 q8, q0, q4 // ((left-right)*weights_hor + 128) >> 8
+ vqrdmulh.s16 q9, q1, q4
+ vqrdmulh.s16 q10, q0, q5
+ vqrdmulh.s16 q11, q1, q5
+ vqrdmulh.s16 q12, q0, q6
+ vqrdmulh.s16 q13, q1, q6
+ vqrdmulh.s16 q14, q0, q7
+ vqrdmulh.s16 q15, q1, q7
+ vadd.i16 q8, q8, q2
+ vadd.i16 q9, q9, q2
+ vadd.i16 q10, q10, q2
+ vadd.i16 q11, q11, q2
+ vadd.i16 q12, q12, q2
+ vadd.i16 q13, q13, q2
+ vadd.i16 q14, q14, q2
+ vadd.i16 q15, q15, q2
+ vst1.16 {q8, q9}, [r0, :128]!
+ vst1.16 {q10, q11}, [r6, :128]!
+ vst1.16 {q12, q13}, [r5, :128]!
+ vst1.16 {q14, q15}, [lr, :128]!
+ bgt 2b
+ subs r4, r4, #4
+ ble 9f
+ sub r8, r8, r12
+ add r0, r0, r1
+ add r6, r6, r1
+ add r5, r5, r1
+ add lr, lr, r1
+ mov r3, r12
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int filt_idx,
+// const int max_width, const int max_height,
+// const int bitdepth_max);
+.macro filter_fn bpc
+function ipred_filter_\bpc\()bpc_neon, export=1
+ movw r12, #511
+ ldrd r4, r5, [sp, #88]
+ and r5, r5, r12 // 511
+ movrel r6, X(filter_intra_taps)
+ lsl r5, r5, #6
+ add r6, r6, r5
+ vld1.8 {d20, d21, d22, d23}, [r6, :128]!
+ clz lr, r3
+ adr r5, L(ipred_filter\bpc\()_tbl)
+ vld1.8 {d27, d28, d29}, [r6, :64]
+ sub lr, lr, #26
+ ldr lr, [r5, lr, lsl #2]
+ vmovl.s8 q8, d20
+ vmovl.s8 q9, d21
+ add r5, r5, lr
+ vmovl.s8 q10, d22
+ vmovl.s8 q11, d23
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmovl.s8 q12, d27
+ vmovl.s8 q13, d28
+ vmovl.s8 q14, d29
+ mov r7, #-4
+ vdup.16 q15, r8
+ add r8, r2, #2
+ sub r2, r2, #4
+.if \bpc == 10
+ vmov.i16 q7, #0
+.endif
+ bx r5
+
+ .align 2
+L(ipred_filter\bpc\()_tbl):
+ .word 320f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB
+ .word 160f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB
+ .word 80f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB
+ .word 40f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB
+
+40:
+ vld1.16 {d0}, [r8] // top (0-3)
+4:
+ vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2)
+.if \bpc == 10
+ vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1)
+ vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2)
+ vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3)
+ vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4)
+ vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0)
+ vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5)
+ vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6)
+ vrshr.s16 q2, q2, #4
+ vmax.s16 q2, q2, q7
+.else
+ vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0)
+ vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6)
+ vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0)
+ vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d4, q2, #4
+ vqrshrun.s32 d5, q3, #4
+.endif
+ vmin.s16 q2, q2, q15
+ subs r4, r4, #2
+ vst1.16 {d4}, [r0, :64], r1
+ vst1.16 {d5}, [r6, :64], r1
+ vmov d0, d5 // move top from [4-7] to [0-3]
+ bgt 4b
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+80:
+ vld1.16 {q0}, [r8] // top (0-7)
+8:
+ vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2)
+.if \bpc == 10
+ vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1)
+ vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2)
+ vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3)
+ vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4)
+ vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0)
+ vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5)
+ vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6)
+ vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1)
+ vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2)
+ vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3)
+ vrshr.s16 q2, q2, #4
+ vmax.s16 q2, q2, q7
+ vmin.s16 q2, q2, q15
+ vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4)
+ vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0)
+ vmla.i16 q3, q13, d4[3] // p5(left[0]) * filter(5)
+ vmla.i16 q3, q14, d5[3] // p6(left[1]) * filter(6)
+ vrshr.s16 q3, q3, #4
+ vmax.s16 q3, q3, q7
+.else
+ vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0)
+ vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6)
+ vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0)
+ vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d4, q2, #4
+ vmull.s16 q4, d18, d1[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q4, d20, d1[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q4, d22, d1[2] // p3(top[2]) * filter(3)
+ vqrshrun.s32 d5, q3, #4
+ vmin.s16 q2, q2, q15
+ vmlal.s16 q4, d24, d1[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q4, d16, d0[3] // p0(topleft) * filter(0)
+ vmlal.s16 q4, d26, d4[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q4, d28, d5[3] // p6(left[1]) * filter(6)
+ vmull.s16 q5, d19, d1[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q5, d21, d1[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q5, d23, d1[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q5, d25, d1[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q5, d17, d0[3] // p0(topleft) * filter(0)
+ vmlal.s16 q5, d27, d4[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q5, d29, d5[3] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d6, q4, #4
+ vqrshrun.s32 d7, q5, #4
+.endif
+ vmin.s16 q3, q3, q15
+ vswp d5, d6
+ subs r4, r4, #2
+ vst1.16 {q2}, [r0, :128], r1
+ vmov q0, q3
+ vst1.16 {q3}, [r6, :128], r1
+ bgt 8b
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+160:
+320:
+ sub r1, r1, r3, lsl #1
+ mov lr, r3
+
+1:
+ vld1.16 {d0}, [r2], r7 // left (0-1) + topleft (2)
+2:
+ vld1.16 {q1, q2}, [r8]! // top(0-15)
+.if \bpc == 10
+ vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0)
+ vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5)
+ vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6)
+ vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1)
+ vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2)
+ vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3)
+ vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4)
+
+ vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1)
+ vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2)
+ vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3)
+ vrshr.s16 q3, q3, #4
+ vmax.s16 q3, q3, q7
+ vmin.s16 q3, q3, q15
+ vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4)
+ vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0)
+ vmla.i16 q4, q13, d6[3] // p5(left[0]) * filter(5)
+ vmla.i16 q4, q14, d7[3] // p6(left[1]) * filter(6)
+
+ vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1)
+ vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2)
+ vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3)
+ vrshr.s16 q4, q4, #4
+ vmax.s16 q4, q4, q7
+ vmin.s16 q4, q4, q15
+ vmov q0, q4
+ vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4)
+ vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0)
+ vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6)
+
+ vmul.i16 q6, q9, d5[0] // p1(top[0]) * filter(1)
+ vmla.i16 q6, q10, d5[1] // p2(top[1]) * filter(2)
+ vmla.i16 q6, q11, d5[2] // p3(top[2]) * filter(3)
+ vrshr.s16 q5, q5, #4
+ vmax.s16 q5, q5, q7
+ vmin.s16 q5, q5, q15
+ vmov q0, q5
+ vmov.u16 r12, d5[3]
+ vmla.i16 q6, q12, d5[3] // p4(top[3]) * filter(4)
+ vmla.i16 q6, q8, d4[3] // p0(topleft) * filter(0)
+ vmla.i16 q6, q13, d0[3] // p5(left[0]) * filter(5)
+ vmla.i16 q6, q14, d1[3] // p6(left[1]) * filter(6)
+ vmov.16 d0[2], r12
+ subs r3, r3, #16
+ vrshr.s16 q6, q6, #4
+.else
+ vmull.s16 q3, d16, d0[2] // p0(topleft) * filter(0)
+ vmlal.s16 q3, d26, d0[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q3, d28, d0[0] // p6(left[1]) * filter(6)
+ vmlal.s16 q3, d18, d2[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q3, d20, d2[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q3, d22, d2[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q3, d24, d2[3] // p4(top[3]) * filter(4)
+ vmull.s16 q4, d17, d0[2] // p0(topleft) * filter(0)
+ vmlal.s16 q4, d27, d0[1] // p5(left[0]) * filter(5)
+ vmlal.s16 q4, d29, d0[0] // p6(left[1]) * filter(6)
+ vmlal.s16 q4, d19, d2[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q4, d21, d2[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q4, d23, d2[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q4, d25, d2[3] // p4(top[3]) * filter(4)
+ vqrshrun.s32 d6, q3, #4
+ vmull.s16 q5, d18, d3[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q5, d20, d3[1] // p2(top[1]) * filter(2)
+ vqrshrun.s32 d7, q4, #4
+ vmin.s16 q3, q3, q15
+ vmlal.s16 q5, d22, d3[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q5, d24, d3[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q5, d16, d2[3] // p0(topleft) * filter(0)
+ vmlal.s16 q5, d26, d6[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q5, d28, d7[3] // p6(left[1]) * filter(6)
+ vmull.s16 q6, d19, d3[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q6, d21, d3[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q6, d23, d3[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q6, d25, d3[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q6, d17, d2[3] // p0(topleft) * filter(0)
+ vmlal.s16 q6, d27, d6[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q6, d29, d7[3] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d8, q5, #4
+ vmull.s16 q7, d18, d4[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q7, d20, d4[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q7, d22, d4[2] // p3(top[2]) * filter(3)
+ vqrshrun.s32 d9, q6, #4
+ vmin.s16 q0, q4, q15
+ vmlal.s16 q7, d24, d4[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q7, d16, d3[3] // p0(topleft) * filter(0)
+ vmlal.s16 q7, d26, d0[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q7, d28, d1[3] // p6(left[1]) * filter(6)
+ vmin.s16 q4, q4, q15
+ vmull.s16 q6, d19, d4[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q6, d21, d4[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q6, d23, d4[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q6, d25, d4[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q6, d17, d3[3] // p0(topleft) * filter(0)
+ vmlal.s16 q6, d27, d0[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q6, d29, d1[3] // p6(left[1]) * filter(6)
+ vqrshrun.s32 d10, q7, #4
+ vmull.s16 q1, d18, d5[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q1, d20, d5[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q1, d22, d5[2] // p3(top[2]) * filter(3)
+ vqrshrun.s32 d11, q6, #4
+ vmin.s16 q0, q5, q15
+ vmlal.s16 q1, d24, d5[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q1, d16, d4[3] // p0(topleft) * filter(0)
+ vmlal.s16 q1, d26, d0[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q1, d28, d1[3] // p6(left[1]) * filter(6)
+ vmin.s16 q5, q5, q15
+ vmov.u16 r12, d5[3]
+ vmull.s16 q7, d19, d5[0] // p1(top[0]) * filter(1)
+ vmlal.s16 q7, d21, d5[1] // p2(top[1]) * filter(2)
+ vmlal.s16 q7, d23, d5[2] // p3(top[2]) * filter(3)
+ vmlal.s16 q7, d25, d5[3] // p4(top[3]) * filter(4)
+ vmlal.s16 q7, d17, d4[3] // p0(topleft) * filter(0)
+ vmlal.s16 q7, d27, d0[3] // p5(left[0]) * filter(5)
+ vmlal.s16 q7, d29, d1[3] // p6(left[1]) * filter(6)
+ vmov.16 d0[2], r12
+ vqrshrun.s32 d12, q1, #4
+ subs r3, r3, #16
+ vqrshrun.s32 d13, q7, #4
+.endif
+ vswp q4, q5
+.if \bpc == 10
+ vmax.s16 q6, q6, q7
+.endif
+ vswp d7, d10
+ vmin.s16 q6, q6, q15
+
+ vswp d9, d12
+
+ vst1.16 {q3, q4}, [r0, :128]!
+ vst1.16 {q5, q6}, [r6, :128]!
+ ble 8f
+ vmov.u16 r12, d13[3]
+ vmov.16 d0[0], r12
+ vmov.u16 r12, d9[3]
+ vmov.16 d0[1], r12
+ b 2b
+8:
+ subs r4, r4, #2
+
+ ble 9f
+ sub r8, r6, lr, lsl #1
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, lr
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+endfunc
+.endm
+
+filter_fn 10
+filter_fn 12
+
+function ipred_filter_16bpc_neon, export=1
+ push {r4-r8, lr}
+ vpush {q4-q7}
+ movw r12, 0x3ff
+ ldr r8, [sp, #104]
+ cmp r8, r12
+ ble ipred_filter_10bpc_neon
+ b ipred_filter_12bpc_neon
+endfunc
+
+// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint16_t *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_16bpc_neon, export=1
+ push {r4-r5, lr}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ vld1.16 {q14}, [r2, :128]
+ clz lr, r4
+ adr r12, L(pal_pred_tbl)
+ sub lr, lr, #25
+ ldr lr, [r12, lr, lsl #2]
+ vmov.i16 q15, #0x100
+ add r12, r12, lr
+ add r2, r0, r1
+ bx r12
+
+ .align 2
+L(pal_pred_tbl):
+ .word 640f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 320f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 160f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 80f - L(pal_pred_tbl) + CONFIG_THUMB
+ .word 40f - L(pal_pred_tbl) + CONFIG_THUMB
+
+40:
+ lsl r1, r1, #1
+4:
+ vld1.8 {q1}, [r3, :128]!
+ subs r5, r5, #4
+ // Restructure q1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
+ vadd.i8 q0, q1, q1
+ vadd.i8 q1, q1, q1
+ vzip.8 q0, q1
+ vadd.i16 q0, q0, q15
+ vadd.i16 q1, q1, q15
+ vtbl.8 d0, {q14}, d0
+ vtbl.8 d1, {q14}, d1
+ vst1.16 {d0}, [r0, :64], r1
+ vtbl.8 d2, {q14}, d2
+ vst1.16 {d1}, [r2, :64], r1
+ vtbl.8 d3, {q14}, d3
+ vst1.16 {d2}, [r0, :64], r1
+ vst1.16 {d3}, [r2, :64], r1
+ bgt 4b
+ pop {r4-r5, pc}
+80:
+ lsl r1, r1, #1
+8:
+ vld1.8 {q1, q2}, [r3, :128]!
+ subs r5, r5, #4
+ // Prefer doing the adds twice, instead of chaining a vmov after
+ // the add.
+ vadd.i8 q0, q1, q1
+ vadd.i8 q1, q1, q1
+ vadd.i8 q3, q2, q2
+ vadd.i8 q2, q2, q2
+ vzip.8 q0, q1
+ vzip.8 q2, q3
+ vadd.i16 q0, q0, q15
+ vadd.i16 q1, q1, q15
+ vtbl.8 d0, {q14}, d0
+ vadd.i16 q2, q2, q15
+ vtbl.8 d1, {q14}, d1
+ vadd.i16 q3, q3, q15
+ vtbl.8 d2, {q14}, d2
+ vtbl.8 d3, {q14}, d3
+ vtbl.8 d4, {q14}, d4
+ vtbl.8 d5, {q14}, d5
+ vst1.16 {q0}, [r0, :128], r1
+ vtbl.8 d6, {q14}, d6
+ vst1.16 {q1}, [r2, :128], r1
+ vtbl.8 d7, {q14}, d7
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r2, :128], r1
+ bgt 8b
+ pop {r4-r5, pc}
+160:
+ lsl r1, r1, #1
+16:
+ vld1.8 {q2, q3}, [r3, :128]!
+ subs r5, r5, #4
+ vld1.8 {q10, q11}, [r3, :128]!
+ vadd.i8 q0, q2, q2
+ vadd.i8 q1, q2, q2
+ vadd.i8 q2, q3, q3
+ vadd.i8 q3, q3, q3
+ vadd.i8 q8, q10, q10
+ vadd.i8 q9, q10, q10
+ vadd.i8 q10, q11, q11
+ vzip.8 q0, q1
+ vadd.i8 q11, q11, q11
+ vzip.8 q2, q3
+ vzip.8 q8, q9
+ vadd.i16 q0, q0, q15
+ vzip.8 q10, q11
+ vadd.i16 q1, q1, q15
+ vadd.i16 q2, q2, q15
+ vadd.i16 q3, q3, q15
+ vadd.i16 q8, q8, q15
+ vadd.i16 q9, q9, q15
+ vadd.i16 q10, q10, q15
+ vtbl.8 d0, {q14}, d0
+ vadd.i16 q11, q11, q15
+ vtbl.8 d1, {q14}, d1
+ vtbl.8 d2, {q14}, d2
+ vtbl.8 d3, {q14}, d3
+ vtbl.8 d4, {q14}, d4
+ vtbl.8 d5, {q14}, d5
+ vtbl.8 d6, {q14}, d6
+ vtbl.8 d7, {q14}, d7
+ vtbl.8 d16, {q14}, d16
+ vtbl.8 d17, {q14}, d17
+ vtbl.8 d18, {q14}, d18
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vtbl.8 d19, {q14}, d19
+ vtbl.8 d20, {q14}, d20
+ vst1.16 {q2, q3}, [r2, :128], r1
+ vtbl.8 d21, {q14}, d21
+ vtbl.8 d22, {q14}, d22
+ vst1.16 {q8, q9}, [r0, :128], r1
+ vtbl.8 d23, {q14}, d23
+ vst1.16 {q10, q11}, [r2, :128], r1
+ bgt 16b
+ pop {r4-r5, pc}
+320:
+ lsl r1, r1, #1
+ sub r1, r1, #32
+32:
+ vld1.8 {q2, q3}, [r3, :128]!
+ subs r5, r5, #2
+ vld1.8 {q10, q11}, [r3, :128]!
+ vadd.i8 q0, q2, q2
+ vadd.i8 q1, q2, q2
+ vadd.i8 q2, q3, q3
+ vadd.i8 q3, q3, q3
+ vadd.i8 q8, q10, q10
+ vadd.i8 q9, q10, q10
+ vadd.i8 q10, q11, q11
+ vzip.8 q0, q1
+ vadd.i8 q11, q11, q11
+ vzip.8 q2, q3
+ vzip.8 q8, q9
+ vadd.i16 q0, q0, q15
+ vzip.8 q10, q11
+ vadd.i16 q1, q1, q15
+ vadd.i16 q2, q2, q15
+ vadd.i16 q3, q3, q15
+ vadd.i16 q8, q8, q15
+ vadd.i16 q9, q9, q15
+ vadd.i16 q10, q10, q15
+ vtbl.8 d0, {q14}, d0
+ vadd.i16 q11, q11, q15
+ vtbl.8 d1, {q14}, d1
+ vtbl.8 d2, {q14}, d2
+ vtbl.8 d3, {q14}, d3
+ vtbl.8 d4, {q14}, d4
+ vtbl.8 d5, {q14}, d5
+ vtbl.8 d6, {q14}, d6
+ vtbl.8 d7, {q14}, d7
+ vtbl.8 d16, {q14}, d16
+ vtbl.8 d17, {q14}, d17
+ vtbl.8 d18, {q14}, d18
+ vst1.16 {q0, q1}, [r0, :128]!
+ vtbl.8 d19, {q14}, d19
+ vtbl.8 d20, {q14}, d20
+ vst1.16 {q2, q3}, [r0, :128], r1
+ vtbl.8 d21, {q14}, d21
+ vtbl.8 d22, {q14}, d22
+ vst1.16 {q8, q9}, [r2, :128]!
+ vtbl.8 d23, {q14}, d23
+ vst1.16 {q10, q11}, [r2, :128], r1
+ bgt 32b
+ pop {r4-r5, pc}
+640:
+ sub r1, r1, #96
+64:
+ vld1.8 {q2, q3}, [r3, :128]!
+ subs r5, r5, #1
+ vld1.8 {q10, q11}, [r3, :128]!
+ vadd.i8 q0, q2, q2
+ vadd.i8 q1, q2, q2
+ vadd.i8 q2, q3, q3
+ vadd.i8 q3, q3, q3
+ vadd.i8 q8, q10, q10
+ vadd.i8 q9, q10, q10
+ vadd.i8 q10, q11, q11
+ vzip.8 q0, q1
+ vadd.i8 q11, q11, q11
+ vzip.8 q2, q3
+ vzip.8 q8, q9
+ vadd.i16 q0, q0, q15
+ vzip.8 q10, q11
+ vadd.i16 q1, q1, q15
+ vadd.i16 q2, q2, q15
+ vadd.i16 q3, q3, q15
+ vadd.i16 q8, q8, q15
+ vadd.i16 q9, q9, q15
+ vadd.i16 q10, q10, q15
+ vtbl.8 d0, {q14}, d0
+ vadd.i16 q11, q11, q15
+ vtbl.8 d1, {q14}, d1
+ vtbl.8 d2, {q14}, d2
+ vtbl.8 d3, {q14}, d3
+ vtbl.8 d4, {q14}, d4
+ vtbl.8 d5, {q14}, d5
+ vtbl.8 d6, {q14}, d6
+ vtbl.8 d7, {q14}, d7
+ vtbl.8 d16, {q14}, d16
+ vtbl.8 d17, {q14}, d17
+ vtbl.8 d18, {q14}, d18
+ vst1.16 {q0, q1}, [r0, :128]!
+ vtbl.8 d19, {q14}, d19
+ vtbl.8 d20, {q14}, d20
+ vst1.16 {q2, q3}, [r0, :128]!
+ vtbl.8 d21, {q14}, d21
+ vtbl.8 d22, {q14}, d22
+ vst1.16 {q8, q9}, [r0, :128]!
+ vtbl.8 d23, {q14}, d23
+ vst1.16 {q10, q11}, [r0, :128], r1
+ bgt 64b
+ pop {r4-r5, pc}
+endfunc
+
+// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_128_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ clz lr, r3
+ vdup.16 q15, r7 // bitdepth_max
+ adr r12, L(ipred_cfl_128_tbl)
+ sub lr, lr, #26
+ ldr lr, [r12, lr, lsl #2]
+ vrshr.u16 q0, q15, #1
+ vdup.16 q1, r6 // alpha
+ add r12, r12, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmov.i16 q14, #0
+ bx r12
+
+ .align 2
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+ .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_splat_w4):
+ vld1.16 {q8, q9}, [r5, :128]!
+ vmull.s16 q2, d16, d2 // diff = ac * alpha
+ vmull.s16 q3, d17, d3
+ vmull.s16 q8, d18, d2
+ vmull.s16 q9, d19, d3
+ vshr.s32 q10, q2, #31 // sign = diff >> 15
+ vshr.s32 q11, q3, #31
+ vshr.s32 q12, q8, #31
+ vshr.s32 q13, q9, #31
+ vadd.i32 q2, q2, q10 // diff + sign
+ vadd.i32 q3, q3, q11
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q8, #6
+ vrshrn.i32 d7, q9, #6
+ vadd.i16 q2, q2, q0 // dc + apply_sign()
+ vadd.i16 q3, q3, q0
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+ vst1.16 {d4}, [r0, :64], r1
+ vst1.16 {d5}, [r6, :64], r1
+ subs r4, r4, #4
+ vst1.16 {d6}, [r0, :64], r1
+ vst1.16 {d7}, [r6, :64], r1
+ bgt L(ipred_cfl_splat_w4)
+ pop {r4-r8, pc}
+L(ipred_cfl_splat_w8):
+ vld1.16 {q8, q9}, [r5, :128]!
+ subs r4, r4, #2
+ vmull.s16 q2, d16, d2 // diff = ac * alpha
+ vmull.s16 q3, d17, d3
+ vmull.s16 q8, d18, d2
+ vmull.s16 q9, d19, d3
+ vshr.s32 q10, q2, #31 // sign = diff >> 15
+ vshr.s32 q11, q3, #31
+ vshr.s32 q12, q8, #31
+ vshr.s32 q13, q9, #31
+ vadd.i32 q2, q2, q10 // diff + sign
+ vadd.i32 q3, q3, q11
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q8, #6
+ vrshrn.i32 d7, q9, #6
+ vadd.i16 q2, q2, q0 // dc + apply_sign()
+ vadd.i16 q3, q3, q0
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r6, :128], r1
+ bgt L(ipred_cfl_splat_w8)
+ pop {r4-r8, pc}
+L(ipred_cfl_splat_w16):
+ vpush {q4-q7}
+ add r12, r5, r3, lsl #1
+ sub r1, r1, r3, lsl #1
+ mov lr, r3
+1:
+ vld1.16 {q6, q7}, [r5, :128]!
+ vmull.s16 q2, d12, d2 // diff = ac * alpha
+ vld1.16 {q8, q9}, [r12, :128]!
+ vmull.s16 q3, d13, d3
+ vmull.s16 q4, d14, d2
+ vmull.s16 q5, d15, d3
+ vmull.s16 q6, d16, d2
+ vmull.s16 q7, d17, d3
+ vmull.s16 q8, d18, d2
+ vmull.s16 q9, d19, d3
+ vshr.s32 q10, q2, #31 // sign = diff >> 15
+ vshr.s32 q11, q3, #31
+ vshr.s32 q12, q4, #31
+ vshr.s32 q13, q5, #31
+ vadd.i32 q2, q2, q10 // diff + sign
+ vshr.s32 q10, q6, #31
+ vadd.i32 q3, q3, q11
+ vshr.s32 q11, q7, #31
+ vadd.i32 q4, q4, q12
+ vshr.s32 q12, q8, #31
+ vadd.i32 q5, q5, q13
+ vshr.s32 q13, q9, #31
+ vadd.i32 q6, q6, q10
+ vadd.i32 q7, q7, q11
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q4, #6
+ vrshrn.i32 d7, q5, #6
+ vadd.i16 q2, q2, q0 // dc + apply_sign()
+ vrshrn.i32 d8, q6, #6
+ vrshrn.i32 d9, q7, #6
+ vadd.i16 q3, q3, q0
+ vrshrn.i32 d10, q8, #6
+ vrshrn.i32 d11, q9, #6
+ vadd.i16 q4, q4, q0
+ vadd.i16 q5, q5, q0
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmax.s16 q4, q4, q14
+ vmax.s16 q5, q5, q14
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+ vmin.s16 q4, q4, q15
+ vmin.s16 q5, q5, q15
+ subs r3, r3, #16
+ vst1.16 {q2, q3}, [r0, :128]!
+ vst1.16 {q4, q5}, [r6, :128]!
+ bgt 1b
+ subs r4, r4, #2
+ add r5, r5, lr, lsl #1
+ add r12, r12, lr, lsl #1
+ add r0, r0, r1
+ add r6, r6, r1
+ mov r3, lr
+ bgt 1b
+ vpop {q4-q7}
+ pop {r4-r8, pc}
+endfunc
+
+// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_top_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ clz lr, r3
+ vdup.16 q15, r7 // bitdepth_max
+ adr r12, L(ipred_cfl_top_tbl)
+ sub lr, lr, #26
+ ldr lr, [r12, lr, lsl #2]
+ vdup.16 q1, r6 // alpha
+ add r2, r2, #2
+ add r12, r12, lr
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmov.i16 q14, #0
+ bx r12
+
+ .align 2
+L(ipred_cfl_top_tbl):
+ .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+ .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB
+
+4:
+ vld1.16 {d0}, [r2]
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w4)
+8:
+ vld1.16 {q0}, [r2]
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w8)
+16:
+ vld1.16 {q2, q3}, [r2]
+ vadd.i16 q0, q2, q3
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+32:
+ vld1.16 {q8, q9}, [r2]!
+ vld1.16 {q10, q11}, [r2]
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q8, q10
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpaddl.u16 d0, d0
+ vrshrn.i32 d0, q0, #5
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+endfunc
+
+// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_left_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ sub r2, r2, r4, lsl #1
+ clz lr, r3
+ clz r8, r4
+ vdup.16 q15, r7 // bitdepth_max
+ adr r12, L(ipred_cfl_splat_tbl)
+ adr r7, L(ipred_cfl_left_tbl)
+ sub lr, lr, #26
+ sub r8, r8, #26
+ ldr lr, [r12, lr, lsl #2]
+ ldr r8, [r7, r8, lsl #2]
+ vdup.16 q1, r6 // alpha
+ add r12, r12, lr
+ add r7, r7, r8
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmov.i16 q14, #0
+ bx r7
+
+ .align 2
+L(ipred_cfl_left_tbl):
+ .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_left_h4):
+ vld1.16 {d0}, [r2, :64]
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #2
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h8):
+ vld1.16 {q0}, [r2, :128]
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #3
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h16):
+ vld1.16 {q2, q3}, [r2, :128]
+ vadd.i16 q0, q2, q3
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpadd.i16 d0, d0, d0
+ vrshr.u16 d0, d0, #4
+ vdup.16 q0, d0[0]
+ bx r12
+
+L(ipred_cfl_left_h32):
+ vld1.16 {q8, q9}, [r2, :128]!
+ vld1.16 {q10, q11}, [r2, :128]
+ vadd.i16 q8, q8, q9
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q8, q10
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ vpaddl.u16 d0, d0
+ vrshrn.i32 d0, q0, #5
+ vdup.16 q0, d0[0]
+ bx r12
+endfunc
+
+// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_16bpc_neon, export=1
+ push {r4-r8, lr}
+ ldrd r4, r5, [sp, #24]
+ ldrd r6, r7, [sp, #32]
+ sub r2, r2, r4, lsl #1
+ add r8, r3, r4 // width + height
+ vdup.16 q1, r6 // alpha
+ clz lr, r3
+ clz r6, r4
+ vdup.32 d16, r8 // width + height
+ vdup.16 q15, r7 // bitdepth_max
+ adr r7, L(ipred_cfl_tbl)
+ rbit r8, r8 // rbit(width + height)
+ sub lr, lr, #22 // 26 leading bits, minus table offset 4
+ sub r6, r6, #26
+ clz r8, r8 // ctz(width + height)
+ ldr lr, [r7, lr, lsl #2]
+ ldr r6, [r7, r6, lsl #2]
+ neg r8, r8 // -ctz(width + height)
+ add r12, r7, lr
+ add r7, r7, r6
+ vshr.u32 d16, d16, #1 // (width + height) >> 1
+ vdup.32 d17, r8 // -ctz(width + height)
+ add r6, r0, r1
+ lsl r1, r1, #1
+ vmov.i16 q14, #0
+ bx r7
+
+ .align 2
+L(ipred_cfl_tbl):
+ .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_h4):
+ vld1.16 {d0}, [r2, :64]!
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r12
+L(ipred_cfl_w4):
+ vld1.16 {d1}, [r2]
+ vadd.i32 d0, d0, d16
+ vpadd.i16 d1, d1, d1
+ vpaddl.u16 d1, d1
+ cmp r4, #4
+ vadd.i32 d0, d0, d1
+ vshl.u32 d0, d0, d17
+ beq 1f
+ // h = 8/16
+ cmp r4, #16
+ movw lr, #0x6667
+ movw r8, #0xAAAB
+ it ne
+ movne lr, r8
+ vdup.32 d18, lr
+ vmul.i32 d0, d0, d18
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+ vld1.16 {q0}, [r2, :128]!
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r12
+L(ipred_cfl_w8):
+ vld1.16 {q2}, [r2]
+ vadd.i32 d0, d0, d16
+ vadd.i16 d1, d4, d5
+ vpadd.i16 d1, d1, d1
+ vpaddl.u16 d1, d1
+ cmp r4, #8
+ vadd.i32 d0, d0, d1
+ vshl.u32 d0, d0, d17
+ beq 1f
+ // h = 4/16/32
+ cmp r4, #32
+ movw lr, #0x6667
+ movw r8, #0xAAAB
+ it ne
+ movne lr, r8
+ vdup.32 d18, lr
+ vmul.i32 d0, d0, d18
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+ vld1.16 {q2, q3}, [r2, :128]!
+ vadd.i16 q0, q2, q3
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r12
+L(ipred_cfl_w16):
+ vld1.16 {q2, q3}, [r2]
+ vadd.i32 d0, d0, d16
+ vadd.i16 q2, q2, q3
+ vadd.i16 d1, d4, d5
+ vpadd.i16 d1, d1, d1
+ vpaddl.u16 d1, d1
+ cmp r4, #16
+ vadd.i32 d0, d0, d1
+ vshl.u32 d0, d0, d17
+ beq 1f
+ // h = 4/8/32/64
+ tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
+ movw lr, #0x6667
+ movw r8, #0xAAAB
+ it ne
+ movne lr, r8
+ vdup.32 d18, lr
+ vmul.i32 d0, d0, d18
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+ vld1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {q10, q11}, [r2, :128]!
+ vadd.i16 q2, q2, q3
+ vadd.i16 q10, q10, q11
+ vadd.i16 q0, q2, q10
+ vadd.i16 d0, d0, d1
+ vpadd.i16 d0, d0, d0
+ add r2, r2, #2
+ vpaddl.u16 d0, d0
+ bx r12
+L(ipred_cfl_w32):
+ vld1.16 {q2, q3}, [r2]!
+ vadd.i32 d0, d0, d16
+ vld1.16 {q10, q11}, [r2]!
+ vadd.i16 q2, q2, q3
+ vadd.i16 q10, q10, q11
+ vadd.i16 q2, q2, q10
+ vadd.i16 d1, d4, d5
+ vpadd.i16 d1, d1, d1
+ vpaddl.u16 d1, d1
+ cmp r4, #32
+ vadd.i32 d0, d0, d1
+ vshl.u32 d0, d0, d17
+ beq 1f
+ // h = 8/16/64
+ cmp r4, #8
+ movw lr, #0x6667
+ movw r8, #0xAAAB
+ it ne
+ movne lr, r8
+ vdup.32 d18, lr
+ vmul.i32 d0, d0, d18
+ vshr.u32 d0, d0, #17
+1:
+ vdup.16 q0, d0[0]
+ b L(ipred_cfl_splat_w16)
+endfunc
+
+// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_16bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_420_tbl)
+ sub r8, r8, #27
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i32 q8, #0
+ vmov.i32 q9, #0
+ vmov.i32 q10, #0
+ vmov.i32 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_420_tbl):
+ .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w4):
+1: // Copy and subsample input
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q1}, [r12, :128], r2
+ vld1.16 {q2}, [r1, :128], r2
+ vld1.16 {q3}, [r12, :128], r2
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d4, d5
+ vshl.i16 q0, q0, #1
+ subs r8, r8, #2
+ vst1.16 {q0}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d1
+ vmov d2, d1
+ vmov d3, d1
+L(ipred_cfl_ac_420_w4_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 2b
+3:
+L(ipred_cfl_ac_420_w4_calc_subtract_dc):
+ // Aggregate the sums
+ vadd.i32 q8, q8, q9
+ vadd.i32 q10, q10, q11
+ vadd.i32 q0, q8, q10
+ vadd.i32 d0, d0, d1
+ vpadd.i32 d0, d0, d0 // sum
+ sub r0, r0, r6, lsl #3
+ vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz
+ vdup.16 q8, d16[0]
+6: // Subtract dc from ac
+ vld1.16 {q0, q1}, [r0, :128]
+ subs r6, r6, #4
+ vsub.i16 q0, q0, q8
+ vsub.i16 q1, q1, q8
+ vst1.16 {q0, q1}, [r0, :128]!
+ bgt 6b
+ pop {r4-r8, pc}
+
+L(ipred_cfl_ac_420_w8):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_420_w8_wpad)
+1: // Copy and subsample input, without padding
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vld1.16 {q12, q13}, [r1, :128], r2
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vadd.i16 q12, q12, q2
+ vadd.i16 q13, q13, q3
+ vpadd.i16 d2, d24, d25
+ vpadd.i16 d3, d26, d27
+ vshl.i16 q0, q0, #1
+ vshl.i16 q1, q1, #1
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q1
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1: // Copy and subsample input, padding 4
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q1}, [r12, :128], r2
+ vld1.16 {q2}, [r1, :128], r2
+ vld1.16 {q3}, [r12, :128], r2
+ vadd.i16 q0, q0, q1
+ vadd.i16 q2, q2, q3
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d4, d5
+ vshl.i16 q0, q0, #1
+ vdup.16 d3, d1[3]
+ vmov d2, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q1
+
+L(ipred_cfl_ac_420_w8_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 2b
+3:
+
+ // Double the height and reuse the w4 summing/subtracting
+ lsl r6, r6, #1
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_w16):
+ adr r7, L(ipred_cfl_ac_420_w16_tbl)
+ ldr r3, [r7, r3, lsl #2]
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_420_w16_tbl):
+ .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_420_w16_wpad0):
+ sub r2, r2, #32
+1: // Copy and subsample input, without padding
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q12, q13}, [r12, :128]!
+ vld1.16 {q2, q3}, [r1, :128], r2
+ vadd.i16 q0, q0, q12
+ vadd.i16 q1, q1, q13
+ vld1.16 {q12, q13}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vadd.i16 q2, q2, q12
+ vadd.i16 q3, q3, q13
+ vpadd.i16 d2, d4, d5
+ vpadd.i16 d3, d6, d7
+ vshl.i16 q0, q0, #1
+ vshl.i16 q1, q1, #1
+ subs r8, r8, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+ sub r2, r2, #32
+1: // Copy and subsample input, padding 4
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q12, q13}, [r12, :128]!
+ vld1.16 {q2}, [r1, :128], r2
+ vadd.i16 q0, q0, q12
+ vadd.i16 q1, q1, q13
+ vld1.16 {q12}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vadd.i16 q2, q2, q12
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d4, d5
+ vshl.i16 q0, q0, #1
+ vshl.i16 d2, d2, #1
+ subs r8, r8, #1
+ vdup.16 d3, d2[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q12, q13}, [r12, :128], r2
+ vadd.i16 q0, q0, q12
+ vadd.i16 q1, q1, q13
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vshl.i16 q0, q0, #1
+ subs r8, r8, #1
+ vdup.16 q1, d1[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q12}, [r12, :128], r2
+ vadd.i16 q0, q0, q12
+ vpadd.i16 d0, d0, d1
+ vshl.i16 d0, d0, #1
+ subs r8, r8, #1
+ vdup.16 q1, d0[3]
+ vdup.16 d1, d0[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 2b
+3:
+
+ // Quadruple the height and reuse the w4 summing/subtracting
+ lsl r6, r6, #2
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+endfunc
+
+// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_16bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_422_tbl)
+ sub r8, r8, #27
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_422_tbl):
+ .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w4):
+1: // Copy and subsample input
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q1}, [r12, :128], r2
+ vld1.16 {q2}, [r1, :128], r2
+ vld1.16 {q3}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d4, d5
+ vpadd.i16 d3, d6, d7
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d3
+ vmov d1, d3
+ vmov d2, d3
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_422_w8_wpad)
+1: // Copy and subsample input, without padding
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vld1.16 {q12, q13}, [r1, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d4, d5
+ vpadd.i16 d3, d6, d7
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vpadd.i16 d24, d24, d25
+ vpadd.i16 d25, d26, d27
+ vpadd.i16 d26, d4, d5
+ vpadd.i16 d27, d6, d7
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vshl.i16 q2, q12, #2
+ vshl.i16 q3, q13, #2
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1: // Copy and subsample input, padding 4
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q2}, [r12, :128], r2
+ vld1.16 {q12}, [r1, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d4, d5
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vpadd.i16 d24, d24, d25
+ vpadd.i16 d25, d4, d5
+ vshl.i16 q0, q0, #2
+ vshl.i16 q12, q12, #2
+ vdup.16 d7, d25[3]
+ vmov d6, d25
+ vdup.16 d5, d24[3]
+ vmov d4, d24
+ vdup.16 d3, d1[3]
+ vmov d2, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+ adr r7, L(ipred_cfl_ac_422_w16_tbl)
+ ldr r3, [r7, r3, lsl #2]
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_422_w16_tbl):
+ .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_422_w16_wpad0):
+ sub r2, r2, #32
+1: // Copy and subsample input, without padding
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q2, q3}, [r12, :128]!
+ vld1.16 {q12, q13}, [r1, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d24, d25
+ vpadd.i16 d3, d26, d27
+ vld1.16 {q12, q13}, [r12, :128], r2
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vpadd.i16 d6, d24, d25
+ vpadd.i16 d7, d26, d27
+ vshl.i16 q0, q0, #2
+ vshl.i16 q1, q1, #2
+ vshl.i16 q2, q2, #2
+ vshl.i16 q3, q3, #2
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+ sub r2, r2, #32
+1: // Copy and subsample input, padding 4
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q2, q3}, [r12, :128]!
+ vld1.16 {q12}, [r1, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d2, d24, d25
+ vld1.16 {q12}, [r12, :128], r2
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vpadd.i16 d6, d24, d25
+ vshl.i16 q0, q0, #2
+ vshl.i16 d2, d2, #2
+ vshl.i16 q2, q2, #2
+ vshl.i16 d6, d6, #2
+ vdup.16 d3, d2[3]
+ vdup.16 d7, d6[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d2, d3
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d5, d6, d7
+ vshl.i16 q0, q0, #2
+ vshl.i16 q2, q2, #2
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q2}, [r12, :128], r2
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d1, d4, d5
+ vshl.i16 q0, q0, #2
+ vdup.16 q3, d1[3]
+ vdup.16 q1, d0[3]
+ vdup.16 d5, d1[3]
+ vmov d4, d1
+ vdup.16 d1, d0[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+endfunc
+
+// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_16bpc_neon, export=1
+ push {r4-r8,lr}
+ ldrd r4, r5, [sp, #24]
+ ldr r6, [sp, #32]
+ clz r8, r5
+ lsl r4, r4, #2
+ adr r7, L(ipred_cfl_ac_444_tbl)
+ sub r8, r8, #26
+ ldr r8, [r7, r8, lsl #2]
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ vmov.i16 q10, #0
+ vmov.i16 q11, #0
+ add r7, r7, r8
+ sub r8, r6, r4 // height - h_pad
+ rbit lr, r5 // rbit(width)
+ rbit r12, r6 // rbit(height)
+ clz lr, lr // ctz(width)
+ clz r12, r12 // ctz(height)
+ add lr, lr, r12 // log2sz
+ add r12, r1, r2
+ vdup.32 d31, lr
+ lsl r2, r2, #1
+ vneg.s32 d31, d31 // -log2sz
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_444_tbl):
+ .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w4):
+1: // Copy and expand input
+ vld1.16 {d0}, [r1, :64], r2
+ vld1.16 {d1}, [r12, :64], r2
+ vld1.16 {d2}, [r1, :64], r2
+ vld1.16 {d3}, [r12, :64], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ bgt 1b
+ cmp r4, #0
+ vmov d0, d3
+ vmov d1, d3
+ vmov d2, d3
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+1: // Copy and expand input
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q1}, [r12, :128], r2
+ vld1.16 {q2}, [r1, :128], r2
+ vld1.16 {q3}, [r12, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ vshl.i16 q2, q2, #3
+ vshl.i16 q3, q3, #3
+ subs r8, r8, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q3
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ cmp r3, #0
+ bne L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vld1.16 {q2, q3}, [r12, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ vshl.i16 q2, q2, #3
+ vshl.i16 q3, q3, #3
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ vld1.16 {q0}, [r1, :128], r2
+ vld1.16 {q2}, [r12, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q2, q2, #3
+ vdup.16 q1, d1[3]
+ vdup.16 q3, d5[3]
+ subs r8, r8, #2
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ vmov q0, q2
+ vmov q1, q3
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ adr r7, L(ipred_cfl_ac_444_w32_tbl)
+ ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2
+ asr r2, r2, #1
+ add r7, r7, r3
+ bx r7
+
+ .align 2
+L(ipred_cfl_ac_444_w32_tbl):
+ .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+ .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB
+
+L(ipred_cfl_ac_444_w32_wpad0):
+ sub r2, r2, #32
+1: // Copy and expand input, without padding
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q2, q3}, [r1, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ vshl.i16 q2, q2, #3
+ vshl.i16 q3, q3, #3
+ subs r8, r8, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+ sub r2, r2, #32
+1: // Copy and expand input, padding 8
+ vld1.16 {q0, q1}, [r1, :128]!
+ vld1.16 {q2}, [r1, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ vshl.i16 q2, q2, #3
+ subs r8, r8, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vdup.16 q3, d5[3]
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+1: // Copy and expand input, padding 16
+ vld1.16 {q0, q1}, [r1, :128], r2
+ vshl.i16 q0, q0, #3
+ vshl.i16 q1, q1, #3
+ subs r8, r8, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vdup.16 q2, d3[3]
+ vdup.16 q3, d3[3]
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+1: // Copy and expand input, padding 24
+ vld1.16 {q0}, [r1, :128], r2
+ vshl.i16 q0, q0, #3
+ subs r8, r8, #1
+ vdup.16 q1, d1[3]
+ vst1.16 {q0, q1}, [r0, :128]!
+ vdup.16 q2, d1[3]
+ vdup.16 q3, d1[3]
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 1b
+ cmp r4, #0
+
+L(ipred_cfl_ac_444_w32_hpad):
+ beq 3f // This assumes that all callers already did "cmp r4, #0"
+2: // Vertical padding (h_pad > 0)
+ subs r4, r4, #1
+ vst1.16 {q0, q1}, [r0, :128]!
+ vaddw.u16 q8, q8, d0
+ vaddw.u16 q9, q9, d1
+ vaddw.u16 q10, q10, d2
+ vaddw.u16 q11, q11, d3
+ vst1.16 {q2, q3}, [r0, :128]!
+ vaddw.u16 q8, q8, d4
+ vaddw.u16 q9, q9, d5
+ vaddw.u16 q10, q10, d6
+ vaddw.u16 q11, q11, d7
+ bgt 2b
+3:
+
+ // Multiply the height by eight and reuse the w4 subtracting
+ lsl r6, r6, #3
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+endfunc
diff --git a/third_party/dav1d/src/arm/32/itx.S b/third_party/dav1d/src/arm/32/itx.S
new file mode 100644
index 0000000000..ceea025e45
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/itx.S
@@ -0,0 +1,3343 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
+
+// Most of the functions use the following register layout:
+// r0-r3 external parameters
+// r4 function pointer to first transform
+// r5 function pointer to second transform
+// r6 output parameter for helper function
+// r7 input parameter for helper function
+// r8 input stride for helper function
+// r9 scratch variable for helper functions
+// r10-r11 pointer to list of eob thresholds, eob threshold value,
+// scratch variables within helper functions (backed up)
+
+// The SIMD registers most often use the following layout:
+// d0-d3 multiplication coefficients
+// d4-d7 scratch registers
+// d8-d15 unused in some transforms, used for scratch registers in others
+// d16-v31 inputs/outputs of transforms
+
+// Potential further optimizations, that are left unimplemented for now:
+// - Trying to keep multiplication coefficients in registers across multiple
+// transform functions. (The register layout is designed to potentially
+// allow this.)
+// - Use a simplified version of the transforms themselves for cases where
+// we know a significant number of inputs are zero. E.g. if the eob value
+// indicates only a quarter of input values are set, for idct16 and up,
+// a significant amount of calculation can be skipped, at the cost of more
+// code duplication and special casing.
+
+const idct_coeffs, align=4
+ // idct4
+ .short 2896, 2896*8, 1567, 3784
+ // idct8
+ .short 799, 4017, 3406, 2276
+ // idct16
+ .short 401, 4076, 3166, 2598
+ .short 1931, 3612, 3920, 1189
+ // idct32
+ .short 201, 4091, 3035, 2751
+ .short 1751, 3703, 3857, 1380
+ .short 995, 3973, 3513, 2106
+ .short 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .short 101*8, 4095*8, 2967*8, -2824*8
+ .short 1660*8, 3745*8, 3822*8, -1474*8
+ .short 4076, 401, 4017, 799
+
+ .short 4036*8, -700*8, 2359*8, 3349*8
+ .short 3461*8, -2191*8, 897*8, 3996*8
+ .short -3166, -2598, -799, -4017
+
+ .short 501*8, 4065*8, 3229*8, -2520*8
+ .short 2019*8, 3564*8, 3948*8, -1092*8
+ .short 3612, 1931, 2276, 3406
+
+ .short 4085*8, -301*8, 2675*8, 3102*8
+ .short 3659*8, -1842*8, 1285*8, 3889*8
+ .short -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+ // .h[4-5] can be interpreted as .s[2]
+ .short 1321, 3803, 2482, 3344, 3344, 0
+endconst
+
+const iadst8_coeffs, align=4
+ .short 4076, 401, 3612, 1931
+ .short 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .short 2896, 0, 1567, 3784, 0, 0, 0, 0
+endconst
+
+const iadst16_coeffs, align=4
+ .short 4091, 201, 3973, 995
+ .short 3703, 1751, 3290, 2440
+ .short 2751, 3035, 2106, 3513
+ .short 1380, 3857, 601, 4052
+endconst
+
+.macro vmull_vmlal d0, s0, s1, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlal.s16 \d0, \s1, \c1
+.endm
+
+.macro vmull_vmlal_8h d0, d1, s0, s1, s2, s3, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlal.s16 \d0, \s2, \c1
+ vmull.s16 \d1, \s1, \c0
+ vmlal.s16 \d1, \s3, \c1
+.endm
+
+.macro vmull_vmlsl d0, s0, s1, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlsl.s16 \d0, \s1, \c1
+.endm
+
+.macro vmull_vmlsl_8h d0, d1, s0, s1, s2, s3, c0, c1
+ vmull.s16 \d0, \s0, \c0
+ vmlsl.s16 \d0, \s2, \c1
+ vmull.s16 \d1, \s1, \c0
+ vmlsl.s16 \d1, \s3, \c1
+.endm
+
+.macro vqrshrn_8h d0, d1, s0, s1, shift
+ vqrshrn.s32 \d0, \s0, \shift
+ vqrshrn.s32 \d1, \s1, \shift
+.endm
+
+.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7
+ vqrdmulh.s16 \r0, \r0, \c
+ vqrdmulh.s16 \r1, \r1, \c
+.ifnb \r2
+ vqrdmulh.s16 \r2, \r2, \c
+ vqrdmulh.s16 \r3, \r3, \c
+.endif
+.ifnb \r4
+ vqrdmulh.s16 \r4, \r4, \c
+ vqrdmulh.s16 \r5, \r5, \c
+ vqrdmulh.s16 \r6, \r6, \c
+ vqrdmulh.s16 \r7, \r7, \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
+.ifnb \load
+ vld1.8 {\load}, [\src, :64], r1
+.endif
+.ifnb \shift
+ vrshr.s16 \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ vaddw.u8 \adddst, \adddst, \addsrc
+.endif
+.ifnb \narrowsrc
+ vqmovun.s16 \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+ vst1.8 {\store}, [\dst, :64], r1
+.endif
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ load_add_store d2, q8, , , , , , \dst, \src, \shiftbits
+ load_add_store d3, q9, , , , , , \dst, \src, \shiftbits
+ load_add_store d4, q10, d2, q8, , , , \dst, \src, \shiftbits
+ load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src, \shiftbits
+ load_add_store d6, q12, d4, q10, q9, d3, d2, \dst, \src, \shiftbits
+ load_add_store d7, q13, d5, q11, q10, d4, d3, \dst, \src, \shiftbits
+ load_add_store d2, q14, d6, q12, q11, d5, d4, \dst, \src, \shiftbits
+ load_add_store d3, q15, d7, q13, q12, d6, d5, \dst, \src, \shiftbits
+ load_add_store , , d2, q14, q13, d7, d6, \dst, \src, \shiftbits
+ load_add_store , , d3, q15, q14, d2, d7, \dst, \src, \shiftbits
+ load_add_store , , , , q15, d3, d2, \dst, \src, \shiftbits
+ load_add_store , , , , , , d3, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src
+ mov \src, \dst
+ load_add_store d2, q8, , , , , , \dst, \src
+ load_add_store d3, q9, , , , , , \dst, \src
+ load_add_store d4, q10, d2, q8, , , , \dst, \src
+ load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src
+ load_add_store , , d4, q10, q9, d3, d2, \dst, \src
+ load_add_store , , d5, q11, q10, d4, d3, \dst, \src
+ load_add_store , , , , q11, d5, d4, \dst, \src
+ load_add_store , , , , , , d5, \dst, \src
+.endm
+.macro load_add_store4 load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src
+.ifnb \load
+ vld1.32 {\load[0]}, [\src, :32], r1
+.endif
+.ifnb \shift
+ vrshr.s16 \shift, \shift, #4
+.endif
+.ifnb \load
+ vld1.32 {\load[1]}, [\src, :32], r1
+.endif
+.ifnb \addsrc
+ vaddw.u8 \adddst, \adddst, \addsrc
+.endif
+.ifnb \store
+ vst1.32 {\store[0]}, [\dst, :32], r1
+.endif
+.ifnb \narrowsrc
+ vqmovun.s16 \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+ vst1.32 {\store[1]}, [\dst, :32], r1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ load_add_store4 d0, , , , , , , \dst, \src
+ load_add_store4 d1, q8, , , , , , \dst, \src
+ load_add_store4 d2, q9, d0, q8, , , , \dst, \src
+ load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src
+ load_add_store4 d4, q11, d2, q10, q9, d1, d0, \dst, \src
+ load_add_store4 d5, q12, d3, q11, q10, d2, d1, \dst, \src
+ load_add_store4 d6, q13, d4, q12, q11, d3, d2, \dst, \src
+ load_add_store4 d7, q14, d5, q13, q12, d4, d3, \dst, \src
+ load_add_store4 , q15, d6, q14, q13, d5, d4, \dst, \src
+ load_add_store4 , , d7, q15, q14, d6, d5, \dst, \src
+ load_add_store4 , , , , q15, d7, d6, \dst, \src
+ load_add_store4 , , , , , , d7, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+ mov \src, \dst
+ load_add_store4 d0, , , , , , , \dst, \src
+ load_add_store4 d1, q8, , , , , , \dst, \src
+ load_add_store4 d2, q9, d0, q8, , , , \dst, \src
+ load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src
+ load_add_store4 , q11, d2, q10, q9, d1, d0, \dst, \src
+ load_add_store4 , , d3, q11, q10, d2, d1, \dst, \src
+ load_add_store4 , , , , q11, d3, d2, \dst, \src
+ load_add_store4 , , , , , , d3, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+ cmp r3, #0
+ bne 1f
+ vmov.i16 d30, #0
+ movw r12, #2896*8
+ vld1.16 {d16[]}, [r2, :16]
+ vdup.16 d0, r12
+ vqrdmulh.s16 d16, d16, d0[0]
+ vst1.16 {d30[0]}, [r2, :16]
+.if (\w == 2*\h) || (2*\w == \h)
+ vqrdmulh.s16 d16, d16, d0[0]
+.endif
+.if \shift > 0
+ vrshr.s16 d16, d16, #\shift
+.endif
+ vqrdmulh.s16 d20, d16, d0[0]
+ mov r3, #\h
+ vrshr.s16 d16, d20, #4
+ vrshr.s16 d17, d20, #4
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+1:
+ vld1.32 {d0[0]}, [r0, :32], r1
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vld1.32 {d1[0]}, [r0, :32], r1
+ vld1.32 {d1[1]}, [r0, :32], r1
+ subs r3, r3, #4
+ sub r0, r0, r1, lsl #2
+ vaddw.u8 q10, q8, d0
+ vqmovun.s16 d0, q10
+ vaddw.u8 q11, q8, d1
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vqmovun.s16 d1, q11
+ vst1.32 {d0[1]}, [r0, :32], r1
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w8_neon
+1:
+ vld1.8 {d0}, [r0, :64], r1
+ vld1.8 {d1}, [r0, :64], r1
+ vld1.8 {d2}, [r0, :64], r1
+ vaddw.u8 q10, q8, d0
+ vld1.8 {d3}, [r0, :64], r1
+ sub r0, r0, r1, lsl #2
+ subs r3, r3, #4
+ vaddw.u8 q11, q8, d1
+ vqmovun.s16 d0, q10
+ vaddw.u8 q12, q8, d2
+ vqmovun.s16 d1, q11
+ vaddw.u8 q13, q8, d3
+ vst1.8 {d0}, [r0, :64], r1
+ vqmovun.s16 d2, q12
+ vst1.8 {d1}, [r0, :64], r1
+ vqmovun.s16 d3, q13
+ vst1.8 {d2}, [r0, :64], r1
+ vst1.8 {d3}, [r0, :64], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w16_neon
+1:
+ vld1.8 {q0}, [r0, :128], r1
+ vld1.8 {q1}, [r0, :128], r1
+ vld1.8 {q2}, [r0, :128], r1
+ subs r3, r3, #4
+ vaddw.u8 q10, q8, d0
+ vaddw.u8 q11, q8, d1
+ vld1.8 {q3}, [r0, :128], r1
+ vaddw.u8 q12, q8, d2
+ vaddw.u8 q13, q8, d3
+ sub r0, r0, r1, lsl #2
+ vaddw.u8 q14, q8, d4
+ vaddw.u8 q15, q8, d5
+ vqmovun.s16 d0, q10
+ vqmovun.s16 d1, q11
+ vaddw.u8 q10, q8, d6
+ vaddw.u8 q11, q8, d7
+ vqmovun.s16 d2, q12
+ vqmovun.s16 d3, q13
+ vqmovun.s16 d4, q14
+ vqmovun.s16 d5, q15
+ vst1.8 {q0}, [r0, :128], r1
+ vqmovun.s16 d6, q10
+ vqmovun.s16 d7, q11
+ vst1.8 {q1}, [r0, :128], r1
+ vst1.8 {q2}, [r0, :128], r1
+ vst1.8 {q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w32_neon
+1:
+ vld1.8 {q0, q1}, [r0, :128], r1
+ subs r3, r3, #2
+ vld1.8 {q2, q3}, [r0, :128], r1
+ vaddw.u8 q10, q8, d0
+ vaddw.u8 q11, q8, d1
+ vaddw.u8 q12, q8, d2
+ vaddw.u8 q13, q8, d3
+ sub r0, r0, r1, lsl #1
+ vaddw.u8 q14, q8, d4
+ vaddw.u8 q15, q8, d5
+ vqmovun.s16 d0, q10
+ vqmovun.s16 d1, q11
+ vaddw.u8 q10, q8, d6
+ vaddw.u8 q11, q8, d7
+ vqmovun.s16 d2, q12
+ vqmovun.s16 d3, q13
+ vqmovun.s16 d4, q14
+ vqmovun.s16 d5, q15
+ vst1.8 {q0, q1}, [r0, :128], r1
+ vqmovun.s16 d6, q10
+ vqmovun.s16 d7, q11
+ vst1.8 {q2, q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w64_neon
+ sub r1, r1, #32
+1:
+ vld1.8 {q0, q1}, [r0, :128]!
+ subs r3, r3, #1
+ vld1.8 {q2, q3}, [r0, :128]
+ vaddw.u8 q10, q8, d0
+ vaddw.u8 q11, q8, d1
+ vaddw.u8 q12, q8, d2
+ vaddw.u8 q13, q8, d3
+ sub r0, r0, #32
+ vaddw.u8 q14, q8, d4
+ vaddw.u8 q15, q8, d5
+ vqmovun.s16 d0, q10
+ vqmovun.s16 d1, q11
+ vaddw.u8 q10, q8, d6
+ vaddw.u8 q11, q8, d7
+ vqmovun.s16 d2, q12
+ vqmovun.s16 d3, q13
+ vqmovun.s16 d4, q14
+ vqmovun.s16 d5, q15
+ vst1.8 {q0, q1}, [r0, :128]!
+ vqmovun.s16 d6, q10
+ vqmovun.s16 d7, q11
+ vst1.8 {q2, q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+.macro iwht4
+ vadd.i16 d16, d16, d17
+ vsub.i16 d21, d18, d19
+ vsub.i16 d20, d16, d21
+ vshr.s16 d20, d20, #1
+ vsub.i16 d18, d20, d17
+ vsub.i16 d17, d20, d19
+ vadd.i16 d19, d21, d18
+ vsub.i16 d16, d16, d17
+.endm
+
+.macro idct_4h_x4 r0, r1, r2, r3
+ vmull_vmlal q3, \r1, \r3, d0[3], d0[2]
+ vmull_vmlsl q2, \r1, \r3, d0[2], d0[3]
+ vmull_vmlal q1, \r0, \r2, d0[0], d0[0]
+ vqrshrn.s32 d6, q3, #12
+ vqrshrn.s32 d7, q2, #12
+ vmull_vmlsl q2, \r0, \r2, d0[0], d0[0]
+ vqrshrn.s32 d2, q1, #12
+ vqrshrn.s32 d3, q2, #12
+ vqadd.s16 \r0, d2, d6
+ vqsub.s16 \r3, d2, d6
+ vqadd.s16 \r1, d3, d7
+ vqsub.s16 \r2, d3, d7
+.endm
+
+.macro idct_8h_x4 q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+ vmull_vmlal_8h q6, q7, \r2, \r3, \r6, \r7, d0[3], d0[2]
+ vmull_vmlsl_8h q4, q5, \r2, \r3, \r6, \r7, d0[2], d0[3]
+ vmull_vmlal_8h q2, q3, \r0, \r1, \r4, \r5, d0[0], d0[0]
+ vqrshrn_8h d12, d13, q6, q7, #12
+ vqrshrn_8h d14, d15, q4, q5, #12
+ vmull_vmlsl_8h q4, q5, \r0, \r1, \r4, \r5, d0[0], d0[0]
+ vqrshrn_8h d4, d5, q2, q3, #12
+ vqrshrn_8h d6, d7, q4, q5, #12
+ vqadd.s16 \q0, q2, q6
+ vqsub.s16 \q3, q2, q6
+ vqadd.s16 \q1, q3, q7
+ vqsub.s16 \q2, q3, q7
+.endm
+
+function inv_dct_4h_x4_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {d0}, [r12, :64]
+ idct_4h_x4 d16, d17, d18, d19
+ bx lr
+endfunc
+
+function inv_dct_8h_x4_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {d0}, [r12, :64]
+ idct_8h_x4 q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel_local r12, iadst4_coeffs
+ vld1.16 {d0, d1}, [r12, :128]
+
+ vsubl.s16 q1, d16, d18
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d19, d0[2]
+ vmull.s16 q10, d17, d0[3]
+ vaddw.s16 q1, q1, d19
+ vmull.s16 q3, d16, d0[2]
+ vmlsl.s16 q3, d18, d0[0]
+ vmlsl.s16 q3, d19, d0[1]
+
+ vadd.s32 q11, q2, q3
+ vmul.s32 q1, q1, d1[0]
+ vadd.s32 q2, q2, q10
+ vadd.s32 q3, q3, q10
+ vsub.s32 q11, q11, q10
+
+ vqrshrn.s32 \o0, q2, #12
+ vqrshrn.s32 \o2, q1, #12
+ vqrshrn.s32 \o1, q3, #12
+ vqrshrn.s32 \o3, q11, #12
+.endm
+
+function inv_adst_4h_x4_neon, export=1
+ iadst_4x4 d16, d17, d18, d19
+ bx lr
+endfunc
+
+function inv_flipadst_4h_x4_neon, export=1
+ iadst_4x4 d19, d18, d17, d16
+ bx lr
+endfunc
+
+.macro iadst_8x4 o0, o1, o2, o3, o4, o5, o6, o7
+ movrel_local r12, iadst4_coeffs
+ vld1.16 {d0, d1}, [r12, :128]
+
+ vsubl.s16 q2, d16, d20
+ vsubl.s16 q3, d17, d21
+ vmull.s16 q4, d16, d0[0]
+ vmlal.s16 q4, d20, d0[1]
+ vmlal.s16 q4, d22, d0[2]
+ vmull.s16 q5, d17, d0[0]
+ vmlal.s16 q5, d21, d0[1]
+ vmlal.s16 q5, d23, d0[2]
+ vaddw.s16 q2, q2, d22
+ vaddw.s16 q3, q3, d23
+ vmull.s16 q6, d16, d0[2]
+ vmlsl.s16 q6, d20, d0[0]
+ vmlsl.s16 q6, d22, d0[1]
+ vmull.s16 q7, d17, d0[2]
+ vmlsl.s16 q7, d21, d0[0]
+ vmlsl.s16 q7, d23, d0[1]
+
+ vmul.s32 q10, q2, d1[0]
+ vmul.s32 q11, q3, d1[0]
+
+ vmull.s16 q2, d18, d0[3]
+ vmull.s16 q3, d19, d0[3]
+
+ vadd.s32 q8, q4, q2 // out0
+ vadd.s32 q9, q5, q3
+
+ vadd.s32 q4, q4, q6 // out3
+ vadd.s32 q5, q5, q7
+
+ vadd.s32 q6, q6, q2 // out1
+ vadd.s32 q7, q7, q3
+
+ vsub.s32 q4, q4, q2 // out3
+ vsub.s32 q5, q5, q3
+
+ vqrshrn.s32 d20, q10, #12
+ vqrshrn.s32 d21, q11, #12
+
+ vqrshrn.s32 \o0, q8, #12
+ vqrshrn.s32 \o1, q9, #12
+
+.ifc \o4, d18
+ vmov q9, q10
+.endif
+
+ vqrshrn.s32 \o2, q6, #12
+ vqrshrn.s32 \o3, q7, #12
+
+ vqrshrn.s32 \o6, q4, #12
+ vqrshrn.s32 \o7, q5, #12
+.endm
+
+function inv_adst_8h_x4_neon, export=1
+ iadst_8x4 d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+function inv_flipadst_8h_x4_neon, export=1
+ iadst_8x4 d22, d23, d20, d21, d18, d19, d16, d17
+ bx lr
+endfunc
+
+function inv_identity_4h_x4_neon, export=1
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ vqrdmulh.s16 q2, q8, d0[0]
+ vqrdmulh.s16 q3, q9, d0[0]
+ vqadd.s16 q8, q8, q2
+ vqadd.s16 q9, q9, q3
+ bx lr
+endfunc
+
+function inv_identity_8h_x4_neon, export=1
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ vqrdmulh.s16 q1, q8, d0[0]
+ vqrdmulh.s16 q2, q9, d0[0]
+ vqrdmulh.s16 q3, q10, d0[0]
+ vqadd.s16 q8, q8, q1
+ vqrdmulh.s16 q1, q11, d0[0]
+ vqadd.s16 q9, q9, q2
+ vqadd.s16 q10, q10, q3
+ vqadd.s16 q11, q11, q1
+ bx lr
+endfunc
+
+.macro identity_8x4_shift1 r0, r1, r2, r3, c
+.irp i, \r0, \r1, \r2, \r3
+ vqrdmulh.s16 q1, \i, \c
+ vrhadd.s16 \i, \i, q1
+.endr
+.endm
+
+function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1
+ push {r4-r5,lr}
+ vmov.i16 q15, #0
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q15}, [r2, :128]!
+
+ vshr.s16 q8, q8, #2
+ vshr.s16 q9, q9, #2
+
+ iwht4
+
+ vst1.16 {q15}, [r2, :128]!
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ iwht4
+
+ vld1.32 {d0[]}, [r0, :32], r1
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vld1.32 {d1[]}, [r0, :32], r1
+ vld1.32 {d1[1]}, [r0, :32], r1
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ vmov.i16 q15, #0
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q15}, [r2, :128]!
+
+ blx r4
+
+ vst1.16 {q15}, [r2, :128]!
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ blx r5
+
+ vld1.32 {d0[]}, [r0, :32], r1
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vld1.32 {d1[]}, [r0, :32], r1
+ vld1.32 {d1[1]}, [r0, :32], r1
+ vrshr.s16 q8, q8, #4
+ vrshr.s16 q9, q9, #4
+
+L(itx_4x4_end):
+ sub r0, r0, r1, lsl #2
+ vaddw.u8 q8, q8, d0
+ vqmovun.s16 d0, q8
+ vaddw.u8 q9, q9, d1
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vqmovun.s16 d1, q9
+ vst1.32 {d0[1]}, [r0, :32], r1
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+
+ pop {r4-r5,pc}
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
+ push {r4-r5,lr}
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cmp r3, #0
+ bne 1f
+ vmov.i16 d30, #0
+ movw r12, #2896*8
+ vld1.16 {d16[]}, [r2, :16]
+ vdup.16 d4, r12
+ vst1.16 {d30[0]}, [r2, :16]
+ vqrdmulh.s16 d16, d16, d4[0]
+ vld1.32 {d0[0]}, [r0, :32], r1
+ vqrdmulh.s16 d20, d16, d4[0]
+ vld1.32 {d0[1]}, [r0, :32], r1
+ vrshr.s16 d16, d20, #4
+ vrshr.s16 d17, d20, #4
+ vld1.32 {d1[0]}, [r0, :32], r1
+ vmov q9, q8
+ vld1.32 {d1[1]}, [r0, :32], r1
+ b L(itx_4x4_end)
+1:
+.endif
+ movrel_local r4, inv_\txfm1\()_4h_x4_neon
+ movrel_local r5, inv_\txfm2\()_4h_x4_neon
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+ idct_8h_x4 \q0, \q2, \q4, \q6, \r0, \r1, \r4, \r5, \r8, \r9, \r12, \r13
+
+ vmull_vmlsl_8h q2, q3, \r2, \r3, \r14, \r15, d1[0], d1[1] // -> t4a
+ vmull_vmlal_8h q4, q5, \r2, \r3, \r14, \r15, d1[1], d1[0] // -> t7a
+ vmull_vmlsl_8h q6, q7, \r10, \r11, \r6, \r7, d1[2], d1[3] // -> t5a
+ vqrshrn_8h \r2, \r3, q2, q3, #12 // t4a
+ vqrshrn_8h \r14, \r15, q4, q5, #12 // t7a
+ vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a
+ vqrshrn_8h \r6, \r7, q6, q7, #12 // t5a
+ vqrshrn_8h \r10, \r11, q2, q3, #12 // t6a
+
+ vqadd.s16 q2, \q1, \q3 // t4
+ vqsub.s16 \q1, \q1, \q3 // t5a
+ vqadd.s16 q3, \q7, \q5 // t7
+ vqsub.s16 \q3, \q7, \q5 // t6a
+
+ vmull_vmlsl_8h q4, q5, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t5
+ vmull_vmlal_8h q6, q7, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t6
+ vqrshrn_8h d8, d9, q4, q5, #12 // t5
+ vqrshrn_8h d10, d11, q6, q7, #12 // t6
+
+ vqsub.s16 \q7, \q0, q3 // out7
+ vqadd.s16 \q0, \q0, q3 // out0
+ vqadd.s16 \q1, \q2, q5 // out1
+ vqsub.s16 q6, \q2, q5 // out6
+ vqadd.s16 \q2, \q4, q4 // out2
+ vqsub.s16 \q5, \q4, q4 // out5
+ vqadd.s16 \q3, \q6, q2 // out3
+ vqsub.s16 \q4, \q6, q2 // out4
+ vmov \q6, q6 // out6
+.endm
+
+.macro idct_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_4h_x4 \r0, \r2, \r4, \r6
+
+ vmull_vmlsl q1, \r1, \r7, d1[0], d1[1] // -> t4a
+ vmull_vmlal q2, \r1, \r7, d1[1], d1[0] // -> t7a
+ vmull_vmlsl q3, \r5, \r3, d1[2], d1[3] // -> t5a
+ vqrshrn.s32 \r1, q1, #12 // t4a
+ vmull_vmlal q1, \r5, \r3, d1[3], d1[2] // -> t6a
+ vqrshrn.s32 \r7, q2, #12 // t7a
+ vqrshrn.s32 \r3, q3, #12 // t5a
+ vqrshrn.s32 \r5, q1, #12 // taa
+
+ vqadd.s16 d2, \r1, \r3 // t4
+ vqsub.s16 \r1, \r1, \r3 // t5a
+ vqadd.s16 d3, \r7, \r5 // t7
+ vqsub.s16 \r3, \r7, \r5 // t6a
+
+ vmull_vmlsl q2, \r3, \r1, d0[0], d0[0] // -> t5
+ vmull_vmlal q3, \r3, \r1, d0[0], d0[0] // -> t6
+ vqrshrn.s32 d4, q2, #12 // t5
+ vqrshrn.s32 d5, q3, #12 // t6
+
+ vqsub.s16 \r7, \r0, d3 // out7
+ vqadd.s16 \r0, \r0, d3 // out0
+ vqadd.s16 \r1, \r2, d5 // out1
+ vqsub.s16 d6, \r2, d5 // out6
+ vqadd.s16 \r2, \r4, d4 // out2
+ vqsub.s16 \r5, \r4, d4 // out5
+ vqadd.s16 \r3, \r6, d2 // out3
+ vqsub.s16 \r4, \r6, d2 // out4
+ vmov \r6, d6 // out6
+.endm
+
+function inv_dct_8h_x8_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {q0}, [r12, :128]
+ idct_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_dct_4h_x8_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {q0}, [r12, :128]
+ idct_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+.macro iadst_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+ movrel_local r12, iadst8_coeffs
+ vld1.16 {d0, d1, d2}, [r12, :64]
+
+ vmull_vmlal_8h q2, q3, d30, d31, d16, d17, d0[0], d0[1]
+ vmull_vmlsl_8h q4, q5, d30, d31, d16, d17, d0[1], d0[0]
+ vmull_vmlal_8h q6, q7, d26, d27, d20, d21, d0[2], d0[3]
+ vqrshrn_8h d16, d17, q2, q3, #12 // t0a
+ vqrshrn_8h d30, d31, q4, q5, #12 // t1a
+ vmull_vmlsl_8h q2, q3, d26, d27, d20, d21, d0[3], d0[2]
+ vmull_vmlal_8h q4, q5, d22, d23, d24, d25, d1[0], d1[1]
+ vqrshrn_8h d20, d21, q6, q7, #12 // t2a
+ vqrshrn_8h d26, d27, q2, q3, #12 // t3a
+ vmull_vmlsl_8h q6, q7, d22, d23, d24, d25, d1[1], d1[0]
+ vmull_vmlal_8h q2, q3, d18, d19, d28, d29, d1[2], d1[3]
+ vqrshrn_8h d24, d25, q4, q5, #12 // t4a
+ vqrshrn_8h d22, d23, q6, q7, #12 // t5a
+ vmull_vmlsl_8h q4, q5, d18, d19, d28, d29, d1[3], d1[2]
+ vqrshrn_8h d28, d29, q2, q3, #12 // t6a
+ vqrshrn_8h d18, d19, q4, q5, #12 // t7a
+
+ vqadd.s16 q2, q8, q12 // t0
+ vqsub.s16 q3, q8, q12 // t4
+ vqadd.s16 q4, q15, q11 // t1
+ vqsub.s16 q5, q15, q11 // t5
+ vqadd.s16 q6, q10, q14 // t2
+ vqsub.s16 q7, q10, q14 // t6
+ vqadd.s16 q10, q13, q9 // t3
+ vqsub.s16 q11, q13, q9 // t7
+
+ vmull_vmlal_8h q8, q9, d6, d7, d10, d11, d2[3], d2[2]
+ vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[2], d2[3]
+ vmull_vmlsl_8h q14, q15, d22, d23, d14, d15, d2[3], d2[2]
+
+ vqrshrn_8h d6, d7, q8, q9, #12 // t4a
+ vqrshrn_8h d10, d11, q12, q13, #12 // t5a
+
+ vmull_vmlal_8h q8, q9, d22, d23, d14, d15, d2[2], d2[3]
+
+ vqrshrn_8h d14, d15, q14, q15, #12 // t6a
+ vqrshrn_8h d22, d23, q8, q9, #12 // t7a
+
+ vqadd.s16 \q0, q2, q6 // out0
+ vqsub.s16 q2, q2, q6 // t2
+ vqadd.s16 \q7, q4, q10 // out7
+ vqsub.s16 q4, q4, q10 // t3
+ vqneg.s16 \q7, \q7 // out7
+
+ vqadd.s16 \q1, q3, q7 // out1
+ vqsub.s16 q3, q3, q7 // t6
+ vqadd.s16 \q6, q5, q11 // out6
+ vqsub.s16 q5, q5, q11 // t7
+ vqneg.s16 \q1, \q1 // out1
+
+ vmull_vmlal_8h q10, q11, d4, d5, d8, d9, d2[0], d2[0] // -> out3 (q11 or q12)
+ vmull_vmlsl_8h q6, q7, d4, d5, d8, d9, d2[0], d2[0] // -> out4 (q12 or q11)
+ vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[0], d2[0] // -> out5 (q13 or q10)
+ vqrshrn_8h d4, d5, q10, q11, #12 // out3
+ vmull_vmlal_8h q10, q11, d6, d7, d10, d11, d2[0], d2[0] // -> out2 (q10 or q13)
+ vqrshrn_8h d6, d7, q12, q13, #12 // out5
+ vqrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13)
+ vqrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11)
+
+ vqneg.s16 \q3, q2 // out3
+ vqneg.s16 \q5, q3 // out5
+.endm
+
+.macro iadst_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ movrel_local r12, iadst8_coeffs
+ vld1.16 {d0, d1, d2}, [r12, :64]
+
+ vmull_vmlal q2, d23, d16, d0[0], d0[1]
+ vmull_vmlsl q3, d23, d16, d0[1], d0[0]
+ vmull_vmlal q4, d21, d18, d0[2], d0[3]
+ vqrshrn.s32 d16, q2, #12 // t0a
+ vqrshrn.s32 d23, q3, #12 // t1a
+ vmull_vmlsl q5, d21, d18, d0[3], d0[2]
+ vmull_vmlal q6, d19, d20, d1[0], d1[1]
+ vqrshrn.s32 d18, q4, #12 // t2a
+ vqrshrn.s32 d21, q5, #12 // t3a
+ vmull_vmlsl q7, d19, d20, d1[1], d1[0]
+ vmull_vmlal q2, d17, d22, d1[2], d1[3]
+ vqrshrn.s32 d20, q6, #12 // t4a
+ vqrshrn.s32 d19, q7, #12 // t5a
+ vmull_vmlsl q3, d17, d22, d1[3], d1[2]
+ vqrshrn.s32 d22, q2, #12 // t6a
+ vqrshrn.s32 d17, q3, #12 // t7a
+
+ vqadd.s16 d4, d16, d20 // t0
+ vqsub.s16 d5, d16, d20 // t4
+ vqadd.s16 d6, d23, d19 // t1
+ vqsub.s16 d7, d23, d19 // t5
+ vqadd.s16 d8, d18, d22 // t2
+ vqsub.s16 d9, d18, d22 // t6
+ vqadd.s16 d18, d21, d17 // t3
+ vqsub.s16 d19, d21, d17 // t7
+
+ vmull_vmlal q8, d5, d7, d2[3], d2[2]
+ vmull_vmlsl q10, d5, d7, d2[2], d2[3]
+ vmull_vmlsl q11, d19, d9, d2[3], d2[2]
+
+ vqrshrn.s32 d5, q8, #12 // t4a
+ vqrshrn.s32 d7, q10, #12 // t5a
+
+ vmull_vmlal q8, d19, d9, d2[2], d2[3]
+
+ vqrshrn.s32 d9, q11, #12 // t6a
+ vqrshrn.s32 d19, q8, #12 // t7a
+
+ vqadd.s16 \r0, d4, d8 // out0
+ vqsub.s16 d4, d4, d8 // t2
+ vqadd.s16 \r7, d6, d18 // out7
+ vqsub.s16 d6, d6, d18 // t3
+ vqneg.s16 \r7, \r7 // out7
+
+ vqadd.s16 \r1, d5, d9 // out1
+ vqsub.s16 d5, d5, d9 // t6
+ vqadd.s16 \r6, d7, d19 // out6
+ vqsub.s16 d7, d7, d19 // t7
+ vqneg.s16 \r1, \r1 // out1
+
+ vmull_vmlal q9, d4, d6, d2[0], d2[0] // -> out3 (d19 or d20)
+ vmull_vmlsl q4, d4, d6, d2[0], d2[0] // -> out4 (d20 or d19)
+ vmull_vmlsl q10, d5, d7, d2[0], d2[0] // -> out5 (d21 or d18)
+ vqrshrn.s32 d4, q9, #12 // out3
+ vmull_vmlal q9, d5, d7, d2[0], d2[0] // -> out2 (d18 or d21)
+ vqrshrn.s32 d5, q10, #12 // out5
+ vqrshrn.s32 \r2, q9, #12 // out2 (d18 or d21)
+ vqrshrn.s32 \r4, q4, #12 // out4 (d20 or d19)
+
+ vqneg.s16 \r3, d4 // out3
+ vqneg.s16 \r5, d5 // out5
+.endm
+
+function inv_adst_8h_x8_neon, export=1
+ iadst_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_flipadst_8h_x8_neon, export=1
+ iadst_8h_x8 q15, q14, q13, q12, q11, q10, q9, q8, d30, d31, d28, d29, d26, d27, d24, d25, d22, d23, d20, d21, d18, d19, d16, d17
+ bx lr
+endfunc
+
+function inv_adst_4h_x8_neon, export=1
+ iadst_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23
+ bx lr
+endfunc
+
+function inv_flipadst_4h_x8_neon, export=1
+ iadst_4h_x8 d23, d22, d21, d20, d19, d18, d17, d16
+ bx lr
+endfunc
+
+function inv_identity_8h_x8_neon, export=1
+ vqshl.s16 q8, q8, #1
+ vqshl.s16 q9, q9, #1
+ vqshl.s16 q10, q10, #1
+ vqshl.s16 q11, q11, #1
+ vqshl.s16 q12, q12, #1
+ vqshl.s16 q13, q13, #1
+ vqshl.s16 q14, q14, #1
+ vqshl.s16 q15, q15, #1
+ bx lr
+endfunc
+
+function inv_identity_4h_x8_neon, export=1
+ vqshl.s16 q8, q8, #1
+ vqshl.s16 q9, q9, #1
+ vqshl.s16 q10, q10, #1
+ vqshl.s16 q11, q11, #1
+ bx lr
+endfunc
+
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_neon
+ vmov.i16 q0, #0
+ vmov.i16 q1, #0
+ vld1.16 {q8, q9}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q10, q11}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q12, q13}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q14, q15}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]
+
+.ifc \variant, identity_
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
+ blx r4
+
+ vrshr.s16 q8, q8, #1
+ vrshr.s16 q9, q9, #1
+ vrshr.s16 q10, q10, #1
+ vrshr.s16 q11, q11, #1
+ vrshr.s16 q12, q12, #1
+ vrshr.s16 q13, q13, #1
+ vrshr.s16 q14, q14, #1
+ vrshr.s16 q15, q15, #1
+.endif
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ blx r5
+
+ load_add_store_8x8 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,pc}
+endfunc
+.endm
+
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
+.macro def_fn_8x8 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ push {r4-r5,r7,lr}
+ vpush {q4-q7}
+ movrel_local r5, inv_\txfm2\()_8h_x8_neon
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_8x8_neon
+.else
+ movrel_local r4, inv_\txfm1\()_8h_x8_neon
+ b inv_txfm_add_8x8_neon
+.endif
+endfunc
+.endm
+
+def_fn_8x8 dct, dct
+def_fn_8x8 identity, identity
+def_fn_8x8 dct, adst
+def_fn_8x8 dct, flipadst
+def_fn_8x8 dct, identity
+def_fn_8x8 adst, dct
+def_fn_8x8 adst, adst
+def_fn_8x8 adst, flipadst
+def_fn_8x8 flipadst, dct
+def_fn_8x8 flipadst, adst
+def_fn_8x8 flipadst, flipadst
+def_fn_8x8 identity, dct
+def_fn_8x8 adst, identity
+def_fn_8x8 flipadst, identity
+def_fn_8x8 identity, adst
+def_fn_8x8 identity, flipadst
+
+function inv_txfm_add_8x4_neon
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ movw r12, #2896*8
+ vdup.16 d0, r12
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]!
+ vld1.16 {d20, d21, d22, d23}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]
+
+ scale_input d0[0], q8, q9, q10, q11
+
+ blx r4
+
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ vswp d17, d20
+ vswp d19, d21
+ vswp d18, d20
+ vswp d21, d22
+
+ blx r5
+
+ load_add_store_8x4 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,pc}
+endfunc
+
+function inv_txfm_add_4x8_neon
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ movw r12, #2896*8
+ vdup.16 d0, r12
+ vld1.16 {q8, q9}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]!
+ vld1.16 {q10, q11}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]
+
+ scale_input d0[0], q8, q9, q10, q11
+
+ blx r4
+
+ transpose_4x8h q8, q9, q10, q11
+ vswp d17, d20
+ vswp d19, d21
+ vswp d17, d18
+ vswp d19, d22
+
+ blx r5
+
+ load_add_store_4x8 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,pc}
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ push {r4-r5,r7,lr}
+ vpush {q4-q7}
+ movrel_local r4, inv_\txfm1\()_\h\()h_x\w\()_neon
+ movrel_local r5, inv_\txfm2\()_\w\()h_x\h\()_neon
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct
+def_fn_48 \w, \h, identity, identity
+def_fn_48 \w, \h, dct, adst
+def_fn_48 \w, \h, dct, flipadst
+def_fn_48 \w, \h, dct, identity
+def_fn_48 \w, \h, adst, dct
+def_fn_48 \w, \h, adst, adst
+def_fn_48 \w, \h, adst, flipadst
+def_fn_48 \w, \h, flipadst, dct
+def_fn_48 \w, \h, flipadst, adst
+def_fn_48 \w, \h, flipadst, flipadst
+def_fn_48 \w, \h, identity, dct
+def_fn_48 \w, \h, adst, identity
+def_fn_48 \w, \h, flipadst, identity
+def_fn_48 \w, \h, identity, adst
+def_fn_48 \w, \h, identity, flipadst
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+function inv_dct_4h_x16_neon, export=1
+ movrel_local r12, idct_coeffs
+ vld1.16 {q0, q1}, [r12, :128]
+
+ vmull_vmlsl q2, d17, d31, d2[0], d2[1] // -> t8a
+ vmull_vmlal q3, d17, d31, d2[1], d2[0] // -> t15a
+ vmull_vmlsl q4, d25, d23, d2[2], d2[3] // -> t9a
+ vqrshrn.s32 d17, q2, #12 // t8a
+ vqrshrn.s32 d31, q3, #12 // t15a
+ vmull_vmlal q2, d25, d23, d2[3], d2[2] // -> t14a
+ vmull_vmlsl q3, d21, d27, d3[0], d3[1] // -> t10a
+ vqrshrn.s32 d23, q4, #12 // t9a
+ vqrshrn.s32 d25, q2, #12 // t14a
+ vmull_vmlal q4, d21, d27, d3[1], d3[0] // -> t13a
+ vmull_vmlsl q2, d29, d19, d3[2], d3[3] // -> t11a
+ vqrshrn.s32 d21, q3, #12 // t10a
+ vqrshrn.s32 d27, q4, #12 // t13a
+ vmull_vmlal q3, d29, d19, d3[3], d3[2] // -> t12a
+ vqrshrn.s32 d19, q2, #12 // t11a
+ vqrshrn.s32 d29, q3, #12 // t12a
+
+ idct_4h_x8 d16, d18, d20, d22, d24, d26, d28, d30
+
+ vqsub.s16 d4, d17, d23 // t9
+ vqadd.s16 d17, d17, d23 // t8
+ vqsub.s16 d5, d31, d25 // t14
+ vqadd.s16 d31, d31, d25 // t15
+ vqsub.s16 d23, d19, d21 // t10
+ vqadd.s16 d19, d19, d21 // t11
+ vqadd.s16 d25, d29, d27 // t12
+ vqsub.s16 d29, d29, d27 // t13
+
+ vmull_vmlsl q3, d5, d4, d0[2], d0[3] // -> t9a
+ vmull_vmlal q4, d5, d4, d0[3], d0[2] // -> t14a
+ vqrshrn.s32 d21, q3, #12 // t9a
+ vqrshrn.s32 d27, q4, #12 // t14a
+
+ vmull_vmlsl q3, d29, d23, d0[2], d0[3] // -> t13a
+ vmull_vmlal q4, d29, d23, d0[3], d0[2] // -> t10a
+ vqrshrn.s32 d29, q3, #12 // t13a
+ vneg.s32 q4, q4
+ vqrshrn.s32 d23, q4, #12 // t10a
+
+ vqsub.s16 d4, d17, d19 // t11a
+ vqadd.s16 d17, d17, d19 // t8a
+ vqsub.s16 d5, d31, d25 // t12a
+ vqadd.s16 d31, d31, d25 // t15a
+ vqadd.s16 d19, d21, d23 // t9
+ vqsub.s16 d21, d21, d23 // t10
+ vqsub.s16 d25, d27, d29 // t13
+ vqadd.s16 d27, d27, d29 // t14
+
+ vmull_vmlsl q3, d5, d4, d0[0], d0[0] // -> t11
+ vmull_vmlal q4, d5, d4, d0[0], d0[0] // -> t12
+ vmull_vmlsl q2, d25, d21, d0[0], d0[0] // -> t10a
+
+ vqrshrn.s32 d6, q3, #12 // t11
+ vqrshrn.s32 d7, q4, #12 // t12
+ vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t13a
+ vqrshrn.s32 d4, q2, #12 // t10a
+ vqrshrn.s32 d5, q4, #12 // t13a
+
+ vqadd.s16 d8, d16, d31 // out0
+ vqsub.s16 d31, d16, d31 // out15
+ vmov d16, d8
+ vqadd.s16 d23, d30, d17 // out7
+ vqsub.s16 d9, d30, d17 // out8
+ vqadd.s16 d17, d18, d27 // out1
+ vqsub.s16 d30, d18, d27 // out14
+ vqadd.s16 d18, d20, d5 // out2
+ vqsub.s16 d29, d20, d5 // out13
+ vqadd.s16 d5, d28, d19 // out6
+ vqsub.s16 d25, d28, d19 // out9
+ vqadd.s16 d19, d22, d7 // out3
+ vqsub.s16 d28, d22, d7 // out12
+ vqadd.s16 d20, d24, d6 // out4
+ vqsub.s16 d27, d24, d6 // out11
+ vqadd.s16 d21, d26, d4 // out5
+ vqsub.s16 d26, d26, d4 // out10
+ vmov d24, d9
+ vmov d22, d5
+
+ bx lr
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+ movrel_local r12, iadst16_coeffs
+ vld1.16 {q0, q1}, [r12, :128]
+ movrel_local r12, idct_coeffs
+
+ vmull_vmlal q2, d31, d16, d0[0], d0[1] // -> t0
+ vmull_vmlsl q3, d31, d16, d0[1], d0[0] // -> t1
+ vmull_vmlal q4, d29, d18, d0[2], d0[3] // -> t2
+ vqrshrn.s32 d16, q2, #12 // t0
+ vqrshrn.s32 d31, q3, #12 // t1
+ vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t3
+ vmull_vmlal q3, d27, d20, d1[0], d1[1] // -> t4
+ vqrshrn.s32 d18, q4, #12 // t2
+ vqrshrn.s32 d29, q2, #12 // t3
+ vmull_vmlsl q4, d27, d20, d1[1], d1[0] // -> t5
+ vmull_vmlal q2, d25, d22, d1[2], d1[3] // -> t6
+ vqrshrn.s32 d20, q3, #12 // t4
+ vqrshrn.s32 d27, q4, #12 // t5
+ vmull_vmlsl q3, d25, d22, d1[3], d1[2] // -> t7
+ vmull_vmlal q4, d23, d24, d2[0], d2[1] // -> t8
+ vqrshrn.s32 d22, q2, #12 // t6
+ vqrshrn.s32 d25, q3, #12 // t7
+ vmull_vmlsl q2, d23, d24, d2[1], d2[0] // -> t9
+ vmull_vmlal q3, d21, d26, d2[2], d2[3] // -> t10
+ vqrshrn.s32 d23, q4, #12 // t8
+ vqrshrn.s32 d24, q2, #12 // t9
+ vmull_vmlsl q4, d21, d26, d2[3], d2[2] // -> t11
+ vmull_vmlal q2, d19, d28, d3[0], d3[1] // -> t12
+ vqrshrn.s32 d21, q3, #12 // t10
+ vqrshrn.s32 d26, q4, #12 // t11
+ vmull_vmlsl q3, d19, d28, d3[1], d3[0] // -> t13
+ vmull_vmlal q4, d17, d30, d3[2], d3[3] // -> t14
+ vqrshrn.s32 d19, q2, #12 // t12
+ vqrshrn.s32 d28, q3, #12 // t13
+ vmull_vmlsl q2, d17, d30, d3[3], d3[2] // -> t15
+ vqrshrn.s32 d17, q4, #12 // t14
+ vqrshrn.s32 d30, q2, #12 // t15
+
+ vld1.16 {q0}, [r12, :128]
+
+ vqsub.s16 d2, d16, d23 // t8a
+ vqadd.s16 d16, d16, d23 // t0a
+ vqsub.s16 d3, d31, d24 // t9a
+ vqadd.s16 d31, d31, d24 // t1a
+ vqadd.s16 d23, d18, d21 // t2a
+ vqsub.s16 d18, d18, d21 // t10a
+ vqadd.s16 d24, d29, d26 // t3a
+ vqsub.s16 d29, d29, d26 // t11a
+ vqadd.s16 d21, d20, d19 // t4a
+ vqsub.s16 d20, d20, d19 // t12a
+ vqadd.s16 d26, d27, d28 // t5a
+ vqsub.s16 d27, d27, d28 // t13a
+ vqadd.s16 d19, d22, d17 // t6a
+ vqsub.s16 d22, d22, d17 // t14a
+ vqadd.s16 d28, d25, d30 // t7a
+ vqsub.s16 d25, d25, d30 // t15a
+
+ vmull_vmlal q2, d2, d3, d1[1], d1[0] // -> t8
+ vmull_vmlsl q3, d2, d3, d1[0], d1[1] // -> t9
+ vmull_vmlal q4, d18, d29, d1[3], d1[2] // -> t10
+ vqrshrn.s32 d17, q2, #12 // t8
+ vqrshrn.s32 d30, q3, #12 // t9
+ vmull_vmlsl q2, d18, d29, d1[2], d1[3] // -> t11
+ vmull_vmlsl q3, d27, d20, d1[1], d1[0] // -> t12
+ vqrshrn.s32 d18, q4, #12 // t10
+ vqrshrn.s32 d29, q2, #12 // t11
+ vmull_vmlal q4, d27, d20, d1[0], d1[1] // -> t13
+ vmull_vmlsl q2, d25, d22, d1[3], d1[2] // -> t14
+ vqrshrn.s32 d27, q3, #12 // t12
+ vqrshrn.s32 d20, q4, #12 // t13
+ vmull_vmlal q3, d25, d22, d1[2], d1[3] // -> t15
+ vqrshrn.s32 d25, q2, #12 // t14
+ vqrshrn.s32 d22, q3, #12 // t15
+
+ vqsub.s16 d2, d16, d21 // t4
+ vqadd.s16 d16, d16, d21 // t0
+ vqsub.s16 d3, d31, d26 // t5
+ vqadd.s16 d31, d31, d26 // t1
+ vqadd.s16 d21, d23, d19 // t2
+ vqsub.s16 d23, d23, d19 // t6
+ vqadd.s16 d26, d24, d28 // t3
+ vqsub.s16 d24, d24, d28 // t7
+ vqadd.s16 d19, d17, d27 // t8a
+ vqsub.s16 d17, d17, d27 // t12a
+ vqadd.s16 d28, d30, d20 // t9a
+ vqsub.s16 d30, d30, d20 // t13a
+ vqadd.s16 d27, d18, d25 // t10a
+ vqsub.s16 d18, d18, d25 // t14a
+ vqadd.s16 d20, d29, d22 // t11a
+ vqsub.s16 d29, d29, d22 // t15a
+
+ vmull_vmlal q2, d2, d3, d0[3], d0[2] // -> t4a
+ vmull_vmlsl q3, d2, d3, d0[2], d0[3] // -> t5a
+ vmull_vmlsl q4, d24, d23, d0[3], d0[2] // -> t6a
+ vqrshrn.s32 d22, q2, #12 // t4a
+ vqrshrn.s32 d25, q3, #12 // t5a
+ vmull_vmlal q2, d24, d23, d0[2], d0[3] // -> t7a
+ vmull_vmlal q3, d17, d30, d0[3], d0[2] // -> t12
+ vqrshrn.s32 d24, q4, #12 // t6a
+ vqrshrn.s32 d23, q2, #12 // t7a
+ vmull_vmlsl q4, d17, d30, d0[2], d0[3] // -> t13
+ vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t14
+ vqrshrn.s32 d17, q3, #12 // t12
+ vmull_vmlal q3, d29, d18, d0[2], d0[3] // -> t15
+ vqrshrn.s32 d29, q4, #12 // t13
+ vqrshrn.s32 d30, q2, #12 // t14
+ vqrshrn.s32 d18, q3, #12 // t15
+
+ vqsub.s16 d2, d16, d21 // t2a
+.ifc \o0, d16
+ vqadd.s16 \o0, d16, d21 // out0
+ vqsub.s16 d21, d31, d26 // t3a
+ vqadd.s16 \o15,d31, d26 // out15
+.else
+ vqadd.s16 d4, d16, d21 // out0
+ vqsub.s16 d21, d31, d26 // t3a
+ vqadd.s16 \o15,d31, d26 // out15
+ vmov \o0, d4
+.endif
+ vqneg.s16 \o15, \o15 // out15
+
+ vqsub.s16 d3, d29, d18 // t15a
+ vqadd.s16 \o13,d29, d18 // out13
+ vqadd.s16 \o2, d17, d30 // out2
+ vqsub.s16 d26, d17, d30 // t14a
+ vqneg.s16 \o13,\o13 // out13
+
+ vqadd.s16 \o1, d19, d27 // out1
+ vqsub.s16 d27, d19, d27 // t10
+ vqadd.s16 \o14,d28, d20 // out14
+ vqsub.s16 d20, d28, d20 // t11
+ vqneg.s16 \o1, \o1 // out1
+
+ vqadd.s16 \o3, d22, d24 // out3
+ vqsub.s16 d22, d22, d24 // t6
+ vqadd.s16 \o12,d25, d23 // out12
+ vqsub.s16 d23, d25, d23 // t7
+ vqneg.s16 \o3, \o3 // out3
+
+ vmull_vmlsl q12, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23)
+ vmull_vmlal q2, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24)
+ vmull_vmlal q3, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26)
+
+ vqrshrn.s32 d24, q12, #12 // out8
+ vqrshrn.s32 d4, q2, #12 // out7
+ vqrshrn.s32 d5, q3, #12 // out5
+ vmull_vmlsl q4, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21)
+ vmull_vmlal q1, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27)
+ vqrshrn.s32 d26, q4, #12 // out10
+
+ vmull_vmlsl q4, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20)
+ vmull_vmlal q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25)
+ vmull_vmlsl q3, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22)
+
+ vqrshrn.s32 \o4, q1, #12 // out4
+ vqrshrn.s32 d7, q3, #12 // out9
+ vqrshrn.s32 d6, q4, #12 // out11
+ vqrshrn.s32 \o6, q11, #12 // out6
+
+.ifc \o8, d23
+ vmov \o8, d24
+ vmov \o10,d26
+.endif
+
+ vqneg.s16 \o7, d4 // out7
+ vqneg.s16 \o5, d5 // out5
+ vqneg.s16 \o11,d6 // out11
+ vqneg.s16 \o9, d7 // out9
+.endm
+
+function inv_adst_4h_x16_neon, export=1
+ iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_flipadst_4h_x16_neon, export=1
+ iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+ bx lr
+endfunc
+
+function inv_identity_4h_x16_neon, export=1
+ movw r12, #2*(5793-4096)*8
+ vdup.16 d0, r12
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q1, \i, d0[0]
+ vqadd.s16 \i, \i, \i
+ vqadd.s16 \i, \i, q1
+.endr
+ bx lr
+endfunc
+
+.macro identity_4x16_shift2 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q2, \i, \c
+ vshr.s16 q2, q2, #1
+ vrhadd.s16 \i, \i, q2
+.endr
+.endm
+
+.macro identity_4x16_shift1 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q2, \i, \c
+ vrshr.s16 q2, q2, #1
+ vqadd.s16 \i, \i, q2
+.endr
+.endm
+
+.macro identity_8x8_shift1 c
+ identity_4x16_shift1 \c
+.endm
+
+.macro identity_8x8 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s16 q2, \i, \c
+ vqadd.s16 \i, \i, \i
+ vqadd.s16 \i, \i, q2
+.endr
+.endm
+
+.macro def_horz_16 scale=0, identity=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x4_neon
+ push {lr}
+ vmov.i16 d7, #0
+.if \identity
+ movw r12, #2*(5793-4096)*8
+ vdup.16 d0, r12
+.endif
+.if \scale
+ movw r12, #2896*8
+ vdup.16 d1, r12
+.endif
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64]
+ vst1.16 {d7}, [r7, :64], r8
+.endr
+.if \scale
+ scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+.if \identity
+.if \shift == -2
+ identity_4x16_shift2 d0[0]
+.else
+ identity_4x16_shift1 d0[0]
+.endif
+.else
+ blx r4
+.endif
+.if \shift > 0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vrshr.s16 \i, \i, #\shift
+.endr
+.endif
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+.irp i, d16, d20, d24, d28, d17, d21, d25, d29, d18, d22, d26, d30, d19, d23, d27, d31
+ vst1.16 {\i}, [r6, :64]!
+.endr
+
+ pop {pc}
+endfunc
+.endm
+
+def_horz_16 scale=0, identity=0, shift=2
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
+def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity
+
+function inv_txfm_add_vert_4x16_neon
+ push {lr}
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ blx r5
+ load_add_store_4x16 r6, r7
+ pop {pc}
+endfunc
+
+function inv_txfm_add_16x16_neon
+ sub_sp_align 512
+ ldrh r11, [r10], #2
+.irp i, 0, 4, 8, 12
+ add r6, sp, #(\i*16*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 12
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #16*2
+ blx r9
+.endr
+ b 3f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+3:
+.irp i, 0, 4, 8, 12
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #32
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 512
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+const eob_16x16
+ .short 10, 36, 78, 256
+endconst
+
+const eob_16x16_identity
+ .short 4, 8, 12, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+ push {r4-r11,lr}
+ vpush {q4}
+.ifc \txfm1, identity
+ movrel_local r9, inv_txfm_horz_identity_16x4_neon
+.else
+ movrel_local r9, inv_txfm_horz_16x4_neon
+ movrel_local r4, inv_\txfm1\()_4h_x16_neon
+.endif
+ movrel_local r5, inv_\txfm2\()_4h_x16_neon
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel_local r10, eob_16x16
+.else
+ movrel_local r10, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+ movrel_local r10, eob_16x16_identity
+.else
+ movrel_local r10, eob_16x16
+.endif
+.endif
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_16x4_neon
+
+.ifc \variant, identity_
+ vmov.i16 d4, #0
+.irp i, d16, d18, d20, d22
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+.irp i, d17, d19, d21, d23
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+ movw r12, #2*(5793-4096)*8
+ vdup.16 d0, r12
+.irp i, d24, d26, d28, d30
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+.irp i, d25, d27, d29, d31
+ vld1.16 {\i}, [r2, :64]
+ vst1.16 {d4}, [r2, :64]!
+.endr
+
+ identity_4x16_shift1 d0[0]
+.else
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+ vld1.16 {d16, d17, d18, d19}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {d20, d21, d22, d23}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r2, :128]
+ vst1.16 {q2, q3}, [r2, :128]!
+
+ blx r4
+
+ vswp d17, d20
+ vswp d19, d22
+ vswp d18, d20
+ vswp d19, d21
+.irp i, q8, q9, q10, q11
+ vrshr.s16 \i, \i, #1
+.endr
+.endif
+ transpose_4x8h q8, q9, q10, q11
+ blx r5
+ mov r6, r0
+ load_add_store_8x4 r6, r7
+
+.ifc \variant, identity_
+ vmov q8, q12
+ vmov q9, q13
+ vmov q10, q14
+ vmov q11, q15
+.else
+ vswp d25, d28
+ vswp d27, d30
+ vswp d26, d28
+ vswp d27, d29
+ vrshr.s16 q8, q12, #1
+ vrshr.s16 q9, q13, #1
+ vrshr.s16 q10, q14, #1
+ vrshr.s16 q11, q15, #1
+.endif
+ transpose_4x8h q8, q9, q10, q11
+ blx r5
+ add r6, r0, #8
+ load_add_store_8x4 r6, r7
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_\variant\()add_4x16_neon
+ vmov.i16 q2, #0
+
+ mov r11, #32
+ cmp r3, r10
+ blt 1f
+
+ add r6, r2, #16
+.ifc \variant, identity_
+.irp i, q12, q13, q14, q15
+ vld1.16 {\i}, [r6, :128]
+ vst1.16 {q2}, [r6, :128], r11
+.endr
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ identity_8x4_shift1 q12, q13, q14, q15, d0[0]
+.else
+.irp i, q8, q9, q10, q11
+ vld1.16 {\i}, [r6, :128]
+ vst1.16 {q2}, [r6, :128], r11
+.endr
+ blx r4
+ vrshr.s16 q12, q8, #1
+ vrshr.s16 q13, q9, #1
+ vrshr.s16 q14, q10, #1
+ vrshr.s16 q15, q11, #1
+.endif
+ transpose_4x8h q12, q13, q14, q15
+ vswp d27, d29
+ vswp d26, d28
+ vswp d27, d30
+ vswp d25, d28
+
+ b 2f
+1:
+.irp i, q12, q13, q14, q15
+ vmov.i16 \i, #0
+.endr
+2:
+ vmov.i16 q2, #0
+.irp i, q8, q9, q10, q11
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q2}, [r2, :128], r11
+.endr
+.ifc \variant, identity_
+ movw r12, #(5793-4096)*8
+ vdup.16 d0, r12
+ identity_8x4_shift1 q8, q9, q10, q11, d0[0]
+.else
+ blx r4
+.irp i, q8, q9, q10, q11
+ vrshr.s16 \i, \i, #1
+.endr
+.endif
+ transpose_4x8h q8, q9, q10, q11
+ vswp d19, d21
+ vswp d18, d20
+ vswp d19, d22
+ vswp d17, d20
+
+ blx r5
+
+ load_add_store_4x16 r0, r6
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+def_fn_416_base
+def_fn_416_base identity_
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+.if \w == 4
+ movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon
+ movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon
+ mov r10, #\eob_half
+.else
+ movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon
+ movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon
+.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+ b inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 29
+def_fn_416 \w, \h, identity, identity, 29
+def_fn_416 \w, \h, dct, adst, 29
+def_fn_416 \w, \h, dct, flipadst, 29
+def_fn_416 \w, \h, dct, identity, 8
+def_fn_416 \w, \h, adst, dct, 29
+def_fn_416 \w, \h, adst, adst, 29
+def_fn_416 \w, \h, adst, flipadst, 29
+def_fn_416 \w, \h, flipadst, dct, 29
+def_fn_416 \w, \h, flipadst, adst, 29
+def_fn_416 \w, \h, flipadst, flipadst, 29
+def_fn_416 \w, \h, identity, dct, 32
+def_fn_416 \w, \h, adst, identity, 8
+def_fn_416 \w, \h, flipadst, identity, 8
+def_fn_416 \w, \h, identity, adst, 32
+def_fn_416 \w, \h, identity, flipadst, 32
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_16x8_neon
+ sub_sp_align 256
+
+.irp i, 0, 4
+ add r6, sp, #(\i*16*2)
+.if \i > 0
+ cmp r3, r10
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #8*2
+ blx r9
+.endr
+ b 2f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+2:
+
+.irp i, 0, 8
+ add r7, sp, #(\i*2)
+ mov r8, #32
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\j}, [r7, :128], r8
+.endr
+ blx r5
+
+ add r6, r0, #(\i)
+ load_add_store_8x8 r6, r7
+.endr
+
+ add_sp_align 256
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_\variant\()add_8x16_neon
+ sub_sp_align 256
+
+.irp i, 0, 8
+ add r6, sp, #(\i*8*2)
+.if \i > 0
+ cmp r3, r10
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #16*2
+
+ vmov.i16 q2, #0
+ movw r12, #2896*8
+ vdup.16 d0, r12
+
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\j}, [r7, :128]
+ vst1.16 {q2}, [r7, :128], r8
+.endr
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+.ifc \variant, identity_
+ // The identity shl #1 and downshift vrshr #1 cancel out
+.else
+ blx r4
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vrshr.s16 \j, \j, #1
+.endr
+.endif
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+ vst1.16 {q8, q9}, [r6, :128]!
+ vst1.16 {q10, q11}, [r6, :128]!
+ vst1.16 {q12, q13}, [r6, :128]!
+ vst1.16 {q14, q15}, [r6, :128]!
+.endr
+ b 2f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+2:
+
+.irp i, 0, 4
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #16
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 256
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+def_fn_816_base
+def_fn_816_base identity_
+
+.macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+.if \w == 8
+ movrel_local r4, inv_\txfm1\()_8h_x8_neon
+ movrel_local r5, inv_\txfm2\()_4h_x16_neon
+.else
+.ifc \txfm1, identity
+ movrel_local r9, inv_txfm_horz_scale_identity_16x4_neon
+.else
+ movrel_local r4, inv_\txfm1\()_4h_x16_neon
+ movrel_local r9, inv_txfm_horz_scale_16x4_neon
+.endif
+ movrel_local r5, inv_\txfm2\()_8h_x8_neon
+.endif
+.if \w == 8
+ mov r10, #\eob_8x8
+.else
+ mov r10, #\eob_4x4
+.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+ b inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct, 43, 10
+def_fn_816 \w, \h, identity, identity, 43, 10
+def_fn_816 \w, \h, dct, adst, 43, 10
+def_fn_816 \w, \h, dct, flipadst, 43, 10
+def_fn_816 \w, \h, dct, identity, 8, 4
+def_fn_816 \w, \h, adst, dct, 43, 10
+def_fn_816 \w, \h, adst, adst, 43, 10
+def_fn_816 \w, \h, adst, flipadst, 43, 10
+def_fn_816 \w, \h, flipadst, dct, 43, 10
+def_fn_816 \w, \h, flipadst, adst, 43, 10
+def_fn_816 \w, \h, flipadst, flipadst, 43, 10
+def_fn_816 \w, \h, identity, dct, 64, 4
+def_fn_816 \w, \h, adst, identity, 8, 4
+def_fn_816 \w, \h, flipadst, identity, 8, 4
+def_fn_816 \w, \h, identity, adst, 64, 4
+def_fn_816 \w, \h, identity, flipadst, 64, 4
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_4h_x16_neon, export=1
+ movrel_local r12, idct_coeffs, 2*16
+ vld1.16 {q0, q1}, [r12, :128]
+ sub r12, r12, #2*16
+
+ vmull_vmlsl q2, d16, d31, d0[0], d0[1] // -> t16a
+ vmull_vmlal q3, d16, d31, d0[1], d0[0] // -> t31a
+ vmull_vmlsl q4, d24, d23, d0[2], d0[3] // -> t17a
+ vqrshrn.s32 d16, q2, #12 // t16a
+ vqrshrn.s32 d31, q3, #12 // t31a
+ vmull_vmlal q2, d24, d23, d0[3], d0[2] // -> t30a
+ vmull_vmlsl q3, d20, d27, d1[0], d1[1] // -> t18a
+ vqrshrn.s32 d24, q4, #12 // t17a
+ vqrshrn.s32 d23, q2, #12 // t30a
+ vmull_vmlal q4, d20, d27, d1[1], d1[0] // -> t29a
+ vmull_vmlsl q2, d28, d19, d1[2], d1[3] // -> t19a
+ vqrshrn.s32 d20, q3, #12 // t18a
+ vqrshrn.s32 d27, q4, #12 // t29a
+ vmull_vmlal q3, d28, d19, d1[3], d1[2] // -> t28a
+ vmull_vmlsl q4, d18, d29, d2[0], d2[1] // -> t20a
+ vqrshrn.s32 d28, q2, #12 // t19a
+ vqrshrn.s32 d19, q3, #12 // t28a
+ vmull_vmlal q2, d18, d29, d2[1], d2[0] // -> t27a
+ vmull_vmlsl q3, d26, d21, d2[2], d2[3] // -> t21a
+ vqrshrn.s32 d18, q4, #12 // t20a
+ vqrshrn.s32 d29, q2, #12 // t27a
+ vmull_vmlal q4, d26, d21, d2[3], d2[2] // -> t26a
+ vmull_vmlsl q2, d22, d25, d3[0], d3[1] // -> t22a
+ vqrshrn.s32 d26, q3, #12 // t21a
+ vqrshrn.s32 d21, q4, #12 // t26a
+ vmull_vmlal q3, d22, d25, d3[1], d3[0] // -> t25a
+ vmull_vmlsl q4, d30, d17, d3[2], d3[3] // -> t23a
+ vqrshrn.s32 d22, q2, #12 // t22a
+ vqrshrn.s32 d25, q3, #12 // t25a
+ vmull_vmlal q2, d30, d17, d3[3], d3[2] // -> t24a
+ vqrshrn.s32 d30, q4, #12 // t23a
+ vqrshrn.s32 d17, q2, #12 // t24a
+
+ vld1.16 {q0}, [r12, :128]
+
+ vqsub.s16 d2, d16, d24 // t17
+ vqadd.s16 d16, d16, d24 // t16
+ vqsub.s16 d3, d31, d23 // t30
+ vqadd.s16 d31, d31, d23 // t31
+ vqsub.s16 d24, d28, d20 // t18
+ vqadd.s16 d28, d28, d20 // t19
+ vqadd.s16 d23, d18, d26 // t20
+ vqsub.s16 d18, d18, d26 // t21
+ vqsub.s16 d20, d30, d22 // t22
+ vqadd.s16 d30, d30, d22 // t23
+ vqadd.s16 d26, d17, d25 // t24
+ vqsub.s16 d17, d17, d25 // t25
+ vqsub.s16 d22, d29, d21 // t26
+ vqadd.s16 d29, d29, d21 // t27
+ vqadd.s16 d25, d19, d27 // t28
+ vqsub.s16 d19, d19, d27 // t29
+
+ vmull_vmlsl q2, d3, d2, d1[0], d1[1] // -> t17a
+ vmull_vmlal q3, d3, d2, d1[1], d1[0] // -> t30a
+ vmull_vmlal q4, d19, d24, d1[1], d1[0] // -> t18a
+ vqrshrn.s32 d21, q2, #12 // t17a
+ vqrshrn.s32 d27, q3, #12 // t30a
+ vneg.s32 q4, q4 // -> t18a
+ vmull_vmlsl q1, d19, d24, d1[0], d1[1] // -> t29a
+ vmull_vmlsl q2, d22, d18, d1[2], d1[3] // -> t21a
+ vqrshrn.s32 d19, q4, #12 // t18a
+ vqrshrn.s32 d24, q1, #12 // t29a
+ vmull_vmlal q3, d22, d18, d1[3], d1[2] // -> t26a
+ vmull_vmlal q4, d17, d20, d1[3], d1[2] // -> t22a
+ vqrshrn.s32 d22, q2, #12 // t21a
+ vqrshrn.s32 d18, q3, #12 // t26a
+ vneg.s32 q4, q4 // -> t22a
+ vmull_vmlsl q1, d17, d20, d1[2], d1[3] // -> t25a
+ vqrshrn.s32 d17, q4, #12 // t22a
+ vqrshrn.s32 d20, q1, #12 // t25a
+
+ vqsub.s16 d2, d27, d24 // t29
+ vqadd.s16 d27, d27, d24 // t30
+ vqsub.s16 d3, d21, d19 // t18
+ vqadd.s16 d21, d21, d19 // t17
+ vqsub.s16 d24, d16, d28 // t19a
+ vqadd.s16 d16, d16, d28 // t16a
+ vqsub.s16 d19, d30, d23 // t20a
+ vqadd.s16 d30, d30, d23 // t23a
+ vqsub.s16 d28, d17, d22 // t21
+ vqadd.s16 d17, d17, d22 // t22
+ vqadd.s16 d23, d26, d29 // t24a
+ vqsub.s16 d26, d26, d29 // t27a
+ vqadd.s16 d22, d20, d18 // t25
+ vqsub.s16 d20, d20, d18 // t26
+ vqsub.s16 d29, d31, d25 // t28a
+ vqadd.s16 d31, d31, d25 // t31a
+
+ vmull_vmlsl q2, d2, d3, d0[2], d0[3] // -> t18a
+ vmull_vmlal q3, d2, d3, d0[3], d0[2] // -> t29a
+ vmull_vmlsl q4, d29, d24, d0[2], d0[3] // -> t19
+ vqrshrn.s32 d18, q2, #12 // t18a
+ vqrshrn.s32 d25, q3, #12 // t29a
+ vmull_vmlal q1, d29, d24, d0[3], d0[2] // -> t28
+ vmull_vmlal q2, d26, d19, d0[3], d0[2] // -> t20
+ vqrshrn.s32 d29, q4, #12 // t19
+ vqrshrn.s32 d24, q1, #12 // t28
+ vneg.s32 q2, q2 // -> t20
+ vmull_vmlsl q3, d26, d19, d0[2], d0[3] // -> t27
+ vmull_vmlal q4, d20, d28, d0[3], d0[2] // -> t21a
+ vqrshrn.s32 d26, q2, #12 // t20
+ vqrshrn.s32 d19, q3, #12 // t27
+ vneg.s32 q4, q4 // -> t21a
+ vmull_vmlsl q1, d20, d28, d0[2], d0[3] // -> t26a
+ vqrshrn.s32 d20, q4, #12 // t21a
+ vqrshrn.s32 d28, q1, #12 // t26a
+
+ vqsub.s16 d2, d16, d30 // t23
+ vqadd.s16 d16, d16, d30 // t16 = out16
+ vqsub.s16 d3, d31, d23 // t24
+ vqadd.s16 d31, d31, d23 // t31 = out31
+ vqsub.s16 d23, d21, d17 // t22a
+ vqadd.s16 d17, d21, d17 // t17a = out17
+ vqadd.s16 d30, d27, d22 // t30a = out30
+ vqsub.s16 d21, d27, d22 // t25a
+ vqsub.s16 d27, d18, d20 // t21
+ vqadd.s16 d18, d18, d20 // t18 = out18
+ vqadd.s16 d4, d29, d26 // t19a = out19
+ vqsub.s16 d26, d29, d26 // t20a
+ vqadd.s16 d29, d25, d28 // t29 = out29
+ vqsub.s16 d25, d25, d28 // t26
+ vqadd.s16 d28, d24, d19 // t28a = out28
+ vqsub.s16 d24, d24, d19 // t27a
+ vmov d19, d4 // out19
+
+ vmull_vmlsl q2, d24, d26, d0[0], d0[0] // -> t20
+ vmull_vmlal q3, d24, d26, d0[0], d0[0] // -> t27
+ vqrshrn.s32 d20, q2, #12 // t20
+ vqrshrn.s32 d22, q3, #12 // t27
+
+ vmull_vmlal q2, d25, d27, d0[0], d0[0] // -> t26a
+ vmull_vmlsl q3, d25, d27, d0[0], d0[0] // -> t21a
+ vmov d27, d22 // t27
+ vqrshrn.s32 d26, q2, #12 // t26a
+
+ vmull_vmlsl q12, d21, d23, d0[0], d0[0] // -> t22
+ vmull_vmlal q2, d21, d23, d0[0], d0[0] // -> t25
+ vqrshrn.s32 d21, q3, #12 // t21a
+ vqrshrn.s32 d22, q12, #12 // t22
+ vqrshrn.s32 d25, q2, #12 // t25
+
+ vmull_vmlsl q2, d3, d2, d0[0], d0[0] // -> t23a
+ vmull_vmlal q3, d3, d2, d0[0], d0[0] // -> t24a
+ vqrshrn.s32 d23, q2, #12 // t23a
+ vqrshrn.s32 d24, q3, #12 // t24a
+
+ bx lr
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x4_neon
+ push {lr}
+ vmov.i16 d7, #0
+ lsl r8, r8, #1
+.if \scale
+ movw r12, #2896*8
+ vdup.16 d0, r12
+.endif
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64]
+ vst1.16 {d7}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ add r7, r7, r8, lsr #1
+.if \scale
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ bl inv_dct_4h_x16_neon
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+.macro store1 r0, r1, r2, r3
+ vst1.16 {\r0}, [r6, :64]!
+ vst1.16 {\r1}, [r6, :64]!
+ vst1.16 {\r2}, [r6, :64]!
+ vst1.16 {\r3}, [r6, :64]!
+ add r6, r6, #32
+.endm
+ store1 d16, d20, d24, d28
+ store1 d17, d21, d25, d29
+ store1 d18, d22, d26, d30
+ store1 d19, d23, d27, d31
+.purgem store1
+ sub r6, r6, #64*4
+
+ vmov.i16 d7, #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64]
+ vst1.16 {d7}, [r7, :64], r8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in d0[1]
+ scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ bl inv_dct32_odd_4h_x16_neon
+ transpose_4x4h q15, q14, d31, d30, d29, d28
+ transpose_4x4h q13, q12, d27, d26, d25, d24
+ transpose_4x4h q11, q10, d23, d22, d21, d20
+ transpose_4x4h q9, q8, d19, d18, d17, d16
+.macro store2 r0, r1, r2, r3, shift
+ vld1.16 {q0, q1}, [r6, :128]
+ vqsub.s16 d7, d0, \r0
+ vqadd.s16 d0, d0, \r0
+ vqsub.s16 d6, d1, \r1
+ vqadd.s16 d1, d1, \r1
+ vqsub.s16 d5, d2, \r2
+ vqadd.s16 d2, d2, \r2
+ vqsub.s16 d4, d3, \r3
+ vqadd.s16 d3, d3, \r3
+ vrev64.16 q2, q2
+ vrev64.16 q3, q3
+ vrshr.s16 q0, q0, #\shift
+ vrshr.s16 q1, q1, #\shift
+ vrshr.s16 q2, q2, #\shift
+ vrshr.s16 q3, q3, #\shift
+ vst1.16 {q0, q1}, [r6, :128]!
+ vst1.16 {q2, q3}, [r6, :128]!
+.endm
+
+ store2 d31, d27, d23, d19, \shift
+ store2 d30, d26, d22, d18, \shift
+ store2 d29, d25, d21, d17, \shift
+ store2 d28, d24, d20, d16, \shift
+.purgem store2
+ pop {pc}
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_4x32_neon
+ push {r10-r11,lr}
+ lsl r8, r8, #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+
+ bl inv_dct_4h_x16_neon
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vst1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ add r7, r7, r8, lsr #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ sub r7, r7, r8, lsr #1
+ bl inv_dct32_odd_4h_x16_neon
+
+ neg r9, r8
+ mov r10, r6
+.macro combine r0, r1, r2, r3, op, stride
+ vld1.16 {d4}, [r7, :64], \stride
+ vld1.32 {d2[0]}, [r10, :32], r1
+ vld1.16 {d5}, [r7, :64], \stride
+ vld1.32 {d2[1]}, [r10, :32], r1
+ \op\().s16 d4, d4, \r0
+ vld1.16 {d6}, [r7, :64], \stride
+ vld1.32 {d3[0]}, [r10, :32], r1
+ \op\().s16 d5, d5, \r1
+ vld1.32 {d3[1]}, [r10, :32], r1
+ vrshr.s16 q2, q2, #4
+ \op\().s16 d6, d6, \r2
+ vld1.16 {d7}, [r7, :64], \stride
+ vaddw.u8 q2, q2, d2
+ \op\().s16 d7, d7, \r3
+ vqmovun.s16 d2, q2
+ vrshr.s16 q3, q3, #4
+ vst1.32 {d2[0]}, [r6, :32], r1
+ vaddw.u8 q3, q3, d3
+ vst1.32 {d2[1]}, [r6, :32], r1
+ vqmovun.s16 d3, q3
+ vst1.32 {d3[0]}, [r6, :32], r1
+ vst1.32 {d3[1]}, [r6, :32], r1
+.endm
+ combine d31, d30, d29, d28, vqadd, r8
+ combine d27, d26, d25, d24, vqadd, r8
+ combine d23, d22, d21, d20, vqadd, r8
+ combine d19, d18, d17, d16, vqadd, r8
+ sub r7, r7, r8
+ combine d16, d17, d18, d19, vqsub, r9
+ combine d20, d21, d22, d23, vqsub, r9
+ combine d24, d25, d26, d27, vqsub, r9
+ combine d28, d29, d30, d31, vqsub, r9
+.purgem combine
+
+ pop {r10-r11,pc}
+endfunc
+
+const eob_32x32
+ .short 10, 36, 78, 136, 210, 300, 406, 1024
+endconst
+
+const eob_16x32
+ .short 10, 36, 78, 151, 215, 279, 343, 512
+endconst
+
+const eob_16x32_shortside
+ .short 10, 36, 78, 512
+endconst
+
+const eob_8x32
+ // Contrary to the others, this one is only ever used in increments of 8x8
+ .short 43, 107, 171, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1
+ push {r4-r7,lr}
+ vmov.i16 q0, #0
+ movrel_local r5, eob_32x32, 2
+
+ mov r6, #2*32
+1:
+ mov r12, #0
+ movrel_local r4, eob_32x32, 2
+2:
+ add r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q0}, [r2, :128], r6
+.endr
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ load_add_store_8x8 r0, r7, shiftbits=2
+ ldrh lr, [r4], #4
+ sub r0, r0, r1, lsl #3
+ cmp r3, lr
+ add r0, r0, #8
+ bge 2b
+
+ ldrh lr, [r5], #4
+ cmp r3, lr
+ blt 9f
+
+ sub r0, r0, r12
+ add r0, r0, r1, lsl #3
+ mls r2, r6, r12, r2
+ add r2, r2, #2*8
+ b 1b
+9:
+ pop {r4-r7,pc}
+endfunc
+
+.macro shift_8_regs op, shift
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+ push {r4-r7,lr}
+ movw r6, #2896*8
+ movw r7, #2*(5793-4096)*8
+ vdup.i16 d0, r6
+ movrel_local r5, eob_16x32\hshort, 2
+ vmov.16 d0[1], r7
+
+ mov r6, #2*\h
+1:
+ mov r12, #0
+ movrel_local r4, eob_16x32\wshort, 2
+2:
+ vmov.i16 q1, #0
+ add r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q1}, [r2, :128], r6
+.endr
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+
+.if \w == 16
+ // 16x32
+ identity_8x8_shift1 d0[1]
+.else
+ // 32x16
+ shift_8_regs vqshl.s16, 1
+ identity_8x8 d0[1]
+.endif
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+.if \w == 16
+ load_add_store_8x8 r0, r7, shiftbits=2
+.else
+ load_add_store_8x8 r0, r7, shiftbits=4
+.endif
+ ldrh lr, [r4], #4
+ sub r0, r0, r1, lsl #3
+ cmp r3, lr
+ add r0, r0, #8
+ bge 2b
+
+ ldrh lr, [r5], #4
+ cmp r3, lr
+ blt 9f
+
+ sub r0, r0, r12
+ add r0, r0, r1, lsl #3
+ mls r2, r6, r12, r2
+ add r2, r2, #2*8
+ b 1b
+9:
+ pop {r4-r7,pc}
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+ push {r4-r5,lr}
+ vmov.i16 q0, #0
+ movrel_local r4, eob_8x32
+
+ mov r12, #2*\h
+1:
+ ldrh lr, [r4], #2
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q0}, [r2, :128], r12
+.endr
+
+.if \w == 8
+ // 8x32
+ shift_8_regs vrshr.s16, 1
+.endif
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ cmp r3, lr
+.if \w == 8
+ load_add_store_8x8 r0, r5, shiftbits=2
+.else
+ load_add_store_8x8 r0, r5, shiftbits=3
+.endif
+
+ blt 9f
+.if \w == 8
+ sub r2, r2, r12, lsl #3
+ add r2, r2, #2*8
+.else
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #8
+.endif
+ b 1b
+
+9:
+ pop {r4-r5,pc}
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+ sub_sp_align 2048
+ movrel_local r10, eob_32x32
+ ldrh r11, [r10], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, sp, #(\i*32*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 2048
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+ sub_sp_align 1024
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+ movrel_local r4, inv_dct_4h_x16_neon
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, sp, #(\i*16*2)
+ add r7, r2, #(\i*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #2*32
+ bl inv_txfm_horz_scale_16x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 1024
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+ sub_sp_align 1024
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+ movrel_local r5, inv_dct_4h_x16_neon
+
+.irp i, 0, 4, 8, 12
+ add r6, sp, #(\i*32*2)
+ add r7, r2, #(\i*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 12
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #2*16
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 1024
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 512
+
+ movrel_local r10, eob_8x32
+
+ mov r8, #2*32
+ mov r9, #32
+ mov r6, sp
+1:
+ vmov.i16 q0, #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q0}, [r2, :128], r8
+.endr
+ ldrh r11, [r10], #2
+ sub r2, r2, r8, lsl #3
+ sub r9, r9, #8
+ add r2, r2, #2*8
+
+ bl inv_dct_8h_x8_neon
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vrshr.s16 \i, \i, #2
+.endr
+
+ transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+ vst1.16 {q8, q9}, [r6, :128]!
+ cmp r3, r11
+ vst1.16 {q10, q11}, [r6, :128]!
+ vst1.16 {q12, q13}, [r6, :128]!
+ vst1.16 {q14, q15}, [r6, :128]!
+
+ bge 1b
+ cmp r9, #0
+ beq 3f
+
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r9, r9, #8
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4
+ add r6, r0, #(\i)
+ add r7, sp, #(\i*2)
+ mov r8, #8*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 512
+
+.irp i, 0, 4
+ add r6, sp, #(\i*32*2)
+ add r7, r2, #(\i*2)
+.if \i > 0
+ cmp r3, #10
+ blt 1f
+.endif
+ mov r8, #8*2
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 2f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+
+2:
+ mov r8, #2*32
+ mov r9, #0
+1:
+ add r6, r0, r9
+ add r7, sp, r9, lsl #1 // #(\i*2)
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r7, :128], r8
+.endr
+ add r9, r9, #8
+
+ bl inv_dct_8h_x8_neon
+
+ cmp r9, #32
+
+ load_add_store_8x8 r6, r7
+
+ blt 1b
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ vld1.16 {d0, d1, d2}, [r12, :64]!
+
+ vqrdmulh.s16 d23, d16, d0[1] // t63a
+ vqrdmulh.s16 d16, d16, d0[0] // t32a
+ vqrdmulh.s16 d22, d17, d0[2] // t62a
+ vqrdmulh.s16 d17, d17, d0[3] // t33a
+ vqrdmulh.s16 d21, d18, d1[1] // t61a
+ vqrdmulh.s16 d18, d18, d1[0] // t34a
+ vqrdmulh.s16 d20, d19, d1[2] // t60a
+ vqrdmulh.s16 d19, d19, d1[3] // t35a
+
+ vqadd.s16 d24, d16, d17 // t32
+ vqsub.s16 d25, d16, d17 // t33
+ vqsub.s16 d26, d19, d18 // t34
+ vqadd.s16 d27, d19, d18 // t35
+ vqadd.s16 d28, d20, d21 // t60
+ vqsub.s16 d29, d20, d21 // t61
+ vqsub.s16 d30, d23, d22 // t62
+ vqadd.s16 d31, d23, d22 // t63
+
+ vmull_vmlal q2, d29, d26, d2[0], d2[1] // -> t34a
+ vmull_vmlsl q3, d29, d26, d2[1], d2[0] // -> t61a
+ vneg.s32 q2, q2 // t34a
+ vmull_vmlsl q4, d30, d25, d2[1], d2[0] // -> t33a
+ vqrshrn.s32 d26, q2, #12 // t34a
+ vmull_vmlal q2, d30, d25, d2[0], d2[1] // -> t62a
+ vqrshrn.s32 d29, q3, #12 // t61a
+ vqrshrn.s32 d25, q4, #12 // t33a
+ vqrshrn.s32 d30, q2, #12 // t62a
+
+ vqadd.s16 d16, d24, d27 // t32a
+ vqsub.s16 d19, d24, d27 // t35a
+ vqadd.s16 d17, d25, d26 // t33
+ vqsub.s16 d18, d25, d26 // t34
+ vqsub.s16 d20, d31, d28 // t60a
+ vqadd.s16 d23, d31, d28 // t63a
+ vqsub.s16 d21, d30, d29 // t61
+ vqadd.s16 d22, d30, d29 // t62
+
+ vmull_vmlal q2, d21, d18, d2[2], d2[3] // -> t61a
+ vmull_vmlsl q3, d21, d18, d2[3], d2[2] // -> t34a
+ vmull_vmlal q4, d20, d19, d2[2], d2[3] // -> t60
+ vqrshrn.s32 d21, q2, #12 // t61a
+ vqrshrn.s32 d18, q3, #12 // t34a
+ vmull_vmlsl q2, d20, d19, d2[3], d2[2] // -> t35
+ vqrshrn.s32 d20, q4, #12 // t60
+ vqrshrn.s32 d19, q2, #12 // t35
+
+ vst1.16 {d16, d17, d18, d19}, [r6, :128]!
+ vst1.16 {d20, d21, d22, d23}, [r6, :128]!
+
+ bx lr
+endfunc
+
+function inv_dct64_step2_neon
+ movrel_local r12, idct_coeffs
+ vld1.16 {d0}, [r12, :64]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ vldr d16, [r6, #2*4*0] // t32a
+ vldr d17, [r9, #2*4*8] // t39a
+ vldr d18, [r9, #2*4*0] // t63a
+ vldr d19, [r6, #2*4*8] // t56a
+ vldr d20, [r6, #2*4*16] // t40a
+ vldr d21, [r9, #2*4*24] // t47a
+ vldr d22, [r9, #2*4*16] // t55a
+ vldr d23, [r6, #2*4*24] // t48a
+
+ vqadd.s16 d24, d16, d17 // t32
+ vqsub.s16 d25, d16, d17 // t39
+ vqadd.s16 d26, d18, d19 // t63
+ vqsub.s16 d27, d18, d19 // t56
+ vqsub.s16 d28, d21, d20 // t40
+ vqadd.s16 d29, d21, d20 // t47
+ vqadd.s16 d30, d23, d22 // t48
+ vqsub.s16 d31, d23, d22 // t55
+
+ vmull_vmlal q2, d27, d25, d0[3], d0[2] // -> t56a
+ vmull_vmlsl q3, d27, d25, d0[2], d0[3] // -> t39a
+ vmull_vmlal q4, d31, d28, d0[3], d0[2] // -> t40a
+ vqrshrn.s32 d25, q2, #12 // t56a
+ vqrshrn.s32 d27, q3, #12 // t39a
+ vneg.s32 q4, q4 // t40a
+ vmull_vmlsl q2, d31, d28, d0[2], d0[3] // -> t55a
+ vqrshrn.s32 d31, q4, #12 // t40a
+ vqrshrn.s32 d28, q2, #12 // t55a
+
+ vqadd.s16 d16, d24, d29 // t32a
+ vqsub.s16 d19, d24, d29 // t47a
+ vqadd.s16 d17, d27, d31 // t39
+ vqsub.s16 d18, d27, d31 // t40
+ vqsub.s16 d20, d26, d30 // t48a
+ vqadd.s16 d23, d26, d30 // t63a
+ vqsub.s16 d21, d25, d28 // t55
+ vqadd.s16 d22, d25, d28 // t56
+
+ vmull_vmlsl q2, d21, d18, d0[0], d0[0] // -> t40a
+ vmull_vmlal q3, d21, d18, d0[0], d0[0] // -> t55a
+ vmull_vmlsl q4, d20, d19, d0[0], d0[0] // -> t47
+ vqrshrn.s32 d18, q2, #12 // t40a
+ vqrshrn.s32 d21, q3, #12 // t55a
+ vmull_vmlal q2, d20, d19, d0[0], d0[0] // -> t48
+ vqrshrn.s32 d19, q4, #12 // t47
+ vqrshrn.s32 d20, q2, #12 // t48
+
+ vstr d16, [r6, #2*4*0] // t32a
+ vstr d17, [r9, #2*4*0] // t39
+ vstr d18, [r6, #2*4*8] // t40a
+ vstr d19, [r9, #2*4*8] // t47
+ vstr d20, [r6, #2*4*16] // t48
+ vstr d21, [r9, #2*4*16] // t55a
+ vstr d22, [r6, #2*4*24] // t56
+ vstr d23, [r9, #2*4*24] // t63a
+
+ add r6, r6, #2*4
+ sub r9, r9, #2*4
+ cmp r6, r9
+ blt 1b
+ bx lr
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23
+.if \clear
+ vld1.16 {\i}, [\src, :64]
+ vst1.16 {\zero}, [\src, :64], \strd
+.else
+ vld1.16 {\i}, [\src, :64], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+ vst1.16 {q8, q9}, [\dst, :128]!
+ vst1.16 {q10, q11}, [\dst, :128]!
+ vst1.16 {q12, q13}, [\dst, :128]!
+ vst1.16 {q14, q15}, [\dst, :128]!
+.endm
+
+.macro clear_upper8
+.irp i, q12, q13, q14, q15
+ vmov.i16 \i, #0
+.endr
+.endm
+
+.macro vmov_if reg, val, cond
+.if \cond
+ vmov.i16 \reg, \val
+.endif
+.endm
+
+.macro movdup_if reg, gpr, val, cond
+.if \cond
+ movw \gpr, \val
+ vdup.16 \reg, \gpr
+.endif
+.endm
+
+.macro vst1_if regs, dst, dstalign, cond
+.if \cond
+ vst1.16 \regs, \dst, \dstalign
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_4h_x64_neon, export=1
+ mov r6, sp
+
+ push {r10-r11,lr}
+
+ lsl r8, r8, #2
+
+ movdup_if d0, r12, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ load8 r7, r8, d7, \clear
+ clear_upper8
+ sub r7, r7, r8, lsl #3
+ add r7, r7, r8, lsr #1
+ scale_if \scale, d0[0], q8, q9, q10, q11
+
+ bl inv_dct_4h_x16_neon
+
+ store16 r6
+
+ movdup_if d0, r12, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ load8 r7, r8, d7, \clear
+ clear_upper8
+ sub r7, r7, r8, lsl #3
+ lsr r8, r8, #1
+ sub r7, r7, r8, lsr #1
+ scale_if \scale, d0[0], q8, q9, q10, q11
+
+ bl inv_dct32_odd_4h_x16_neon
+
+ add r10, r6, #8*15
+ sub r6, r6, #8*16
+
+ mov r9, #-8
+
+.macro store_addsub r0, r1, r2, r3
+ vld1.16 {d2}, [r6, :64]!
+ vld1.16 {d3}, [r6, :64]!
+ vqadd.s16 d6, d2, \r0
+ vqsub.s16 \r0, d2, \r0
+ vld1.16 {d4}, [r6, :64]!
+ vqadd.s16 d7, d3, \r1
+ vqsub.s16 \r1, d3, \r1
+ vld1.16 {d5}, [r6, :64]!
+ vqadd.s16 d2, d4, \r2
+ sub r6, r6, #8*4
+ vqsub.s16 \r2, d4, \r2
+ vst1.16 {d6}, [r6, :64]!
+ vst1.16 {\r0}, [r10, :64], r9
+ vqadd.s16 d3, d5, \r3
+ vqsub.s16 \r3, d5, \r3
+ vst1.16 {d7}, [r6, :64]!
+ vst1.16 {\r1}, [r10, :64], r9
+ vst1.16 {d2}, [r6, :64]!
+ vst1.16 {\r2}, [r10, :64], r9
+ vst1.16 {d3}, [r6, :64]!
+ vst1.16 {\r3}, [r10, :64], r9
+.endm
+ store_addsub d31, d30, d29, d28
+ store_addsub d27, d26, d25, d24
+ store_addsub d23, d22, d21, d20
+ store_addsub d19, d18, d17, d16
+.purgem store_addsub
+
+ add r6, r6, #2*4*16
+
+ movrel_local r12, idct64_coeffs
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ add r9, r7, r8, lsl #4 // offset 16
+ add r10, r7, r8, lsl #3 // offset 8
+ sub r9, r9, r8 // offset 15
+ sub r11, r10, r8 // offset 7
+ vld1.16 {d16}, [r7, :64] // in1 (offset 0)
+ vld1.16 {d17}, [r9, :64] // in31 (offset 15)
+ vld1.16 {d18}, [r10, :64] // in17 (offset 8)
+ vld1.16 {d19}, [r11, :64] // in15 (offset 7)
+ vst1_if {d7}, [r7, :64], \clear
+ vst1_if {d7}, [r9, :64], \clear
+ vst1_if {d7}, [r10, :64], \clear
+ vst1_if {d7}, [r11, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ add r7, r7, r8, lsl #2 // offset 4
+ sub r9, r9, r8, lsl #2 // offset 11
+ sub r10, r7, r8 // offset 3
+ add r11, r9, r8 // offset 12
+ vld1.16 {d16}, [r10, :64] // in7 (offset 3)
+ vld1.16 {d17}, [r11, :64] // in25 (offset 12)
+ vld1.16 {d18}, [r9, :64] // in23 (offset 11)
+ vld1.16 {d19}, [r7, :64] // in9 (offset 4)
+ vst1_if {d7}, [r7, :64], \clear
+ vst1_if {d7}, [r9, :64], \clear
+ vst1_if {d7}, [r10, :64], \clear
+ vst1_if {d7}, [r11, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ sub r10, r10, r8, lsl #1 // offset 1
+ sub r9, r9, r8, lsl #1 // offset 9
+ add r10, r10, r8 // offset 2
+ add r9, r9, r8 // offset 10
+ add r7, r7, r8 // offset 5
+ add r11, r11, r8 // offset 13
+ vld1.16 d16, [r10, :64] // in5 (offset 2)
+ vld1.16 d17, [r11, :64] // in27 (offset 13)
+ vld1.16 d18, [r9, :64] // in21 (offset 10)
+ vld1.16 d19, [r7, :64] // in11 (offset 5)
+ vst1_if d7, [r10, :64], \clear
+ vst1_if d7, [r11, :64], \clear
+ vst1_if d7, [r9, :64], \clear
+ vst1_if d7, [r7, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, #2896*8, \scale
+ vmov_if d7, #0, \clear
+ sub r10, r10, r8 // offset 1
+ sub r9, r9, r8 // offset 9
+ add r11, r11, r8 // offset 14
+ add r7, r7, r8 // offset 6
+ vld1.16 d16, [r10, :64] // in3 (offset 1)
+ vld1.16 d17, [r11, :64] // in29 (offset 14)
+ vld1.16 d18, [r9, :64] // in19 (offset 9)
+ vld1.16 d19, [r7, :64] // in13 (offset 6)
+ vst1_if d7, [r10, :64], \clear
+ vst1_if d7, [r11, :64], \clear
+ vst1_if d7, [r9, :64], \clear
+ vst1_if d7, [r7, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+
+ sub r6, r6, #2*4*32
+ add r9, r6, #2*4*7
+
+ bl inv_dct64_step2_neon
+
+ pop {r10-r11,pc}
+endfunc
+.endm
+
+def_dct64_func
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+function inv_txfm_horz_dct_64x4_neon
+ vdup.16 q3, r9
+
+ mov r7, sp
+ add r8, sp, #2*4*(64 - 4)
+ add r9, r6, #2*56
+
+ push {r10-r11,lr}
+
+ mov r10, #2*64
+ mov r11, #-2*4*4
+
+1:
+ vld1.16 {d16, d17, d18, d19}, [r7, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r8, :128], r11
+ vld1.16 {d20, d21, d22, d23}, [r7, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r8, :128], r11
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q15, q14, d31, d30, d29, d28
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q13, q12, d27, d26, d25, d24
+
+.macro store_addsub src0, src1, src2, src3
+ vqsub.s16 d3, \src0, \src1
+ vqsub.s16 d2, \src2, \src3
+ vqadd.s16 d0, \src0, \src1
+ vqadd.s16 d1, \src2, \src3
+ vrshl.s16 q1, q1, q3
+ vrshl.s16 q0, q0, q3
+ vrev64.16 q1, q1
+ vst1.16 {q0}, [r6, :128], r10
+ vst1.16 {q1}, [r9, :128], r10
+.endm
+ store_addsub d16, d31, d20, d27
+ store_addsub d17, d30, d21, d26
+ store_addsub d18, d29, d22, d25
+ store_addsub d19, d28, d23, d24
+.purgem store_addsub
+ sub r6, r6, r10, lsl #2
+ sub r9, r9, r10, lsl #2
+ add r6, r6, #16
+ sub r9, r9, #16
+
+ cmp r7, r8
+ blt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_vert_dct_4x64_neon
+ lsl r8, r8, #1
+
+ mov r7, sp
+ add r8, sp, #2*4*(64 - 4)
+ add r9, r6, r1, lsl #6
+ sub r9, r9, r1
+
+ push {r10-r11,lr}
+
+ neg r10, r1
+ mov r11, #-2*4*4
+
+1:
+ vld1.16 {d16, d17, d18, d19}, [r7, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r8, :128], r11
+ vld1.16 {d20, d21, d22, d23}, [r7, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r8, :128], r11
+
+.macro add_dest_addsub src0, src1, src2, src3
+ vld1.32 {d0[0]}, [r6, :32], r1
+ vld1.32 {d1[0]}, [r9, :32], r10
+ vqadd.s16 d4, \src0, \src1
+ vld1.32 {d0[1]}, [r6, :32]
+ vqadd.s16 d5, \src2, \src3
+ vld1.32 {d1[1]}, [r9, :32]
+ vqsub.s16 d6, \src0, \src1
+ vqsub.s16 d7, \src2, \src3
+ sub r6, r6, r1
+ sub r9, r9, r10
+ vrshr.s16 q2, q2, #4
+ vrshr.s16 q3, q3, #4
+ vaddw.u8 q2, q2, d0
+ vaddw.u8 q3, q3, d1
+ vqmovun.s16 d0, q2
+ vqmovun.s16 d1, q3
+ vst1.32 {d0[0]}, [r6, :32], r1
+ vst1.32 {d1[0]}, [r9, :32], r10
+ vst1.32 {d0[1]}, [r6, :32], r1
+ vst1.32 {d1[1]}, [r9, :32], r10
+.endm
+ add_dest_addsub d16, d31, d17, d30
+ add_dest_addsub d18, d29, d19, d28
+ add_dest_addsub d20, d27, d21, d26
+ add_dest_addsub d22, d25, d23, d24
+.purgem add_dest_addsub
+ cmp r7, r8
+ blt 1b
+
+ pop {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 64*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*64*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_dct_clear_4h_x64_neon
+ add r6, r5, #(\i*64*2)
+ mov r9, #-2 // shift
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r7, r5, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_dct_4h_x64_neon
+ add r6, r0, #(\i)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 64*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 64*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*64*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_dct_clear_scale_4h_x64_neon
+ add r6, r5, #(\i*64*2)
+ mov r9, #-1 // shift
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r6, r0, #(\i)
+ add r7, r5, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 64*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 32*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+ ldrh r11, [r10], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*32*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r7, r5, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_dct_4h_x64_neon
+ add r6, r0, #(\i)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 32*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 64*16*2+64*4*2
+ add r4, sp, #64*4*2
+
+ movrel_local r10, eob_16x32
+
+.irp i, 0, 4, 8, 12
+ add r6, r4, #(\i*64*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_dct_clear_4h_x64_neon
+ add r6, r4, #(\i*64*2)
+ mov r9, #-2 // shift
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 12
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+ movrel_local r5, inv_dct_4h_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r6, r0, #(\i)
+ add r7, r4, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 64*16*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ push {r4-r11,lr}
+ vpush {q4}
+
+ sub_sp_align 16*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+
+ movrel_local r4, inv_dct_4h_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r5, #(\i*16*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 28
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_horz_16x4_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12
+ add r7, r5, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_dct_4h_x64_neon
+ add r6, r0, #(\i)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 16*32*2+64*4*2
+ vpop {q4}
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/itx16.S b/third_party/dav1d/src/arm/32/itx16.S
new file mode 100644
index 0000000000..aa6c272e71
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/itx16.S
@@ -0,0 +1,3625 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
+
+// Most of the functions use the following register layout:
+// r0-r3 external parameters
+// r4 function pointer to first transform
+// r5 function pointer to second transform
+// r6 output parameter for helper function
+// r7 input parameter for helper function
+// r8 input stride for helper function
+// r9 scratch variable for helper functions
+// r10-r11 pointer to list of eob thresholds, eob threshold value,
+// scratch variables within helper functions (backed up)
+
+// The SIMD registers most often use the following layout:
+// d0-d3 multiplication coefficients
+// d4-d7 scratch registers
+// d8-d15 unused in some transforms, used for scratch registers in others
+// d16-v31 inputs/outputs of transforms
+
+// Potential further optimizations, that are left unimplemented for now:
+// - Trying to keep multiplication coefficients in registers across multiple
+// transform functions. (The register layout is designed to potentially
+// allow this.)
+// - Use a simplified version of the transforms themselves for cases where
+// we know a significant number of inputs are zero. E.g. if the eob value
+// indicates only a quarter of input values are set, for idct16 and up,
+// a significant amount of calculation can be skipped, at the cost of more
+// code duplication and special casing.
+
+// A macro for cases where a thumb mov can express the constant in one
+// instruction, while arm mode requires two separate movw+movt pairs.
+.macro mov_const reg, val
+#if CONFIG_THUMB
+ mov.w \reg, #\val
+#else
+ movw \reg, #((\val) & 0xffff)
+ movt \reg, #(((\val) >> 16) & 0xffff)
+#endif
+.endm
+
+const idct_coeffs, align=4
+ // idct4
+ .int 2896, 2896*8*(1<<16), 1567, 3784
+ // idct8
+ .int 799, 4017, 3406, 2276
+ // idct16
+ .int 401, 4076, 3166, 2598
+ .int 1931, 3612, 3920, 1189
+ // idct32
+ .int 201, 4091, 3035, 2751
+ .int 1751, 3703, 3857, 1380
+ .int 995, 3973, 3513, 2106
+ .int 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)
+ .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16)
+ .int 4076, 401, 4017, 799
+
+ .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16)
+ .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16)
+ .int -3166, -2598, -799, -4017
+
+ .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16)
+ .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16)
+ .int 3612, 1931, 2276, 3406
+
+ .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16)
+ .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16)
+ .int -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+ .int 1321, 3803, 2482, 3344
+endconst
+
+const iadst8_coeffs, align=4
+ .int 4076, 401, 3612, 1931
+ .int 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .int 2896, 0, 1567, 3784
+endconst
+
+const iadst16_coeffs, align=4
+ .int 4091, 201, 3973, 995
+ .int 3703, 1751, 3290, 2440
+ .int 2751, 3035, 2106, 3513
+ .int 1380, 3857, 601, 4052
+endconst
+
+.macro vmul_vmla d0, s0, s1, c0, c1
+ vmul.i32 \d0, \s0, \c0
+ vmla.i32 \d0, \s1, \c1
+.endm
+
+.macro vmul_vmls d0, s0, s1, c0, c1
+ vmul.i32 \d0, \s0, \c0
+ vmls.i32 \d0, \s1, \c1
+.endm
+
+.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7
+ vqrdmulh.s32 \r0, \r0, \c
+ vqrdmulh.s32 \r1, \r1, \c
+.ifnb \r2
+ vqrdmulh.s32 \r2, \r2, \c
+ vqrdmulh.s32 \r3, \r3, \c
+.endif
+.ifnb \r4
+ vqrdmulh.s32 \r4, \r4, \c
+ vqrdmulh.s32 \r5, \r5, \c
+ vqrdmulh.s32 \r6, \r6, \c
+ vqrdmulh.s32 \r7, \r7, \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4
+.ifnb \load
+ vld1.16 {\load}, [\src, :128], r1
+.endif
+.ifnb \shift
+ vrshr.s16 \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ vqadd.s16 \adddst, \adddst, \addsrc
+.endif
+.ifnb \max
+ vmax.s16 \max, \max, q6
+.endif
+.ifnb \min
+ vmin.s16 \min, \min, q7
+.endif
+.ifnb \store
+ vst1.16 {\store}, [\dst, :128], r1
+.endif
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ load_add_store q0, q8, , , , , , \dst, \src, \shiftbits
+ load_add_store q1, q9, , , , , , \dst, \src, \shiftbits
+ load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits
+ load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits
+ load_add_store q4, q12, q2, q10, q9, q8, , \dst, \src, \shiftbits
+ load_add_store q5, q13, q3, q11, q10, q9, q8, \dst, \src, \shiftbits
+ load_add_store q0, q14, q4, q12, q11, q10, q9, \dst, \src, \shiftbits
+ load_add_store q1, q15, q5, q13, q12, q11, q10, \dst, \src, \shiftbits
+ load_add_store , , q0, q14, q13, q12, q11, \dst, \src, \shiftbits
+ load_add_store , , q1, q15, q14, q13, q12, \dst, \src, \shiftbits
+ load_add_store , , , , q15, q14, q13, \dst, \src, \shiftbits
+ load_add_store , , , , , q15, q14, \dst, \src, \shiftbits
+ load_add_store , , , , , , q15, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src, shiftbits=4
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ load_add_store q0, q8, , , , , , \dst, \src, \shiftbits
+ load_add_store q1, q9, , , , , , \dst, \src, \shiftbits
+ load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits
+ load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits
+ load_add_store , , q2, q10, q9, q8, , \dst, \src, \shiftbits
+ load_add_store , , q3, q11, q10, q9, q8, \dst, \src, \shiftbits
+ load_add_store , , , , q11, q10, q9, \dst, \src, \shiftbits
+ load_add_store , , , , , q11, q10, \dst, \src, \shiftbits
+ load_add_store , , , , , , q11, \dst, \src, \shiftbits
+.endm
+.macro load_add_store4 load1, load2, shift, addsrc, adddst, max, min, store1, store2, dst, src, shiftbits=4
+.ifnb \load1
+ vld1.16 {\load1}, [\src, :64], r1
+.endif
+.ifnb \shift
+ vrshr.s16 \shift, \shift, #\shiftbits
+.endif
+.ifnb \load2
+ vld1.16 {\load2}, [\src, :64], r1
+.endif
+.ifnb \addsrc
+ vqadd.s16 \adddst, \adddst, \addsrc
+.endif
+.ifnb \max
+ vmax.s16 \max, \max, q6
+.endif
+.ifnb \store1
+ vst1.16 {\store1}, [\dst, :64], r1
+.endif
+.ifnb \min
+ vmin.s16 \min, \min, q7
+.endif
+.ifnb \store2
+ vst1.16 {\store2}, [\dst, :64], r1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ mov \src, \dst
+ load_add_store4 d0, d1, q8, , , , , , , \dst, \src
+ load_add_store4 d2, d3, q9, , , , , , , \dst, \src
+ load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src
+ load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src
+ load_add_store4 d8, d9, q12, q2, q10, q9, q8, , , \dst, \src
+ load_add_store4 d10, d11, q13, q3, q11, q10, q9, d16, d17, \dst, \src
+ load_add_store4 d0, d1, q14, q4, q12, q11, q10, d18, d19, \dst, \src
+ load_add_store4 d2, d3, q15, q5, q13, q12, q11, d20, d21, \dst, \src
+ load_add_store4 , , , q0, q14, q13, q12, d22, d23, \dst, \src
+ load_add_store4 , , , q1, q15, q14, q13, d24, d25, \dst, \src
+ load_add_store4 , , , , , q15, q14, d26, d27, \dst, \src
+ load_add_store4 , , , , , , q15, d28, d29, \dst, \src
+ load_add_store4 , , , , , , , d30, d31, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src, shiftbits=4
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ mov \src, \dst
+ load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits
+ load_add_store4 d2, d3, q9, , , , , , , \dst, \src, \shiftbits
+ load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src, \shiftbits
+ load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src, \shiftbits
+ load_add_store4 , , , q2, q10, q9, q8, , , \dst, \src, \shiftbits
+ load_add_store4 , , , q3, q11, q10, q9, d16, d17, \dst, \src, \shiftbits
+ load_add_store4 , , , , , q11, q10, d18, d19, \dst, \src, \shiftbits
+ load_add_store4 , , , , , , q11, d20, d21, \dst, \src, \shiftbits
+ load_add_store4 , , , , , , , d22, d23, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_4x4 dst, src, shiftbits=4
+ mov \src, \dst
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+ mov \src, \dst
+ load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits
+ load_add_store4 d2, d3, q9, q0, q8, , , , , \dst, \src, \shiftbits
+ load_add_store4 , , , q1, q9, q8, , , , \dst, \src, \shiftbits
+ load_add_store4 , , , , , q9, q8, , , \dst, \src, \shiftbits
+ load_add_store4 , , , , , , q9, d16, d17, \dst, \src, \shiftbits
+ load_add_store4 , , , , , , , d18, d19, \dst, \src, \shiftbits
+.endm
+
+.macro idct_dc w, h, shift
+ cmp r3, #0
+ bne 1f
+ vmov.i16 q14, #0
+ mov_const r12, 2896*8*(1<<16)
+ vld1.32 {d24[], d25[]}, [r2, :32]
+ vdup.32 d0, r12
+ vqrdmulh.s32 q13, q12, d0[0]
+ vst1.32 {d28[0]}, [r2, :32]
+.if (\w == 2*\h) || (2*\w == \h)
+ vqrdmulh.s32 q13, q13, d0[0]
+.endif
+.if \shift > 0
+ vqrshrn.s32 d24, q13, #\shift
+ vqrshrn.s32 d25, q13, #\shift
+.else
+ vqmovn.s32 d24, q13
+ vqmovn.s32 d25, q13
+.endif
+ vqrdmulh.s16 q12, q12, d0[1]
+ mov r3, #\h
+ vrshr.s16 q12, q12, #4
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {d0}, [r0, :64], r1
+ vld1.16 {d1}, [r0, :64], r1
+ vld1.16 {d2}, [r0, :64], r1
+ vld1.16 {d3}, [r0, :64], r1
+ subs r3, r3, #4
+ vqadd.s16 q0, q0, q12
+ sub r0, r0, r1, lsl #2
+ vqadd.s16 q1, q1, q12
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmin.s16 q0, q0, q15
+ vst1.16 {d0}, [r0, :64], r1
+ vmin.s16 q1, q1, q15
+ vst1.16 {d1}, [r0, :64], r1
+ vst1.16 {d2}, [r0, :64], r1
+ vst1.16 {d3}, [r0, :64], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w8_neon
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {q0}, [r0, :128], r1
+ subs r3, r3, #4
+ vld1.16 {q1}, [r0, :128], r1
+ vqadd.s16 q0, q0, q12
+ vld1.16 {q2}, [r0, :128], r1
+ vqadd.s16 q1, q1, q12
+ vld1.16 {q3}, [r0, :128], r1
+ vqadd.s16 q2, q2, q12
+ vqadd.s16 q3, q3, q12
+ sub r0, r0, r1, lsl #2
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vst1.16 {q0}, [r0, :128], r1
+ vmin.s16 q2, q2, q15
+ vst1.16 {q1}, [r0, :128], r1
+ vmin.s16 q3, q3, q15
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w16_neon
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {q0, q1}, [r0, :128], r1
+ subs r3, r3, #2
+ vld1.16 {q2, q3}, [r0, :128], r1
+ vqadd.s16 q0, q0, q12
+ vqadd.s16 q1, q1, q12
+ vqadd.s16 q2, q2, q12
+ vqadd.s16 q3, q3, q12
+ sub r0, r0, r1, lsl #1
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vmin.s16 q2, q2, q15
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vmin.s16 q3, q3, q15
+ vst1.16 {q2, q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w32_neon
+ sub r1, r1, #32
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {q0, q1}, [r0, :128]!
+ subs r3, r3, #1
+ vld1.16 {q2, q3}, [r0, :128]
+ vqadd.s16 q0, q0, q12
+ vqadd.s16 q1, q1, q12
+ vqadd.s16 q2, q2, q12
+ vqadd.s16 q3, q3, q12
+ sub r0, r0, #32
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vmin.s16 q2, q2, q15
+ vst1.16 {q0, q1}, [r0, :128]!
+ vmin.s16 q3, q3, q15
+ vst1.16 {q2, q3}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function idct_dc_w64_neon
+ sub r1, r1, #96
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+1:
+ vld1.16 {q0, q1}, [r0, :128]!
+ subs r3, r3, #1
+ vld1.16 {q2, q3}, [r0, :128]!
+ vqadd.s16 q0, q0, q12
+ vld1.16 {q8, q9}, [r0, :128]!
+ vqadd.s16 q1, q1, q12
+ vld1.16 {q10, q11}, [r0, :128]
+ vqadd.s16 q2, q2, q12
+ vqadd.s16 q3, q3, q12
+ vqadd.s16 q8, q8, q12
+ vqadd.s16 q9, q9, q12
+ vqadd.s16 q10, q10, q12
+ vqadd.s16 q11, q11, q12
+ sub r0, r0, #96
+ vmax.s16 q0, q0, q14
+ vmax.s16 q1, q1, q14
+ vmax.s16 q2, q2, q14
+ vmax.s16 q3, q3, q14
+ vmax.s16 q8, q8, q14
+ vmax.s16 q9, q9, q14
+ vmax.s16 q10, q10, q14
+ vmax.s16 q11, q11, q14
+ vmin.s16 q0, q0, q15
+ vmin.s16 q1, q1, q15
+ vmin.s16 q2, q2, q15
+ vmin.s16 q3, q3, q15
+ vmin.s16 q8, q8, q15
+ vst1.16 {q0, q1}, [r0, :128]!
+ vmin.s16 q9, q9, q15
+ vst1.16 {q2, q3}, [r0, :128]!
+ vmin.s16 q10, q10, q15
+ vst1.16 {q8, q9}, [r0, :128]!
+ vmin.s16 q11, q11, q15
+ vst1.16 {q10, q11}, [r0, :128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+.macro iwht4
+ vadd.i32 q8, q8, q9
+ vsub.i32 q13, q10, q11
+ vsub.i32 q12, q8, q13
+ vshr.s32 q12, q12, #1
+ vsub.i32 q10, q12, q9
+ vsub.i32 q9, q12, q11
+ vadd.i32 q11, q13, q10
+ vsub.i32 q8, q8, q9
+.endm
+
+.macro idct_4s_x4 r0, r1, r2, r3
+ vmul_vmla q4, \r1, \r3, d1[1], d1[0]
+ vmul_vmla q2, \r0, \r2, d0[0], d0[0]
+ vmul_vmls q3, \r1, \r3, d1[0], d1[1]
+ vmul_vmls q5, \r0, \r2, d0[0], d0[0]
+ vrshr.s32 q4, q4, #12
+ vrshr.s32 q2, q2, #12
+ vrshr.s32 q3, q3, #12
+ vrshr.s32 q5, q5, #12
+ vqadd.s32 \r0, q2, q4
+ vqsub.s32 \r3, q2, q4
+ vqadd.s32 \r1, q5, q3
+ vqsub.s32 \r2, q5, q3
+.endm
+
+.macro idct_2s_x4 r0, r1, r2, r3
+ vmul_vmla d6, \r1, \r3, d1[1], d1[0]
+ vmul_vmla d4, \r0, \r2, d0[0], d0[0]
+ vmul_vmls d5, \r1, \r3, d1[0], d1[1]
+ vmul_vmls d7, \r0, \r2, d0[0], d0[0]
+ vrshr.s32 d6, d6, #12
+ vrshr.s32 d4, d4, #12
+ vrshr.s32 d5, d5, #12
+ vrshr.s32 d7, d7, #12
+ vqadd.s32 \r0, d4, d6
+ vqsub.s32 \r3, d4, d6
+ vqadd.s32 \r1, d7, d5
+ vqsub.s32 \r2, d7, d5
+.endm
+
+function inv_dct_4s_x4_neon
+ movrel_local r12, idct_coeffs
+ vld1.32 {d0, d1}, [r12, :128]
+ idct_4s_x4 q8, q9, q10, q11
+ bx lr
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel_local r12, iadst4_coeffs
+ vld1.32 {d0, d1}, [r12, :128]
+
+ vsub.i32 q1, q8, q10
+ vmul.i32 q2, q8, d0[0]
+ vmla.i32 q2, q10, d0[1]
+ vmla.i32 q2, q11, d1[0]
+ vmul.i32 q4, q9, d1[1]
+ vadd.i32 q1, q1, q11
+ vmul.i32 q3, q8, d1[0]
+ vmls.i32 q3, q10, d0[0]
+ vmls.i32 q3, q11, d0[1]
+
+ vadd.i32 \o3, q2, q3
+ vmul.i32 \o2, q1, d1[1]
+ vadd.i32 \o0, q2, q4
+ vadd.i32 \o1, q3, q4
+ vsub.i32 \o3, \o3, q4
+
+ vrshr.s32 \o0, \o0, #12
+ vrshr.s32 \o2, \o2, #12
+ vrshr.s32 \o1, \o1, #12
+ vrshr.s32 \o3, \o3, #12
+.endm
+
+function inv_adst_4s_x4_neon
+ iadst_4x4 q8, q9, q10, q11
+ bx lr
+endfunc
+
+function inv_flipadst_4s_x4_neon
+ iadst_4x4 q11, q10, q9, q8
+ bx lr
+endfunc
+
+function inv_identity_4s_x4_neon
+ mov r12, #0
+ movt r12, #(5793-4096)*8
+ vdup.32 d0, r12
+ vqrdmulh.s32 q1, q8, d0[0]
+ vqrdmulh.s32 q2, q9, d0[0]
+ vqrdmulh.s32 q3, q10, d0[0]
+ vqrdmulh.s32 q4, q11, d0[0]
+ vqadd.s32 q8, q8, q1
+ vqadd.s32 q9, q9, q2
+ vqadd.s32 q10, q10, q3
+ vqadd.s32 q11, q11, q4
+ bx lr
+endfunc
+
+function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
+ push {r4-r5,lr}
+ vpush {q4-q5}
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ vld1.32 {q8, q9}, [r2, :128]
+ vst1.32 {q14, q15}, [r2, :128]!
+ vshr.s16 q8, q8, #2
+ vld1.32 {q10, q11}, [r2, :128]
+ vshr.s16 q9, q9, #2
+ vshr.s16 q10, q10, #2
+ vshr.s16 q11, q11, #2
+
+ iwht4
+
+ vst1.32 {q14, q15}, [r2, :128]
+ transpose_4x4s q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
+
+ iwht4
+
+ vld1.16 {d0}, [r0, :64], r1
+ vqmovn.s32 d16, q8
+ vld1.16 {d1}, [r0, :64], r1
+ vqmovn.s32 d17, q9
+ vld1.16 {d2}, [r0, :64], r1
+ vqmovn.s32 d18, q10
+ vld1.16 {d3}, [r0, :64], r1
+ vqmovn.s32 d19, q11
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+ vld1.32 {q8, q9}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]!
+ vld1.32 {q10, q11}, [r2, :128]
+ vst1.16 {q14, q15}, [r2, :128]
+
+ blx r4
+
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q9
+ vqmovn.s32 d18, q10
+ vqmovn.s32 d19, q11
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ blx r5
+
+ vld1.16 {d0}, [r0, :64], r1
+ vld1.16 {d1}, [r0, :64], r1
+ vrshr.s16 q8, q8, #4
+ vld1.16 {d2}, [r0, :64], r1
+ vrshr.s16 q9, q9, #4
+ vld1.16 {d3}, [r0, :64], r1
+
+L(itx_4x4_end):
+ vmvn.i16 q15, #0xfc00 // 0x3ff
+ sub r0, r0, r1, lsl #2
+ vqadd.s16 q8, q8, q0
+ vqadd.s16 q9, q9, q1
+ vmax.s16 q8, q8, q14
+ vmax.s16 q9, q9, q14
+ vmin.s16 q8, q8, q15
+ vmin.s16 q9, q9, q15
+ vst1.16 {d16}, [r0, :64], r1
+ vst1.16 {d17}, [r0, :64], r1
+ vst1.16 {d18}, [r0, :64], r1
+ vst1.16 {d19}, [r0, :64], r1
+
+ vpop {q4-q5}
+ pop {r4-r5,pc}
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
+ push {r4-r5,lr}
+ vpush {q4-q5}
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cmp r3, #0
+ bne 1f
+ vmov.i16 q14, #0
+ mov_const r12, 2896*8*(1<<16)
+ vld1.32 {d16[], d17[]}, [r2, :32]
+ vdup.32 d4, r12
+ vst1.32 {d28[0]}, [r2, :32]
+ vqrdmulh.s32 q8, q8, d4[0]
+ vld1.16 {d0}, [r0, :64], r1
+ vqmovn.s32 d20, q8
+ vqmovn.s32 d21, q8
+ vld1.16 {d1}, [r0, :64], r1
+ vqrdmulh.s16 q10, q10, d4[1]
+ vld1.16 {d2}, [r0, :64], r1
+ vrshr.s16 q8, q10, #4
+ vld1.16 {d3}, [r0, :64], r1
+ vrshr.s16 q9, q10, #4
+ b L(itx_4x4_end)
+1:
+.endif
+ movrel_local r4, inv_\txfm1\()_4s_x4_neon
+ movrel r5, X(inv_\txfm2\()_4h_x4_neon)
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_4s_x4 \r0, \r2, \r4, \r6
+
+ vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a
+ vmul_vmla q3, \r1, \r7, d2[1], d2[0] // -> t7a
+ vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a
+ vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a
+ vrshr.s32 \r1, q2, #12 // t4a
+ vrshr.s32 \r7, q3, #12 // t7a
+ vrshr.s32 \r3, q6, #12 // t5a
+ vrshr.s32 \r5, q7, #12 // t6a
+
+ vqadd.s32 q2, \r1, \r3 // t4
+ vqsub.s32 \r1, \r1, \r3 // t5a
+ vqadd.s32 q3, \r7, \r5 // t7
+ vqsub.s32 \r3, \r7, \r5 // t6a
+
+.irp r, q2, \r1, q3, \r3
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q2, \r1, q3, \r3
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmls q7, \r3, \r1, d0[0], d0[0] // -> t5
+ vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6
+ vrshr.s32 q7, q7, #12 // t5
+ vrshr.s32 q5, q6, #12 // t6
+
+ vqsub.s32 \r7, \r0, q3 // out7
+ vqadd.s32 \r0, \r0, q3 // out0
+ vqadd.s32 \r1, \r2, q5 // out1
+ vqsub.s32 q6, \r2, q5 // out6
+ vqadd.s32 \r2, \r4, q7 // out2
+ vqsub.s32 \r5, \r4, q7 // out5
+ vqadd.s32 \r3, \r6, q2 // out3
+ vqsub.s32 \r4, \r6, q2 // out4
+ vmov \r6, q6 // out6
+.endm
+
+.macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_2s_x4 \r0, \r2, \r4, \r6
+
+ vmov.i32 d9, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d8, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ vmax.s32 \r, \r, d8
+.endr
+
+ vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a
+ vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a
+ vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a
+ vmul_vmla d7, \r5, \r3, d3[1], d3[0] // -> t6a
+ vrshr.s32 \r1, d4, #12 // t4a
+ vrshr.s32 \r7, d5, #12 // t7a
+ vrshr.s32 \r3, d6, #12 // t5a
+ vrshr.s32 \r5, d7, #12 // t6a
+
+ vqadd.s32 d4, \r1, \r3 // t4
+ vqsub.s32 \r1, \r1, \r3 // t5a
+ vqadd.s32 d5, \r7, \r5 // t7
+ vqsub.s32 \r3, \r7, \r5 // t6a
+
+.irp r, d4, \r1, d5, \r3
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, \r1, d5, \r3
+ vmax.s32 \r, \r, d8
+.endr
+
+ vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5
+ vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6
+ vrshr.s32 d6, d6, #12 // t5
+ vrshr.s32 d7, d7, #12 // t6
+
+ vqsub.s32 \r7, \r0, d5 // out7
+ vqadd.s32 \r0, \r0, d5 // out0
+ vqadd.s32 \r1, \r2, d7 // out1
+ vqsub.s32 d7, \r2, d7 // out6
+ vqadd.s32 \r2, \r4, d6 // out2
+ vqsub.s32 \r5, \r4, d6 // out5
+ vqadd.s32 \r3, \r6, d4 // out3
+ vqsub.s32 \r4, \r6, d4 // out4
+ vmov \r6, d7 // out6
+.endm
+
+function inv_dct_4s_x8_neon
+ movrel_local r12, idct_coeffs
+ vld1.32 {q0, q1}, [r12, :128]
+ idct_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15
+ bx lr
+endfunc
+
+.macro iadst_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7
+ movrel_local r12, iadst8_coeffs
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ vmul_vmla q2, q15, q8, d0[0], d0[1]
+ vmul_vmls q3, q15, q8, d0[1], d0[0]
+ vmul_vmla q4, q13, q10, d1[0], d1[1]
+ vrshr.s32 q8, q2, #12 // t0a
+ vrshr.s32 q15, q3, #12 // t1a
+ vmul_vmls q5, q13, q10, d1[1], d1[0]
+ vmul_vmla q6, q11, q12, d2[0], d2[1]
+ vrshr.s32 q10, q4, #12 // t2a
+ vrshr.s32 q13, q5, #12 // t3a
+ vmul_vmls q7, q11, q12, d2[1], d2[0]
+ vmul_vmla q2, q9, q14, d3[0], d3[1]
+ vrshr.s32 q12, q6, #12 // t4a
+ vrshr.s32 q11, q7, #12 // t5a
+ vmul_vmls q3, q9, q14, d3[1], d3[0]
+ vrshr.s32 q14, q2, #12 // t6a
+ vrshr.s32 q9, q3, #12 // t7a
+
+ vld1.32 {q0}, [r12]
+
+ vqadd.s32 q2, q8, q12 // t0
+ vqsub.s32 q3, q8, q12 // t4
+ vmov.i32 q12, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vqadd.s32 q4, q15, q11 // t1
+ vqsub.s32 q5, q15, q11 // t5
+ vqadd.s32 q6, q10, q14 // t2
+ vqsub.s32 q7, q10, q14 // t6
+ vmvn.i32 q14, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+ vqadd.s32 q10, q13, q9 // t3
+ vqsub.s32 q11, q13, q9 // t7
+
+.irp r, q2, q3, q4, q5, q6, q7, q10, q11
+ vmin.s32 \r, \r, q12
+.endr
+.irp r, q2, q3, q4, q5, q6, q7, q10, q11
+ vmax.s32 \r, \r, q14
+.endr
+
+ vmul_vmla q8, q3, q5, d1[1], d1[0]
+ vmul_vmls q13, q3, q5, d1[0], d1[1]
+ vmul_vmls q14, q11, q7, d1[1], d1[0]
+
+ vrshr.s32 q3, q8, #12 // t4a
+ vrshr.s32 q5, q13, #12 // t5a
+
+ vmul_vmla q8, q11, q7, d1[0], d1[1]
+
+ vrshr.s32 q7, q14, #12 // t6a
+ vrshr.s32 q11, q8, #12 // t7a
+
+ vqadd.s32 \r0, q2, q6 // out0
+ vqsub.s32 q2, q2, q6 // t2
+ vqadd.s32 \r7, q4, q10 // out7
+ vqsub.s32 q4, q4, q10 // t3
+
+ vmvn.i32 q10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ vqadd.s32 \r1, q3, q7 // out1
+ vqsub.s32 q3, q3, q7 // t6
+ vqadd.s32 \r6, q5, q11 // out6
+ vqsub.s32 q5, q5, q11 // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, q2, q4, q3, q5
+ vmin.s32 \r, \r, q12
+.endr
+.irp r, q2, q4, q3, q5
+ vmax.s32 \r, \r, q10
+.endr
+
+ vqneg.s32 \r7, \r7 // out7
+ vqneg.s32 \r1, \r1 // out1
+
+ vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12)
+ vmul_vmls q6, q2, q4, d0[0], d0[0] // -> out4 (q12 or q11)
+ vmul_vmls q12, q3, q5, d0[0], d0[0] // -> out5 (q13 or q10)
+ vrshr.s32 q2, q10, #12 // out3
+ vmul_vmla q10, q3, q5, d0[0], d0[0] // -> out2 (q10 or q13)
+ vrshr.s32 q3, q12, #12 // out5
+ vrshr.s32 \r2, q10, #12 // out2 (q10 or q13)
+ vrshr.s32 \r4, q6, #12 // out4 (q12 or q11)
+
+ vqneg.s32 \r3, q2 // out3
+ vqneg.s32 \r5, q3 // out5
+.endm
+
+function inv_adst_4s_x8_neon
+ iadst_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15
+ bx lr
+endfunc
+
+function inv_flipadst_4s_x8_neon
+ iadst_4s_x8 q15, q14, q13, q12, q11, q10, q9, q8
+ bx lr
+endfunc
+
+function inv_identity_4s_x8_neon
+ vqshl.s32 q8, q8, #1
+ vqshl.s32 q9, q9, #1
+ vqshl.s32 q10, q10, #1
+ vqshl.s32 q11, q11, #1
+ vqshl.s32 q12, q12, #1
+ vqshl.s32 q13, q13, #1
+ vqshl.s32 q14, q14, #1
+ vqshl.s32 q15, q15, #1
+ bx lr
+endfunc
+
+function inv_txfm_add_8x8_neon
+ vmov.i32 q0, #0
+ mov r7, #8*4
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r7
+.endr
+
+ blx r4
+
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q12, #1
+ vqrshrn.s32 d18, q9, #1
+ vqrshrn.s32 d19, q13, #1
+ vqrshrn.s32 d20, q10, #1
+ vqrshrn.s32 d21, q14, #1
+ vqrshrn.s32 d22, q11, #1
+ vqrshrn.s32 d23, q15, #1
+
+ cmp r3, r10
+ transpose_4x8h q8, q9, q10, q11
+
+ blt 1f
+
+ sub r2, r2, r7, lsl #3
+ vpush {q8-q11}
+
+ add r2, r2, #16
+ vmov.i32 q0, #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r7
+.endr
+
+ blx r4
+
+ vqrshrn.s32 d31, q15, #1
+ vqrshrn.s32 d30, q11, #1
+ vqrshrn.s32 d29, q14, #1
+ vqrshrn.s32 d28, q10, #1
+ vqrshrn.s32 d27, q13, #1
+ vqrshrn.s32 d26, q9, #1
+ vqrshrn.s32 d25, q12, #1
+ vqrshrn.s32 d24, q8, #1
+ vpop {q8-q11}
+
+ transpose_4x8h q12, q13, q14, q15
+
+ b 2f
+
+1:
+ vmov.i16 q12, #0
+ vmov.i16 q13, #0
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+
+2:
+ blx r5
+
+ load_add_store_8x8 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,r10,pc}
+endfunc
+
+.macro def_fn_8x8 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ push {r4-r5,r7,r10,lr}
+ vpush {q4-q7}
+ mov r10, #\eob_half
+ movrel_local r4, inv_\txfm1\()_4s_x8_neon
+ movrel r5, X(inv_\txfm2\()_8h_x8_neon)
+ b inv_txfm_add_8x8_neon
+endfunc
+.endm
+
+def_fn_8x8 dct, dct, 10
+def_fn_8x8 identity, identity, 10
+def_fn_8x8 dct, adst, 10
+def_fn_8x8 dct, flipadst, 10
+def_fn_8x8 dct, identity, 4
+def_fn_8x8 adst, dct, 10
+def_fn_8x8 adst, adst, 10
+def_fn_8x8 adst, flipadst, 10
+def_fn_8x8 flipadst, dct, 10
+def_fn_8x8 flipadst, adst, 10
+def_fn_8x8 flipadst, flipadst, 10
+def_fn_8x8 identity, dct, 4
+def_fn_8x8 adst, identity, 4
+def_fn_8x8 flipadst, identity, 4
+def_fn_8x8 identity, adst, 4
+def_fn_8x8 identity, flipadst, 4
+
+function inv_txfm_add_8x4_neon
+ mov_const r12, 2896*8*(1<<16)
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vld1.16 {q8, q9}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vdup.32 d4, r12
+ vld1.16 {q10, q11}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q12, q13}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q14, q15}, [r2, :128]
+ vst1.16 {q0, q1}, [r2, :128]!
+
+ scale_input d4[0], q8, q9, q10, q11, q12, q13, q14, q15
+
+ blx r4
+
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q9
+ vqmovn.s32 d18, q10
+ vqmovn.s32 d19, q11
+ vqmovn.s32 d20, q12
+ vqmovn.s32 d21, q13
+ vqmovn.s32 d22, q14
+ vqmovn.s32 d23, q15
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ vswp d17, d20
+ vswp d19, d21
+ vswp d18, d20
+ vswp d21, d22
+
+ blx r5
+
+ load_add_store_8x4 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,r10,pc}
+endfunc
+
+function inv_txfm_add_4x8_neon
+ mov_const r12, 2896*8*(1<<16)
+ vmov.i32 q0, #0
+ cmp r3, r10
+ mov r7, #32
+ blt 1f
+
+ add r2, r2, #16
+ vdup.32 d2, r12
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r7
+.endr
+
+ scale_input d2[0], q8, q9, q10, q11
+ sub r2, r2, r7, lsl #2
+
+ blx r4
+
+ sub r2, r2, #16
+
+ vqmovn.s32 d24, q8
+ vqmovn.s32 d25, q9
+ vqmovn.s32 d26, q10
+ vqmovn.s32 d27, q11
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+
+ b 2f
+
+1:
+ vmov.i16 q12, #0
+ vmov.i16 q13, #0
+
+2:
+ mov_const r12, 2896*8*(1<<16)
+ vmov.i32 q0, #0
+ vdup.32 d2, r12
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r7
+.endr
+ scale_input d2[0], q8, q9, q10, q11
+ blx r4
+
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q9
+ vqmovn.s32 d18, q10
+ vqmovn.s32 d19, q11
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ vmov q10, q12
+ vmov q11, q13
+
+ blx r5
+
+ load_add_store_4x8 r0, r7
+ vpop {q4-q7}
+ pop {r4-r5,r7,r10,pc}
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ push {r4-r5,r7,r10,lr}
+ vpush {q4-q7}
+ movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon
+.if \w == 4
+ mov r10, #\eob_half
+.endif
+ movrel r5, X(inv_\txfm2\()_\w\()h_x\h\()_neon)
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct, 13
+def_fn_48 \w, \h, identity, identity, 13
+def_fn_48 \w, \h, dct, adst, 13
+def_fn_48 \w, \h, dct, flipadst, 13
+def_fn_48 \w, \h, dct, identity, 4
+def_fn_48 \w, \h, adst, dct, 13
+def_fn_48 \w, \h, adst, adst, 13
+def_fn_48 \w, \h, adst, flipadst, 13
+def_fn_48 \w, \h, flipadst, dct, 13
+def_fn_48 \w, \h, flipadst, adst, 13
+def_fn_48 \w, \h, flipadst, flipadst, 13
+def_fn_48 \w, \h, identity, dct, 16
+def_fn_48 \w, \h, adst, identity, 4
+def_fn_48 \w, \h, flipadst, identity, 4
+def_fn_48 \w, \h, identity, adst, 16
+def_fn_48 \w, \h, identity, flipadst, 16
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+function inv_dct_2s_x16_neon
+ movrel_local r12, idct_coeffs
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30
+
+ // idct_8 leaves the row_clip_max/min constants in d9 and d8
+.irp r, d16, d18, d20, d22, d24, d26, d28, d30
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d16, d18, d20, d22, d24, d26, d28, d30
+ vmax.s32 \r, \r, d8
+.endr
+
+ vld1.32 {q0, q1}, [r12, :128]
+ sub r12, r12, #32
+
+ vmul_vmls d4, d17, d31, d0[0], d0[1] // -> t8a
+ vmul_vmla d5, d17, d31, d0[1], d0[0] // -> t15a
+ vmul_vmls d6, d25, d23, d1[0], d1[1] // -> t9a
+ vrshr.s32 d17, d4, #12 // t8a
+ vrshr.s32 d31, d5, #12 // t15a
+ vmul_vmla d4, d25, d23, d1[1], d1[0] // -> t14a
+ vmul_vmls d5, d21, d27, d2[0], d2[1] // -> t10a
+ vrshr.s32 d23, d6, #12 // t9a
+ vrshr.s32 d25, d4, #12 // t14a
+ vmul_vmla d6, d21, d27, d2[1], d2[0] // -> t13a
+ vmul_vmls d4, d29, d19, d3[0], d3[1] // -> t11a
+ vrshr.s32 d21, d5, #12 // t10a
+ vrshr.s32 d27, d6, #12 // t13a
+ vmul_vmla d5, d29, d19, d3[1], d3[0] // -> t12a
+ vrshr.s32 d19, d4, #12 // t11a
+ vrshr.s32 d29, d5, #12 // t12a
+
+ vld1.32 {q0}, [r12, :128]
+
+ vqsub.s32 d4, d17, d23 // t9
+ vqadd.s32 d17, d17, d23 // t8
+ vqsub.s32 d5, d31, d25 // t14
+ vqadd.s32 d31, d31, d25 // t15
+ vqsub.s32 d23, d19, d21 // t10
+ vqadd.s32 d19, d19, d21 // t11
+ vqadd.s32 d25, d29, d27 // t12
+ vqsub.s32 d29, d29, d27 // t13
+
+.irp r, d4, d17, d5, d31, d23, d19, d25, d29
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, d17, d5, d31, d23, d19, d25, d29
+ vmax.s32 \r, \r, d8
+.endr
+
+ vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a
+ vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a
+ vrshr.s32 d21, d6, #12 // t9a
+ vrshr.s32 d27, d7, #12 // t14a
+
+ vmul_vmls d6, d29, d23, d1[0], d1[1] // -> t13a
+ vmul_vmla d7, d29, d23, d1[1], d1[0] // -> t10a
+ vrshr.s32 d29, d6, #12 // t13a
+ vneg.s32 d7, d7
+ vrshr.s32 d23, d7, #12 // t10a
+
+ vqsub.s32 d4, d17, d19 // t11a
+ vqadd.s32 d17, d17, d19 // t8a
+ vqsub.s32 d5, d31, d25 // t12a
+ vqadd.s32 d31, d31, d25 // t15a
+ vqadd.s32 d19, d21, d23 // t9
+ vqsub.s32 d21, d21, d23 // t10
+ vqsub.s32 d25, d27, d29 // t13
+ vqadd.s32 d27, d27, d29 // t14
+
+.irp r, d4, d17, d5, d31, d19, d21, d25, d27
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, d17, d5, d31, d19, d21, d25, d27
+ vmax.s32 \r, \r, d8
+.endr
+
+ vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11
+ vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12
+ vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a
+
+ vrshr.s32 d6, d6, #12 // t11
+ vrshr.s32 d7, d7, #12 // t12
+ vmul_vmla d5, d25, d21, d0[0], d0[0] // -> t13a
+ vrshr.s32 d4, d4, #12 // t10a
+ vrshr.s32 d5, d5, #12 // t13a
+
+ vqadd.s32 d8, d16, d31 // out0
+ vqsub.s32 d31, d16, d31 // out15
+ vmov d16, d8
+ vqadd.s32 d23, d30, d17 // out7
+ vqsub.s32 d9, d30, d17 // out8
+ vqadd.s32 d17, d18, d27 // out1
+ vqsub.s32 d30, d18, d27 // out14
+ vqadd.s32 d18, d20, d5 // out2
+ vqsub.s32 d29, d20, d5 // out13
+ vqadd.s32 d5, d28, d19 // out6
+ vqsub.s32 d25, d28, d19 // out9
+ vqadd.s32 d19, d22, d7 // out3
+ vqsub.s32 d28, d22, d7 // out12
+ vqadd.s32 d20, d24, d6 // out4
+ vqsub.s32 d27, d24, d6 // out11
+ vqadd.s32 d21, d26, d4 // out5
+ vqsub.s32 d26, d26, d4 // out10
+ vmov d24, d9
+ vmov d22, d5
+
+ bx lr
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+ movrel_local r12, iadst16_coeffs
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ vmul_vmla d4, d31, d16, d0[0], d0[1] // -> t0
+ vmul_vmls d6, d31, d16, d0[1], d0[0] // -> t1
+ vmul_vmla d8, d29, d18, d1[0], d1[1] // -> t2
+ vrshr.s32 d16, d4, #12 // t0
+ vrshr.s32 d31, d6, #12 // t1
+ vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t3
+ vmul_vmla d6, d27, d20, d2[0], d2[1] // -> t4
+ vrshr.s32 d18, d8, #12 // t2
+ vrshr.s32 d29, d4, #12 // t3
+ vmul_vmls d8, d27, d20, d2[1], d2[0] // -> t5
+ vmul_vmla d4, d25, d22, d3[0], d3[1] // -> t6
+ vrshr.s32 d20, d6, #12 // t4
+ vrshr.s32 d27, d8, #12 // t5
+ vmul_vmls d6, d25, d22, d3[1], d3[0] // -> t7
+ vld1.32 {q0, q1}, [r12, :128]
+ movrel_local r12, idct_coeffs
+ vmul_vmla d8, d23, d24, d0[0], d0[1] // -> t8
+ vrshr.s32 d22, d4, #12 // t6
+ vrshr.s32 d25, d6, #12 // t7
+ vmul_vmls d4, d23, d24, d0[1], d0[0] // -> t9
+ vmul_vmla d6, d21, d26, d1[0], d1[1] // -> t10
+ vrshr.s32 d23, d8, #12 // t8
+ vrshr.s32 d24, d4, #12 // t9
+ vmul_vmls d8, d21, d26, d1[1], d1[0] // -> t11
+ vmul_vmla d4, d19, d28, d2[0], d2[1] // -> t12
+ vrshr.s32 d21, d6, #12 // t10
+ vrshr.s32 d26, d8, #12 // t11
+ vmul_vmls d6, d19, d28, d2[1], d2[0] // -> t13
+ vmul_vmla d8, d17, d30, d3[0], d3[1] // -> t14
+ vrshr.s32 d19, d4, #12 // t12
+ vrshr.s32 d28, d6, #12 // t13
+ vmul_vmls d4, d17, d30, d3[1], d3[0] // -> t15
+ vrshr.s32 d17, d8, #12 // t14
+ vrshr.s32 d30, d4, #12 // t15
+
+ vld1.32 {q0, q1}, [r12, :128]
+
+ vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ vqsub.s32 d5, d16, d23 // t8a
+ vqadd.s32 d16, d16, d23 // t0a
+ vqsub.s32 d7, d31, d24 // t9a
+ vqadd.s32 d31, d31, d24 // t1a
+ vqadd.s32 d23, d18, d21 // t2a
+ vqsub.s32 d18, d18, d21 // t10a
+ vqadd.s32 d24, d29, d26 // t3a
+ vqsub.s32 d29, d29, d26 // t11a
+ vqadd.s32 d21, d20, d19 // t4a
+ vqsub.s32 d20, d20, d19 // t12a
+ vqadd.s32 d26, d27, d28 // t5a
+ vqsub.s32 d27, d27, d28 // t13a
+ vqadd.s32 d19, d22, d17 // t6a
+ vqsub.s32 d22, d22, d17 // t14a
+ vqadd.s32 d28, d25, d30 // t7a
+ vqsub.s32 d25, d25, d30 // t15a
+
+.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8
+ vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9
+ vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10
+ vrshr.s32 d17, d4, #12 // t8
+ vrshr.s32 d30, d6, #12 // t9
+ vmul_vmls d4, d18, d29, d3[0], d3[1] // -> t11
+ vmul_vmls d6, d27, d20, d2[1], d2[0] // -> t12
+ vrshr.s32 d18, d8, #12 // t10
+ vrshr.s32 d29, d4, #12 // t11
+ vmul_vmla d8, d27, d20, d2[0], d2[1] // -> t13
+ vmul_vmls d4, d25, d22, d3[1], d3[0] // -> t14
+ vrshr.s32 d27, d6, #12 // t12
+ vrshr.s32 d20, d8, #12 // t13
+ vmul_vmla d6, d25, d22, d3[0], d3[1] // -> t15
+ vrshr.s32 d25, d4, #12 // t14
+ vrshr.s32 d22, d6, #12 // t15
+
+ vqsub.s32 d2, d16, d21 // t4
+ vqadd.s32 d16, d16, d21 // t0
+ vqsub.s32 d3, d31, d26 // t5
+ vqadd.s32 d31, d31, d26 // t1
+ vqadd.s32 d21, d23, d19 // t2
+ vqsub.s32 d23, d23, d19 // t6
+ vqadd.s32 d26, d24, d28 // t3
+ vqsub.s32 d24, d24, d28 // t7
+ vqadd.s32 d19, d17, d27 // t8a
+ vqsub.s32 d17, d17, d27 // t12a
+ vqadd.s32 d28, d30, d20 // t9a
+ vqsub.s32 d30, d30, d20 // t13a
+ vqadd.s32 d27, d18, d25 // t10a
+ vqsub.s32 d18, d18, d25 // t14a
+ vqadd.s32 d20, d29, d22 // t11a
+ vqsub.s32 d29, d29, d22 // t15a
+
+.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a
+ vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a
+ vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a
+ vrshr.s32 d22, d4, #12 // t4a
+ vrshr.s32 d25, d6, #12 // t5a
+ vmul_vmla d4, d24, d23, d1[0], d1[1] // -> t7a
+ vmul_vmla d6, d17, d30, d1[1], d1[0] // -> t12
+ vrshr.s32 d24, d8, #12 // t6a
+ vrshr.s32 d23, d4, #12 // t7a
+ vmul_vmls d8, d17, d30, d1[0], d1[1] // -> t13
+ vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t14
+ vrshr.s32 d17, d6, #12 // t12
+ vmul_vmla d6, d29, d18, d1[0], d1[1] // -> t15
+ vrshr.s32 d29, d8, #12 // t13
+ vrshr.s32 d30, d4, #12 // t14
+ vrshr.s32 d18, d6, #12 // t15
+
+ vqsub.s32 d2, d16, d21 // t2a
+.ifc \o0, d16
+ vqadd.s32 \o0, d16, d21 // out0
+ vqsub.s32 d21, d31, d26 // t3a
+ vqadd.s32 \o15,d31, d26 // out15
+.else
+ vqadd.s32 d4, d16, d21 // out0
+ vqsub.s32 d21, d31, d26 // t3a
+ vqadd.s32 \o15,d31, d26 // out15
+ vmov \o0, d4
+.endif
+
+ vqsub.s32 d3, d29, d18 // t15a
+ vqadd.s32 \o13,d29, d18 // out13
+ vqadd.s32 \o2, d17, d30 // out2
+ vqsub.s32 d26, d17, d30 // t14a
+
+ vqadd.s32 \o1, d19, d27 // out1
+ vqsub.s32 d27, d19, d27 // t10
+ vqadd.s32 \o14,d28, d20 // out14
+ vqsub.s32 d20, d28, d20 // t11
+
+ vqadd.s32 \o3, d22, d24 // out3
+ vqsub.s32 d22, d22, d24 // t6
+ vqadd.s32 \o12,d25, d23 // out12
+ vqsub.s32 d23, d25, d23 // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, d2, d21, d3, d26, d27, d20, d22, d23
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d21, d3, d26, d27, d20, d22, d23
+ vmax.s32 \r, \r, d10
+.endr
+
+ vqneg.s32 \o15, \o15 // out15
+ vqneg.s32 \o13,\o13 // out13
+ vqneg.s32 \o1, \o1 // out1
+ vqneg.s32 \o3, \o3 // out3
+
+ vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23)
+ vmul_vmla d4, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24)
+ vmul_vmla d6, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26)
+
+ vrshr.s32 d24, d24, #12 // out8
+ vrshr.s32 d4, d4, #12 // out7
+ vrshr.s32 d5, d6, #12 // out5
+ vmul_vmls d8, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21)
+ vmul_vmla d2, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27)
+ vrshr.s32 d26, d8, #12 // out10
+
+ vmul_vmls d8, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20)
+ vmul_vmla d22, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25)
+ vmul_vmls d6, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22)
+
+ vrshr.s32 \o4, d2, #12 // out4
+ vrshr.s32 d7, d6, #12 // out9
+ vrshr.s32 d6, d8, #12 // out11
+ vrshr.s32 \o6, d22, #12 // out6
+
+.ifc \o8, d23
+ vmov \o8, d24
+ vmov \o10,d26
+.endif
+
+ vqneg.s32 \o7, d4 // out7
+ vqneg.s32 \o5, d5 // out5
+ vqneg.s32 \o11,d6 // out11
+ vqneg.s32 \o9, d7 // out9
+.endm
+
+function inv_adst_2s_x16_neon
+ iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ bx lr
+endfunc
+
+function inv_flipadst_2s_x16_neon
+ iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+ bx lr
+endfunc
+
+function inv_identity_2s_x16_neon
+ mov r12, #0
+ movt r12, #2*(5793-4096)*8
+ vdup.32 d0, r12
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s32 q1, \i, d0[0]
+ vqadd.s32 \i, \i, \i
+ vqadd.s32 \i, \i, q1
+.endr
+ bx lr
+endfunc
+
+.macro identity_8x4_shift1 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s32 q2, \i, \c
+ vrshr.s32 q2, q2, #1
+ vqadd.s32 \i, \i, q2
+.endr
+.endm
+
+.macro identity_8x4 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vqrdmulh.s32 q2, \i, \c
+ vqadd.s32 \i, \i, \i
+ vqadd.s32 \i, \i, q2
+.endr
+.endm
+
+.macro def_horz_16 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x2_neon
+ push {lr}
+ vmov.i32 d7, #0
+.if \scale
+ mov_const r12, 2896*8*(1<<16)
+ vdup.32 d1, r12
+.endif
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r7, :64]
+ vst1.32 {d7}, [r7, :64], r8
+.endr
+.if \scale
+ scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ blx r4
+ vqrshrn.s32 d16, q8, #\shift
+ vqrshrn.s32 d17, q9, #\shift
+ vqrshrn.s32 d18, q10, #\shift
+ vqrshrn.s32 d19, q11, #\shift
+ vqrshrn.s32 d20, q12, #\shift
+ vqrshrn.s32 d21, q13, #\shift
+ vqrshrn.s32 d22, q14, #\shift
+ vqrshrn.s32 d23, q15, #\shift
+ vuzp.16 q8, q9
+ vuzp.16 q10, q11
+
+.irp i, q8, q10, q9, q11
+ vst1.16 {\i}, [r6, :128]!
+.endr
+
+ pop {pc}
+endfunc
+.endm
+
+def_horz_16 scale=0, shift=2
+def_horz_16 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_4x16_neon
+ push {lr}
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ blx r5
+ load_add_store_4x16 r6, r7
+ pop {pc}
+endfunc
+
+function inv_txfm_add_16x16_neon
+ sub_sp_align 512
+ ldrh r11, [r10], #2
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+ add r6, sp, #(\i*16*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 14
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #16*4
+ bl inv_txfm_horz_16x2_neon
+.endr
+ b 3f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+3:
+.irp i, 0, 4, 8, 12
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #32
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+const eob_16x16
+ .short 3, 10, 21, 36, 55, 78, 105, 256
+endconst
+
+const eob_16x16_identity
+ .short 2, 4, 6, 8, 10, 12, 14, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ movrel_local r4, inv_\txfm1\()_2s_x16_neon
+ movrel r5, X(inv_\txfm2\()_4h_x16_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel_local r10, eob_16x16
+.else
+ movrel_local r10, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+ movrel_local r10, eob_16x16_identity
+.else
+ movrel_local r10, eob_16x16
+.endif
+.endif
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+function inv_txfm_add_16x4_neon
+ cmp r3, r10
+ mov r11, #16
+ blt 1f
+
+ add r6, r2, #8
+ vmov.i32 d4, #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r6, :64]
+ vst1.32 {d4}, [r6, :64], r11
+.endr
+ blx r4
+
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ vqrshrn.s32 d20, q12, #1
+ vqrshrn.s32 d21, q13, #1
+ vqrshrn.s32 d22, q14, #1
+ vqrshrn.s32 d23, q15, #1
+ vuzp.16 q8, q9
+ mov r6, sp
+ vuzp.16 q10, q11
+ vpush {q8-q11}
+
+ b 2f
+
+1:
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+ mov r6, sp
+ vpush {q8-q9}
+ vpush {q8-q9}
+
+2:
+ vmov.i32 d4, #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r2, :64]
+ vst1.32 {d4}, [r2, :64], r11
+.endr
+
+ blx r4
+
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ vqrshrn.s32 d20, q12, #1
+ vqrshrn.s32 d21, q13, #1
+ vqrshrn.s32 d22, q14, #1
+ vqrshrn.s32 d23, q15, #1
+ vuzp.16 q8, q9
+ mov r6, sp
+ vuzp.16 q10, q11
+
+ vmov q12, q10
+ vmov q13, q11
+
+ vpop {q10-q11}
+ blx r5
+ mov r6, r0
+ load_add_store_8x4 r6, r7
+
+ vpop {q10-q11}
+ vmov q8, q12
+ vmov q9, q13
+ blx r5
+ add r6, r0, #16
+ load_add_store_8x4 r6, r7
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_4x16_neon
+ ldrh r9, [r10, #4]
+
+ mov r11, #64
+ cmp r3, r9
+ ldrh r9, [r10, #2]
+ blt 1f
+
+ add r6, r2, #48
+ vmov.i32 q2, #0
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r6, :128]
+ vst1.32 {q2}, [r6, :128], r11
+.endr
+ blx r4
+ vqrshrn.s32 d28, q8, #1
+ vqrshrn.s32 d29, q9, #1
+ vqrshrn.s32 d30, q10, #1
+ vqrshrn.s32 d31, q11, #1
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+ b 2f
+1:
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+2:
+ cmp r3, r9
+ ldrh r9, [r10]
+ blt 1f
+
+ add r6, r2, #32
+ vmov.i32 q2, #0
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r6, :128]
+ vst1.32 {q2}, [r6, :128], r11
+.endr
+ blx r4
+ vqrshrn.s32 d24, q8, #1
+ vqrshrn.s32 d25, q9, #1
+ vqrshrn.s32 d26, q10, #1
+ vqrshrn.s32 d27, q11, #1
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+
+ b 2f
+1:
+ vmov.i16 q12, #0
+ vmov.i16 q13, #0
+2:
+ cmp r3, r9
+ blt 1f
+
+ add r6, r2, #16
+ vmov.i32 q2, #0
+.irp i, q8, q9, q10, q11
+ vld1.32 {\i}, [r6, :128]
+ vst1.32 {q2}, [r6, :128], r11
+.endr
+ blx r4
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+
+ b 2f
+1:
+ vmov.i16 q8, #0
+ vmov.i16 q9, #0
+2:
+ vmov.i16 q2, #0
+ vpush {q8-q9}
+.irp i, q8, q9, q10, q11
+ vld1.16 {\i}, [r2, :128]
+ vst1.16 {q2}, [r2, :128], r11
+.endr
+ blx r4
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ vpop {q10-q11}
+
+ blx r5
+
+ load_add_store_4x16 r0, r6
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+const eob_4x16
+ .short 13, 29, 45, 64
+endconst
+
+const eob_4x16_identity1
+ .short 16, 32, 48, 64
+endconst
+
+const eob_4x16_identity2
+ .short 4, 8, 12, 64
+endconst
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_16x4
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+.if \w == 4
+ movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel r5, X(inv_\txfm2\()_4h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel_local r10, eob_4x16
+.else
+ movrel_local r10, eob_4x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel_local r10, eob_4x16_identity2
+.else
+ movrel_local r10, eob_4x16
+.endif
+.endif
+.else
+ mov r10, #\eob_16x4
+ movrel_local r4, inv_\txfm1\()_2s_x\w\()_neon
+ movrel r5, X(inv_\txfm2\()_8h_x\h\()_neon)
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 3
+def_fn_416 \w, \h, identity, identity, 3
+def_fn_416 \w, \h, dct, adst, 3
+def_fn_416 \w, \h, dct, flipadst, 3
+def_fn_416 \w, \h, dct, identity, 2
+def_fn_416 \w, \h, adst, dct, 3
+def_fn_416 \w, \h, adst, adst, 3
+def_fn_416 \w, \h, adst, flipadst, 3
+def_fn_416 \w, \h, flipadst, dct, 3
+def_fn_416 \w, \h, flipadst, adst, 3
+def_fn_416 \w, \h, flipadst, flipadst, 3
+def_fn_416 \w, \h, identity, dct, 2
+def_fn_416 \w, \h, adst, identity, 2
+def_fn_416 \w, \h, flipadst, identity, 2
+def_fn_416 \w, \h, identity, adst, 2
+def_fn_416 \w, \h, identity, flipadst, 2
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+function inv_txfm_add_16x8_neon
+ sub_sp_align 256
+ ldrh r11, [r10], #2
+
+.irp i, 0, 2, 4, 6
+ add r6, sp, #(\i*16*2)
+.if \i > 0
+ mov r8, #(8 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 6
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #8*4
+ bl inv_txfm_horz_scale_16x2_neon
+.endr
+ b 3f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+3:
+
+.irp i, 0, 8
+ add r7, sp, #(\i*2)
+ mov r8, #32
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\j}, [r7, :128], r8
+.endr
+ blx r5
+
+ add r6, r0, #(\i*2)
+ load_add_store_8x8 r6, r7
+.endr
+
+ add_sp_align 256
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_8x16_neon
+ add r10, r10, #2
+ sub_sp_align 256
+ ldrh r11, [r10], #4
+
+.irp i, 0, 4, 8, 12
+ add r6, sp, #(\i*8*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 12
+ ldrh r11, [r10], #4
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #16*4
+
+ mov_const r12, 2896*8*(1<<16)
+ vmov.i32 q2, #0
+ vdup.32 d0, r12
+
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\j}, [r7, :128]
+ vst1.32 {q2}, [r7, :128], r8
+.endr
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+ blx r4
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q9, #1
+ vqrshrn.s32 d18, q10, #1
+ vqrshrn.s32 d19, q11, #1
+ vqrshrn.s32 d20, q12, #1
+ vqrshrn.s32 d21, q13, #1
+ vqrshrn.s32 d22, q14, #1
+ vqrshrn.s32 d23, q15, #1
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+.irp j, d16, d20, d17, d21, d18, d22, d19, d23
+ vst1.16 {\j}, [r6, :64]!
+.endr
+.endr
+ b 3f
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #4
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+3:
+
+.irp i, 0, 4
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #16
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 256
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+const eob_8x16
+ .short 3, 10, 21, 43, 59, 75, 91, 128
+endconst
+
+const eob_8x16_identity1
+ .short 2, 4, 6, 64, 80, 96, 112, 128
+endconst
+
+const eob_8x16_identity2
+ .short 2, 4, 6, 8, 10, 12, 14, 128
+endconst
+
+.macro def_fn_816 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ push {r4-r11,lr}
+ vpush {q4-q7}
+.if \w == 8
+ movrel_local r4, inv_\txfm1\()_4s_x8_neon
+ movrel r5, X(inv_\txfm2\()_4h_x16_neon)
+.else
+ movrel_local r4, inv_\txfm1\()_2s_x16_neon
+ movrel r5, X(inv_\txfm2\()_8h_x8_neon)
+.endif
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel_local r10, eob_8x16
+.else
+ movrel_local r10, eob_8x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel_local r10, eob_8x16_identity2
+.else
+ movrel_local r10, eob_8x16
+.endif
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct
+def_fn_816 \w, \h, identity, identity
+def_fn_816 \w, \h, dct, adst
+def_fn_816 \w, \h, dct, flipadst
+def_fn_816 \w, \h, dct, identity
+def_fn_816 \w, \h, adst, dct
+def_fn_816 \w, \h, adst, adst
+def_fn_816 \w, \h, adst, flipadst
+def_fn_816 \w, \h, flipadst, dct
+def_fn_816 \w, \h, flipadst, adst
+def_fn_816 \w, \h, flipadst, flipadst
+def_fn_816 \w, \h, identity, dct
+def_fn_816 \w, \h, adst, identity
+def_fn_816 \w, \h, flipadst, identity
+def_fn_816 \w, \h, identity, adst
+def_fn_816 \w, \h, identity, flipadst
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_2s_x16_neon
+ movrel_local r12, idct_coeffs, 4*16
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ vmul_vmls d4, d16, d31, d0[0], d0[1] // -> t16a
+ vmul_vmla d6, d16, d31, d0[1], d0[0] // -> t31a
+ vmul_vmls d8, d24, d23, d1[0], d1[1] // -> t17a
+ vrshr.s32 d16, d4, #12 // t16a
+ vrshr.s32 d31, d6, #12 // t31a
+ vmul_vmla d4, d24, d23, d1[1], d1[0] // -> t30a
+ vmul_vmls d6, d20, d27, d2[0], d2[1] // -> t18a
+ vrshr.s32 d24, d8, #12 // t17a
+ vrshr.s32 d23, d4, #12 // t30a
+ vmul_vmla d8, d20, d27, d2[1], d2[0] // -> t29a
+ vmul_vmls d4, d28, d19, d3[0], d3[1] // -> t19a
+ vrshr.s32 d20, d6, #12 // t18a
+ vrshr.s32 d27, d8, #12 // t29a
+ vmul_vmla d6, d28, d19, d3[1], d3[0] // -> t28a
+ vld1.32 {q0, q1}, [r12, :128]
+ sub r12, r12, #4*24
+ vmul_vmls d8, d18, d29, d0[0], d0[1] // -> t20a
+ vrshr.s32 d28, d4, #12 // t19a
+ vrshr.s32 d19, d6, #12 // t28a
+ vmul_vmla d4, d18, d29, d0[1], d0[0] // -> t27a
+ vmul_vmls d6, d26, d21, d1[0], d1[1] // -> t21a
+ vrshr.s32 d18, d8, #12 // t20a
+ vrshr.s32 d29, d4, #12 // t27a
+ vmul_vmla d8, d26, d21, d1[1], d1[0] // -> t26a
+ vmul_vmls d4, d22, d25, d2[0], d2[1] // -> t22a
+ vrshr.s32 d26, d6, #12 // t21a
+ vrshr.s32 d21, d8, #12 // t26a
+ vmul_vmla d6, d22, d25, d2[1], d2[0] // -> t25a
+ vmul_vmls d8, d30, d17, d3[0], d3[1] // -> t23a
+ vrshr.s32 d22, d4, #12 // t22a
+ vrshr.s32 d25, d6, #12 // t25a
+ vmul_vmla d4, d30, d17, d3[1], d3[0] // -> t24a
+ vrshr.s32 d30, d8, #12 // t23a
+ vrshr.s32 d17, d4, #12 // t24a
+
+ vld1.32 {q0, q1}, [r12, :128]
+
+ vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ vqsub.s32 d5, d16, d24 // t17
+ vqadd.s32 d16, d16, d24 // t16
+ vqsub.s32 d7, d31, d23 // t30
+ vqadd.s32 d31, d31, d23 // t31
+ vqsub.s32 d24, d28, d20 // t18
+ vqadd.s32 d28, d28, d20 // t19
+ vqadd.s32 d23, d18, d26 // t20
+ vqsub.s32 d18, d18, d26 // t21
+ vqsub.s32 d20, d30, d22 // t22
+ vqadd.s32 d30, d30, d22 // t23
+ vqadd.s32 d26, d17, d25 // t24
+ vqsub.s32 d17, d17, d25 // t25
+ vqsub.s32 d22, d29, d21 // t26
+ vqadd.s32 d29, d29, d21 // t27
+ vqadd.s32 d25, d19, d27 // t28
+ vqsub.s32 d19, d19, d27 // t29
+
+.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a
+ vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a
+ vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a
+ vrshr.s32 d21, d4, #12 // t17a
+ vrshr.s32 d27, d6, #12 // t30a
+ vneg.s32 d8, d8 // -> t18a
+ vmul_vmls d5, d19, d24, d2[0], d2[1] // -> t29a
+ vmul_vmls d4, d22, d18, d3[0], d3[1] // -> t21a
+ vrshr.s32 d19, d8, #12 // t18a
+ vrshr.s32 d24, d5, #12 // t29a
+ vmul_vmla d6, d22, d18, d3[1], d3[0] // -> t26a
+ vmul_vmla d8, d17, d20, d3[1], d3[0] // -> t22a
+ vrshr.s32 d22, d4, #12 // t21a
+ vrshr.s32 d18, d6, #12 // t26a
+ vneg.s32 d8, d8 // -> t22a
+ vmul_vmls d5, d17, d20, d3[0], d3[1] // -> t25a
+ vrshr.s32 d17, d8, #12 // t22a
+ vrshr.s32 d20, d5, #12 // t25a
+
+ vqsub.s32 d2, d27, d24 // t29
+ vqadd.s32 d27, d27, d24 // t30
+ vqsub.s32 d3, d21, d19 // t18
+ vqadd.s32 d21, d21, d19 // t17
+ vqsub.s32 d24, d16, d28 // t19a
+ vqadd.s32 d16, d16, d28 // t16a
+ vqsub.s32 d19, d30, d23 // t20a
+ vqadd.s32 d30, d30, d23 // t23a
+ vqsub.s32 d28, d17, d22 // t21
+ vqadd.s32 d17, d17, d22 // t22
+ vqadd.s32 d23, d26, d29 // t24a
+ vqsub.s32 d26, d26, d29 // t27a
+ vqadd.s32 d22, d20, d18 // t25
+ vqsub.s32 d20, d20, d18 // t26
+ vqsub.s32 d29, d31, d25 // t28a
+ vqadd.s32 d31, d31, d25 // t31a
+
+.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a
+ vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a
+ vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19
+ vrshr.s32 d18, d4, #12 // t18a
+ vrshr.s32 d25, d6, #12 // t29a
+ vmul_vmla d5, d29, d24, d1[1], d1[0] // -> t28
+ vmul_vmla d4, d26, d19, d1[1], d1[0] // -> t20
+ vrshr.s32 d29, d8, #12 // t19
+ vrshr.s32 d24, d5, #12 // t28
+ vneg.s32 d4, d4 // -> t20
+ vmul_vmls d6, d26, d19, d1[0], d1[1] // -> t27
+ vmul_vmla d8, d20, d28, d1[1], d1[0] // -> t21a
+ vrshr.s32 d26, d4, #12 // t20
+ vrshr.s32 d19, d6, #12 // t27
+ vneg.s32 d8, d8 // -> t21a
+ vmul_vmls d5, d20, d28, d1[0], d1[1] // -> t26a
+ vrshr.s32 d20, d8, #12 // t21a
+ vrshr.s32 d28, d5, #12 // t26a
+
+ vqsub.s32 d2, d16, d30 // t23
+ vqadd.s32 d16, d16, d30 // t16 = out16
+ vqsub.s32 d3, d31, d23 // t24
+ vqadd.s32 d31, d31, d23 // t31 = out31
+ vqsub.s32 d23, d21, d17 // t22a
+ vqadd.s32 d17, d21, d17 // t17a = out17
+ vqadd.s32 d30, d27, d22 // t30a = out30
+ vqsub.s32 d21, d27, d22 // t25a
+ vqsub.s32 d27, d18, d20 // t21
+ vqadd.s32 d18, d18, d20 // t18 = out18
+ vqadd.s32 d4, d29, d26 // t19a = out19
+ vqsub.s32 d26, d29, d26 // t20a
+ vqadd.s32 d29, d25, d28 // t29 = out29
+ vqsub.s32 d25, d25, d28 // t26
+ vqadd.s32 d28, d24, d19 // t28a = out28
+ vqsub.s32 d24, d24, d19 // t27a
+ vmov d19, d4 // out19
+
+.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
+ vmax.s32 \r, \r, d10
+.endr
+
+ vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20
+ vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27
+ vrshr.s32 d20, d4, #12 // t20
+ vrshr.s32 d22, d6, #12 // t27
+
+ vmul_vmla d4, d25, d27, d0[0], d0[0] // -> t26a
+ vmul_vmls d6, d25, d27, d0[0], d0[0] // -> t21a
+ vmov d27, d22 // t27
+ vrshr.s32 d26, d4, #12 // t26a
+
+ vmul_vmls d24, d21, d23, d0[0], d0[0] // -> t22
+ vmul_vmla d4, d21, d23, d0[0], d0[0] // -> t25
+ vrshr.s32 d21, d6, #12 // t21a
+ vrshr.s32 d22, d24, #12 // t22
+ vrshr.s32 d25, d4, #12 // t25
+
+ vmul_vmls d4, d3, d2, d0[0], d0[0] // -> t23a
+ vmul_vmla d6, d3, d2, d0[0], d0[0] // -> t24a
+ vrshr.s32 d23, d4, #12 // t23a
+ vrshr.s32 d24, d6, #12 // t24a
+
+ bx lr
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x2_neon
+ push {lr}
+ vmov.i32 d7, #0
+ lsl r8, r8, #1
+.if \scale
+ mov_const r12, 2896*8*(1<<16)
+ vdup.32 d0, r12
+.endif
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r7, :64]
+ vst1.32 {d7}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ add r7, r7, r8, lsr #1
+.if \scale
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ bl inv_dct_2s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in d9 and d8,
+ // but here we want to use full q registers for clipping.
+ vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmin.s32 \r, \r, q3
+.endr
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmax.s32 \r, \r, q2
+.endr
+
+ vtrn.32 d16, d17
+ vtrn.32 d18, d19
+ vtrn.32 d20, d21
+ vtrn.32 d22, d23
+ vtrn.32 d24, d25
+ vtrn.32 d26, d27
+ vtrn.32 d28, d29
+ vtrn.32 d30, d31
+
+.macro store1 r0, r1, r2, r3
+ vst1.16 {\r0}, [r6, :64]!
+ vst1.16 {\r1}, [r6, :64]!
+ vst1.16 {\r2}, [r6, :64]!
+ vst1.16 {\r3}, [r6, :64]!
+.endm
+ store1 d16, d18, d20, d22
+ store1 d24, d26, d28, d30
+ store1 d17, d19, d21, d23
+ store1 d25, d27, d29, d31
+.purgem store1
+ sub r6, r6, #64*2
+
+ vmov.i32 d7, #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.32 {\i}, [r7, :64]
+ vst1.32 {d7}, [r7, :64], r8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in d0[1]
+ scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15
+.endif
+ bl inv_dct32_odd_2s_x16_neon
+ vtrn.32 d31, d30
+ vtrn.32 d29, d28
+ vtrn.32 d27, d26
+ vtrn.32 d25, d24
+ vtrn.32 d23, d22
+ vtrn.32 d21, d20
+ vtrn.32 d19, d18
+ vtrn.32 d17, d16
+.macro store2 r0, r1, r2, r3, r4, r5, r6, r7, shift
+ vld1.32 {q0, q1}, [r6, :128]!
+ vld1.32 {q2, q3}, [r6, :128]
+ sub r6, r6, #32
+ vqsub.s32 d15, d0, \r0
+ vqadd.s32 d0, d0, \r0
+ vqsub.s32 d14, d1, \r1
+ vqadd.s32 d1, d1, \r1
+ vqsub.s32 d13, d2, \r2
+ vqadd.s32 d2, d2, \r2
+ vqsub.s32 d12, d3, \r3
+ vqadd.s32 d3, d3, \r3
+ vqsub.s32 d11, d4, \r4
+ vqadd.s32 d4, d4, \r4
+ vqsub.s32 d10, d5, \r5
+ vqadd.s32 d5, d5, \r5
+ vqsub.s32 d9, d6, \r6
+ vqadd.s32 d6, d6, \r6
+ vqsub.s32 d8, d7, \r7
+ vqadd.s32 d7, d7, \r7
+ vqrshrn.s32 d0, q0, #\shift
+ vqrshrn.s32 d1, q1, #\shift
+ vqrshrn.s32 d2, q2, #\shift
+ vqrshrn.s32 d3, q3, #\shift
+ vqrshrn.s32 d4, q4, #\shift
+ vqrshrn.s32 d5, q5, #\shift
+ vqrshrn.s32 d6, q6, #\shift
+ vqrshrn.s32 d7, q7, #\shift
+ vrev32.16 q2, q2
+ vrev32.16 q3, q3
+ vst1.16 {q0, q1}, [r6, :128]!
+ vst1.16 {q2, q3}, [r6, :128]!
+.endm
+
+ store2 d31, d29, d27, d25, d23, d21, d19, d17, \shift
+ store2 d30, d28, d26, d24, d22, d20, d18, d16, \shift
+.purgem store2
+ pop {pc}
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_4x32_neon
+ push {r10-r11,lr}
+ lsl r8, r8, #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+
+ bl X(inv_dct_4h_x16_neon)
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vst1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ add r7, r7, r8, lsr #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ vld1.16 {\i}, [r7, :64], r8
+.endr
+ sub r7, r7, r8, lsl #4
+ sub r7, r7, r8, lsr #1
+ bl X(inv_dct32_odd_4h_x16_neon)
+
+ neg r9, r8
+ mov r10, r6
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+.macro combine r0, r1, r2, r3, op, stride
+ vld1.16 {d4}, [r7, :64], \stride
+ vld1.16 {d0}, [r10, :64], r1
+ vld1.16 {d5}, [r7, :64], \stride
+ vld1.16 {d1}, [r10, :64], r1
+ \op\().s16 d4, d4, \r0
+ vld1.16 {d6}, [r7, :64], \stride
+ vld1.16 {d2}, [r10, :64], r1
+ \op\().s16 d5, d5, \r1
+ vld1.16 {d3}, [r10, :64], r1
+ vrshr.s16 q2, q2, #4
+ \op\().s16 d6, d6, \r2
+ vld1.16 {d7}, [r7, :64], \stride
+ vqadd.s16 q0, q0, q2
+ \op\().s16 d7, d7, \r3
+ vmax.s16 q0, q0, q6
+ vrshr.s16 q3, q3, #4
+ vmin.s16 q0, q0, q7
+ vqadd.s16 q1, q1, q3
+ vst1.16 {d0}, [r6, :64], r1
+ vmax.s16 q1, q1, q6
+ vst1.16 {d1}, [r6, :64], r1
+ vmin.s16 q1, q1, q7
+ vst1.16 {d2}, [r6, :64], r1
+ vst1.16 {d3}, [r6, :64], r1
+.endm
+ combine d31, d30, d29, d28, vqadd, r8
+ combine d27, d26, d25, d24, vqadd, r8
+ combine d23, d22, d21, d20, vqadd, r8
+ combine d19, d18, d17, d16, vqadd, r8
+ sub r7, r7, r8
+ combine d16, d17, d18, d19, vqsub, r9
+ combine d20, d21, d22, d23, vqsub, r9
+ combine d24, d25, d26, d27, vqsub, r9
+ combine d28, d29, d30, d31, vqsub, r9
+.purgem combine
+
+ pop {r10-r11,pc}
+endfunc
+
+const eob_32x32
+ .short 3, 10, 21, 36, 55, 78, 105, 136, 171, 210, 253, 300, 351, 406, 465, 1024
+endconst
+
+const eob_16x32
+ .short 3, 10, 21, 36, 55, 78, 105, 151, 183, 215, 247, 279, 311, 343, 375, 512
+endconst
+
+const eob_16x32_shortside
+ .short 3, 10, 21, 36, 55, 78, 105, 512
+endconst
+
+const eob_8x32
+ .short 3, 10, 21, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1
+ push {r4-r7,lr}
+ vpush {q6-q7}
+ movrel_local r5, eob_32x32, 2
+
+ mov r6, #4*32
+1:
+ mov r12, #0
+ movrel_local r4, eob_32x32, 6
+2:
+ vmov.i32 q0, #0
+ add r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r6
+.endr
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q12
+ vqmovn.s32 d18, q9
+ vqmovn.s32 d19, q13
+ vqmovn.s32 d20, q10
+ vqmovn.s32 d21, q14
+ vqmovn.s32 d22, q11
+ vqmovn.s32 d23, q15
+ transpose_4x8h q8, q9, q10, q11
+
+ load_add_store_8x4 r0, r7, shiftbits=2
+ ldrh lr, [r4], #8
+ sub r0, r0, r1, lsl #2
+ cmp r3, lr
+ add r0, r0, #2*8
+ bge 2b
+
+ ldrh lr, [r5], #4
+ cmp r3, lr
+ blt 9f
+
+ sub r0, r0, r12, lsl #1
+ add r0, r0, r1, lsl #2
+ mls r2, r6, r12, r2
+ add r2, r2, #4*4
+ b 1b
+9:
+ vpop {q6-q7}
+ pop {r4-r7,pc}
+endfunc
+
+.macro shift_8_regs op, shift
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ push {r4-r9,lr}
+ vpush {q6-q7}
+ mov r9, #0
+ mov_const r8, 2896*8*(1<<16)
+ movt r9, #2*(5793-4096)*8
+ movrel_local r5, eob_16x32\hshort, 2
+
+ mov r6, #4*\h
+1:
+ mov r12, #0
+ movrel_local r4, eob_16x32\wshort, 6
+2:
+ vdup.i32 d0, r8
+ vmov.i32 q1, #0
+ vmov.32 d0[1], r9
+ add r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q1}, [r2, :128], r6
+.endr
+ scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
+
+.if \w == 16
+ // 16x32
+ identity_8x4_shift1 d0[1]
+.else
+ // 32x16
+ shift_8_regs vqshl.s32, 1
+ identity_8x4 d0[1]
+.endif
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q12
+ vqmovn.s32 d18, q9
+ vqmovn.s32 d19, q13
+ vqmovn.s32 d20, q10
+ vqmovn.s32 d21, q14
+ vqmovn.s32 d22, q11
+ vqmovn.s32 d23, q15
+ transpose_4x8h q8, q9, q10, q11
+
+.if \w == 16
+ load_add_store_8x4 r0, r7, shiftbits=2
+.else
+ load_add_store_8x4 r0, r7, shiftbits=4
+.endif
+ ldrh lr, [r4], #8
+ sub r0, r0, r1, lsl #2
+ cmp r3, lr
+ add r0, r0, #2*8
+ bge 2b
+
+ ldrh lr, [r5], #4
+ cmp r3, lr
+ blt 9f
+
+ sub r0, r0, r12, lsl #1
+ add r0, r0, r1, lsl #2
+ mls r2, r6, r12, r2
+ add r2, r2, #4*4
+ b 1b
+9:
+ vpop {q6-q7}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ push {r4-r5,lr}
+ vpush {q6-q7}
+ movrel_local r4, eob_8x32, 2
+
+ mov r12, #4*\h
+1:
+ ldrh lr, [r4], #4
+.if \w == 8
+ vmov.i32 q0, #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r12
+.endr
+
+ vqrshrn.s32 d16, q8, #1
+ vqrshrn.s32 d17, q12, #1
+ vqrshrn.s32 d18, q9, #1
+ vqrshrn.s32 d19, q13, #1
+ vqrshrn.s32 d20, q10, #1
+ vqrshrn.s32 d21, q14, #1
+ vqrshrn.s32 d22, q11, #1
+ vqrshrn.s32 d23, q15, #1
+
+ transpose_4x8h q8, q9, q10, q11
+
+ cmp r3, lr
+ load_add_store_8x4 r0, r5, shiftbits=2
+ blt 9f
+ sub r2, r2, r12, lsl #3
+ add r2, r2, #4*4
+.else
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+ vld1.32 {q8, q9}, [r2, :128]
+ vst1.32 {q0, q1}, [r2, :128], r12
+ vld1.32 {q10, q11}, [r2, :128]
+ vst1.32 {q0, q1}, [r2, :128], r12
+ vld1.32 {q12, q13}, [r2, :128]
+ vst1.32 {q0, q1}, [r2, :128], r12
+ vld1.32 {q14, q15}, [r2, :128]
+ vst1.32 {q0, q1}, [r2, :128], r12
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q10
+ vqmovn.s32 d20, q9
+ vqmovn.s32 d21, q11
+ vqmovn.s32 d18, q12
+ vqmovn.s32 d19, q14
+ vqmovn.s32 d22, q13
+ vqmovn.s32 d23, q15
+
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+
+ cmp r3, lr
+ load_add_store_4x8 r0, r5, shiftbits=3
+ blt 9f
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #2*4
+.endif
+ b 1b
+
+9:
+ vpop {q6-q7}
+ pop {r4-r5,pc}
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 2048
+ movrel_local r10, eob_32x32
+ ldrh r11, [r10], #2
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, sp, #(\i*32*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_horz_dct_32x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 2048
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 1024
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+ movrel_local r4, inv_dct_2s_x16_neon
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, sp, #(\i*16*2)
+ add r7, r2, #(\i*4)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #4*32
+ bl inv_txfm_horz_scale_16x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #16*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 1024
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 1024
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+ movrel r5, X(inv_dct_4h_x16_neon)
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+ add r6, sp, #(\i*32*2)
+ add r7, r2, #(\i*4)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 14
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #4*16
+ bl inv_txfm_horz_scale_dct_32x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #32*2
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 1024
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ sub_sp_align 512
+
+ movrel_local r10, eob_8x32, 2
+
+ mov r8, #4*32
+ mov r9, #32
+ mov r6, sp
+1:
+ vmov.i32 q0, #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.32 {\i}, [r2, :128]
+ vst1.32 {q0}, [r2, :128], r8
+.endr
+ ldrh r11, [r10], #4
+ sub r2, r2, r8, lsl #3
+ sub r9, r9, #4
+ add r2, r2, #4*4
+
+ bl inv_dct_4s_x8_neon
+
+ vqrshrn.s32 d16, q8, #2
+ vqrshrn.s32 d18, q9, #2
+ vqrshrn.s32 d20, q10, #2
+ vqrshrn.s32 d22, q11, #2
+ vqrshrn.s32 d17, q12, #2
+ vqrshrn.s32 d19, q13, #2
+ vqrshrn.s32 d21, q14, #2
+ vqrshrn.s32 d23, q15, #2
+
+ transpose_4x8h q8, q9, q10, q11
+
+ vst1.16 {q8, q9}, [r6, :128]!
+ cmp r3, r11
+ vst1.16 {q10, q11}, [r6, :128]!
+
+ bge 1b
+ cmp r9, #0
+ beq 3f
+
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r9, r9, #4
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4
+ add r6, r0, #(\i*2)
+ add r7, sp, #(\i*2)
+ mov r8, #8*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ movrel_local r10, eob_8x32
+ sub_sp_align 512
+ ldrh r11, [r10], #2
+
+.irp i, 0, 2, 4, 6
+ add r6, sp, #(\i*32*2)
+ add r7, r2, #(\i*4)
+.if \i > 0
+ cmp r3, r11
+ mov r8, #(8 - \i)
+ blt 1f
+.if \i < 6
+ ldrh r11, [r10], #2
+.endif
+.endif
+ mov r8, #8*4
+ bl inv_txfm_horz_dct_32x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+ mov r8, #2*32
+ mov r9, #0
+1:
+ add r6, r0, r9, lsl #1
+ add r7, sp, r9, lsl #1 // #(\i*2)
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+ vld1.16 {\i}, [r7, :128], r8
+.endr
+ add r9, r9, #8
+
+ bl X(inv_dct_8h_x8_neon)
+
+ cmp r9, #32
+
+ load_add_store_8x8 r6, r7
+
+ blt 1b
+
+ add_sp_align 512
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ vld1.32 {q0, q1}, [r12, :128]!
+
+ vqrdmulh.s32 d23, d16, d0[1] // t63a
+ vqrdmulh.s32 d16, d16, d0[0] // t32a
+ vqrdmulh.s32 d22, d17, d1[0] // t62a
+ vqrdmulh.s32 d17, d17, d1[1] // t33a
+ vqrdmulh.s32 d21, d18, d2[1] // t61a
+ vqrdmulh.s32 d18, d18, d2[0] // t34a
+ vqrdmulh.s32 d20, d19, d3[0] // t60a
+ vqrdmulh.s32 d19, d19, d3[1] // t35a
+
+ vld1.32 {q0}, [r12, :128]!
+
+ vqadd.s32 d24, d16, d17 // t32
+ vqsub.s32 d25, d16, d17 // t33
+ vqsub.s32 d26, d19, d18 // t34
+ vqadd.s32 d27, d19, d18 // t35
+ vqadd.s32 d28, d20, d21 // t60
+ vqsub.s32 d29, d20, d21 // t61
+ vqsub.s32 d30, d23, d22 // t62
+ vqadd.s32 d31, d23, d22 // t63
+
+.irp r, q12, q13, q14, q15
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q12, q13, q14, q15
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a
+ vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a
+ vneg.s32 d4, d4 // t34a
+ vmul_vmls d7, d30, d25, d0[1], d0[0] // -> t33a
+ vrshr.s32 d26, d4, #12 // t34a
+ vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a
+ vrshr.s32 d29, d6, #12 // t61a
+ vrshr.s32 d25, d7, #12 // t33a
+ vrshr.s32 d30, d4, #12 // t62a
+
+ vqadd.s32 d16, d24, d27 // t32a
+ vqsub.s32 d19, d24, d27 // t35a
+ vqadd.s32 d17, d25, d26 // t33
+ vqsub.s32 d18, d25, d26 // t34
+ vqsub.s32 d20, d31, d28 // t60a
+ vqadd.s32 d23, d31, d28 // t63a
+ vqsub.s32 d21, d30, d29 // t61
+ vqadd.s32 d22, d30, d29 // t62
+
+.irp r, q8, q9, q10, q11
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q8, q9, q10, q11
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a
+ vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a
+ vmul_vmla d7, d20, d19, d1[0], d1[1] // -> t60
+ vrshr.s32 d21, d4, #12 // t61a
+ vrshr.s32 d18, d6, #12 // t34a
+ vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35
+ vrshr.s32 d20, d7, #12 // t60
+ vrshr.s32 d19, d4, #12 // t35
+
+ vst1.32 {d16, d17, d18, d19}, [r6, :128]!
+ vst1.32 {d20, d21, d22, d23}, [r6, :128]!
+
+ bx lr
+endfunc
+
+function inv_dct64_step2_neon
+ movrel_local r12, idct_coeffs
+ vld1.32 {q0}, [r12, :128]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ vldr d16, [r6, #4*2*0] // t32a
+ vldr d17, [r9, #4*2*8] // t39a
+ vldr d18, [r9, #4*2*0] // t63a
+ vldr d19, [r6, #4*2*8] // t56a
+ vldr d20, [r6, #4*2*16] // t40a
+ vldr d21, [r9, #4*2*24] // t47a
+ vldr d22, [r9, #4*2*16] // t55a
+ vldr d23, [r6, #4*2*24] // t48a
+
+ vqadd.s32 d24, d16, d17 // t32
+ vqsub.s32 d25, d16, d17 // t39
+ vqadd.s32 d26, d18, d19 // t63
+ vqsub.s32 d27, d18, d19 // t56
+ vqsub.s32 d28, d21, d20 // t40
+ vqadd.s32 d29, d21, d20 // t47
+ vqadd.s32 d30, d23, d22 // t48
+ vqsub.s32 d31, d23, d22 // t55
+
+.irp r, q12, q13, q14, q15
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q12, q13, q14, q15
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a
+ vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a
+ vmul_vmla d7, d31, d28, d1[1], d1[0] // -> t40a
+ vrshr.s32 d25, d4, #12 // t56a
+ vrshr.s32 d27, d6, #12 // t39a
+ vneg.s32 d7, d7 // t40a
+ vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a
+ vrshr.s32 d31, d7, #12 // t40a
+ vrshr.s32 d28, d4, #12 // t55a
+
+ vqadd.s32 d16, d24, d29 // t32a
+ vqsub.s32 d19, d24, d29 // t47a
+ vqadd.s32 d17, d27, d31 // t39
+ vqsub.s32 d18, d27, d31 // t40
+ vqsub.s32 d20, d26, d30 // t48a
+ vqadd.s32 d23, d26, d30 // t63a
+ vqsub.s32 d21, d25, d28 // t55
+ vqadd.s32 d22, d25, d28 // t56
+
+.irp r, q8, q9, q10, q11
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q8, q9, q10, q11
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a
+ vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a
+ vmul_vmls d7, d20, d19, d0[0], d0[0] // -> t47
+ vrshr.s32 d18, d4, #12 // t40a
+ vrshr.s32 d21, d6, #12 // t55a
+ vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48
+ vrshr.s32 d19, d7, #12 // t47
+ vrshr.s32 d20, d4, #12 // t48
+
+ vstr d16, [r6, #4*2*0] // t32a
+ vstr d17, [r9, #4*2*0] // t39
+ vstr d18, [r6, #4*2*8] // t40a
+ vstr d19, [r9, #4*2*8] // t47
+ vstr d20, [r6, #4*2*16] // t48
+ vstr d21, [r9, #4*2*16] // t55a
+ vstr d22, [r6, #4*2*24] // t56
+ vstr d23, [r9, #4*2*24] // t63a
+
+ add r6, r6, #4*2
+ sub r9, r9, #4*2
+ cmp r6, r9
+ blt 1b
+ bx lr
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23
+.if \clear
+ vld1.32 {\i}, [\src, :64]
+ vst1.32 {\zero}, [\src, :64], \strd
+.else
+ vld1.32 {\i}, [\src, :64], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+ vst1.32 {q8, q9}, [\dst, :128]!
+ vst1.32 {q10, q11}, [\dst, :128]!
+ vst1.32 {q12, q13}, [\dst, :128]!
+ vst1.32 {q14, q15}, [\dst, :128]!
+.endm
+
+.macro clear_upper8
+.irp i, q12, q13, q14, q15
+ vmov.i32 \i, #0
+.endr
+.endm
+
+.macro vmov_if reg, val, cond
+.if \cond
+ vmov.i32 \reg, \val
+.endif
+.endm
+
+.macro movdup_if reg, gpr, val, cond
+.if \cond
+ mov_const \gpr, \val
+ vdup.32 \reg, \gpr
+.endif
+.endm
+
+.macro vst1_if regs, dst, dstalign, cond
+.if \cond
+ vst1.32 \regs, \dst, \dstalign
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_2s_x64_neon
+ mov r6, sp
+
+ push {r10-r11,lr}
+
+ lsl r8, r8, #2
+
+ movdup_if d0, r12, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ load8 r7, r8, d7, \clear
+ clear_upper8
+ sub r7, r7, r8, lsl #3
+ add r7, r7, r8, lsr #1
+ scale_if \scale, d0[0], q8, q9, q10, q11
+
+ bl inv_dct_2s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in d9 and d8,
+ // but here we want to use full q registers for clipping.
+ vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmin.s32 \r, \r, q3
+.endr
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmax.s32 \r, \r, q2
+.endr
+
+ store16 r6
+
+ movdup_if d0, r12, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ load8 r7, r8, d7, \clear
+ clear_upper8
+ sub r7, r7, r8, lsl #3
+ lsr r8, r8, #1
+ sub r7, r7, r8, lsr #1
+ scale_if \scale, d0[0], q8, q9, q10, q11
+
+ bl inv_dct32_odd_2s_x16_neon
+
+ add r10, r6, #8*15
+ sub r6, r6, #8*16
+
+ mov r9, #-8
+
+ vmov.i32 d1, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d0, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.macro store_addsub r0, r1, r2, r3
+ vld1.32 {d2}, [r6, :64]!
+ vld1.32 {d3}, [r6, :64]!
+ vqadd.s32 d6, d2, \r0
+ vqsub.s32 \r0, d2, \r0
+ vld1.32 {d4}, [r6, :64]!
+ vqadd.s32 d7, d3, \r1
+ vqsub.s32 \r1, d3, \r1
+ vmin.s32 d6, d6, d1
+ vmin.s32 \r0, \r0, d1
+ vld1.32 {d5}, [r6, :64]!
+ vqadd.s32 d2, d4, \r2
+ sub r6, r6, #8*4
+ vmax.s32 d6, d6, d0
+ vmax.s32 \r0, \r0, d0
+ vqsub.s32 \r2, d4, \r2
+ vmin.s32 d7, d7, d1
+ vmin.s32 \r1, \r1, d1
+ vst1.32 {d6}, [r6, :64]!
+ vst1.32 {\r0}, [r10, :64], r9
+ vmin.s32 d2, d2, d1
+ vmin.s32 \r2, \r2, d1
+ vmax.s32 d7, d7, d0
+ vmax.s32 \r1, \r1, d0
+ vqadd.s32 d3, d5, \r3
+ vqsub.s32 \r3, d5, \r3
+ vmax.s32 d2, d2, d0
+ vmax.s32 \r2, \r2, d0
+ vmin.s32 d3, d3, d1
+ vmin.s32 \r3, \r3, d1
+ vst1.32 {d7}, [r6, :64]!
+ vst1.32 {\r1}, [r10, :64], r9
+ vmax.s32 d3, d3, d0
+ vmax.s32 \r3, \r3, d0
+ vst1.32 {d2}, [r6, :64]!
+ vst1.32 {\r2}, [r10, :64], r9
+ vst1.32 {d3}, [r6, :64]!
+ vst1.32 {\r3}, [r10, :64], r9
+.endm
+ store_addsub d31, d30, d29, d28
+ store_addsub d27, d26, d25, d24
+ store_addsub d23, d22, d21, d20
+ store_addsub d19, d18, d17, d16
+.purgem store_addsub
+
+ add r6, r6, #2*4*16
+
+ movrel_local r12, idct64_coeffs
+ vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+ movdup_if d0, lr, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ add r9, r7, r8, lsl #4 // offset 16
+ add r10, r7, r8, lsl #3 // offset 8
+ sub r9, r9, r8 // offset 15
+ sub r11, r10, r8 // offset 7
+ vld1.32 {d16}, [r7, :64] // in1 (offset 0)
+ vld1.32 {d17}, [r9, :64] // in31 (offset 15)
+ vld1.32 {d18}, [r10, :64] // in17 (offset 8)
+ vld1.32 {d19}, [r11, :64] // in15 (offset 7)
+ vst1_if {d7}, [r7, :64], \clear
+ vst1_if {d7}, [r9, :64], \clear
+ vst1_if {d7}, [r10, :64], \clear
+ vst1_if {d7}, [r11, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ add r7, r7, r8, lsl #2 // offset 4
+ sub r9, r9, r8, lsl #2 // offset 11
+ sub r10, r7, r8 // offset 3
+ add r11, r9, r8 // offset 12
+ vld1.32 {d16}, [r10, :64] // in7 (offset 3)
+ vld1.32 {d17}, [r11, :64] // in25 (offset 12)
+ vld1.32 {d18}, [r9, :64] // in23 (offset 11)
+ vld1.32 {d19}, [r7, :64] // in9 (offset 4)
+ vst1_if {d7}, [r7, :64], \clear
+ vst1_if {d7}, [r9, :64], \clear
+ vst1_if {d7}, [r10, :64], \clear
+ vst1_if {d7}, [r11, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ sub r10, r10, r8, lsl #1 // offset 1
+ sub r9, r9, r8, lsl #1 // offset 9
+ add r10, r10, r8 // offset 2
+ add r9, r9, r8 // offset 10
+ add r7, r7, r8 // offset 5
+ add r11, r11, r8 // offset 13
+ vld1.32 d16, [r10, :64] // in5 (offset 2)
+ vld1.32 d17, [r11, :64] // in27 (offset 13)
+ vld1.32 d18, [r9, :64] // in21 (offset 10)
+ vld1.32 d19, [r7, :64] // in11 (offset 5)
+ vst1_if d7, [r10, :64], \clear
+ vst1_if d7, [r11, :64], \clear
+ vst1_if d7, [r9, :64], \clear
+ vst1_if d7, [r7, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+ movdup_if d0, lr, 2896*8*(1<<16), \scale
+ vmov_if d7, #0, \clear
+ sub r10, r10, r8 // offset 1
+ sub r9, r9, r8 // offset 9
+ add r11, r11, r8 // offset 14
+ add r7, r7, r8 // offset 6
+ vld1.32 d16, [r10, :64] // in3 (offset 1)
+ vld1.32 d17, [r11, :64] // in29 (offset 14)
+ vld1.32 d18, [r9, :64] // in19 (offset 9)
+ vld1.32 d19, [r7, :64] // in13 (offset 6)
+ vst1_if d7, [r10, :64], \clear
+ vst1_if d7, [r11, :64], \clear
+ vst1_if d7, [r9, :64], \clear
+ vst1_if d7, [r7, :64], \clear
+ scale_if \scale, d0[0], q8, q9
+ bl inv_dct64_step1_neon
+
+ sub r6, r6, #2*4*32
+ add r9, r6, #2*4*7
+
+ bl inv_dct64_step2_neon
+
+ pop {r10-r11,pc}
+endfunc
+.endm
+
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+function inv_txfm_horz_dct_64x2_neon
+ vdup.32 q4, r9
+
+ mov r7, sp
+ add r8, sp, #2*4*(64 - 4)
+ add r9, r6, #2*56
+
+ push {r10-r11,lr}
+
+ mov r10, #2*64
+ mov r11, #-2*4*4
+
+1:
+ vld1.32 {d16, d17, d18, d19}, [r7, :128]!
+ vld1.32 {d28, d29, d30, d31}, [r8, :128], r11
+ vld1.32 {d20, d21, d22, d23}, [r7, :128]!
+ vld1.32 {d24, d25, d26, d27}, [r8, :128], r11
+ vtrn.32 d16, d17
+ vtrn.32 d18, d19
+ vtrn.32 d20, d21
+ vtrn.32 d22, d23
+ vtrn.32 d31, d30
+ vtrn.32 d29, d28
+ vtrn.32 d27, d26
+ vtrn.32 d25, d24
+
+.macro store_addsub src0, src1, src2, src3, src4, src5, src6, src7
+ vqsub.s32 d7, \src0, \src1
+ vqsub.s32 d6, \src2, \src3
+ vqsub.s32 d5, \src4, \src5
+ vqsub.s32 d4, \src6, \src7
+ vqadd.s32 d0, \src0, \src1
+ vqadd.s32 d1, \src2, \src3
+ vqadd.s32 d2, \src4, \src5
+ vqadd.s32 d3, \src6, \src7
+ vrshl.s32 q3, q3, q4
+ vrshl.s32 q2, q2, q4
+ vrshl.s32 q0, q0, q4
+ vrshl.s32 q1, q1, q4
+ vqmovn.s32 d7, q3
+ vqmovn.s32 d6, q2
+ vqmovn.s32 d0, q0
+ vqmovn.s32 d1, q1
+ vrev32.16 q3, q3
+ vst1.16 {q0}, [r6, :128], r10
+ vst1.16 {q3}, [r9, :128], r10
+.endm
+ store_addsub d16, d31, d18, d29, d20, d27, d22, d25
+ store_addsub d17, d30, d19, d28, d21, d26, d23, d24
+.purgem store_addsub
+ sub r6, r6, r10, lsl #1
+ sub r9, r9, r10, lsl #1
+ add r6, r6, #16
+ sub r9, r9, #16
+
+ cmp r7, r8
+ blt 1b
+ pop {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_vert_dct_4x64_neon
+ lsl r8, r8, #1
+
+ mov r7, sp
+ add r8, sp, #2*4*(64 - 4)
+ add r9, r6, r1, lsl #6
+ sub r9, r9, r1
+
+ push {r10-r11,lr}
+
+ neg r10, r1
+ mov r11, #-2*4*4
+
+1:
+ vld1.16 {d16, d17, d18, d19}, [r7, :128]!
+ vld1.16 {d28, d29, d30, d31}, [r8, :128], r11
+ vld1.16 {d20, d21, d22, d23}, [r7, :128]!
+ vld1.16 {d24, d25, d26, d27}, [r8, :128], r11
+
+ vmov.i16 q6, #0
+ vmvn.i16 q7, #0xfc00 // 0x3ff
+.macro add_dest_addsub src0, src1, src2, src3
+ vld1.16 {d0}, [r6, :64], r1
+ vld1.16 {d1}, [r9, :64], r10
+ vqadd.s16 d4, \src0, \src1
+ vld1.16 {d2}, [r6, :64]
+ vqsub.s16 d5, \src0, \src1
+ vld1.16 {d3}, [r9, :64]
+ vqadd.s16 d6, \src2, \src3
+ vqsub.s16 d7, \src2, \src3
+ sub r6, r6, r1
+ sub r9, r9, r10
+ vrshr.s16 q2, q2, #4
+ vrshr.s16 q3, q3, #4
+ vqadd.s16 q2, q2, q0
+ vqadd.s16 q3, q3, q1
+ vmax.s16 q2, q2, q6
+ vmax.s16 q3, q3, q6
+ vmin.s16 q2, q2, q7
+ vmin.s16 q3, q3, q7
+ vst1.16 {d4}, [r6, :64], r1
+ vst1.16 {d5}, [r9, :64], r10
+ vst1.16 {d6}, [r6, :64], r1
+ vst1.16 {d7}, [r9, :64], r10
+.endm
+ add_dest_addsub d16, d31, d17, d30
+ add_dest_addsub d18, d29, d19, d28
+ add_dest_addsub d20, d27, d21, d26
+ add_dest_addsub d22, d25, d23, d24
+.purgem add_dest_addsub
+ cmp r7, r8
+ blt 1b
+
+ pop {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 64*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, r5, #(\i*64*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_dct_clear_2s_x64_neon
+ add r6, r5, #(\i*64*2)
+ mov r9, #-2 // shift
+ bl inv_txfm_horz_dct_64x2_neon
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r7, r5, #(\i*2)
+ mov r8, #64*2
+ bl X(inv_txfm_dct_4h_x64_neon)
+ add r6, r0, #(\i*2)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 64*32*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 64*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, r5, #(\i*64*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_dct_clear_scale_2s_x64_neon
+ add r6, r5, #(\i*64*2)
+ mov r9, #-1 // shift
+ bl inv_txfm_horz_dct_64x2_neon
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r6, r0, #(\i*2)
+ add r7, r5, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+ add_sp_align 64*32*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 32*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_32x32
+ ldrh r11, [r10], #2
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, r5, #(\i*32*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_horz_scale_dct_32x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 4
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r7, r5, #(\i*2)
+ mov r8, #32*2
+ bl X(inv_txfm_dct_4h_x64_neon)
+ add r6, r0, #(\i*2)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 32*32*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 64*16*2+64*4*2
+ add r4, sp, #64*4*2
+
+ movrel_local r10, eob_16x32
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+ add r6, r4, #(\i*64*2)
+.if \i > 0
+ mov r8, #(16 - \i)
+ cmp r3, r11
+ blt 1f
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #16*4
+ bl inv_txfm_dct_clear_2s_x64_neon
+ add r6, r4, #(\i*64*2)
+ mov r9, #-2 // shift
+ bl inv_txfm_horz_dct_64x2_neon
+.if \i < 8
+ ldrh r11, [r10], #2
+.endif
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 8
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+ movrel r5, X(inv_dct_4h_x16_neon)
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+ add r6, r0, #(\i*2)
+ add r7, r4, #(\i*2)
+ mov r8, #64*2
+ bl inv_txfm_add_vert_4x16_neon
+.endr
+
+ add_sp_align 64*16*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ push {r4-r11,lr}
+ vpush {q4-q7}
+
+ sub_sp_align 16*32*2+64*4*2
+ add r5, sp, #64*4*2
+
+ movrel_local r10, eob_16x32
+ ldrh r11, [r10], #2
+
+ movrel_local r4, inv_dct_2s_x16_neon
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r6, r5, #(\i*16*2)
+.if \i > 0
+ mov r8, #(32 - \i)
+ cmp r3, r11
+ blt 1f
+.if \i < 30
+ ldrh r11, [r10], #2
+.endif
+.endif
+ add r7, r2, #(\i*4)
+ mov r8, #32*4
+ bl inv_txfm_horz_16x2_neon
+.endr
+ b 3f
+
+1:
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+2:
+ subs r8, r8, #2
+.rept 2
+ vst1.16 {q2, q3}, [r6, :128]!
+.endr
+ bgt 2b
+
+3:
+.irp i, 0, 4, 8, 12
+ add r7, r5, #(\i*2)
+ mov r8, #16*2
+ bl X(inv_txfm_dct_4h_x64_neon)
+ add r6, r0, #(\i*2)
+ bl inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+ add_sp_align 16*32*2+64*4*2
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/loopfilter.S b/third_party/dav1d/src/arm/32/loopfilter.S
new file mode 100644
index 0000000000..97b960534f
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/loopfilter.S
@@ -0,0 +1,868 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro loop_filter wd
+function lpf_8_wd\wd\()_neon
+ vabd.u8 d0, d22, d23 // abs(p1 - p0)
+ vabd.u8 d1, d25, d24 // abs(q1 - q0)
+ vabd.u8 d2, d23, d24 // abs(p0 - q0)
+ vabd.u8 d3, d22, d25 // abs(p1 - q1)
+.if \wd >= 6
+ vabd.u8 d4, d21, d22 // abs(p2 - p1)
+ vabd.u8 d5, d26, d25 // abs(q2 - q1)
+.endif
+.if \wd >= 8
+ vabd.u8 d6, d20, d21 // abs(p3 - p2)
+ vabd.u8 d7, d27, d26 // abs(q3 - q3)
+.endif
+.if \wd >= 6
+ vmax.u8 d4, d4, d5
+.endif
+ vqadd.u8 d2, d2, d2 // abs(p0 - q0) * 2
+.if \wd >= 8
+ vmax.u8 d6, d6, d7
+.endif
+ vshr.u8 d3, d3, #1
+.if \wd >= 8
+ vmax.u8 d4, d4, d6
+.endif
+.if \wd >= 6
+ vand d4, d4, d14
+.endif
+ vmax.u8 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0))
+ vqadd.u8 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+ vmax.u8 d4, d0, d4
+ vcge.u8 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+ vcge.u8 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+ vcge.u8 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+ vand d1, d1, d2 // fm
+ vand d1, d1, d13 // fm && wd >= 4
+.if \wd >= 6
+ vand d14, d14, d1 // fm && wd > 4
+.endif
+.if \wd >= 16
+ vand d15, d15, d1 // fm && wd == 16
+.endif
+
+ vmov r10, r11, d1
+ orrs r10, r10, r11
+ beq 9f // if (!fm || wd < 4) return;
+
+.if \wd >= 6
+ vmov.i8 d10, #1
+ vabd.u8 d2, d21, d23 // abs(p2 - p0)
+ vabd.u8 d3, d22, d23 // abs(p1 - p0)
+ vabd.u8 d4, d25, d24 // abs(q1 - q0)
+ vabd.u8 d5, d26, d24 // abs(q2 - q0)
+.if \wd >= 8
+ vabd.u8 d6, d20, d23 // abs(p3 - p0)
+ vabd.u8 d7, d27, d24 // abs(q3 - q0)
+.endif
+ vmax.u8 d2, d2, d3
+ vmax.u8 d4, d4, d5
+.if \wd >= 8
+ vmax.u8 d6, d6, d7
+.endif
+ vmax.u8 d2, d2, d4
+.if \wd >= 8
+ vmax.u8 d2, d2, d6
+.endif
+
+.if \wd == 16
+ vabd.u8 d3, d17, d23 // abs(p6 - p0)
+ vabd.u8 d4, d18, d23 // abs(p5 - p0)
+ vabd.u8 d5, d19, d23 // abs(p4 - p0)
+.endif
+ vcge.u8 d2, d10, d2 // flat8in
+.if \wd == 16
+ vabd.u8 d6, d28, d24 // abs(q4 - q0)
+ vabd.u8 d7, d29, d24 // abs(q5 - q0)
+ vabd.u8 d8, d30, d24 // abs(q6 - q0)
+.endif
+ vand d14, d2, d14 // flat8in && fm && wd > 4
+ vbic d1, d1, d14 // fm && wd >= 4 && !flat8in
+.if \wd == 16
+ vmax.u8 d3, d3, d4
+ vmax.u8 d5, d5, d6
+.endif
+ vmov r10, r11, d1
+.if \wd == 16
+ vmax.u8 d7, d7, d8
+ vmax.u8 d3, d3, d5
+ vmax.u8 d3, d3, d7
+ vcge.u8 d3, d10, d3 // flat8out
+.endif
+ orrs r10, r10, r11
+.if \wd == 16
+ vand d15, d15, d3 // flat8out && fm && wd == 16
+ vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16
+ vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out
+.endif
+ beq 1f // skip wd == 4 case
+.endif
+
+ vsubl.u8 q1, d22, d25 // p1 - q1
+ vcgt.u8 d0, d0, d12 // hev
+ vqmovn.s16 d2, q1
+ vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1)
+ vbic d0, d1, d0 // (fm && wd >= 4 && !hev)
+ vsubl.u8 q1, d24, d23
+ vmov.i16 q3, #3
+ vmul.i16 q1, q1, q3
+ vmov.i8 d6, #4
+ vaddw.s8 q1, q1, d4
+ vmov.i8 d7, #3
+ vqmovn.s16 d2, q1 // f
+ vqadd.s8 d4, d6, d2 // imin(f + 4, 127)
+ vqadd.s8 d5, d7, d2 // imin(f + 3, 127)
+ vshr.s8 d4, d4, #3 // f1
+ vshr.s8 d5, d5, #3 // f2
+ vmovl.u8 q1, d23 // p0
+ vmovl.u8 q3, d24 // q0
+ vaddw.s8 q1, q1, d5
+ vsubw.s8 q3, q3, d4
+ vrshr.s8 d4, d4, #1 // (f1 + 1) >> 1
+ vqmovun.s16 d2, q1 // out p0
+ vqmovun.s16 d6, q3 // out q0
+ vbit d23, d2, d1 // if (fm && wd >= 4)
+ vmovl.u8 q1, d22 // p1
+ vbit d24, d6, d1 // if (fm && wd >= 4)
+ vmovl.u8 q3, d25 // q1
+ vaddw.s8 q1, q1, d4
+ vsubw.s8 q3, q3, d4
+ vqmovun.s16 d2, q1 // out p1
+ vqmovun.s16 d6, q3 // out q1
+ vbit d22, d2, d0 // if (fm && wd >= 4 && !hev)
+ vbit d25, d6, d0 // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+ beq 2f // skip if there's no flat8in
+
+ vaddl.u8 q0, d21, d21 // p2 * 2
+ vaddl.u8 q1, d21, d22 // p2 + p1
+ vaddl.u8 q2, d22, d23 // p1 + p0
+ vaddl.u8 q3, d23, d24 // p0 + q0
+ vadd.i16 q4, q0, q1
+ vadd.i16 q5, q2, q3
+ vaddl.u8 q6, d24, d25 // q0 + q1
+ vadd.i16 q4, q4, q5
+ vsub.i16 q6, q6, q0
+ vaddl.u8 q5, d25, d26 // q1 + q2
+ vrshrn.i16 d0, q4, #3 // out p1
+
+ vadd.i16 q4, q4, q6
+ vsub.i16 q5, q5, q1
+ vaddl.u8 q6, d26, d26 // q2 + q2
+ vrshrn.i16 d1, q4, #3 // out p0
+
+ vadd.i16 q4, q4, q5
+ vsub.i16 q6, q6, q2
+ vrshrn.i16 d2, q4, #3 // out q0
+
+ vbit d22, d0, d14 // p1 if (flat8in)
+ vadd.i16 q4, q4, q6
+ vbit d23, d1, d14 // p0 if (flat8in)
+ vrshrn.i16 d3, q4, #3 // out q1
+ vbit d24, d2, d14 // q0 if (flat8in)
+ vbit d25, d3, d14 // q1 if (flat8in)
+.elseif \wd >= 8
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+.if \wd == 8
+ beq 8f // skip if there's no flat8in
+.else
+ beq 2f // skip if there's no flat8in
+.endif
+
+ vaddl.u8 q0, d20, d21 // p3 + p2
+ vaddl.u8 q1, d22, d25 // p1 + q1
+ vaddl.u8 q2, d20, d22 // p3 + p1
+ vaddl.u8 q3, d23, d26 // p0 + q2
+ vadd.i16 q4, q0, q0 // 2 * (p3 + p2)
+ vaddw.u8 q4, q4, d23 // + p0
+ vaddw.u8 q4, q4, d24 // + q0
+ vadd.i16 q4, q4, q2 // + p3 + p1
+ vsub.i16 q1, q1, q0 // p1 + q1 - p3 - p2
+ vsub.i16 q3, q3, q2 // p0 + q2 - p3 - p1
+ vrshrn.i16 d10, q4, #3 // out p2
+
+ vadd.i16 q4, q4, q1
+ vaddl.u8 q0, d20, d23 // p3 + p0
+ vaddl.u8 q1, d24, d27 // q0 + q3
+ vrshrn.i16 d11, q4, #3 // out p1
+
+ vadd.i16 q4, q4, q3
+ vsub.i16 q1, q1, q0 // q0 + q3 - p3 - p0
+ vaddl.u8 q2, d21, d24 // p2 + q0
+ vaddl.u8 q3, d25, d27 // q1 + q3
+ vrshrn.i16 d12, q4, #3 // out p0
+
+ vadd.i16 q4, q4, q1
+ vsub.i16 q3, q3, q2 // q1 + q3 - p2 - q0
+ vaddl.u8 q0, d22, d25 // p1 + q1
+ vaddl.u8 q1, d26, d27 // q2 + q3
+ vrshrn.i16 d13, q4, #3 // out q0
+
+ vadd.i16 q4, q4, q3
+ vsub.i16 q1, q1, q0 // q2 + q3 - p1 - q1
+ vrshrn.i16 d0, q4, #3 // out q1
+
+ vadd.i16 q4, q4, q1
+
+ vbit d21, d10, d14
+ vbit d22, d11, d14
+ vbit d23, d12, d14
+ vrshrn.i16 d1, q4, #3 // out q2
+ vbit d24, d13, d14
+ vbit d25, d0, d14
+ vbit d26, d1, d14
+.endif
+2:
+.if \wd == 16
+ vmov r10, r11, d15
+ orrs r10, r10, r11
+ bne 1f // check if flat8out is needed
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+ beq 8f // if there was no flat8in, just write the inner 4 pixels
+ b 7f // if flat8in was used, write the inner 6 pixels
+1:
+
+ vaddl.u8 q1, d17, d17 // p6 + p6
+ vaddl.u8 q2, d17, d18 // p6 + p5
+ vaddl.u8 q3, d17, d19 // p6 + p4
+ vaddl.u8 q4, d17, d20 // p6 + p3
+ vadd.i16 q6, q1, q2
+ vadd.i16 q5, q3, q4
+ vaddl.u8 q3, d17, d21 // p6 + p2
+ vadd.i16 q6, q6, q5
+ vaddl.u8 q4, d17, d22 // p6 + p1
+ vaddl.u8 q5, d18, d23 // p5 + p0
+ vadd.i16 q3, q3, q4
+ vaddl.u8 q4, d19, d24 // p4 + q0
+ vadd.i16 q6, q6, q3
+ vadd.i16 q5, q5, q4
+ vaddl.u8 q3, d20, d25 // p3 + q1
+ vadd.i16 q6, q6, q5
+ vsub.i16 q3, q3, q1
+ vaddl.u8 q1, d21, d26 // p2 + q2
+ vrshrn.i16 d0, q6, #4 // out p5
+ vadd.i16 q6, q6, q3 // - (p6 + p6) + (p3 + q1)
+ vsub.i16 q1, q1, q2
+ vaddl.u8 q2, d22, d27 // p1 + q3
+ vaddl.u8 q3, d17, d19 // p6 + p4
+ vrshrn.i16 d1, q6, #4 // out p4
+ vadd.i16 q6, q6, q1 // - (p6 + p5) + (p2 + q2)
+ vsub.i16 q2, q2, q3
+ vaddl.u8 q3, d23, d28 // p0 + q4
+ vaddl.u8 q4, d17, d20 // p6 + p3
+ vrshrn.i16 d2, q6, #4 // out p3
+ vadd.i16 q6, q6, q2 // - (p6 + p4) + (p1 + q3)
+ vsub.i16 q3, q3, q4
+ vaddl.u8 q4, d24, d29 // q0 + q5
+ vaddl.u8 q2, d17, d21 // p6 + p2
+ vrshrn.i16 d3, q6, #4 // out p2
+ vadd.i16 q6, q6, q3 // - (p6 + p3) + (p0 + q4)
+ vsub.i16 q4, q4, q2
+ vaddl.u8 q3, d25, d30 // q1 + q6
+ vaddl.u8 q5, d17, d22 // p6 + p1
+ vrshrn.i16 d4, q6, #4 // out p1
+ vadd.i16 q6, q6, q4 // - (p6 + p2) + (q0 + q5)
+ vsub.i16 q3, q3, q5
+ vaddl.u8 q4, d26, d30 // q2 + q6
+ vbif d0, d18, d15 // out p5
+ vaddl.u8 q5, d18, d23 // p5 + p0
+ vrshrn.i16 d5, q6, #4 // out p0
+ vadd.i16 q6, q6, q3 // - (p6 + p1) + (q1 + q6)
+ vsub.i16 q4, q4, q5
+ vaddl.u8 q5, d27, d30 // q3 + q6
+ vbif d1, d19, d15 // out p4
+ vaddl.u8 q9, d19, d24 // p4 + q0
+ vrshrn.i16 d6, q6, #4 // out q0
+ vadd.i16 q6, q6, q4 // - (p5 + p0) + (q2 + q6)
+ vsub.i16 q5, q5, q9
+ vaddl.u8 q4, d28, d30 // q4 + q6
+ vbif d2, d20, d15 // out p3
+ vaddl.u8 q9, d20, d25 // p3 + q1
+ vrshrn.i16 d7, q6, #4 // out q1
+ vadd.i16 q6, q6, q5 // - (p4 + q0) + (q3 + q6)
+ vsub.i16 q9, q4, q9
+ vaddl.u8 q5, d29, d30 // q5 + q6
+ vbif d3, d21, d15 // out p2
+ vaddl.u8 q10, d21, d26 // p2 + q2
+ vrshrn.i16 d8, q6, #4 // out q2
+ vadd.i16 q6, q6, q9 // - (p3 + q1) + (q4 + q6)
+ vsub.i16 q5, q5, q10
+ vaddl.u8 q9, d30, d30 // q6 + q6
+ vbif d4, d22, d15 // out p1
+ vaddl.u8 q10, d22, d27 // p1 + q3
+ vrshrn.i16 d9, q6, #4 // out q3
+ vadd.i16 q6, q6, q5 // - (p2 + q2) + (q5 + q6)
+ vsub.i16 q9, q9, q10
+ vbif d5, d23, d15 // out p0
+ vrshrn.i16 d10, q6, #4 // out q4
+ vadd.i16 q6, q6, q9 // - (p1 + q3) + (q6 + q6)
+ vrshrn.i16 d11, q6, #4 // out q5
+ vbif d6, d24, d15 // out q0
+ vbif d7, d25, d15 // out q1
+ vbif d8, d26, d15 // out q2
+ vbif d9, d27, d15 // out q3
+ vbif d10, d28, d15 // out q4
+ vbif d11, d29, d15 // out q5
+.endif
+
+ bx lr
+.if \wd == 16
+7:
+ // Return to a shorter epilogue, writing only the inner 6 pixels
+ bx r8
+.endif
+.if \wd >= 8
+8:
+ // Return to a shorter epilogue, writing only the inner 4 pixels
+ bx r9
+.endif
+9:
+ // Return directly without writing back any pixels
+ bx r12
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_8_wd16
+ adr r8, 7f + CONFIG_THUMB
+ adr r9, 8f + CONFIG_THUMB
+ bl lpf_8_wd16_neon
+.endm
+
+.macro lpf_8_wd8
+ adr r9, 8f + CONFIG_THUMB
+ bl lpf_8_wd8_neon
+.endm
+
+.macro lpf_8_wd6
+ bl lpf_8_wd6_neon
+.endm
+
+.macro lpf_8_wd4
+ bl lpf_8_wd4_neon
+.endm
+
+function lpf_v_4_8_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #1
+ vld1.8 {d22}, [r10, :64], r1 // p1
+ vld1.8 {d24}, [r0, :64], r1 // q0
+ vld1.8 {d23}, [r10, :64], r1 // p0
+ vld1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+
+ lpf_8_wd4
+
+ sub r10, r0, r1, lsl #1
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_4_8_neon
+ mov r12, lr
+ sub r10, r0, #2
+ add r0, r10, r1, lsl #2
+ vld1.32 {d22[0]}, [r10], r1
+ vld1.32 {d22[1]}, [r0], r1
+ vld1.32 {d23[0]}, [r10], r1
+ vld1.32 {d23[1]}, [r0], r1
+ vld1.32 {d24[0]}, [r10], r1
+ vld1.32 {d24[1]}, [r0], r1
+ vld1.32 {d25[0]}, [r10], r1
+ vld1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+
+ lpf_8_wd4
+
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #2
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #2
+
+ vst1.32 {d22[0]}, [r10], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r10], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r10], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r10], r1
+ vst1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+ bx r12
+endfunc
+
+function lpf_v_6_8_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #1
+ sub r10, r10, r1
+ vld1.8 {d21}, [r10, :64], r1 // p2
+ vld1.8 {d24}, [r0, :64], r1 // q0
+ vld1.8 {d22}, [r10, :64], r1 // p1
+ vld1.8 {d25}, [r0, :64], r1 // q1
+ vld1.8 {d23}, [r10, :64], r1 // p0
+ vld1.8 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+
+ lpf_8_wd6
+
+ sub r10, r0, r1, lsl #1
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_6_8_neon
+ mov r12, lr
+ sub r10, r0, #4
+ add r0, r10, r1, lsl #2
+ vld1.8 {d20}, [r10], r1
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d21}, [r10], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d22}, [r10], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d23}, [r10], r1
+ vld1.8 {d27}, [r0], r1
+ add r0, r0, #4
+
+ transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+ lpf_8_wd6
+
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #2
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #2
+
+ vst1.32 {d22[0]}, [r10], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r10], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r10], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r10], r1
+ vst1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+ bx r12
+endfunc
+
+function lpf_v_8_8_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #2
+ vld1.8 {d20}, [r10, :64], r1 // p3
+ vld1.8 {d24}, [r0, :64], r1 // q0
+ vld1.8 {d21}, [r10, :64], r1 // p2
+ vld1.8 {d25}, [r0, :64], r1 // q1
+ vld1.8 {d22}, [r10, :64], r1 // p1
+ vld1.8 {d26}, [r0, :64], r1 // q2
+ vld1.8 {d23}, [r10, :64], r1 // p0
+ vld1.8 {d27}, [r0, :64], r1 // q3
+ sub r0, r0, r1, lsl #2
+
+ lpf_8_wd8
+
+ sub r10, r0, r1, lsl #1
+ sub r10, r10, r1
+ vst1.8 {d21}, [r10, :64], r1 // p2
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx r12
+
+8:
+ sub r10, r0, r1, lsl #1
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_8_8_neon
+ mov r12, lr
+ sub r10, r0, #4
+ add r0, r10, r1, lsl #2
+ vld1.8 {d20}, [r10], r1
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d21}, [r10], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d22}, [r10], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d23}, [r10], r1
+ vld1.8 {d27}, [r0], r1
+ add r0, r0, #4
+
+ transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+ lpf_8_wd8
+
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #4
+ transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+ add r0, r10, r1, lsl #2
+
+ vst1.8 {d20}, [r10], r1
+ vst1.8 {d24}, [r0], r1
+ vst1.8 {d21}, [r10], r1
+ vst1.8 {d25}, [r0], r1
+ vst1.8 {d22}, [r10], r1
+ vst1.8 {d26}, [r0], r1
+ vst1.8 {d23}, [r10], r1
+ vst1.8 {d27}, [r0], r1
+ add r0, r0, #4
+ bx r12
+8:
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #2
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #2
+
+ vst1.32 {d22[0]}, [r10], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r10], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r10], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r10], r1
+ vst1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+ bx r12
+endfunc
+
+function lpf_v_16_8_neon
+ mov r12, lr
+
+ sub r10, r0, r1, lsl #3
+ add r10, r10, r1
+ vld1.8 {d17}, [r10, :64], r1 // p6
+ vld1.8 {d24}, [r0, :64], r1 // q0
+ vld1.8 {d18}, [r10, :64], r1 // p5
+ vld1.8 {d25}, [r0, :64], r1 // q1
+ vld1.8 {d19}, [r10, :64], r1 // p4
+ vld1.8 {d26}, [r0, :64], r1 // q2
+ vld1.8 {d20}, [r10, :64], r1 // p3
+ vld1.8 {d27}, [r0, :64], r1 // q3
+ vld1.8 {d21}, [r10, :64], r1 // p2
+ vld1.8 {d28}, [r0, :64], r1 // q4
+ vld1.8 {d22}, [r10, :64], r1 // p1
+ vld1.8 {d29}, [r0, :64], r1 // q5
+ vld1.8 {d23}, [r10, :64], r1 // p0
+ vld1.8 {d30}, [r0, :64], r1 // q6
+ sub r0, r0, r1, lsl #3
+ add r0, r0, r1
+
+ lpf_8_wd16
+
+ sub r10, r0, r1, lsl #2
+ sub r10, r10, r1, lsl #1
+ vst1.8 {d0}, [r10, :64], r1 // p5
+ vst1.8 {d6}, [r0, :64], r1 // q0
+ vst1.8 {d1}, [r10, :64], r1 // p4
+ vst1.8 {d7}, [r0, :64], r1 // q1
+ vst1.8 {d2}, [r10, :64], r1 // p3
+ vst1.8 {d8}, [r0, :64], r1 // q2
+ vst1.8 {d3}, [r10, :64], r1 // p2
+ vst1.8 {d9}, [r0, :64], r1 // q3
+ vst1.8 {d4}, [r10, :64], r1 // p1
+ vst1.8 {d10}, [r0, :64], r1 // q4
+ vst1.8 {d5}, [r10, :64], r1 // p0
+ vst1.8 {d11}, [r0, :64], r1 // q5
+ sub r0, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+ bx r12
+7:
+ sub r10, r0, r1
+ sub r10, r10, r1, lsl #1
+ vst1.8 {d21}, [r10, :64], r1 // p2
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx r12
+
+8:
+ sub r10, r0, r1, lsl #1
+ vst1.8 {d22}, [r10, :64], r1 // p1
+ vst1.8 {d24}, [r0, :64], r1 // q0
+ vst1.8 {d23}, [r10, :64], r1 // p0
+ vst1.8 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_16_8_neon
+ mov r12, lr
+ sub r10, r0, #8
+ vld1.8 {d16}, [r10, :64], r1
+ vld1.8 {d24}, [r0, :64], r1
+ vld1.8 {d17}, [r10, :64], r1
+ vld1.8 {d25}, [r0, :64], r1
+ vld1.8 {d18}, [r10, :64], r1
+ vld1.8 {d26}, [r0, :64], r1
+ vld1.8 {d19}, [r10, :64], r1
+ vld1.8 {d27}, [r0, :64], r1
+ vld1.8 {d20}, [r10, :64], r1
+ vld1.8 {d28}, [r0, :64], r1
+ vld1.8 {d21}, [r10, :64], r1
+ vld1.8 {d29}, [r0, :64], r1
+ vld1.8 {d22}, [r10, :64], r1
+ vld1.8 {d30}, [r0, :64], r1
+ vld1.8 {d23}, [r10, :64], r1
+ vld1.8 {d31}, [r0, :64], r1
+
+ transpose_8x8b q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
+ transpose_8x8b q12, q13, q14, q15, d24, d25, d26, d27, d28, d29, d30, d31
+
+ lpf_8_wd16
+
+ sub r0, r0, r1, lsl #3
+ sub r10, r0, #8
+
+ transpose_8x8b q8, q0, q1, q2, d16, d17, d0, d1, d2, d3, d4, d5
+ transpose_8x8b q3, q4, q5, q15, d6, d7, d8, d9, d10, d11, d30, d31
+
+ vst1.8 {d16}, [r10, :64], r1
+ vst1.8 {d6}, [r0, :64], r1
+ vst1.8 {d17}, [r10, :64], r1
+ vst1.8 {d7}, [r0, :64], r1
+ vst1.8 {d0}, [r10, :64], r1
+ vst1.8 {d8}, [r0, :64], r1
+ vst1.8 {d1}, [r10, :64], r1
+ vst1.8 {d9}, [r0, :64], r1
+ vst1.8 {d2}, [r10, :64], r1
+ vst1.8 {d10}, [r0, :64], r1
+ vst1.8 {d3}, [r10, :64], r1
+ vst1.8 {d11}, [r0, :64], r1
+ vst1.8 {d4}, [r10, :64], r1
+ vst1.8 {d30}, [r0, :64], r1
+ vst1.8 {d5}, [r10, :64], r1
+ vst1.8 {d31}, [r0, :64], r1
+ bx r12
+
+7:
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #4
+ transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+ add r0, r10, r1, lsl #2
+
+ vst1.8 {d20}, [r10], r1
+ vst1.8 {d24}, [r0], r1
+ vst1.8 {d21}, [r10], r1
+ vst1.8 {d25}, [r0], r1
+ vst1.8 {d22}, [r10], r1
+ vst1.8 {d26}, [r0], r1
+ vst1.8 {d23}, [r10], r1
+ vst1.8 {d27}, [r0], r1
+ add r0, r0, #4
+ bx r12
+8:
+ sub r10, r0, r1, lsl #3
+ sub r10, r10, #2
+ transpose_4x8b q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #2
+
+ vst1.32 {d22[0]}, [r10], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r10], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r10], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r10], r1
+ vst1.32 {d25[1]}, [r0], r1
+ add r0, r0, #2
+ bx r12
+endfunc
+
+// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint32_t *const vmask,
+// const uint8_t (*l)[4], ptrdiff_t b4_stride,
+// const Av1FilterLUT *lut, const int w)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [r2] // vmask[0], vmask[1]
+.ifc \type, y
+ ldr r2, [r2, #8] // vmask[2]
+.endif
+ add r5, r5, #128 // Move to sharp part of lut
+.ifc \type, y
+ orr r7, r7, r2 // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+ sub r4, r3, r4, lsl #2
+.else
+ sub r3, r3, #4
+ lsl r4, r4, #2
+.endif
+ orr r6, r6, r7 // vmask[0] |= vmask[1]
+
+1:
+ tst r6, #0x03
+.ifc \dir, v
+ vld1.8 {d0}, [r4]!
+ vld1.8 {d1}, [r3]!
+.else
+ vld2.32 {d0[0], d1[0]}, [r3], r4
+ vld2.32 {d0[1], d1[1]}, [r3], r4
+.endif
+ beq 7f // if (!(vm & bits)) continue;
+
+ vld1.8 {d5[]}, [r5] // sharp[0]
+ add r5, r5, #8
+ vmov.i32 d2, #0xff
+ vdup.32 d13, r6 // vmask[0]
+
+ vand d0, d0, d2 // Keep only lowest byte in each 32 bit word
+ vand d1, d1, d2
+ vtst.8 d3, d1, d2 // Check for nonzero values in l[0][0]
+ vmov.i8 d4, #1
+ vld1.8 {d6[]}, [r5] // sharp[1]
+ sub r5, r5, #8
+ vbif d1, d0, d3 // if (!l[0][0]) L = l[offset][0]
+ vtst.32 d2, d1, d2 // L != 0
+ vmul.i32 d1, d1, d4 // L
+.ifc \type, y
+ vdup.32 d15, r2 // vmask[2]
+.endif
+ vdup.32 d14, r7 // vmask[1]
+ vmov r10, r11, d2
+ orrs r10, r10, r11
+ beq 7f // if (!L) continue;
+ vneg.s8 d5, d5 // -sharp[0]
+ movrel_local r10, word_12
+ vshr.u8 d12, d1, #4 // H
+ vld1.32 {d16}, [r10, :64]
+ vshl.s8 d3, d1, d5 // L >> sharp[0]
+.ifc \type, y
+ vtst.32 d15, d15, d16 // if (vmask[2] & bits)
+.endif
+ vmov.i8 d7, #2
+ vmin.u8 d3, d3, d6 // imin(L >> sharp[0], sharp[1])
+ vadd.i8 d0, d1, d7 // L + 2
+ vmax.u8 d11, d3, d4 // imax(imin(), 1) = limit = I
+ vadd.u8 d0, d0, d0 // 2*(L + 2)
+ vtst.32 d14, d14, d16 // if (vmask[1] & bits)
+ vadd.i8 d10, d0, d11 // 2*(L + 2) + limit = E
+ vtst.32 d13, d13, d16 // if (vmask[0] & bits)
+ vand d13, d13, d2 // vmask[0] &= L != 0
+
+.ifc \type, y
+ tst r2, #0x03
+ beq 2f
+ // wd16
+ bl lpf_\dir\()_16_8_neon
+ b 8f
+2:
+.endif
+ tst r7, #0x03
+ beq 3f
+.ifc \type, y
+ // wd8
+ bl lpf_\dir\()_8_8_neon
+.else
+ // wd6
+ bl lpf_\dir\()_6_8_neon
+.endif
+ b 8f
+3:
+ // wd4
+ bl lpf_\dir\()_4_8_neon
+.ifc \dir, h
+ b 8f
+7:
+ // For dir h, the functions above increment r0.
+ // If the whole function is skipped, increment it here instead.
+ add r0, r0, r1, lsl #3
+.else
+7:
+.endif
+8:
+ lsrs r6, r6, #2 // vmask[0] >>= 2
+ lsr r7, r7, #2 // vmask[1] >>= 2
+.ifc \type, y
+ lsr r2, r2, #2 // vmask[2] >>= 2
+.endif
+.ifc \dir, v
+ add r0, r0, #8
+.else
+ // For dir h, r0 is returned incremented
+.endif
+ bne 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
+
+const word_12, align=4
+ .word 1, 2
+endconst
diff --git a/third_party/dav1d/src/arm/32/loopfilter16.S b/third_party/dav1d/src/arm/32/loopfilter16.S
new file mode 100644
index 0000000000..d7daf21f1a
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/loopfilter16.S
@@ -0,0 +1,859 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro loop_filter wd
+function lpf_4_wd\wd\()_neon
+ vabd.u16 d0, d22, d23 // abs(p1 - p0)
+ vabd.u16 d1, d25, d24 // abs(q1 - q0)
+ vabd.u16 d2, d23, d24 // abs(p0 - q0)
+ vabd.u16 d3, d22, d25 // abs(p1 - q1)
+.if \wd >= 6
+ vabd.u16 d4, d21, d22 // abs(p2 - p1)
+ vabd.u16 d5, d26, d25 // abs(q2 - q1)
+.endif
+.if \wd >= 8
+ vabd.u16 d6, d20, d21 // abs(p3 - p2)
+ vabd.u16 d7, d27, d26 // abs(q3 - q3)
+.endif
+.if \wd >= 6
+ vmax.u16 d4, d4, d5
+.endif
+ vqadd.u16 d2, d2, d2 // abs(p0 - q0) * 2
+.if \wd >= 8
+ vmax.u16 d6, d6, d7
+.endif
+ vshr.u16 d3, d3, #1
+.if \wd >= 8
+ vmax.u16 d4, d4, d6
+.endif
+ vmax.u16 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0))
+ vqadd.u16 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+ vmax.u16 d4, d0, d4
+ vcge.u16 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+ vcge.u16 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+ vcge.u16 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+ vand d1, d1, d2 // fm && wd >= 4 (implicit)
+.if \wd >= 6
+ vmov d14, d1 // fm && wd > 4 (implicit)
+.endif
+.if \wd >= 16
+ vmov d15, d1 // fm && wd == 16 (implicit)
+.endif
+
+ vmov r10, r11, d1
+ orrs r10, r10, r11
+ beq 9f // if (!fm || wd < 4) return;
+
+.if \wd >= 6
+ vmov.i16 d10, #1
+ vabd.u16 d2, d21, d23 // abs(p2 - p0)
+ vabd.u16 d3, d22, d23 // abs(p1 - p0)
+ vabd.u16 d4, d25, d24 // abs(q1 - q0)
+ vabd.u16 d5, d26, d24 // abs(q2 - q0)
+ vdup.16 d9, r9 // bitdepth_min_8
+.if \wd >= 8
+ vabd.u16 d6, d20, d23 // abs(p3 - p0)
+ vabd.u16 d7, d27, d24 // abs(q3 - q0)
+.endif
+ vmax.u16 d2, d2, d3
+ vmax.u16 d4, d4, d5
+.if \wd >= 8
+ vmax.u16 d6, d6, d7
+.endif
+ vmax.u16 d2, d2, d4
+ vshl.u16 d10, d10, d9 // F = 1 << bitdepth_min_8
+.if \wd >= 8
+ vmax.u16 d2, d2, d6
+.endif
+
+.if \wd == 16
+ vabd.u16 d3, d17, d23 // abs(p6 - p0)
+ vabd.u16 d4, d18, d23 // abs(p5 - p0)
+ vabd.u16 d5, d19, d23 // abs(p4 - p0)
+.endif
+ vcge.u16 d2, d10, d2 // flat8in
+.if \wd == 16
+ vabd.u16 d6, d28, d24 // abs(q4 - q0)
+ vabd.u16 d7, d29, d24 // abs(q5 - q0)
+ vabd.u16 d8, d30, d24 // abs(q6 - q0)
+.endif
+ vand d14, d2, d14 // flat8in && fm && wd > 4
+ vbic d1, d1, d14 // fm && wd >= 4 && !flat8in
+.if \wd == 16
+ vmax.u16 d3, d3, d4
+ vmax.u16 d5, d5, d6
+.endif
+ vmov r10, r11, d1
+.if \wd == 16
+ vmax.u16 d7, d7, d8
+ vmax.u16 d3, d3, d5
+ vmax.u16 d3, d3, d7
+ vcge.u16 d3, d10, d3 // flat8out
+.endif
+ orrs r10, r10, r11
+.if \wd == 16
+ vand d15, d15, d3 // flat8out && fm && wd == 16
+ vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16
+ vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out
+.endif
+ beq 1f // skip wd == 4 case
+.endif
+
+ vdup.16 d3, r8 // bitdepth_max
+ vsub.u16 d2, d22, d25 // p1 - q1
+ vshr.u16 d3, d3, #1 // 128 << bitdepth_min_8 - 1
+ vcgt.u16 d0, d0, d12 // hev
+ vmvn d9, d3 // - 128 * (1 << bitdepth_min_8)
+ vmin.s16 d2, d2, d3 // iclip_diff(p1 - q1)
+ vmax.s16 d2, d2, d9 // iclip_diff(p1 - q1)
+ vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1)
+ vsub.u16 d2, d24, d23
+ vmov.i16 d6, #3
+ vbic d0, d1, d0 // (fm && wd >= 4 && !hev)
+ vmul.i16 d2, d2, d6
+ vmov.i16 d7, #4
+ vadd.i16 d2, d2, d4
+ vmin.s16 d2, d2, d3 // f = iclip_diff()
+ vmax.s16 d2, d2, d9 // f = iclip_diff()
+ vqadd.s16 d4, d7, d2 // f + 4
+ vqadd.s16 d5, d6, d2 // f + 3
+ vmin.s16 d4, d4, d3 // imin(f + 4, 128 << bitdepth_min_8 - 1)
+ vmin.s16 d5, d5, d3 // imin(f + 3, 128 << bitdepth_min_8 - 1)
+ vshr.s16 d4, d4, #3 // f1
+ vshr.s16 d5, d5, #3 // f2
+ vmov.i16 d9, #0
+ vdup.16 d3, r8 // bitdepth_max
+ vqadd.s16 d2, d23, d5 // p0 + f2
+ vqsub.s16 d6, d24, d4 // q0 - f1
+ vrshr.s16 d4, d4, #1 // (f1 + 1) >> 1
+ vmin.s16 d2, d2, d3 // out p0 = iclip_pixel()
+ vmin.s16 d6, d6, d3 // out q0 = iclip_pixel()
+ vmax.s16 d2, d2, d9 // out p0 = iclip_pixel()
+ vmax.s16 d6, d6, d9 // out q0 = iclip_pixel()
+ vbit d23, d2, d1 // if (fm && wd >= 4)
+ vbit d24, d6, d1 // if (fm && wd >= 4)
+ vqadd.s16 d2, d22, d4 // p1 + f
+ vqsub.s16 d6, d25, d4 // q1 - f
+ vmin.s16 d2, d2, d3 // out p1 = iclip_pixel()
+ vmin.s16 d6, d6, d3 // out q1 = iclip_pixel()
+ vmax.s16 d2, d2, d9 // out p1 = iclip_pixel()
+ vmax.s16 d6, d6, d9 // out q1 = iclip_pixel()
+ vbit d22, d2, d0 // if (fm && wd >= 4 && !hev)
+ vbit d25, d6, d0 // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+ beq 2f // skip if there's no flat8in
+
+ vadd.i16 d0, d21, d21 // p2 * 2
+ vadd.i16 d2, d21, d22 // p2 + p1
+ vadd.i16 d4, d22, d23 // p1 + p0
+ vadd.i16 d6, d23, d24 // p0 + q0
+ vadd.i16 d8, d0, d2
+ vadd.i16 d10, d4, d6
+ vadd.i16 d12, d24, d25 // q0 + q1
+ vadd.i16 d8, d8, d10
+ vsub.i16 d12, d12, d0
+ vadd.i16 d10, d25, d26 // q1 + q2
+ vrshr.u16 d0, d8, #3 // out p1
+
+ vadd.i16 d8, d8, d12
+ vsub.i16 d10, d10, d2
+ vadd.i16 d12, d26, d26 // q2 + q2
+ vrshr.u16 d1, d8, #3 // out p0
+
+ vadd.i16 d8, d8, d10
+ vsub.i16 d12, d12, d4
+ vrshr.u16 d2, d8, #3 // out q0
+
+ vbit d22, d0, d14 // p1 if (flat8in)
+ vadd.i16 d8, d8, d12
+ vbit d23, d1, d14 // p0 if (flat8in)
+ vrshr.u16 d3, d8, #3 // out q1
+ vbit d24, d2, d14 // q0 if (flat8in)
+ vbit d25, d3, d14 // q1 if (flat8in)
+.elseif \wd >= 8
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+.if \wd == 8
+ beq 8f // skip if there's no flat8in
+.else
+ beq 2f // skip if there's no flat8in
+.endif
+
+ vadd.i16 d0, d20, d21 // p3 + p2
+ vadd.i16 d2, d22, d25 // p1 + q1
+ vadd.i16 d4, d20, d22 // p3 + p1
+ vadd.i16 d6, d23, d26 // p0 + q2
+ vadd.i16 d8, d0, d0 // 2 * (p3 + p2)
+ vadd.i16 d9, d23, d24 // p0 + q0
+ vadd.i16 d8, d8, d4 // + p3 + p1
+ vsub.i16 d2, d2, d0 // p1 + q1 - p3 - p2
+ vadd.i16 d8, d8, d9 // + p0 + q0
+ vsub.i16 d6, d6, d4 // p0 + q2 - p3 - p1
+ vrshr.u16 d10, d8, #3 // out p2
+
+ vadd.i16 d8, d8, d2
+ vadd.i16 d0, d20, d23 // p3 + p0
+ vadd.i16 d2, d24, d27 // q0 + q3
+ vrshr.u16 d11, d8, #3 // out p1
+
+ vadd.i16 d8, d8, d6
+ vsub.i16 d2, d2, d0 // q0 + q3 - p3 - p0
+ vadd.i16 d4, d21, d24 // p2 + q0
+ vadd.i16 d6, d25, d27 // q1 + q3
+ vrshr.u16 d12, d8, #3 // out p0
+
+ vadd.i16 d8, d8, d2
+ vsub.i16 d6, d6, d4 // q1 + q3 - p2 - q0
+ vadd.i16 d0, d22, d25 // p1 + q1
+ vadd.i16 d2, d26, d27 // q2 + q3
+ vrshr.u16 d13, d8, #3 // out q0
+
+ vadd.i16 d8, d8, d6
+ vsub.i16 d2, d2, d0 // q2 + q3 - p1 - q1
+ vrshr.u16 d0, d8, #3 // out q1
+
+ vadd.i16 d8, d8, d2
+
+ vbit d21, d10, d14
+ vbit d22, d11, d14
+ vbit d23, d12, d14
+ vrshr.u16 d1, d8, #3 // out q2
+ vbit d24, d13, d14
+ vbit d25, d0, d14
+ vbit d26, d1, d14
+.endif
+2:
+.if \wd == 16
+ vmov r10, r11, d15
+ orrs r10, r10, r11
+ bne 1f // check if flat8out is needed
+ vmov r10, r11, d14
+ orrs r10, r10, r11
+ beq 8f // if there was no flat8in, just write the inner 4 pixels
+ b 7f // if flat8in was used, write the inner 6 pixels
+1:
+
+ vadd.i16 d2, d17, d17 // p6 + p6
+ vadd.i16 d4, d17, d18 // p6 + p5
+ vadd.i16 d6, d17, d19 // p6 + p4
+ vadd.i16 d8, d17, d20 // p6 + p3
+ vadd.i16 d12, d2, d4
+ vadd.i16 d10, d6, d8
+ vadd.i16 d6, d17, d21 // p6 + p2
+ vadd.i16 d12, d12, d10
+ vadd.i16 d8, d17, d22 // p6 + p1
+ vadd.i16 d10, d18, d23 // p5 + p0
+ vadd.i16 d6, d6, d8
+ vadd.i16 d8, d19, d24 // p4 + q0
+ vadd.i16 d12, d12, d6
+ vadd.i16 d10, d10, d8
+ vadd.i16 d6, d20, d25 // p3 + q1
+ vadd.i16 d12, d12, d10
+ vsub.i16 d6, d6, d2
+ vadd.i16 d2, d21, d26 // p2 + q2
+ vrshr.u16 d0, d12, #4 // out p5
+ vadd.i16 d12, d12, d6 // - (p6 + p6) + (p3 + q1)
+ vsub.i16 d2, d2, d4
+ vadd.i16 d4, d22, d27 // p1 + q3
+ vadd.i16 d6, d17, d19 // p6 + p4
+ vrshr.u16 d1, d12, #4 // out p4
+ vadd.i16 d12, d12, d2 // - (p6 + p5) + (p2 + q2)
+ vsub.i16 d4, d4, d6
+ vadd.i16 d6, d23, d28 // p0 + q4
+ vadd.i16 d8, d17, d20 // p6 + p3
+ vrshr.u16 d2, d12, #4 // out p3
+ vadd.i16 d12, d12, d4 // - (p6 + p4) + (p1 + q3)
+ vsub.i16 d6, d6, d8
+ vadd.i16 d8, d24, d29 // q0 + q5
+ vadd.i16 d4, d17, d21 // p6 + p2
+ vrshr.u16 d3, d12, #4 // out p2
+ vadd.i16 d12, d12, d6 // - (p6 + p3) + (p0 + q4)
+ vsub.i16 d8, d8, d4
+ vadd.i16 d6, d25, d30 // q1 + q6
+ vadd.i16 d10, d17, d22 // p6 + p1
+ vrshr.u16 d4, d12, #4 // out p1
+ vadd.i16 d12, d12, d8 // - (p6 + p2) + (q0 + q5)
+ vsub.i16 d6, d6, d10
+ vadd.i16 d8, d26, d30 // q2 + q6
+ vbif d0, d18, d15 // out p5
+ vadd.i16 d10, d18, d23 // p5 + p0
+ vrshr.u16 d5, d12, #4 // out p0
+ vadd.i16 d12, d12, d6 // - (p6 + p1) + (q1 + q6)
+ vsub.i16 d8, d8, d10
+ vadd.i16 d10, d27, d30 // q3 + q6
+ vbif d1, d19, d15 // out p4
+ vadd.i16 d18, d19, d24 // p4 + q0
+ vrshr.u16 d6, d12, #4 // out q0
+ vadd.i16 d12, d12, d8 // - (p5 + p0) + (q2 + q6)
+ vsub.i16 d10, d10, d18
+ vadd.i16 d8, d28, d30 // q4 + q6
+ vbif d2, d20, d15 // out p3
+ vadd.i16 d18, d20, d25 // p3 + q1
+ vrshr.u16 d7, d12, #4 // out q1
+ vadd.i16 d12, d12, d10 // - (p4 + q0) + (q3 + q6)
+ vsub.i16 d18, d8, d18
+ vadd.i16 d10, d29, d30 // q5 + q6
+ vbif d3, d21, d15 // out p2
+ vadd.i16 d20, d21, d26 // p2 + q2
+ vrshr.u16 d8, d12, #4 // out q2
+ vadd.i16 d12, d12, d18 // - (p3 + q1) + (q4 + q6)
+ vsub.i16 d10, d10, d20
+ vadd.i16 d18, d30, d30 // q6 + q6
+ vbif d4, d22, d15 // out p1
+ vadd.i16 d20, d22, d27 // p1 + q3
+ vrshr.u16 d9, d12, #4 // out q3
+ vadd.i16 d12, d12, d10 // - (p2 + q2) + (q5 + q6)
+ vsub.i16 d18, d18, d20
+ vbif d5, d23, d15 // out p0
+ vrshr.u16 d10, d12, #4 // out q4
+ vadd.i16 d12, d12, d18 // - (p1 + q3) + (q6 + q6)
+ vrshr.u16 d11, d12, #4 // out q5
+ vbif d6, d24, d15 // out q0
+ vbif d7, d25, d15 // out q1
+ vbif d8, d26, d15 // out q2
+ vbif d9, d27, d15 // out q3
+ vbif d10, d28, d15 // out q4
+ vbif d11, d29, d15 // out q5
+.endif
+
+ bx lr
+.if \wd == 16
+7:
+ // Return to a shorter epilogue, writing only the inner 6 pixels
+ bx r6
+.endif
+.if \wd >= 8
+8:
+ // Return to a shorter epilogue, writing only the inner 4 pixels
+ bx r7
+.endif
+9:
+ // Return directly without writing back any pixels
+ bx r12
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_4_wd16
+ adr r6, 7f + CONFIG_THUMB
+ adr r7, 8f + CONFIG_THUMB
+ bl lpf_4_wd16_neon
+.endm
+
+.macro lpf_4_wd8
+ adr r7, 8f + CONFIG_THUMB
+ bl lpf_4_wd8_neon
+.endm
+
+.macro lpf_4_wd6
+ bl lpf_4_wd6_neon
+.endm
+
+.macro lpf_4_wd4
+ bl lpf_4_wd4_neon
+.endm
+
+function lpf_v_4_4_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #1
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+
+ lpf_4_wd4
+
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_4_4_neon
+ mov r12, lr
+ sub r10, r0, #4
+ add r0, r10, r1, lsl #1
+ vld1.16 {d22}, [r10], r1
+ vld1.16 {d24}, [r0], r1
+ vld1.16 {d23}, [r10], r1
+ vld1.16 {d25}, [r0], r1
+ add r0, r0, #4
+
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+
+ lpf_4_wd4
+
+ sub r10, r0, r1, lsl #2
+ sub r10, r10, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ add r0, r10, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+function lpf_v_6_4_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #1
+ sub r10, r10, r1
+ vld1.16 {d21}, [r10, :64], r1 // p2
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+
+ lpf_4_wd6
+
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_6_4_neon
+ mov r12, lr
+ sub r10, r0, #8
+ vld1.16 {d20}, [r10, :64], r1
+ vld1.16 {d24}, [r0, :64], r1
+ vld1.16 {d21}, [r10, :64], r1
+ vld1.16 {d25}, [r0, :64], r1
+ vld1.16 {d22}, [r10, :64], r1
+ vld1.16 {d26}, [r0, :64], r1
+ vld1.16 {d23}, [r10, :64], r1
+ vld1.16 {d27}, [r0, :64], r1
+
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+
+ lpf_4_wd6
+
+ sub r0, r0, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ sub r10, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+function lpf_v_8_4_neon
+ mov r12, lr
+ sub r10, r0, r1, lsl #2
+ vld1.16 {d20}, [r10, :64], r1 // p3
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d21}, [r10, :64], r1 // p2
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d26}, [r0, :64], r1 // q2
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d27}, [r0, :64], r1 // q3
+ sub r0, r0, r1, lsl #2
+
+ lpf_4_wd8
+
+ sub r10, r0, r1, lsl #1
+ sub r10, r10, r1
+ vst1.16 {d21}, [r10, :64], r1 // p2
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx r12
+
+8:
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_8_4_neon
+ mov r12, lr
+ sub r10, r0, #8
+ vld1.16 {d20}, [r10, :64], r1
+ vld1.16 {d24}, [r0, :64], r1
+ vld1.16 {d21}, [r10, :64], r1
+ vld1.16 {d25}, [r0, :64], r1
+ vld1.16 {d22}, [r10, :64], r1
+ vld1.16 {d26}, [r0, :64], r1
+ vld1.16 {d23}, [r10, :64], r1
+ vld1.16 {d27}, [r0, :64], r1
+
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+
+ lpf_4_wd8
+
+ sub r0, r0, r1, lsl #2
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ sub r10, r0, #8
+
+ vst1.16 {d20}, [r10, :64], r1
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d21}, [r10, :64], r1
+ vst1.16 {d25}, [r0, :64], r1
+ vst1.16 {d22}, [r10, :64], r1
+ vst1.16 {d26}, [r0, :64], r1
+ vst1.16 {d23}, [r10, :64], r1
+ vst1.16 {d27}, [r0, :64], r1
+ bx r12
+8:
+ sub r0, r0, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ sub r10, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+function lpf_v_16_4_neon
+ mov r12, lr
+
+ sub r10, r0, r1, lsl #3
+ add r10, r10, r1
+ vld1.16 {d17}, [r10, :64], r1 // p6
+ vld1.16 {d24}, [r0, :64], r1 // q0
+ vld1.16 {d18}, [r10, :64], r1 // p5
+ vld1.16 {d25}, [r0, :64], r1 // q1
+ vld1.16 {d19}, [r10, :64], r1 // p4
+ vld1.16 {d26}, [r0, :64], r1 // q2
+ vld1.16 {d20}, [r10, :64], r1 // p3
+ vld1.16 {d27}, [r0, :64], r1 // q3
+ vld1.16 {d21}, [r10, :64], r1 // p2
+ vld1.16 {d28}, [r0, :64], r1 // q4
+ vld1.16 {d22}, [r10, :64], r1 // p1
+ vld1.16 {d29}, [r0, :64], r1 // q5
+ vld1.16 {d23}, [r10, :64], r1 // p0
+ vld1.16 {d30}, [r0, :64], r1 // q6
+ sub r0, r0, r1, lsl #3
+ add r0, r0, r1
+
+ lpf_4_wd16
+
+ sub r10, r0, r1, lsl #2
+ sub r10, r10, r1, lsl #1
+ vst1.16 {d0}, [r10, :64], r1 // p5
+ vst1.16 {d6}, [r0, :64], r1 // q0
+ vst1.16 {d1}, [r10, :64], r1 // p4
+ vst1.16 {d7}, [r0, :64], r1 // q1
+ vst1.16 {d2}, [r10, :64], r1 // p3
+ vst1.16 {d8}, [r0, :64], r1 // q2
+ vst1.16 {d3}, [r10, :64], r1 // p2
+ vst1.16 {d9}, [r0, :64], r1 // q3
+ vst1.16 {d4}, [r10, :64], r1 // p1
+ vst1.16 {d10}, [r0, :64], r1 // q4
+ vst1.16 {d5}, [r10, :64], r1 // p0
+ vst1.16 {d11}, [r0, :64], r1 // q5
+ sub r0, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+ bx r12
+7:
+ sub r10, r0, r1
+ sub r10, r10, r1, lsl #1
+ vst1.16 {d21}, [r10, :64], r1 // p2
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d26}, [r0, :64], r1 // q2
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx r12
+
+8:
+ sub r10, r0, r1, lsl #1
+ vst1.16 {d22}, [r10, :64], r1 // p1
+ vst1.16 {d24}, [r0, :64], r1 // q0
+ vst1.16 {d23}, [r10, :64], r1 // p0
+ vst1.16 {d25}, [r0, :64], r1 // q1
+ sub r0, r0, r1, lsl #1
+ bx r12
+endfunc
+
+function lpf_h_16_4_neon
+ mov r12, lr
+ sub r10, r0, #16
+ sub r0, r0, #8
+ vld1.16 {d16}, [r10, :64], r1
+ vld1.16 {d20}, [r0, :64], r1
+ vld1.16 {d17}, [r10, :64], r1
+ vld1.16 {d21}, [r0, :64], r1
+ vld1.16 {d18}, [r10, :64], r1
+ vld1.16 {d22}, [r0, :64], r1
+ vld1.16 {d19}, [r10, :64], r1
+ vld1.16 {d23}, [r0, :64], r1
+ sub r10, r10, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+ add r10, r10, #16
+ add r0, r0, #16
+ vld1.16 {d24}, [r10, :64], r1
+ vld1.16 {d28}, [r0, :64], r1
+ vld1.16 {d25}, [r10, :64], r1
+ vld1.16 {d29}, [r0, :64], r1
+ vld1.16 {d26}, [r10, :64], r1
+ vld1.16 {d30}, [r0, :64], r1
+ vld1.16 {d27}, [r10, :64], r1
+ vld1.16 {d31}, [r0, :64], r1
+ sub r0, r0, #8
+
+ transpose_4x4h q8, q9, d16, d17, d18, d19
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ transpose_4x4h q14, q15, d28, d29, d30, d31
+
+ lpf_4_wd16
+
+ sub r0, r0, r1, lsl #2
+ transpose_4x4h q8, q0, d16, d17, d0, d1
+ transpose_4x4h q1, q2, d2, d3, d4, d5
+ transpose_4x4h q3, q4, d6, d7, d8, d9
+ transpose_4x4h q5, q15, d10, d11, d30, d31
+ sub r10, r0, #16
+ sub r0, r0, #8
+
+ vst1.16 {d16}, [r10, :64], r1
+ vst1.16 {d2}, [r0, :64], r1
+ vst1.16 {d17}, [r10, :64], r1
+ vst1.16 {d3}, [r0, :64], r1
+ vst1.16 {d0}, [r10, :64], r1
+ vst1.16 {d4}, [r0, :64], r1
+ vst1.16 {d1}, [r10, :64], r1
+ vst1.16 {d5}, [r0, :64], r1
+ sub r10, r10, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+ add r10, r10, #16
+ add r0, r0, #16
+ vst1.16 {d6}, [r10, :64], r1
+ vst1.16 {d10}, [r0, :64], r1
+ vst1.16 {d7}, [r10, :64], r1
+ vst1.16 {d11}, [r0, :64], r1
+ vst1.16 {d8}, [r10, :64], r1
+ vst1.16 {d30}, [r0, :64], r1
+ vst1.16 {d9}, [r10, :64], r1
+ vst1.16 {d31}, [r0, :64], r1
+ sub r0, r0, #8
+
+ bx r12
+
+7:
+ sub r0, r0, r1, lsl #2
+ transpose_4x4h q10, q11, d20, d21, d22, d23
+ transpose_4x4h q12, q13, d24, d25, d26, d27
+ sub r10, r0, #8
+
+ vst1.16 {d20}, [r10, :64], r1
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d21}, [r10, :64], r1
+ vst1.16 {d25}, [r0, :64], r1
+ vst1.16 {d22}, [r10, :64], r1
+ vst1.16 {d26}, [r0, :64], r1
+ vst1.16 {d23}, [r10, :64], r1
+ vst1.16 {d27}, [r0, :64], r1
+ bx r12
+8:
+ sub r0, r0, #4
+ transpose_4x4h q11, q12, d22, d23, d24, d25
+ sub r10, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+
+ vst1.16 {d22}, [r10], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r10], r1
+ vst1.16 {d25}, [r0], r1
+ add r0, r0, #4
+ bx r12
+endfunc
+
+// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint32_t *const vmask,
+// const uint8_t (*l)[4], ptrdiff_t b4_stride,
+// const Av1FilterLUT *lut, const int w,
+// const int bitdepth_max)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r8, [sp, #112] // bitdepth_max; the 'w' parameter isn't loaded
+ sub sp, sp, #8
+ clz r9, r8
+ rsb r9, r9, #24 // bitdepth_min_8
+ ldrd r6, r7, [r2] // vmask[0], vmask[1]
+.ifc \type, y
+ ldr r2, [r2, #8] // vmask[2]
+.endif
+ add r5, r5, #128 // Move to sharp part of lut
+.ifc \type, y
+ orr r7, r7, r2 // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+ sub r4, r3, r4, lsl #2
+.else
+ sub r3, r3, #4
+ lsl r4, r4, #2
+.endif
+ orr r6, r6, r7 // vmask[0] |= vmask[1]
+
+1:
+ tst r6, #0x01
+ strd r6, r7, [sp]
+.ifc \dir, v
+ ldrb r10, [r4], #4
+ ldrb r11, [r3], #4
+.else
+ ldrb r10, [r3]
+ ldrb r11, [r3, #4]
+ add r3, r3, r4
+.endif
+ beq 7f // if (!(vm & bits)) continue;
+
+ orrs r12, r10, r11
+ vdup.16 d31, r9 // bitdepth_min_8
+ beq 7f // if (!(l[0][0] | l[offset][0])) continue;
+ cmp r11, #0 // Check for nonzero values in l[0][0]
+ ldrb r6, [r5], #8 // sharp[0]
+ it eq
+ moveq r11, r10 // if (!l[0][0]) L = l[offset][0]
+ ldrb r12, [r5] // sharp[1]
+ lsr r6, r11, r6 // L >> sharp[0]
+ sub r5, r5, #8
+ cmp r12, r6
+ lsr r10, r11, #4 // H
+ add r11, r11, #2 // L + 2
+ it lt
+ movlt r6, r12 // imin(L >> sharp[0], sharp[1])
+ add r11, r11, r11 // 2*(L + 2)
+ cmp r6, #1
+ lsl r10, r10, r9 // H << bitdepth_min_8
+ it lt
+ movlt r6, #1 // imax(imin(), 1) = limit = I
+ vdup.16 d12, r10 // H << bitdepth_min_8
+ add r11, r11, r6 // 2*(L + 2) + limit = E
+ lsl r6, r6, r9 // I << bitdepth_min_8
+ lsl r11, r11, r9 // E << bitdepth_min_8
+ vdup.16 d11, r6 // I << bitdepth_min_8
+ vdup.16 d10, r11 // E << bitdepth_min_8
+
+.ifc \type, y
+ tst r2, #0x01
+ beq 2f
+ // wd16
+ bl lpf_\dir\()_16_4_neon
+ b 8f
+2:
+.endif
+ tst r7, #0x01
+ beq 3f
+.ifc \type, y
+ // wd8
+ bl lpf_\dir\()_8_4_neon
+.else
+ // wd6
+ bl lpf_\dir\()_6_4_neon
+.endif
+ b 8f
+3:
+ // wd4
+ bl lpf_\dir\()_4_4_neon
+.ifc \dir, h
+ b 8f
+7:
+ // For dir h, the functions above increment r0.
+ // If the whole function is skipped, increment it here instead.
+ add r0, r0, r1, lsl #2
+.else
+7:
+.endif
+8:
+ ldrd r6, r7, [sp]
+.ifc \type, y
+ lsr r2, r2, #1 // vmask[2] >>= 1
+.endif
+.ifc \dir, v
+ add r0, r0, #8
+.else
+ // For dir h, r0 is returned incremented
+.endif
+ lsrs r6, r6, #1 // vmask[0] >>= 1
+ lsr r7, r7, #1 // vmask[1] >>= 1
+ bne 1b
+
+ add sp, sp, #8
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
diff --git a/third_party/dav1d/src/arm/32/looprestoration.S b/third_party/dav1d/src/arm/32/looprestoration.S
new file mode 100644
index 0000000000..be5c658d6d
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration.S
@@ -0,0 +1,791 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
+// const pixel *src, ptrdiff_t stride,
+// const int16_t fh[8], intptr_t w,
+// int h, enum LrEdgeFlags edges);
+function wiener_filter_h_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ mov r8, r5
+ vld1.16 {q0}, [r4, :128]
+ movw r9, #(1 << 14) - (1 << 2)
+ vdup.16 q14, r9
+ vmov.s16 q15, #2048
+ // Calculate mid_stride
+ add r10, r5, #7
+ bic r10, r10, #7
+ lsl r10, r10, #1
+
+ // Set up pointers for reading/writing alternate rows
+ add r12, r0, r10
+ lsl r10, r10, #1
+ add lr, r2, r3
+ lsl r3, r3, #1
+
+ // Subtract the aligned width from mid_stride
+ add r11, r5, #7
+ bic r11, r11, #7
+ sub r10, r10, r11, lsl #1
+
+ // Subtract the number of pixels read from the source stride
+ add r11, r11, #8
+ sub r3, r3, r11
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r1, #0
+ bne 0f
+ // left == NULL
+ sub r2, r2, #3
+ sub lr, lr, #3
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r3, r3, #3
+
+
+1: // Loop vertically
+ vld1.8 {q2}, [r2]!
+ vld1.8 {q9}, [lr]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r1, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.32 {d3[1]}, [r1]!
+ // Move r2/lr back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub r2, r2, #3
+ sub lr, lr, #3
+ vld1.32 {d17[1]}, [r1]!
+ vext.8 q2, q1, q2, #13
+ vext.8 q9, q8, q9, #13
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+ // and shift q2 to have 3x the first byte at the front.
+ vdup.8 q1, d4[0]
+ vdup.8 q8, d18[0]
+ // Move r2 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub r2, r2, #3
+ sub lr, lr, #3
+ vext.8 q2, q1, q2, #13
+ vext.8 q9, q8, q9, #13
+
+2:
+ vmovl.u8 q1, d4
+ vmovl.u8 q2, d5
+ vmovl.u8 q8, d18
+ vmovl.u8 q9, d19
+
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub r9, r5, #14
+ ldrb r11, [r2, r9]
+ ldrb r9, [lr, r9]
+ // Fill q12/q13 with the right padding pixel
+ vdup.16 q12, r11
+ vdup.16 q13, r9
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel_local r4, right_ext_mask, -6
+ sub r4, r4, r5, lsl #1
+ vld1.8 {q10, q11}, [r4]
+
+ vbit q1, q12, q10
+ vbit q2, q12, q11
+ vbit q8, q13, q10
+ vbit q9, q13, q11
+
+4: // Loop horizontally
+ vext.8 q11, q1, q2, #4
+ vext.8 q5, q1, q2, #8
+ vext.8 q10, q1, q2, #2
+ vext.8 q6, q1, q2, #10
+ vext.8 q7, q1, q2, #12
+ vext.8 q4, q1, q2, #6
+ vadd.i16 q5, q5, q11
+ vadd.i16 q6, q6, q10
+ vadd.i16 q7, q7, q1
+ vmul.s16 q3, q4, d0[3]
+ vmla.s16 q3, q5, d1[0]
+ vmla.s16 q3, q6, d1[1]
+ vmla.s16 q3, q7, d1[2]
+
+ vext.8 q4, q8, q9, #4
+ vext.8 q6, q8, q9, #8
+ vext.8 q11, q8, q9, #2
+ vext.8 q7, q8, q9, #10
+ vadd.i16 q6, q6, q4
+ vext.8 q4, q8, q9, #12
+ vext.8 q5, q8, q9, #6
+ vadd.i16 q7, q7, q11
+ vadd.i16 q4, q4, q8
+ vmul.s16 q10, q5, d0[3]
+ vmla.s16 q10, q6, d1[0]
+ vmla.s16 q10, q7, d1[1]
+ vmla.s16 q10, q4, d1[2]
+
+ vext.8 q1, q1, q2, #6
+ vext.8 q8, q8, q9, #6
+ vshl.s16 q1, q1, #7
+ vshl.s16 q8, q8, #7
+ vsub.s16 q1, q1, q14
+ vsub.s16 q8, q8, q14
+ vqadd.s16 q3, q3, q1
+ vqadd.s16 q10, q10, q8
+ vshr.s16 q3, q3, #3
+ vshr.s16 q10, q10, #3
+ vadd.s16 q3, q3, q15
+ vadd.s16 q10, q10, q15
+ subs r5, r5, #8
+ vst1.16 {q3}, [r0, :128]!
+ vst1.16 {q10}, [r12, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q1, q2
+ vmov q8, q9
+ vld1.8 {d4}, [r2]!
+ vld1.8 {d18}, [lr]!
+ vmovl.u8 q2, d4
+ vmovl.u8 q9, d18
+ bne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r10
+ add r12, r12, r10
+ add r2, r2, r3
+ add lr, lr, r3
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
+// const int16_t *mid, int w, int h,
+// const int16_t fv[8], enum LrEdgeFlags edges,
+// ptrdiff_t mid_stride);
+function wiener_filter_v_8bpc_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q6}
+ ldrd r4, r5, [sp, #68]
+ ldrd r6, r7, [sp, #76]
+ mov lr, r4
+ vld1.16 {q0}, [r5, :128]
+
+ // Calculate the number of rows to move back when looping vertically
+ mov r12, r4
+ tst r6, #4 // LR_HAVE_TOP
+ beq 0f
+ sub r2, r2, r7, lsl #1
+ add r12, r12, #2
+0:
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 1f
+ add r12, r12, #2
+
+1: // Start of horizontal loop; start one vertical filter slice.
+ // Load rows into q8-q11 and pad properly.
+ tst r6, #4 // LR_HAVE_TOP
+ vld1.16 {q8}, [r2, :128], r7
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.16 {q10}, [r2, :128], r7
+ vmov q9, q8
+ vld1.16 {q11}, [r2, :128], r7
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q9, q8
+ vmov q10, q8
+ vmov q11, q8
+
+3:
+ cmp r4, #4
+ blt 5f
+ // Start filtering normally; fill in q12-q14 with unique rows.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vld1.16 {q14}, [r2, :128], r7
+
+4:
+.macro filter compare
+ subs r4, r4, #1
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ vadd.i16 q4, q10, q12
+ vadd.i16 q5, q9, q13
+ vadd.i16 q6, q8, q14
+ vmull.s16 q2, d22, d0[3]
+ vmlal.s16 q2, d8, d1[0]
+ vmlal.s16 q2, d10, d1[1]
+ vmlal.s16 q2, d12, d1[2]
+ vmull.s16 q3, d23, d0[3]
+ vmlal.s16 q3, d9, d1[0]
+ vmlal.s16 q3, d11, d1[1]
+ vmlal.s16 q3, d13, d1[2]
+ vqrshrun.s32 d4, q2, #11
+ vqrshrun.s32 d5, q3, #11
+ vqmovun.s16 d4, q2
+ vst1.8 {d4}, [r0, :64], r1
+.if \compare
+ cmp r4, #4
+.else
+ ble 9f
+.endif
+ vmov q8, q9
+ vmov q9, q10
+ vmov q10, q11
+ vmov q11, q12
+ vmov q12, q13
+ vmov q13, q14
+.endm
+ filter 1
+ blt 7f
+ vld1.16 {q14}, [r2, :128], r7
+ b 4b
+
+5: // Less than 4 rows in total; not all of q12-q13 are filled yet.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 6f
+ // LR_HAVE_BOTTOM
+ cmp r4, #2
+ // We load at least 2 rows in all cases.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ bgt 53f // 3 rows in total
+ beq 52f // 2 rows in total
+51: // 1 row in total, q11 already loaded, load edge into q12-q14.
+ vmov q13, q12
+ b 8f
+52: // 2 rows in total, q11 already loaded, load q12 with content data
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vmov q15, q14
+ b 8f
+53:
+ // 3 rows in total, q11 already loaded, load q12 and q13 with content
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+
+6:
+ // !LR_HAVE_BOTTOM
+ cmp r4, #2
+ bgt 63f // 3 rows in total
+ beq 62f // 2 rows in total
+61: // 1 row in total, q11 already loaded, pad that into q12-q14.
+ vmov q12, q11
+ vmov q13, q11
+ vmov q14, q11
+ b 8f
+62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
+ vld1.16 {q12}, [r2, :128], r7
+ vmov q13, q12
+ vmov q14, q12
+ vmov q15, q12
+ b 8f
+63:
+ // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+ b 8f
+
+7:
+ // All registers up to q13 are filled already, 3 valid rows left.
+ // < 4 valid rows left; fill in padding and filter the last
+ // few rows.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 71f
+ // LR_HAVE_BOTTOM; load 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+71:
+ // !LR_HAVE_BOTTOM, pad 3 rows
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+
+8: // At this point, all registers up to q14-15,q1 are loaded with
+ // edge/padding (depending on how many rows are left).
+ filter 0 // This branches to 9f when done
+ vmov q14, q15
+ vmov q15, q1
+ b 8b
+
+9: // End of one vertical slice.
+ subs r3, r3, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ mls r0, r1, lr, r0
+ mls r2, r7, r12, r2
+ add r0, r0, #8
+ add r2, r2, #16
+ mov r4, lr
+ b 1b
+
+0:
+ vpop {q4-q6}
+ pop {r4-r7,pc}
+.purgem filter
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_h_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add lr, r5, #7
+ bic lr, lr, #7
+ sub r9, r9, lr, lsl #1
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Subtract the number of pixels read from the input from the stride
+ add lr, lr, #8
+ sub r4, r4, lr
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #2
+ sub r12, r12, #2
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 2 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #2
+
+
+1: // Loop vertically
+ vld1.8 {q0}, [r3]!
+ vld1.8 {q4}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.32 {d3[]}, [r2]!
+ // Move r3/r12 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #2
+ sub r12, r12, #2
+ vld1.32 {d11[]}, [r2]!
+ vext.8 q0, q1, q0, #14
+ vext.8 q4, q5, q4, #14
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+ // and shift q0 to have 2x the first byte at the front.
+ vdup.8 q1, d0[0]
+ vdup.8 q5, d8[0]
+ // Move r3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub r3, r3, #2
+ sub r12, r12, #2
+ vext.8 q0, q1, q0, #14
+ vext.8 q4, q5, q4, #14
+
+2:
+ vmull.u8 q1, d0, d0
+ vmull.u8 q2, d1, d1
+ vmull.u8 q5, d8, d8
+ vmull.u8 q6, d9, d9
+
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 2 + 1)
+ ldrb r11, [r3, lr]
+ ldrb lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.8 q14, r11
+ vdup.8 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #10
+ bge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in q0/4.b[w] onwards
+ movrel_local lr, right_ext_mask
+ sub lr, lr, r5
+ vld1.8 {q13}, [lr]
+
+ vbit q0, q14, q13
+ vbit q4, q15, q13
+
+ // Update the precalculated squares
+ vmull.u8 q1, d0, d0
+ vmull.u8 q2, d1, d1
+ vmull.u8 q5, d8, d8
+ vmull.u8 q6, d9, d9
+
+4: // Loop horizontally
+ vext.8 d16, d0, d1, #1
+ vext.8 d17, d0, d1, #2
+ vext.8 d18, d8, d9, #1
+ vext.8 d19, d8, d9, #2
+ vaddl.u8 q3, d0, d16
+ vaddw.u8 q3, q3, d17
+ vaddl.u8 q7, d8, d18
+ vaddw.u8 q7, q7, d19
+
+ vext.8 q8, q1, q2, #2
+ vext.8 q9, q1, q2, #4
+ vext.8 q10, q5, q6, #2
+ vext.8 q11, q5, q6, #4
+
+ vaddl.u16 q12, d2, d16
+ vaddl.u16 q13, d3, d17
+ vaddw.u16 q12, q12, d18
+ vaddw.u16 q13, q13, d19
+
+ vaddl.u16 q8, d10, d20
+ vaddl.u16 q9, d11, d21
+ vaddw.u16 q8, q8, d22
+ vaddw.u16 q9, q9, d23
+
+ subs r5, r5, #8
+ vst1.16 {q3}, [r1, :128]!
+ vst1.16 {q7}, [r11, :128]!
+ vst1.32 {q12, q13}, [r0, :128]!
+ vst1.32 {q8, q9}, [r10, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vld1.8 {d6}, [r3]!
+ vld1.8 {d14}, [r12]!
+ vmov q1, q2
+ vmov q5, q6
+ vext.8 q0, q0, q3, #8
+ vext.8 q4, q4, q7, #8
+ vmull.u8 q2, d6, d6
+ vmull.u8 q6, d14, d14
+
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_h_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add lr, r5, #7
+ bic lr, lr, #7
+ sub r9, r9, lr, lsl #1
+ add lr, lr, #8
+ sub r4, r4, lr
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #3
+ sub r12, r12, #3
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #3
+
+1: // Loop vertically
+ vld1.8 {q0}, [r3]!
+ vld1.8 {q4}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.32 {d3[]}, [r2]!
+ // Move r3/r12 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #3
+ sub r12, r12, #3
+ vld1.32 {d11[]}, [r2]!
+ vext.8 q0, q1, q0, #13
+ vext.8 q4, q5, q4, #13
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+ // and shift q0 to have 3x the first byte at the front.
+ vdup.8 q1, d0[0]
+ vdup.8 q5, d8[0]
+ // Move r3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub r3, r3, #3
+ sub r12, r12, #3
+ vext.8 q0, q1, q0, #13
+ vext.8 q4, q5, q4, #13
+
+2:
+ vmull.u8 q1, d0, d0
+ vmull.u8 q2, d1, d1
+ vmull.u8 q5, d8, d8
+ vmull.u8 q6, d9, d9
+
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 3 + 1)
+ ldrb r11, [r3, lr]
+ ldrb lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.8 q14, r11
+ vdup.8 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in q0/4.b[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel_local lr, right_ext_mask, -1
+ sub lr, lr, r5
+ vld1.8 {q13}, [lr]
+
+ vbit q0, q14, q13
+ vbit q4, q15, q13
+
+ // Update the precalculated squares
+ vmull.u8 q1, d0, d0
+ vmull.u8 q2, d1, d1
+ vmull.u8 q5, d8, d8
+ vmull.u8 q6, d9, d9
+
+4: // Loop horizontally
+ vext.8 d16, d0, d1, #1
+ vext.8 d17, d0, d1, #2
+ vext.8 d18, d0, d1, #3
+ vext.8 d19, d0, d1, #4
+ vext.8 d20, d8, d9, #1
+ vext.8 d21, d8, d9, #2
+ vext.8 d22, d8, d9, #3
+ vext.8 d23, d8, d9, #4
+ vaddl.u8 q3, d0, d16
+ vaddl.u8 q12, d17, d18
+ vaddl.u8 q7, d8, d20
+ vaddl.u8 q13, d21, d22
+ vaddw.u8 q3, q3, d19
+ vaddw.u8 q7, q7, d23
+ vadd.u16 q3, q3, q12
+ vadd.u16 q7, q7, q13
+
+ vext.8 q8, q1, q2, #2
+ vext.8 q9, q1, q2, #4
+ vext.8 q10, q1, q2, #6
+ vext.8 q11, q1, q2, #8
+ vaddl.u16 q12, d2, d16
+ vaddl.u16 q13, d3, d17
+ vaddl.u16 q8, d18, d20
+ vaddl.u16 q9, d19, d21
+ vaddw.u16 q12, q12, d22
+ vaddw.u16 q13, q13, d23
+ vadd.i32 q12, q12, q8
+ vadd.i32 q13, q13, q9
+ vext.8 q8, q5, q6, #2
+ vext.8 q9, q5, q6, #4
+ vext.8 q10, q5, q6, #6
+ vext.8 q11, q5, q6, #8
+ vaddl.u16 q1, d10, d16
+ vaddl.u16 q5, d11, d17
+ vaddl.u16 q8, d18, d20
+ vaddl.u16 q9, d19, d21
+ vaddw.u16 q1, q1, d22
+ vaddw.u16 q5, q5, d23
+ vadd.i32 q10, q1, q8
+ vadd.i32 q11, q5, q9
+
+ subs r5, r5, #8
+ vst1.16 {q3}, [r1, :128]!
+ vst1.16 {q7}, [r11, :128]!
+ vst1.32 {q12, q13}, [r0, :128]!
+ vst1.32 {q10, q11}, [r10, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vld1.8 {d6}, [r3]!
+ vld1.8 {d14}, [r12]!
+ vmov q1, q2
+ vmov q5, q6
+ vext.8 q0, q0, q3, #8
+ vext.8 q4, q4, q7, #8
+ vmull.u8 q2, d6, d6
+ vmull.u8 q6, d14, d14
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+sgr_funcs 8
diff --git a/third_party/dav1d/src/arm/32/looprestoration16.S b/third_party/dav1d/src/arm/32/looprestoration16.S
new file mode 100644
index 0000000000..d699617a87
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration16.S
@@ -0,0 +1,801 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
+// const pixel *src, ptrdiff_t stride,
+// const int16_t fh[7], const intptr_t w,
+// int h, enum LrEdgeFlags edges,
+// const int bitdepth_max);
+function wiener_filter_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ ldr r8, [sp, #116] // bitdepth_max
+ vld1.16 {q0}, [r4, :128]
+ clz r8, r8
+ vmov.i32 q14, #1
+ sub r9, r8, #38 // -(bitdepth + 6)
+ sub r8, r8, #25 // -round_bits_h
+ neg r9, r9 // bitdepth + 6
+ vdup.32 q1, r9
+ vdup.32 q13, r8 // -round_bits_h
+ vmov.i16 q15, #8192
+ vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6)
+ mov r8, r5
+ // Calculate mid_stride
+ add r10, r5, #7
+ bic r10, r10, #7
+ lsl r10, r10, #1
+
+ // Set up pointers for reading/writing alternate rows
+ add r12, r0, r10
+ lsl r10, r10, #1
+ add lr, r2, r3
+ lsl r3, r3, #1
+
+ // Subtract the aligned width from mid_stride
+ add r11, r5, #7
+ bic r11, r11, #7
+ sub r10, r10, r11, lsl #1
+
+ // Subtract the number of pixels read from the source stride
+ add r11, r11, #8
+ sub r3, r3, r11, lsl #1
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r1, #0
+ bne 0f
+ // left == NULL
+ sub r2, r2, #6
+ sub lr, lr, #6
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r3, r3, #6
+
+
+1: // Loop vertically
+ vld1.16 {q2, q3}, [r2]!
+ vld1.16 {q4, q5}, [lr]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r1, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d3}, [r1]!
+ // Move r2/lr back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r2, r2, #6
+ sub lr, lr, #6
+ vld1.16 {d13}, [r1]!
+ vext.8 q3, q2, q3, #10
+ vext.8 q2, q1, q2, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
+ // and shift q2/q3 to have 3x the first pixel at the front.
+ vdup.16 q1, d4[0]
+ vdup.16 q6, d8[0]
+ // Move r2 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub r2, r2, #6
+ sub lr, lr, #6
+ vext.8 q3, q2, q3, #10
+ vext.8 q2, q1, q2, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+
+2:
+
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub r9, r5, #14
+ lsl r9, r9, #1
+ ldrh r11, [r2, r9]
+ ldrh r9, [lr, r9]
+ // Fill q11/q12 with the right padding pixel
+ vdup.16 q11, r11
+ vdup.16 q12, r9
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel_local r4, right_ext_mask, -6
+ sub r4, r4, r5, lsl #1
+ vld1.8 {q9, q10}, [r4]
+
+ vbit q2, q11, q9
+ vbit q3, q11, q10
+ vbit q4, q12, q9
+ vbit q5, q12, q10
+
+4: // Loop horizontally
+ vext.8 q7, q2, q3, #4
+ vext.8 q8, q2, q3, #8
+ vext.8 q6, q2, q3, #2
+ vext.8 q9, q2, q3, #10
+ vadd.i16 q8, q8, q7
+ vadd.i16 q9, q9, q6
+ vext.8 q6, q2, q3, #12
+ vext.8 q7, q2, q3, #6
+ vadd.i16 q2, q2, q6
+ vmull.s16 q6, d14, d0[3]
+ vmlal.s16 q6, d16, d1[0]
+ vmlal.s16 q6, d18, d1[1]
+ vmlal.s16 q6, d4, d1[2]
+ vmull.s16 q7, d15, d0[3]
+ vmlal.s16 q7, d17, d1[0]
+ vmlal.s16 q7, d19, d1[1]
+ vmlal.s16 q7, d5, d1[2]
+
+ vext.8 q8, q4, q5, #4
+ vext.8 q10, q4, q5, #8
+ vext.8 q9, q4, q5, #2
+ vext.8 q2, q4, q5, #10
+ vadd.i16 q10, q10, q8
+ vadd.i16 q2, q2, q9
+ vext.8 q8, q4, q5, #12
+ vext.8 q9, q4, q5, #6
+ vadd.i16 q4, q4, q8
+ vmull.s16 q8, d18, d0[3]
+ vmlal.s16 q8, d20, d1[0]
+ vmlal.s16 q8, d4, d1[1]
+ vmlal.s16 q8, d8, d1[2]
+ vmull.s16 q9, d19, d0[3]
+ vmlal.s16 q9, d21, d1[0]
+ vmlal.s16 q9, d5, d1[1]
+ vmlal.s16 q9, d9, d1[2]
+
+ vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
+ vadd.i32 q6, q6, q14
+ vadd.i32 q7, q7, q14
+ vadd.i32 q8, q8, q14
+ vadd.i32 q9, q9, q14
+ vrshl.s32 q6, q6, q13
+ vrshl.s32 q7, q7, q13
+ vrshl.s32 q8, q8, q13
+ vrshl.s32 q9, q9, q13
+ vqmovun.s32 d12, q6
+ vqmovun.s32 d13, q7
+ vqmovun.s32 d14, q8
+ vqmovun.s32 d15, q9
+ vmin.u16 q6, q6, q10
+ vmin.u16 q7, q7, q10
+ vsub.i16 q6, q6, q15
+ vsub.i16 q7, q7, q15
+ subs r5, r5, #8
+ vst1.16 {q6}, [r0, :128]!
+ vst1.16 {q7}, [r12, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q2, q3
+ vmov q4, q5
+ vld1.16 {q3}, [r2]!
+ vld1.16 {q5}, [lr]!
+ bne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r10
+ add r12, r12, r10
+ add r2, r2, r3
+ add lr, lr, r3
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
+// const int16_t *mid, int w, int h,
+// const int16_t fv[7], enum LrEdgeFlags edges,
+// ptrdiff_t mid_stride, const int bitdepth_max);
+function wiener_filter_v_16bpc_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q5}
+ ldrd r4, r5, [sp, #52]
+ ldrd r6, r7, [sp, #60]
+ ldr lr, [sp, #68] // bitdepth_max
+ vld1.16 {q0}, [r5, :128]
+ vdup.16 q5, lr
+ clz lr, lr
+ sub lr, lr, #11 // round_bits_v
+ vdup.32 q4, lr
+ mov lr, r4
+ vneg.s32 q4, q4 // -round_bits_v
+
+ // Calculate the number of rows to move back when looping vertically
+ mov r12, r4
+ tst r6, #4 // LR_HAVE_TOP
+ beq 0f
+ sub r2, r2, r7, lsl #1
+ add r12, r12, #2
+0:
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 1f
+ add r12, r12, #2
+
+1: // Start of horizontal loop; start one vertical filter slice.
+ // Load rows into q8-q11 and pad properly.
+ tst r6, #4 // LR_HAVE_TOP
+ vld1.16 {q8}, [r2, :128], r7
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.16 {q10}, [r2, :128], r7
+ vmov q9, q8
+ vld1.16 {q11}, [r2, :128], r7
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q9, q8
+ vmov q10, q8
+ vmov q11, q8
+
+3:
+ cmp r4, #4
+ blt 5f
+ // Start filtering normally; fill in q12-q14 with unique rows.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vld1.16 {q14}, [r2, :128], r7
+
+4:
+.macro filter compare
+ subs r4, r4, #1
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d20, d0[2]
+ vmlal.s16 q2, d22, d0[3]
+ vmlal.s16 q2, d24, d1[0]
+ vmlal.s16 q2, d26, d1[1]
+ vmlal.s16 q2, d28, d1[2]
+ vmull.s16 q3, d17, d0[0]
+ vmlal.s16 q3, d19, d0[1]
+ vmlal.s16 q3, d21, d0[2]
+ vmlal.s16 q3, d23, d0[3]
+ vmlal.s16 q3, d25, d1[0]
+ vmlal.s16 q3, d27, d1[1]
+ vmlal.s16 q3, d29, d1[2]
+ vrshl.s32 q2, q2, q4 // round_bits_v
+ vrshl.s32 q3, q3, q4
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vmin.u16 q2, q2, q5 // bitdepth_max
+ vst1.16 {q2}, [r0, :128], r1
+.if \compare
+ cmp r4, #4
+.else
+ ble 9f
+.endif
+ vmov q8, q9
+ vmov q9, q10
+ vmov q10, q11
+ vmov q11, q12
+ vmov q12, q13
+ vmov q13, q14
+.endm
+ filter 1
+ blt 7f
+ vld1.16 {q14}, [r2, :128], r7
+ b 4b
+
+5: // Less than 4 rows in total; not all of q12-q13 are filled yet.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 6f
+ // LR_HAVE_BOTTOM
+ cmp r4, #2
+ // We load at least 2 rows in all cases.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ bgt 53f // 3 rows in total
+ beq 52f // 2 rows in total
+51: // 1 row in total, q11 already loaded, load edge into q12-q14.
+ vmov q13, q12
+ b 8f
+52: // 2 rows in total, q11 already loaded, load q12 with content data
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vmov q15, q14
+ b 8f
+53:
+ // 3 rows in total, q11 already loaded, load q12 and q13 with content
+ // and 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+
+6:
+ // !LR_HAVE_BOTTOM
+ cmp r4, #2
+ bgt 63f // 3 rows in total
+ beq 62f // 2 rows in total
+61: // 1 row in total, q11 already loaded, pad that into q12-q14.
+ vmov q12, q11
+ vmov q13, q11
+ vmov q14, q11
+ b 8f
+62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
+ vld1.16 {q12}, [r2, :128], r7
+ vmov q13, q12
+ vmov q14, q12
+ vmov q15, q12
+ b 8f
+63:
+ // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
+ vld1.16 {q12}, [r2, :128], r7
+ vld1.16 {q13}, [r2, :128], r7
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+ b 8f
+
+7:
+ // All registers up to q13 are filled already, 3 valid rows left.
+ // < 4 valid rows left; fill in padding and filter the last
+ // few rows.
+ tst r6, #8 // LR_HAVE_BOTTOM
+ beq 71f
+ // LR_HAVE_BOTTOM; load 2 rows of edge.
+ vld1.16 {q14}, [r2, :128], r7
+ vld1.16 {q15}, [r2, :128], r7
+ vmov q1, q15
+ b 8f
+71:
+ // !LR_HAVE_BOTTOM, pad 3 rows
+ vmov q14, q13
+ vmov q15, q13
+ vmov q1, q13
+
+8: // At this point, all registers up to q14-q15,q1 are loaded with
+ // edge/padding (depending on how many rows are left).
+ filter 0 // This branches to 9f when done
+ vmov q14, q15
+ vmov q15, q1
+ b 8b
+
+9: // End of one vertical slice.
+ subs r3, r3, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ mls r0, r1, lr, r0
+ mls r2, r7, r12, r2
+ add r0, r0, #16
+ add r2, r2, #16
+ mov r4, lr
+ b 1b
+
+0:
+ vpop {q4-q5}
+ pop {r4-r7,pc}
+.purgem filter
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add lr, r5, #7
+ bic lr, lr, #7
+ sub r9, r9, lr, lsl #1
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Subtract the number of pixels read from the input from the stride
+ add lr, lr, #8
+ sub r4, r4, lr, lsl #1
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #4
+ sub r12, r12, #4
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 2 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #4
+
+
+1: // Loop vertically
+ vld1.16 {q0, q1}, [r3]!
+ vld1.16 {q4, q5}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d5}, [r2]!
+ // Move r3/r12 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #4
+ sub r12, r12, #4
+ vld1.16 {d13}, [r2]!
+ vext.8 q1, q0, q1, #12
+ vext.8 q0, q2, q0, #12
+ vext.8 q5, q4, q5, #12
+ vext.8 q4, q6, q4, #12
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
+ // and shift q0 to have 2x the first byte at the front.
+ vdup.16 q2, d0[0]
+ vdup.16 q6, d8[0]
+ // Move r3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub r3, r3, #4
+ sub r12, r12, #4
+ vext.8 q1, q0, q1, #12
+ vext.8 q0, q2, q0, #12
+ vext.8 q5, q4, q5, #12
+ vext.8 q4, q6, q4, #12
+
+2:
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 2 + 1)
+ lsl lr, lr, #1
+ ldrh r11, [r3, lr]
+ ldrh lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.16 q14, r11
+ vdup.16 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #10
+ bge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in q0/1.h[w] onwards
+ movrel_local lr, right_ext_mask
+ sub lr, lr, r5, lsl #1
+ vld1.8 {q12, q13}, [lr]
+
+ vbit q0, q14, q12
+ vbit q1, q14, q13
+ vbit q4, q15, q12
+ vbit q5, q15, q13
+
+4: // Loop horizontally
+ vext.8 q8, q0, q1, #2
+ vext.8 q10, q4, q5, #2
+ vext.8 q9, q0, q1, #4
+ vext.8 q11, q4, q5, #4
+ vadd.i16 q2, q0, q8
+ vadd.i16 q3, q4, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+
+ vmull.u16 q6, d0, d0
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d18, d18
+ vmull.u16 q12, d8, d8
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d22, d22
+ vmull.u16 q7, d1, d1
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmull.u16 q13, d9, d9
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+ subs r5, r5, #8
+ vst1.16 {q2}, [r1, :128]!
+ vst1.16 {q3}, [r11, :128]!
+ vst1.32 {q6, q7}, [r0, :128]!
+ vst1.32 {q12, q13}, [r10, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q0, q1
+ vmov q4, q5
+ vld1.16 {q1}, [r3]!
+ vld1.16 {q5}, [r12]!
+
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_h_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ add r5, r5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add r10, r0, #(4*SUM_STRIDE) // sumsq
+ add r11, r1, #(2*SUM_STRIDE) // sum
+ add r12, r3, r4 // src
+ lsl r4, r4, #1
+ mov r9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add lr, r5, #7
+ bic lr, lr, #7
+ sub r9, r9, lr, lsl #1
+ add lr, lr, #8
+ sub r4, r4, lr, lsl #1
+
+ // Store the width for the vertical loop
+ mov r8, r5
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 2f
+ // LR_HAVE_LEFT
+ cmp r2, #0
+ bne 0f
+ // left == NULL
+ sub r3, r3, #6
+ sub r12, r12, #6
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add r4, r4, #6
+
+1: // Loop vertically
+ vld1.16 {q0, q1}, [r3]!
+ vld1.16 {q4, q5}, [r12]!
+
+ tst r7, #1 // LR_HAVE_LEFT
+ beq 0f
+ cmp r2, #0
+ beq 2f
+ // LR_HAVE_LEFT, left != NULL
+ vld1.16 {d5}, [r2]!
+ // Move r3/r12 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub r3, r3, #6
+ sub r12, r12, #6
+ vld1.16 {d13}, [r2]!
+ vext.8 q1, q0, q1, #10
+ vext.8 q0, q2, q0, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
+ // and shift q0 to have 3x the first pixel at the front.
+ vdup.16 q2, d0[0]
+ vdup.16 q6, d8[0]
+ // Move r3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub r3, r3, #6
+ sub r12, r12, #6
+ vext.8 q1, q0, q1, #10
+ vext.8 q0, q2, q0, #10
+ vext.8 q5, q4, q5, #10
+ vext.8 q4, q6, q4, #10
+
+2:
+ tst r7, #2 // LR_HAVE_RIGHT
+ bne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub lr, r5, #(2 + 16 - 3 + 1)
+ lsl lr, lr, #1
+ ldrh r11, [r3, lr]
+ ldrh lr, [r12, lr]
+ // Fill q14/q15 with the right padding pixel
+ vdup.16 q14, r11
+ vdup.16 q15, lr
+ // Restore r11 after using it for a temporary value
+ add r11, r1, #(2*SUM_STRIDE)
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp r5, #11
+ bge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel_local lr, right_ext_mask, -2
+ sub lr, lr, r5, lsl #1
+ vld1.8 {q12, q13}, [lr]
+
+ vbit q0, q14, q12
+ vbit q1, q14, q13
+ vbit q4, q15, q12
+ vbit q5, q15, q13
+
+4: // Loop horizontally
+ vext.8 q8, q0, q1, #2
+ vext.8 q10, q4, q5, #2
+ vext.8 q9, q0, q1, #4
+ vext.8 q11, q4, q5, #4
+ vadd.i16 q2, q0, q8
+ vadd.i16 q3, q4, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+
+ vmull.u16 q6, d0, d0
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d18, d18
+ vmull.u16 q12, d8, d8
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d22, d22
+ vmull.u16 q7, d1, d1
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmull.u16 q13, d9, d9
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+
+ vext.8 q8, q0, q1, #6
+ vext.8 q10, q4, q5, #6
+ vext.8 q9, q0, q1, #8
+ vext.8 q11, q4, q5, #8
+ vadd.i16 q2, q2, q8
+ vadd.i16 q3, q3, q10
+ vadd.i16 q2, q2, q9
+ vadd.i16 q3, q3, q11
+
+ vmlal.u16 q6, d16, d16
+ vmlal.u16 q6, d1, d1
+ vmlal.u16 q12, d20, d20
+ vmlal.u16 q12, d9, d9
+ vmlal.u16 q7, d17, d17
+ vmlal.u16 q7, d19, d19
+ vmlal.u16 q13, d21, d21
+ vmlal.u16 q13, d23, d23
+
+ subs r5, r5, #8
+ vst1.16 {q2}, [r1, :128]!
+ vst1.16 {q3}, [r11, :128]!
+ vst1.32 {q6, q7}, [r0, :128]!
+ vst1.32 {q12, q13}, [r10, :128]!
+
+ ble 9f
+ tst r7, #2 // LR_HAVE_RIGHT
+ vmov q0, q1
+ vmov q4, q5
+ vld1.16 {q1}, [r3]!
+ vld1.16 {q5}, [r12]!
+ bne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs r6, r6, #2
+ ble 0f
+ // Jump to the next row and loop horizontally
+ add r0, r0, r9, lsl #1
+ add r10, r10, r9, lsl #1
+ add r1, r1, r9
+ add r11, r11, r9
+ add r3, r3, r4
+ add r12, r12, r4
+ mov r5, r8
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+sgr_funcs 16
diff --git a/third_party/dav1d/src/arm/32/looprestoration_common.S b/third_party/dav1d/src/arm/32/looprestoration_common.S
new file mode 100644
index 0000000000..b080bb5115
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration_common.S
@@ -0,0 +1,453 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+ push {r4-r9,lr}
+ ldr r4, [sp, #28]
+ add r12, r3, #2 // Number of output rows to move back
+ mov lr, r3 // Number of input rows to move back
+ add r2, r2, #2 // Actual summed width
+ mov r7, #(4*SUM_STRIDE) // sumsq stride
+ mov r8, #(2*SUM_STRIDE) // sum stride
+ sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst r4, #4 // LR_HAVE_TOP
+ beq 0f
+ // If have top, read from row -2.
+ sub r5, r0, #(4*SUM_STRIDE)
+ sub r6, r1, #(2*SUM_STRIDE)
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add r5, r0, #(4*SUM_STRIDE)
+ add r6, r1, #(2*SUM_STRIDE)
+1:
+
+ tst r4, #8 // LR_HAVE_BOTTOM
+ beq 1f
+ // LR_HAVE_BOTTOM
+ add r3, r3, #2 // Sum all h+2 lines with the main loop
+ add lr, lr, #2
+1:
+ mov r9, r3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into q8-q13 and q0-q2 taking top
+ // padding into consideration.
+ tst r4, #4 // LR_HAVE_TOP
+ vld1.32 {q8, q9}, [r5, :128], r7
+ vld1.16 {q0}, [r6, :128], r8
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.32 {q10, q11}, [r5, :128], r7
+ vld1.16 {q1}, [r6, :128], r8
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q10, q8
+ vmov q11, q9
+ vmov q1, q0
+ vmov q12, q8
+ vmov q13, q9
+ vmov q2, q0
+
+3:
+ subs r3, r3, #1
+.macro add3
+ vadd.i32 q8, q8, q10
+ vadd.i32 q9, q9, q11
+ vadd.i16 q0, q0, q1
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i16 q0, q0, q2
+ vst1.32 {q8, q9}, [r0, :128], r7
+ vst1.16 {q0}, [r1, :128], r8
+.endm
+ add3
+ vmov q8, q10
+ vmov q9, q11
+ vmov q0, q1
+ vmov q10, q12
+ vmov q11, q13
+ vmov q1, q2
+ ble 4f
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ b 3b
+
+4:
+ tst r4, #8 // LR_HAVE_BOTTOM
+ bne 5f
+ // !LR_HAVE_BOTTOM
+ // Produce two more rows, extending the already loaded rows.
+ add3
+ vmov q8, q10
+ vmov q9, q11
+ vmov q0, q1
+ add3
+
+5: // End of one vertical slice.
+ subs r2, r2, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ mls r5, r7, lr, r5
+ mls r6, r8, lr, r6
+ // Output pointers
+ mls r0, r7, r12, r0
+ mls r1, r8, r12, r1
+ add r0, r0, #32
+ add r1, r1, #16
+ add r5, r5, #32
+ add r6, r6, #16
+ mov r3, r9
+ b 1b
+
+0:
+ pop {r4-r9,pc}
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+ push {r4-r9,lr}
+ vpush {q5-q7}
+ ldr r4, [sp, #76]
+ add r12, r3, #2 // Number of output rows to move back
+ mov lr, r3 // Number of input rows to move back
+ add r2, r2, #8 // Actual summed width
+ mov r7, #(4*SUM_STRIDE) // sumsq stride
+ mov r8, #(2*SUM_STRIDE) // sum stride
+ sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub r1, r1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst r4, #4 // LR_HAVE_TOP
+ beq 0f
+ // If have top, read from row -2.
+ sub r5, r0, #(4*SUM_STRIDE)
+ sub r6, r1, #(2*SUM_STRIDE)
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add r5, r0, #(4*SUM_STRIDE)
+ add r6, r1, #(2*SUM_STRIDE)
+1:
+
+ tst r4, #8 // LR_HAVE_BOTTOM
+ beq 0f
+ // LR_HAVE_BOTTOM
+ add r3, r3, #2 // Handle h+2 lines with the main loop
+ add lr, lr, #2
+ b 1f
+0:
+ // !LR_HAVE_BOTTOM
+ sub r3, r3, #1 // Handle h-1 lines with the main loop
+1:
+ mov r9, r3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into q6-q15 and q0-q3,q5 taking top
+ // padding into consideration.
+ tst r4, #4 // LR_HAVE_TOP
+ vld1.32 {q6, q7}, [r5, :128], r7
+ vld1.16 {q0}, [r6, :128], r8
+ beq 2f
+ // LR_HAVE_TOP
+ vld1.32 {q10, q11}, [r5, :128], r7
+ vld1.16 {q2}, [r6, :128], r8
+ vmov q8, q6
+ vmov q9, q7
+ vmov q1, q0
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ b 3f
+2: // !LR_HAVE_TOP
+ vmov q8, q6
+ vmov q9, q7
+ vmov q1, q0
+ vmov q10, q6
+ vmov q11, q7
+ vmov q2, q0
+ vmov q12, q6
+ vmov q13, q7
+ vmov q3, q0
+
+3:
+ cmp r3, #0
+ beq 4f
+ vld1.32 {q14, q15}, [r5, :128], r7
+ vld1.16 {q5}, [r6, :128], r8
+
+3:
+ // Start of vertical loop
+ subs r3, r3, #2
+.macro add5
+ vadd.i32 q6, q6, q8
+ vadd.i32 q7, q7, q9
+ vadd.i16 q0, q0, q1
+ vadd.i32 q6, q6, q10
+ vadd.i32 q7, q7, q11
+ vadd.i16 q0, q0, q2
+ vadd.i32 q6, q6, q12
+ vadd.i32 q7, q7, q13
+ vadd.i16 q0, q0, q3
+ vadd.i32 q6, q6, q14
+ vadd.i32 q7, q7, q15
+ vadd.i16 q0, q0, q5
+ vst1.32 {q6, q7}, [r0, :128], r7
+ vst1.16 {q0}, [r1, :128], r8
+.endm
+ add5
+.macro shift2
+ vmov q6, q10
+ vmov q7, q11
+ vmov q0, q2
+ vmov q8, q12
+ vmov q9, q13
+ vmov q1, q3
+ vmov q10, q14
+ vmov q11, q15
+ vmov q2, q5
+.endm
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ ble 5f
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ vld1.32 {q14, q15}, [r5, :128], r7
+ vld1.16 {q5}, [r6, :128], r8
+ b 3b
+
+4:
+ // h == 1, !LR_HAVE_BOTTOM.
+ // Pad the last row with the only content row, and add.
+ vmov q14, q12
+ vmov q15, q13
+ vmov q5, q3
+ add5
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ add5
+ b 6f
+
+5:
+ tst r4, #8 // LR_HAVE_BOTTOM
+ bne 6f
+ // !LR_HAVE_BOTTOM
+ cmp r3, #0
+ bne 5f
+ // The intended three edge rows left; output the one at h-2 and
+ // the past edge one at h.
+ vld1.32 {q12, q13}, [r5, :128], r7
+ vld1.16 {q3}, [r6, :128], r8
+ // Pad the past-edge row from the last content row.
+ vmov q14, q12
+ vmov q15, q13
+ vmov q5, q3
+ add5
+ shift2
+ add r0, r0, r7
+ add r1, r1, r8
+ // The last two rows are already padded properly here.
+ add5
+ b 6f
+
+5:
+ // r3 == -1, two rows left, output one.
+ // Pad the last two rows from the mid one.
+ vmov q12, q10
+ vmov q13, q11
+ vmov q3, q2
+ vmov q14, q10
+ vmov q15, q11
+ vmov q5, q2
+ add5
+ add r0, r0, r7
+ add r1, r1, r8
+ b 6f
+
+6: // End of one vertical slice.
+ subs r2, r2, #8
+ ble 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ mls r5, r7, lr, r5
+ mls r6, r8, lr, r6
+ // Output pointers
+ mls r0, r7, r12, r0
+ mls r1, r8, r12, r1
+ add r0, r0, #32
+ add r1, r1, #16
+ add r5, r5, #32
+ add r6, r6, #16
+ mov r3, r9
+ b 1b
+
+0:
+ vpop {q5-q7}
+ pop {r4-r9,pc}
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength,
+// const int bitdepth_max);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength,
+// const int bitdepth_max);
+function sgr_calc_ab1_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #84]
+ add r3, r3, #2 // h += 2
+ clz r6, r5
+ vmov.i32 q15, #9 // n
+ movw r5, #455
+ mov lr, #SUM_STRIDE
+ b sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+ push {r4-r7,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #84]
+ add r3, r3, #3 // h += 3
+ clz r6, r5
+ asr r3, r3, #1 // h /= 2
+ vmov.i32 q15, #25 // n
+ mov r5, #164
+ mov lr, #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+ movrel r12, X(sgr_x_by_x)
+ sub r6, r6, #24 // -bitdepth_min_8
+ vld1.8 {q8, q9}, [r12, :128]!
+ add r7, r6, r6 // -2*bitdepth_min_8
+ vmov.i8 q11, #5
+ vmov.i8 d10, #55 // idx of last 5
+ vld1.8 {q10}, [r12, :128]
+ vmov.i8 d11, #72 // idx of last 4
+ vmov.i8 d12, #101 // idx of last 3
+ vmov.i8 d13, #169 // idx of last 2
+ vmov.i8 d14, #254 // idx of last 1
+ vmov.i8 d15, #32 // elements consumed in first vtbl
+ add r2, r2, #2 // w += 2
+ add r12, r2, #7
+ bic r12, r12, #7 // aligned w
+ sub r12, lr, r12 // increment between rows
+ vdup.32 q12, r4
+ sub r0, r0, #(4*(SUM_STRIDE))
+ sub r1, r1, #(2*(SUM_STRIDE))
+ mov r4, r2 // backup of w
+ vsub.i8 q8, q8, q11
+ vsub.i8 q9, q9, q11
+ vsub.i8 q10, q10, q11
+1:
+ vld1.32 {q0, q1}, [r0, :128] // a
+ vld1.16 {q2}, [r1, :128] // b
+ vdup.32 q13, r7 // -2*bitdepth_min_8
+ vdup.16 q14, r6 // -bitdepth_min_8
+ subs r2, r2, #8
+ vrshl.s32 q0, q0, q13
+ vrshl.s32 q1, q1, q13
+ vrshl.s16 q4, q2, q14
+ vmul.i32 q0, q0, q15 // a * n
+ vmul.i32 q1, q1, q15 // a * n
+ vmull.u16 q3, d8, d8 // b * b
+ vmull.u16 q4, d9, d9 // b * b
+ vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0)
+ vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0)
+ vmul.i32 q0, q0, q12 // p * s
+ vmul.i32 q1, q1, q12 // p * s
+ vqshrn.u32 d0, q0, #16
+ vqshrn.u32 d1, q1, #16
+ vqrshrn.u16 d0, q0, #4 // imin(z, 255)
+
+ vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5
+ vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4
+ vtbl.8 d1, {q8, q9}, d0
+ vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3
+ vsub.i8 d9, d0, d15 // indices for vtbx
+ vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2
+ vadd.i8 d2, d2, d3
+ vtbx.8 d1, {q10}, d9
+ vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1
+ vadd.i8 d6, d6, d7
+ vadd.i8 d8, d8, d22
+ vadd.i8 d2, d2, d6
+ vadd.i8 d1, d1, d8
+ vadd.i8 d1, d1, d2
+ vmovl.u8 q0, d1 // x
+
+ vmov.i16 q13, #256
+ vdup.32 q14, r5 // one_by_x
+
+ vmull.u16 q1, d0, d4 // x * BB[i]
+ vmull.u16 q2, d1, d5 // x * BB[i]
+ vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x
+ vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x
+ vrshr.s32 q1, q1, #12 // AA[i]
+ vrshr.s32 q2, q2, #12 // AA[i]
+ vsub.i16 q0, q13, q0 // 256 - x
+
+ vst1.32 {q1, q2}, [r0, :128]!
+ vst1.16 {q0}, [r1, :128]!
+ bgt 1b
+
+ subs r3, r3, #1
+ ble 0f
+ add r0, r0, r12, lsl #2
+ add r1, r1, r12, lsl #1
+ mov r2, r4
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r7,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/looprestoration_tmpl.S b/third_party/dav1d/src/arm/32/looprestoration_tmpl.S
new file mode 100644
index 0000000000..8a9940bb3a
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration_tmpl.S
@@ -0,0 +1,600 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+.macro sgr_funcs bpc
+// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter1_\bpc\()bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r6, [sp, #108]
+ sub r7, r3, #(4*SUM_STRIDE)
+ add r8, r3, #(4*SUM_STRIDE)
+ sub r9, r4, #(2*SUM_STRIDE)
+ add r10, r4, #(2*SUM_STRIDE)
+ mov r11, #SUM_STRIDE
+ mov r12, #FILTER_OUT_STRIDE
+ add lr, r5, #3
+ bic lr, lr, #3 // Aligned width
+.if \bpc == 8
+ sub r2, r2, lr
+.else
+ sub r2, r2, lr, lsl #1
+.endif
+ sub r12, r12, lr
+ sub r11, r11, lr
+ sub r11, r11, #4 // We read 4 extra elements from both a and b
+ mov lr, r5
+ vmov.i16 q14, #3
+ vmov.i32 q15, #3
+1:
+ vld1.16 {q0}, [r9, :128]!
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q2}, [r10, :128]!
+ vld1.32 {q8, q9}, [r7, :128]!
+ vld1.32 {q10, q11}, [r3, :128]!
+ vld1.32 {q12, q13}, [r8, :128]!
+
+2:
+ subs r5, r5, #4
+ vext.8 d6, d0, d1, #2 // -stride
+ vext.8 d7, d2, d3, #2 // 0
+ vext.8 d8, d4, d5, #2 // +stride
+ vext.8 d9, d0, d1, #4 // +1-stride
+ vext.8 d10, d2, d3, #4 // +1
+ vext.8 d11, d4, d5, #4 // +1+stride
+ vadd.i16 d2, d2, d6 // -1, -stride
+ vadd.i16 d7, d7, d8 // 0, +stride
+ vadd.i16 d0, d0, d9 // -1-stride, +1-stride
+ vadd.i16 d2, d2, d7
+ vadd.i16 d4, d4, d11 // -1+stride, +1+stride
+ vadd.i16 d2, d2, d10 // +1
+ vadd.i16 d0, d0, d4
+
+ vext.8 q3, q8, q9, #4 // -stride
+ vshl.i16 d2, d2, #2
+ vext.8 q4, q8, q9, #8 // +1-stride
+ vext.8 q5, q10, q11, #4 // 0
+ vext.8 q6, q10, q11, #8 // +1
+ vmla.i16 d2, d0, d28 // * 3 -> a
+ vadd.i32 q3, q3, q10 // -stride, -1
+ vadd.i32 q8, q8, q4 // -1-stride, +1-stride
+ vadd.i32 q5, q5, q6 // 0, +1
+ vadd.i32 q8, q8, q12 // -1+stride
+ vadd.i32 q3, q3, q5
+ vext.8 q7, q12, q13, #4 // +stride
+ vext.8 q10, q12, q13, #8 // +1+stride
+.if \bpc == 8
+ vld1.32 {d24[0]}, [r1, :32]! // src
+.else
+ vld1.16 {d24}, [r1, :64]! // src
+.endif
+ vadd.i32 q3, q3, q7 // +stride
+ vadd.i32 q8, q8, q10 // +1+stride
+ vshl.i32 q3, q3, #2
+ vmla.i32 q3, q8, q15 // * 3 -> b
+.if \bpc == 8
+ vmovl.u8 q12, d24 // src
+.endif
+ vmov d0, d1
+ vmlal.u16 q3, d2, d24 // b + a * src
+ vmov d2, d3
+ vrshrn.i32 d6, q3, #9
+ vmov d4, d5
+ vst1.16 {d6}, [r0]!
+
+ ble 3f
+ vmov q8, q9
+ vmov q10, q11
+ vmov q12, q13
+ vld1.16 {d1}, [r9, :64]!
+ vld1.16 {d3}, [r4, :64]!
+ vld1.16 {d5}, [r10, :64]!
+ vld1.32 {q9}, [r7, :128]!
+ vld1.32 {q11}, [r3, :128]!
+ vld1.32 {q13}, [r8, :128]!
+ b 2b
+
+3:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ add r0, r0, r12, lsl #1
+ add r1, r1, r2
+ add r3, r3, r11, lsl #2
+ add r7, r7, r11, lsl #2
+ add r8, r8, r11, lsl #2
+ add r4, r4, r11, lsl #1
+ add r9, r9, r11, lsl #1
+ add r10, r10, r11, lsl #1
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter2_\bpc\()bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r6, [sp, #108]
+ add r7, r3, #(4*(SUM_STRIDE))
+ sub r3, r3, #(4*(SUM_STRIDE))
+ add r8, r4, #(2*(SUM_STRIDE))
+ sub r4, r4, #(2*(SUM_STRIDE))
+ mov r9, #(2*SUM_STRIDE)
+ mov r10, #FILTER_OUT_STRIDE
+ add r11, r5, #7
+ bic r11, r11, #7 // Aligned width
+.if \bpc == 8
+ sub r2, r2, r11
+.else
+ sub r2, r2, r11, lsl #1
+.endif
+ sub r10, r10, r11
+ sub r9, r9, r11
+ sub r9, r9, #4 // We read 4 extra elements from a
+ sub r12, r9, #4 // We read 8 extra elements from b
+ mov lr, r5
+
+1:
+ vld1.16 {q0, q1}, [r4, :128]!
+ vld1.16 {q2, q3}, [r8, :128]!
+ vld1.32 {q8, q9}, [r3, :128]!
+ vld1.32 {q11, q12}, [r7, :128]!
+ vld1.32 {q10}, [r3, :128]!
+ vld1.32 {q13}, [r7, :128]!
+
+2:
+ vmov.i16 q14, #5
+ vmov.i16 q15, #6
+ subs r5, r5, #8
+ vext.8 q4, q0, q1, #4 // +1-stride
+ vext.8 q5, q2, q3, #4 // +1+stride
+ vext.8 q6, q0, q1, #2 // -stride
+ vext.8 q7, q2, q3, #2 // +stride
+ vadd.i16 q0, q0, q4 // -1-stride, +1-stride
+ vadd.i16 q5, q2, q5 // -1+stride, +1+stride
+ vadd.i16 q2, q6, q7 // -stride, +stride
+ vadd.i16 q0, q0, q5
+
+ vext.8 q4, q8, q9, #8 // +1-stride
+ vext.8 q5, q9, q10, #8
+ vext.8 q6, q11, q12, #8 // +1+stride
+ vext.8 q7, q12, q13, #8
+ vmul.i16 q0, q0, q14 // * 5
+ vmla.i16 q0, q2, q15 // * 6
+ vadd.i32 q4, q4, q8 // -1-stride, +1-stride
+ vadd.i32 q5, q5, q9
+ vadd.i32 q6, q6, q11 // -1+stride, +1+stride
+ vadd.i32 q7, q7, q12
+ vadd.i32 q4, q4, q6
+ vadd.i32 q5, q5, q7
+ vext.8 q6, q8, q9, #4 // -stride
+ vext.8 q7, q9, q10, #4
+ vext.8 q8, q11, q12, #4 // +stride
+ vext.8 q11, q12, q13, #4
+
+.if \bpc == 8
+ vld1.8 {d4}, [r1, :64]!
+.else
+ vld1.8 {q2}, [r1, :128]!
+.endif
+
+ vmov.i32 q14, #5
+ vmov.i32 q15, #6
+
+ vadd.i32 q6, q6, q8 // -stride, +stride
+ vadd.i32 q7, q7, q11
+ vmul.i32 q4, q4, q14 // * 5
+ vmla.i32 q4, q6, q15 // * 6
+ vmul.i32 q5, q5, q14 // * 5
+ vmla.i32 q5, q7, q15 // * 6
+
+.if \bpc == 8
+ vmovl.u8 q2, d4
+.endif
+ vmlal.u16 q4, d0, d4 // b + a * src
+ vmlal.u16 q5, d1, d5 // b + a * src
+ vmov q0, q1
+ vrshrn.i32 d8, q4, #9
+ vrshrn.i32 d9, q5, #9
+ vmov q2, q3
+ vst1.16 {q4}, [r0, :128]!
+
+ ble 3f
+ vmov q8, q10
+ vmov q11, q13
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q3}, [r8, :128]!
+ vld1.32 {q9, q10}, [r3, :128]!
+ vld1.32 {q12, q13}, [r7, :128]!
+ b 2b
+
+3:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ add r0, r0, r10, lsl #1
+ add r1, r1, r2
+ add r3, r3, r9, lsl #2
+ add r7, r7, r9, lsl #2
+ add r4, r4, r12, lsl #1
+ add r8, r8, r12, lsl #1
+
+ vld1.32 {q8, q9}, [r3, :128]!
+ vld1.16 {q0, q1}, [r4, :128]!
+ vld1.32 {q10}, [r3, :128]!
+
+ vmov.i16 q12, #5
+ vmov.i16 q13, #6
+
+4:
+ subs r5, r5, #8
+ vext.8 q3, q0, q1, #4 // +1
+ vext.8 q2, q0, q1, #2 // 0
+ vadd.i16 q0, q0, q3 // -1, +1
+
+ vext.8 q4, q8, q9, #4 // 0
+ vext.8 q5, q9, q10, #4
+ vext.8 q6, q8, q9, #8 // +1
+ vext.8 q7, q9, q10, #8
+ vmul.i16 q2, q2, q13 // * 6
+ vmla.i16 q2, q0, q12 // * 5 -> a
+.if \bpc == 8
+ vld1.8 {d22}, [r1, :64]!
+.else
+ vld1.16 {q11}, [r1, :128]!
+.endif
+ vadd.i32 q8, q8, q6 // -1, +1
+ vadd.i32 q9, q9, q7
+.if \bpc == 8
+ vmovl.u8 q11, d22
+.endif
+ vmul.i32 q4, q4, q15 // * 6
+ vmla.i32 q4, q8, q14 // * 5 -> b
+ vmul.i32 q5, q5, q15 // * 6
+ vmla.i32 q5, q9, q14 // * 5 -> b
+
+ vmlal.u16 q4, d4, d22 // b + a * src
+ vmlal.u16 q5, d5, d23
+ vmov q0, q1
+ vrshrn.i32 d8, q4, #8
+ vrshrn.i32 d9, q5, #8
+ vmov q8, q10
+ vst1.16 {q4}, [r0, :128]!
+
+ ble 5f
+ vld1.16 {q1}, [r4, :128]!
+ vld1.32 {q9, q10}, [r3, :128]!
+ b 4b
+
+5:
+ subs r6, r6, #1
+ ble 0f
+ mov r5, lr
+ sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started
+ sub r4, r4, r11, lsl #1
+ add r0, r0, r10, lsl #1
+ add r1, r1, r2
+ sub r3, r3, #16
+ sub r4, r4, #16
+ b 1b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int w, const int h,
+// const int wt, const int bitdepth_max);
+function sgr_weighted1_\bpc\()bpc_neon, export=1
+ push {r4-r9,lr}
+ ldrd r4, r5, [sp, #28]
+ ldrd r6, r7, [sp, #36]
+.if \bpc == 16
+ ldr r8, [sp, #44]
+.endif
+ vdup.16 d31, r7
+ cmp r6, #2
+.if \bpc == 16
+ vdup.16 q14, r8
+.endif
+ add r9, r0, r1
+ add r12, r2, r3
+ add lr, r4, #2*FILTER_OUT_STRIDE
+ mov r7, #(4*FILTER_OUT_STRIDE)
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+ add r8, r5, #7
+ bic r8, r8, #7 // Aligned width
+.if \bpc == 8
+ sub r1, r1, r8
+ sub r3, r3, r8
+.else
+ sub r1, r1, r8, lsl #1
+ sub r3, r3, r8, lsl #1
+.endif
+ sub r7, r7, r8, lsl #1
+ mov r8, r5
+ blt 2f
+1:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+ vld1.8 {d16}, [r12, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+ vld1.16 {q8}, [r12, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q9}, [lr, :128]!
+ subs r5, r5, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+ vshll.u8 q8, d16, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+ vshl.i16 q8, q8, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q9, q9, q8 // t1 - u
+ vshll.u16 q2, d0, #7 // u << 7
+ vshll.u16 q3, d1, #7 // u << 7
+ vshll.u16 q10, d16, #7 // u << 7
+ vshll.u16 q11, d17, #7 // u << 7
+ vmlal.s16 q2, d2, d31 // v
+ vmlal.s16 q3, d3, d31 // v
+ vmlal.s16 q10, d18, d31 // v
+ vmlal.s16 q11, d19, d31 // v
+.if \bpc == 8
+ vrshrn.i32 d4, q2, #11
+ vrshrn.i32 d5, q3, #11
+ vrshrn.i32 d20, q10, #11
+ vrshrn.i32 d21, q11, #11
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d20, q10
+ vst1.8 {d4}, [r0, :64]!
+ vst1.8 {d20}, [r9, :64]!
+.else
+ vqrshrun.s32 d4, q2, #11
+ vqrshrun.s32 d5, q3, #11
+ vqrshrun.s32 d20, q10, #11
+ vqrshrun.s32 d21, q11, #11
+ vmin.u16 q2, q2, q14
+ vmin.u16 q10, q10, q14
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q10}, [r9, :128]!
+.endif
+ bgt 1b
+
+ sub r6, r6, #2
+ cmp r6, #1
+ blt 0f
+ mov r5, r8
+ add r0, r0, r1
+ add r9, r9, r1
+ add r2, r2, r3
+ add r12, r12, r3
+ add r4, r4, r7
+ add lr, lr, r7
+ beq 2f
+ b 1b
+
+2:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ subs r5, r5, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vshll.u16 q2, d0, #7 // u << 7
+ vshll.u16 q3, d1, #7 // u << 7
+ vmlal.s16 q2, d2, d31 // v
+ vmlal.s16 q3, d3, d31 // v
+.if \bpc == 8
+ vrshrn.i32 d4, q2, #11
+ vrshrn.i32 d5, q3, #11
+ vqmovun.s16 d2, q2
+ vst1.8 {d2}, [r0, :64]!
+.else
+ vqrshrun.s32 d4, q2, #11
+ vqrshrun.s32 d5, q3, #11
+ vmin.u16 q2, q2, q14
+ vst1.16 {q2}, [r0, :128]!
+.endif
+ bgt 2b
+0:
+ pop {r4-r9,pc}
+endfunc
+
+// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int16_t *t2,
+// const int w, const int h,
+// const int16_t wt[2], const int bitdepth_max);
+function sgr_weighted2_\bpc\()bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+.if \bpc == 8
+ ldr r8, [sp, #52]
+.else
+ ldrd r8, r9, [sp, #52]
+.endif
+ cmp r7, #2
+ add r10, r0, r1
+ add r11, r2, r3
+ add r12, r4, #2*FILTER_OUT_STRIDE
+ add lr, r5, #2*FILTER_OUT_STRIDE
+ vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1]
+.if \bpc == 16
+ vdup.16 q14, r9
+.endif
+ mov r8, #4*FILTER_OUT_STRIDE
+ lsl r1, r1, #1
+ lsl r3, r3, #1
+ add r9, r6, #7
+ bic r9, r9, #7 // Aligned width
+.if \bpc == 8
+ sub r1, r1, r9
+ sub r3, r3, r9
+.else
+ sub r1, r1, r9, lsl #1
+ sub r3, r3, r9, lsl #1
+.endif
+ sub r8, r8, r9, lsl #1
+ mov r9, r6
+ blt 2f
+1:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+ vld1.8 {d16}, [r11, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+ vld1.16 {q8}, [r11, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q9}, [r12, :128]!
+ vld1.16 {q2}, [r5, :128]!
+ vld1.16 {q10}, [lr, :128]!
+ subs r6, r6, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+ vshll.u8 q8, d16, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+ vshl.i16 q8, q8, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q2, q2, q0 // t2 - u
+ vsub.i16 q9, q9, q8 // t1 - u
+ vsub.i16 q10, q10, q8 // t2 - u
+ vshll.u16 q3, d0, #7 // u << 7
+ vshll.u16 q0, d1, #7 // u << 7
+ vshll.u16 q11, d16, #7 // u << 7
+ vshll.u16 q8, d17, #7 // u << 7
+ vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u)
+.if \bpc == 8
+ vrshrn.i32 d6, q3, #11
+ vrshrn.i32 d7, q0, #11
+ vrshrn.i32 d22, q11, #11
+ vrshrn.i32 d23, q8, #11
+ vqmovun.s16 d6, q3
+ vqmovun.s16 d22, q11
+ vst1.8 {d6}, [r0, :64]!
+ vst1.8 {d22}, [r10, :64]!
+.else
+ vqrshrun.s32 d6, q3, #11
+ vqrshrun.s32 d7, q0, #11
+ vqrshrun.s32 d22, q11, #11
+ vqrshrun.s32 d23, q8, #11
+ vmin.u16 q3, q3, q14
+ vmin.u16 q11, q11, q14
+ vst1.16 {q3}, [r0, :128]!
+ vst1.16 {q11}, [r10, :128]!
+.endif
+ bgt 1b
+
+ subs r7, r7, #2
+ cmp r7, #1
+ blt 0f
+ mov r6, r9
+ add r0, r0, r1
+ add r10, r10, r1
+ add r2, r2, r3
+ add r11, r11, r3
+ add r4, r4, r8
+ add r12, r12, r8
+ add r5, r5, r8
+ add lr, lr, r8
+ beq 2f
+ b 1b
+
+2:
+.if \bpc == 8
+ vld1.8 {d0}, [r2, :64]!
+.else
+ vld1.16 {q0}, [r2, :128]!
+.endif
+ vld1.16 {q1}, [r4, :128]!
+ vld1.16 {q2}, [r5, :128]!
+ subs r6, r6, #8
+.if \bpc == 8
+ vshll.u8 q0, d0, #4 // u
+.else
+ vshl.i16 q0, q0, #4 // u
+.endif
+ vsub.i16 q1, q1, q0 // t1 - u
+ vsub.i16 q2, q2, q0 // t2 - u
+ vshll.u16 q3, d0, #7 // u << 7
+ vshll.u16 q0, d1, #7 // u << 7
+ vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u)
+ vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u)
+ vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u)
+.if \bpc == 8
+ vrshrn.i32 d6, q3, #11
+ vrshrn.i32 d7, q0, #11
+ vqmovun.s16 d6, q3
+ vst1.8 {d6}, [r0, :64]!
+.else
+ vqrshrun.s32 d6, q3, #11
+ vqrshrun.s32 d7, q0, #11
+ vmin.u16 q3, q3, q14
+ vst1.16 {q3}, [r0, :128]!
+.endif
+ bgt 1b
+0:
+ pop {r4-r11,pc}
+endfunc
+.endm
diff --git a/third_party/dav1d/src/arm/32/mc.S b/third_party/dav1d/src/arm/32/mc.S
new file mode 100644
index 0000000000..1b60a7bdb3
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/mc.S
@@ -0,0 +1,3340 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro avg dst0, dst1, t0, t1, t2, t3
+ vld1.16 {\t0,\t1}, [r2, :128]!
+ vld1.16 {\t2,\t3}, [r3, :128]!
+ vadd.i16 \t0, \t0, \t2
+ vadd.i16 \t1, \t1, \t3
+ vqrshrun.s16 \dst0, \t0, #5
+ vqrshrun.s16 \dst1, \t1, #5
+.endm
+
+.macro w_avg dst0, dst1, t0, t1, t2, t3
+ vld1.16 {\t0,\t1}, [r2, :128]!
+ vld1.16 {\t2,\t3}, [r3, :128]!
+ vsub.i16 \t0, \t2, \t0
+ vsub.i16 \t1, \t3, \t1
+ vqdmulh.s16 \t0, \t0, q15
+ vqdmulh.s16 \t1, \t1, q15
+ vadd.i16 \t0, \t2, \t0
+ vadd.i16 \t1, \t3, \t1
+ vqrshrun.s16 \dst0, \t0, #4
+ vqrshrun.s16 \dst1, \t1, #4
+.endm
+
+.macro mask dst0, dst1, t0, t1, t2, t3
+ vld1.8 {q14}, [lr, :128]!
+ vld1.16 {\t0,\t1}, [r2, :128]!
+ vmul.i8 q14, q14, q15
+ vld1.16 {\t2,\t3}, [r3, :128]!
+ vshll.i8 q13, d28, #8
+ vshll.i8 q14, d29, #8
+ vsub.i16 \t0, \t2, \t0
+ vsub.i16 \t1, \t3, \t1
+ vqdmulh.s16 \t0, \t0, q13
+ vqdmulh.s16 \t1, \t1, q14
+ vadd.i16 \t0, \t2, \t0
+ vadd.i16 \t1, \t3, \t1
+ vqrshrun.s16 \dst0, \t0, #4
+ vqrshrun.s16 \dst1, \t1, #4
+.endm
+
+.macro bidir_fn type
+function \type\()_8bpc_neon, export=1
+ push {r4-r6,lr}
+ ldrd r4, r5, [sp, #16]
+ clz r4, r4
+.ifnc \type, avg
+ ldr lr, [sp, #24]
+.endif
+.ifc \type, w_avg
+ vdup.s16 q15, lr
+ vneg.s16 q15, q15
+ vshl.i16 q15, q15, #11
+.endif
+.ifc \type, mask
+ vmov.i8 q15, #256-2
+.endif
+ adr r12, L(\type\()_tbl)
+ sub r4, r4, #24
+ ldr r4, [r12, r4, lsl #2]
+ \type d16, d17, q0, q1, q2, q3
+ add r12, r12, r4
+ bx r12
+
+ .align 2
+L(\type\()_tbl):
+ .word 1280f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 4f - L(\type\()_tbl) + CONFIG_THUMB
+
+4:
+ add r6, r0, r1
+ lsl r1, r1, #1
+ cmp r5, #4
+ vst1.32 {d16[0]}, [r0, :32], r1
+ vst1.32 {d16[1]}, [r6, :32], r1
+ vst1.32 {d17[0]}, [r0, :32], r1
+ vst1.32 {d17[1]}, [r6, :32], r1
+ beq 0f
+ \type d18, d19, q0, q1, q2, q3
+ cmp r5, #8
+ vst1.32 {d18[0]}, [r0, :32], r1
+ vst1.32 {d18[1]}, [r6, :32], r1
+ vst1.32 {d19[0]}, [r0, :32], r1
+ vst1.32 {d19[1]}, [r6, :32], r1
+ beq 0f
+ \type d16, d17, q0, q1, q2, q3
+ vst1.32 {d16[0]}, [r0, :32], r1
+ vst1.32 {d16[1]}, [r6, :32], r1
+ \type d18, d19, q0, q1, q2, q3
+ vst1.32 {d17[0]}, [r0, :32], r1
+ vst1.32 {d17[1]}, [r6, :32], r1
+ vst1.32 {d18[0]}, [r0, :32], r1
+ vst1.32 {d18[1]}, [r6, :32], r1
+ vst1.32 {d19[0]}, [r0, :32], r1
+ vst1.32 {d19[1]}, [r6, :32], r1
+ pop {r4-r6,pc}
+80:
+ add r6, r0, r1
+ lsl r1, r1, #1
+8:
+ vst1.8 {d16}, [r0, :64], r1
+ \type d18, d19, q0, q1, q2, q3
+ vst1.8 {d17}, [r6, :64], r1
+ vst1.8 {d18}, [r0, :64], r1
+ subs r5, r5, #4
+ vst1.8 {d19}, [r6, :64], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 8b
+160:
+ add r6, r0, r1
+ lsl r1, r1, #1
+16:
+ \type d18, d19, q0, q1, q2, q3
+ vst1.8 {q8}, [r0, :128], r1
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q9}, [r6, :128], r1
+ \type d22, d23, q0, q1, q2, q3
+ vst1.8 {q10}, [r0, :128], r1
+ subs r5, r5, #4
+ vst1.8 {q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 16b
+320:
+ add r6, r0, r1
+ lsl r1, r1, #1
+32:
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128], r1
+ \type d22, d23, q0, q1, q2, q3
+ subs r5, r5, #2
+ vst1.8 {q10, q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 32b
+640:
+ add r6, r0, #32
+64:
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ \type d22, d23, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128], r1
+ \type d16, d17, q0, q1, q2, q3
+ vst1.8 {q10, q11}, [r6, :128], r1
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128], r1
+ \type d22, d23, q0, q1, q2, q3
+ subs r5, r5, #2
+ vst1.8 {q10, q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 64b
+1280:
+ sub r1, r1, #32
+ add r6, r0, #64
+128:
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ \type d22, d23, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r0, :128]!
+ \type d16, d17, q0, q1, q2, q3
+ vst1.8 {q10, q11}, [r0, :128], r1
+ \type d18, d19, q0, q1, q2, q3
+ \type d20, d21, q0, q1, q2, q3
+ vst1.8 {q8, q9}, [r6, :128]!
+ \type d22, d23, q0, q1, q2, q3
+ subs r5, r5, #1
+ vst1.8 {q10, q11}, [r6, :128], r1
+ ble 0f
+ \type d16, d17, q0, q1, q2, q3
+ b 128b
+
+0:
+ pop {r4-r6,pc}
+endfunc
+.endm
+
+bidir_fn avg
+bidir_fn w_avg
+bidir_fn mask
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_8bpc_neon, export=1
+ push {r4-r9,lr}
+ ldrd r4, r5, [sp, #28]
+ ldrd r6, r7, [sp, #36]
+ clz r8, r4
+ adr r9, L(w_mask_\type\()_tbl)
+ sub r8, r8, #24
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ movw r12, #6903
+ vdup.16 q14, r12
+.if \type == 444
+ vmov.i8 q15, #64
+.elseif \type == 422
+ vdup.8 d0, r7 // d0[] <- sign
+ vmov.i8 d30, #129
+ vsub.i8 d30, d30, d0 // 129 - sign
+.elseif \type == 420
+ vdup.16 q0, r7 // d0[] <- sign
+ vmov.i16 q15, #256
+ vsub.i16 q15, q15, q0 // 256 - sign
+.endif
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r9
+
+ .align 2
+L(w_mask_\type\()_tbl):
+ .word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 640f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 320f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 160f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 8f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 4f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+
+4:
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1 (four rows at once)
+ vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2 (four rows at once)
+ subs r5, r5, #4
+ vsub.i16 q8, q2, q0 // tmp2-tmp1
+ vsub.i16 q9, q3, q1
+ vabd.s16 q10, q0, q2 // (abs(tmp1[x] - tmp2[x]))
+ vabd.s16 q11, q1, q3
+ vqsub.u16 q10, q14, q10 // 6903 - abs ()
+ vqsub.u16 q11, q14, q11
+ vshr.s16 q10, q10, #8 // 64-m = (6903 - abs()) >> 8
+ vshr.s16 q11, q11, #8
+ vshl.s16 q12, q10, #9 // (64-m)<<9
+ vshl.s16 q13, q11, #9
+ vqdmulh.s16 q12, q12, q8 // ((tmp2-tmp1)*(64-m)<<9)>>15
+ vqdmulh.s16 q13, q13, q9
+ vadd.i16 q12, q12, q0 // (((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1
+ vadd.i16 q13, q13, q1
+ vqrshrun.s16 d24, q12, #4 // (((((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1) + 8) >> 4
+ vqrshrun.s16 d25, q13, #4
+.if \type == 444
+ vmovn.u16 d20, q10 // 64 - m
+ vmovn.u16 d21, q11
+ vsub.i8 q10, q15, q10 // m
+ vst1.8 {d20, d21}, [r6, :128]!
+.elseif \type == 422
+ vpadd.s16 d20, d20, d21 // (64 - m) + (64 - n) (column wise addition)
+ vpadd.s16 d21, d22, d23
+ vmovn.s16 d6, q10
+ vhsub.u8 d6, d30, d6 // ((129 - sign) - ((64 - m) + (64 - n))) >> 1
+ vst1.8 {d6}, [r6, :64]!
+.elseif \type == 420
+ vadd.s16 d20, d20, d21 // (64 - my1) + (64 - my2) (row wise addition)
+ vadd.s16 d21, d22, d23
+ vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
+ vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d20[0]}, [r6, :32]!
+.endif
+ vst1.32 {d24[0]}, [r0, :32], r1
+ vst1.32 {d24[1]}, [r12, :32], r1
+ vst1.32 {d25[0]}, [r0, :32], r1
+ vst1.32 {d25[1]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r9,pc}
+8:
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1, tmp1y2
+ vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1, tmp2y2
+ subs r5, r5, #2
+ vsub.i16 q8, q2, q0 // tmp2y1 - tmp1y1
+ vsub.i16 q9, q3, q1 // tmp2y2 - tmp1y2
+ vabd.s16 q10, q0, q2 // abs(tmp1y1 - tmp2y1)
+ vabd.s16 q11, q1, q3 // abs(tmp1y2 - tmp2y2)
+ vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1)
+ vqsub.u16 q11, q14, q11 // 6903 - abs(tmp1y2 - tmp2y2)
+ vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8
+ vshr.s16 q11, q11, #8 // 64 - my2 = 6903 - abs(tmp1y2 - tmp2y2) >> 8
+ vshl.s16 q12, q10, #9 // (64 - my1) << 9
+ vshl.s16 q13, q11, #9 // (64 - my2) << 9
+ vqdmulh.s16 q12, q12, q8 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15
+ vqdmulh.s16 q13, q13, q9 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15
+ vadd.s16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1
+ vadd.s16 q13, q13, q1 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2
+ vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4
+ vqrshrun.s16 d25, q13, #4 // (((((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4
+.if \type == 444
+ vmovn.u16 d20, q10 // 64 - m
+ vmovn.u16 d21, q11
+ vsub.i8 q10, q15, q10 // m
+ vst1.8 {d20, d21}, [r6, :128]!
+.elseif \type == 422
+ vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition)
+ vpadd.s16 d21, d22, d23 // (64 - my2) + (64 - ny2)
+ vmovn.s16 d20, q10
+ vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1/y2) + (64 - ny1/y2))) >> 1
+ vst1.8 {d20}, [r6, :64]!
+.elseif \type == 420
+ vadd.s16 q10, q10, q11 // (64 - my1) + (64 - my2) (row wise addition)
+ vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
+ vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d20[0]}, [r6, :32]!
+.endif
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d25}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r9,pc}
+1280:
+640:
+320:
+160:
+ sub r1, r1, r4
+.if \type == 444
+ add lr, r6, r4
+.elseif \type == 422
+ add lr, r6, r4, lsr #1
+.endif
+ add r9, r3, r4, lsl #1
+ add r7, r2, r4, lsl #1
+161:
+ mov r8, r4
+16:
+ vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1
+ vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1
+ vld1.16 {d16, d17, d18, d19}, [r7, :128]! // tmp1y2
+ subs r8, r8, #16
+ vsub.i16 q2, q2, q0 // tmp2y1 - tmp1y1
+ vsub.i16 q3, q3, q1
+ vabs.s16 q10, q2 // abs(tm2y1 - tmp1y1)
+ vabs.s16 q11, q3
+ vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1)
+ vqsub.u16 q11, q14, q11
+ vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8
+ vshr.s16 q11, q11, #8
+ vshl.s16 q12, q10, #9 // (64 - my1) << 9
+ vshl.s16 q13, q11, #9
+ vqdmulh.s16 q12, q12, q2 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15
+ vqdmulh.s16 q13, q13, q3
+ vadd.i16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1
+ vadd.i16 q13, q13, q1
+ vld1.16 {d0, d1, d2, d3}, [r9, :128]! // tmp2h2
+.if \type == 444
+ vmovn.u16 d20, q10 // 64 - my1
+ vmovn.u16 d21, q11
+ vsub.i8 q10, q15, q10 // my1
+ vst1.8 {d20, d21}, [r6, :128]!
+.elseif \type == 422
+ vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition)
+ vpadd.s16 d21, d22, d23
+ vmovn.s16 d20, q10
+ vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1) + (64 - ny1))) >> 1
+ vst1.8 {d20}, [r6, :64]!
+.endif
+ vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1)*(64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4
+ vqrshrun.s16 d25, q13, #4
+ vsub.i16 q0, q0, q8 // tmp2y2 - tmp1y2
+ vsub.i16 q1, q1, q9
+ vst1.16 {d24, d25}, [r0, :128]! // store dsty1
+ vabs.s16 q2, q0 // abs(tmp2y2 - tmp1y2)
+ vabs.s16 q3, q1
+ vqsub.u16 q2, q14, q2 // 6903 - abs(tmp2y2 - tmp1y2)
+ vqsub.u16 q3, q14, q3
+ vshr.s16 q2, q2, #8 // (6903 - abs(tmp2y2 - tmp1y2)) >> 8
+ vshr.s16 q3, q3, #8
+ vshl.s16 q12, q2, #9 // (64 - my2) << 9
+ vshl.s16 q13, q3, #9
+.if \type == 444
+ vmovn.u16 d4, q2 // 64 - my2
+ vmovn.u16 d5, q3
+ vsub.i8 q2, q15, q2 // my2
+ vst1.8 {d4, d5}, [lr, :128]!
+.elseif \type == 422
+ vpadd.s16 d4, d4, d5 // (64 - my2) + (64 - ny2) (column wise addition)
+ vpadd.s16 d5, d6, d7
+ vmovn.s16 d4, q2
+ vhsub.u8 d4, d30, d4 // ((129 - sign) - ((64 - my2) + (64 - ny2))) >> 1
+ vst1.8 {d4}, [lr, :64]!
+.elseif \type == 420
+ vadd.s16 q10, q10, q2 // (64 - my1) + (64 - my2) (row wise addition)
+ vadd.s16 q11, q11, q3
+ vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
+ vpadd.s16 d21, d22, d23
+ vsub.s16 q10, q15, q10 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.8 {d20}, [r6, :64]!
+.endif
+ vqdmulh.s16 q12, q12, q0 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15
+ vqdmulh.s16 q13, q13, q1
+ vadd.i16 q12, q12, q8 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2
+ vadd.i16 q13, q13, q9
+ vqrshrun.s16 d24, q12, #4 // (((((tmp2y2 - tmp1y2)*(64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4
+ vqrshrun.s16 d25, q13, #4
+ vst1.16 {d24, d25}, [r12, :128]! // store dsty2
+ bgt 16b
+ subs r5, r5, #2
+ add r2, r2, r4, lsl #1
+ add r3, r3, r4, lsl #1
+ add r7, r7, r4, lsl #1
+ add r9, r9, r4, lsl #1
+.if \type == 444
+ add r6, r6, r4
+ add lr, lr, r4
+.elseif \type == 422
+ add r6, r6, r4, lsr #1
+ add lr, lr, r4, lsr #1
+.endif
+ add r0, r0, r1
+ add r12, r12, r1
+ bgt 161b
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+
+function blend_8bpc_neon, export=1
+ push {r4-r5,lr}
+ ldrd r4, r5, [sp, #12]
+ clz lr, r3
+ adr r3, L(blend_tbl)
+ sub lr, lr, #26
+ ldr lr, [r3, lr, lsl #2]
+ add r3, r3, lr
+ bx r3
+
+ .align 2
+L(blend_tbl):
+ .word 320f - L(blend_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_tbl) + CONFIG_THUMB
+
+40:
+ vmov.i8 d22, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+4:
+ vld1.u8 {d2}, [r5, :64]!
+ vld1.u8 {d1}, [r2, :64]!
+ vld1.32 {d0[]}, [r0, :32]
+ subs r4, r4, #2
+ vld1.32 {d0[1]}, [r12, :32]
+ vsub.i8 d3, d22, d2
+ vmull.u8 q8, d1, d2
+ vmlal.u8 q8, d0, d3
+ vrshrn.i16 d20, q8, #6
+ vst1.32 {d20[0]}, [r0, :32], r1
+ vst1.32 {d20[1]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r5,pc}
+80:
+ vmov.i8 d16, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+8:
+ vld1.u8 {q1}, [r5, :128]!
+ vld1.u8 {q2}, [r2, :128]!
+ vld1.u8 {d0}, [r0, :64]
+ vsub.i8 d17, d16, d2
+ vld1.u8 {d1}, [r12, :64]
+ subs r4, r4, #2
+ vsub.i8 d18, d16, d3
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d17
+ vmull.u8 q10, d3, d5
+ vmlal.u8 q10, d1, d18
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q10, #6
+ vst1.u8 {d22}, [r0, :64], r1
+ vst1.u8 {d23}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r5,pc}
+160:
+ vmov.i8 q12, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+16:
+ vld1.u8 {q1, q2}, [r5, :128]!
+ vld1.u8 {q8, q9}, [r2, :128]!
+ vld1.u8 {q0}, [r0, :128]
+ subs r4, r4, #2
+ vsub.i8 q15, q12, q1
+ vld1.u8 {q13}, [r12, :128]
+ vmull.u8 q3, d16, d2
+ vmlal.u8 q3, d0, d30
+ vmull.u8 q14, d17, d3
+ vmlal.u8 q14, d1, d31
+ vsub.i8 q15, q12, q2
+ vrshrn.i16 d20, q3, #6
+ vrshrn.i16 d21, q14, #6
+ vmull.u8 q3, d18, d4
+ vmlal.u8 q3, d26, d30
+ vmull.u8 q14, d19, d5
+ vmlal.u8 q14, d27, d31
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q14, #6
+ vst1.u8 {q10}, [r0, :128], r1
+ vst1.u8 {q11}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5,pc}
+320:
+ vmov.i8 q10, #64
+32:
+ vld1.u8 {q2, q3}, [r5, :128]!
+ vld1.u8 {q8, q9}, [r2, :128]!
+ vld1.u8 {q0, q1}, [r0, :128]
+ subs r4, r4, #1
+ vsub.i8 q11, q10, q2
+ vmull.u8 q15, d16, d4
+ vmlal.u8 q15, d0, d22
+ vmull.u8 q14, d17, d5
+ vmlal.u8 q14, d1, d23
+ vsub.i8 q11, q10, q3
+ vrshrn.i16 d24, q15, #6
+ vrshrn.i16 d25, q14, #6
+ vmull.u8 q15, d18, d6
+ vmlal.u8 q15, d2, d22
+ vmull.u8 q14, d19, d7
+ vmlal.u8 q14, d3, d23
+ vrshrn.i16 d26, q15, #6
+ vrshrn.i16 d27, q14, #6
+ vst1.u8 {q12, q13}, [r0, :128], r1
+ bgt 32b
+ pop {r4-r5,pc}
+endfunc
+
+function blend_h_8bpc_neon, export=1
+ push {r4-r5,lr}
+ ldr r4, [sp, #12]
+ movrel r5, X(obmc_masks)
+ add r5, r5, r4
+ sub r4, r4, r4, lsr #2
+ clz lr, r3
+ adr r12, L(blend_h_tbl)
+ sub lr, lr, #24
+ ldr lr, [r12, lr, lsl #2]
+ add r12, r12, lr
+ bx r12
+
+ .align 2
+L(blend_h_tbl):
+ .word 1280f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 640f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 320f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 20f - L(blend_h_tbl) + CONFIG_THUMB
+
+20:
+ vmov.i8 d22, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+2:
+ vld1.16 {d2[], d3[]}, [r5, :16]!
+ vld1.32 {d1[]}, [r2, :32]!
+ subs r4, r4, #2
+ vld1.16 {d0[]}, [r0, :16]
+ vzip.8 d2, d3
+ vsub.i8 d4, d22, d2
+ vld1.16 {d0[1]}, [r12, :16]
+ vmull.u8 q8, d1, d2
+ vmlal.u8 q8, d0, d4
+ vrshrn.i16 d20, q8, #6
+ vst1.16 {d20[0]}, [r0, :16], r1
+ vst1.16 {d20[1]}, [r12, :16], r1
+ bgt 2b
+ pop {r4-r5,pc}
+40:
+ vmov.i8 d22, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+4:
+ vld2.u8 {d2[], d3[]}, [r5, :16]!
+ vld1.u8 {d1}, [r2, :64]!
+ subs r4, r4, #2
+ vext.u8 d2, d2, d3, #4
+ vld1.32 {d0[]}, [r0, :32]
+ vsub.i8 d6, d22, d2
+ vld1.32 {d0[1]}, [r12, :32]
+ vmull.u8 q8, d1, d2
+ vmlal.u8 q8, d0, d6
+ vrshrn.i16 d20, q8, #6
+ vst1.32 {d20[0]}, [r0, :32], r1
+ vst1.32 {d20[1]}, [r12, :32], r1
+ bgt 4b
+ pop {r4-r5,pc}
+80:
+ vmov.i8 q8, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+8:
+ vld2.u8 {d2[], d3[]}, [r5, :16]!
+ vld1.u8 {d4, d5}, [r2, :128]!
+ vld1.u8 {d0}, [r0, :64]
+ vsub.i8 q9, q8, q1
+ vld1.u8 {d1}, [r12, :64]
+ subs r4, r4, #2
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d18
+ vmull.u8 q10, d3, d5
+ vmlal.u8 q10, d1, d19
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q10, #6
+ vst1.u8 {d22}, [r0, :64], r1
+ vst1.u8 {d23}, [r12, :64], r1
+ bgt 8b
+ pop {r4-r5,pc}
+160:
+ vmov.i8 q12, #64
+ add r12, r0, r1
+ lsl r1, r1, #1
+16:
+ vld2.u8 {d28[], d29[]}, [r5, :16]!
+ vld1.u8 {d2, d3, d4, d5}, [r2, :128]!
+ vsub.i8 q15, q12, q14
+ vld1.u8 {q0}, [r0, :128]
+ subs r4, r4, #2
+ vld1.u8 {q13}, [r12, :128]
+ vmull.u8 q3, d2, d28
+ vmlal.u8 q3, d0, d30
+ vmull.u8 q8, d3, d28
+ vmlal.u8 q8, d1, d30
+ vrshrn.i16 d18, q3, #6
+ vrshrn.i16 d19, q8, #6
+ vmull.u8 q3, d4, d29
+ vmlal.u8 q3, d26, d31
+ vmull.u8 q8, d5, d29
+ vmlal.u8 q8, d27, d31
+ vrshrn.i16 d20, q3, #6
+ vrshrn.i16 d21, q8, #6
+ vst1.u8 {q9}, [r0, :128], r1
+ vst1.u8 {q10}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5,pc}
+320:
+640:
+1280:
+ vmov.i8 d20, #64
+ sub r1, r1, r3
+321:
+ vld1.u8 {d6[]}, [r5]!
+ vsub.i8 d7, d20, d6
+ mov r12, r3
+32:
+ vld1.u8 {q8, q9}, [r2, :128]!
+ vld1.u8 {q0, q1}, [r0, :128]
+ vmull.u8 q15, d16, d6
+ vmlal.u8 q15, d0, d7
+ vmull.u8 q14, d17, d6
+ vmlal.u8 q14, d1, d7
+ vrshrn.i16 d0, q15, #6
+ vrshrn.i16 d1, q14, #6
+ vmull.u8 q15, d18, d6
+ vmlal.u8 q15, d2, d7
+ vmull.u8 q14, d19, d6
+ vmlal.u8 q14, d3, d7
+ vrshrn.i16 d2, q15, #6
+ vrshrn.i16 d3, q14, #6
+ subs r12, r12, #32
+ vst1.u8 {q0, q1}, [r0, :128]!
+ bgt 32b
+ add r0, r0, r1
+ subs r4, r4, #1
+ bgt 321b
+ pop {r4-r5,pc}
+endfunc
+
+function blend_v_8bpc_neon, export=1
+ push {r4,lr}
+ ldr r4, [sp, #8]
+ movrel lr, X(obmc_masks)
+ add lr, lr, r3
+ clz r12, r3
+ adr r3, L(blend_v_tbl)
+ sub r12, r12, #26
+ ldr r12, [r3, r12, lsl #2]
+ add r3, r3, r12
+ bx r3
+
+ .align 2
+L(blend_v_tbl):
+ .word 320f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 20f - L(blend_v_tbl) + CONFIG_THUMB
+
+20:
+ vmov.i8 d22, #64
+ vld1.8 {d2[]}, [lr]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 d3, d22, d2
+2:
+ vld1.16 {d1[0]}, [r2, :16]!
+ vld1.8 {d0[]}, [r0]
+ subs r4, r4, #2
+ vld1.8 {d1[1]}, [r2]
+ vld1.8 {d0[1]}, [r12]
+ vmull.u8 q2, d1, d2
+ vmlal.u8 q2, d0, d3
+ vrshrn.i16 d6, q2, #6
+ add r2, r2, #2
+ vst1.8 {d6[0]}, [r0], r1
+ vst1.8 {d6[1]}, [r12], r1
+ bgt 2b
+ pop {r4,pc}
+40:
+ vmov.i8 d22, #64
+ vld1.32 {d4[]}, [lr, :32]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 d5, d22, d4
+ sub r1, r1, #2
+4:
+ vld1.u8 {d2}, [r2, :64]!
+ vld1.32 {d0[]}, [r0, :32]
+ vld1.32 {d0[1]}, [r12, :32]
+ subs r4, r4, #2
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d5
+ vrshrn.i16 d20, q3, #6
+ vst1.16 {d20[0]}, [r0, :16]!
+ vst1.16 {d20[2]}, [r12, :16]!
+ vst1.8 {d20[2]}, [r0], r1
+ vst1.8 {d20[6]}, [r12], r1
+ bgt 4b
+ pop {r4,pc}
+80:
+ vmov.i8 d16, #64
+ vld1.u8 {d2}, [lr, :64]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 d17, d16, d2
+ sub r1, r1, #4
+8:
+ vld1.u8 {d4, d5}, [r2, :128]!
+ vld1.u8 {d0}, [r0, :64]
+ vld1.u8 {d1}, [r12, :64]
+ subs r4, r4, #2
+ vmull.u8 q3, d2, d4
+ vmlal.u8 q3, d0, d17
+ vmull.u8 q10, d2, d5
+ vmlal.u8 q10, d1, d17
+ vrshrn.i16 d22, q3, #6
+ vrshrn.i16 d23, q10, #6
+ vst1.32 {d22[0]}, [r0, :32]!
+ vst1.32 {d23[0]}, [r12, :32]!
+ vst1.16 {d22[2]}, [r0, :16], r1
+ vst1.16 {d23[2]}, [r12, :16], r1
+ bgt 8b
+ pop {r4,pc}
+160:
+ vmov.i8 q12, #64
+ vld1.u8 {q14}, [lr, :128]
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vsub.i8 q11, q12, q14
+ sub r1, r1, #8
+16:
+ vld1.u8 {q1, q2}, [r2, :128]!
+ vld1.u8 {q0}, [r0, :128]
+ subs r4, r4, #2
+ vld1.u8 {q13}, [r12, :128]
+ vmull.u8 q3, d2, d28
+ vmlal.u8 q3, d0, d22
+ vmull.u8 q8, d3, d29
+ vmlal.u8 q8, d1, d23
+ vrshrn.i16 d18, q3, #6
+ vrshrn.i16 d19, q8, #6
+ vmull.u8 q3, d4, d28
+ vmlal.u8 q3, d26, d22
+ vmull.u8 q8, d5, d29
+ vmlal.u8 q8, d27, d23
+ vrshrn.i16 d20, q3, #6
+ vrshrn.i16 d21, q8, #6
+ vst1.u8 {d18}, [r0, :64]!
+ vst1.u8 {d20}, [r12, :64]!
+ vst1.32 {d19[0]}, [r0, :32], r1
+ vst1.32 {d21[0]}, [r12, :32], r1
+ bgt 16b
+ pop {r4,pc}
+320:
+ vmov.i8 q10, #64
+ vld1.u8 {q2, q3}, [lr, :128]
+ vsub.i8 q11, q10, q2
+ vsub.i8 d24, d20, d6
+32:
+ vld1.u8 {q8, q9}, [r2, :128]!
+ vld1.u8 {d0, d1, d2}, [r0, :64]
+ subs r4, r4, #1
+ vmull.u8 q15, d16, d4
+ vmlal.u8 q15, d0, d22
+ vmull.u8 q14, d17, d5
+ vmlal.u8 q14, d1, d23
+ vrshrn.i16 d0, q15, #6
+ vrshrn.i16 d1, q14, #6
+ vmull.u8 q15, d18, d6
+ vmlal.u8 q15, d2, d24
+ vrshrn.i16 d2, q15, #6
+ vst1.u8 {d0, d1, d2}, [r0, :64], r1
+ bgt 32b
+ pop {r4,pc}
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// assumes that the caller has loaded the h argument into r5,
+// and assumes that r8 is set to (clz(w)-24).
+function put_neon
+ adr r9, L(put_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(put_tbl):
+ .word 1280f - L(put_tbl) + CONFIG_THUMB
+ .word 640f - L(put_tbl) + CONFIG_THUMB
+ .word 32f - L(put_tbl) + CONFIG_THUMB
+ .word 160f - L(put_tbl) + CONFIG_THUMB
+ .word 8f - L(put_tbl) + CONFIG_THUMB
+ .word 4f - L(put_tbl) + CONFIG_THUMB
+ .word 2f - L(put_tbl) + CONFIG_THUMB
+
+2:
+ vld1.16 {d0[]}, [r2], r3
+ vld1.16 {d1[]}, [r2], r3
+ subs r5, r5, #2
+ vst1.16 {d0[0]}, [r0, :16], r1
+ vst1.16 {d1[0]}, [r0, :16], r1
+ bgt 2b
+ pop {r4-r11,pc}
+4:
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d1[]}, [r2], r3
+ subs r5, r5, #2
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d1[0]}, [r0, :32], r1
+ bgt 4b
+ pop {r4-r11,pc}
+8:
+ vld1.8 {d0}, [r2], r3
+ vld1.8 {d1}, [r2], r3
+ subs r5, r5, #2
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d1}, [r0, :64], r1
+ bgt 8b
+ pop {r4-r11,pc}
+160:
+ add r8, r0, r1
+ lsl r1, r1, #1
+ add r9, r2, r3
+ lsl r3, r3, #1
+16:
+ vld1.8 {q0}, [r2], r3
+ vld1.8 {q1}, [r9], r3
+ subs r5, r5, #2
+ vst1.8 {q0}, [r0, :128], r1
+ vst1.8 {q1}, [r8, :128], r1
+ bgt 16b
+ pop {r4-r11,pc}
+32:
+ vld1.8 {q0, q1}, [r2], r3
+ subs r5, r5, #1
+ vst1.8 {q0, q1}, [r0, :128], r1
+ bgt 32b
+ pop {r4-r11,pc}
+640:
+ sub r1, r1, #32
+ sub r3, r3, #32
+64:
+ vld1.8 {q0, q1}, [r2]!
+ vst1.8 {q0, q1}, [r0, :128]!
+ vld1.8 {q2, q3}, [r2], r3
+ subs r5, r5, #1
+ vst1.8 {q2, q3}, [r0, :128], r1
+ bgt 64b
+ pop {r4-r11,pc}
+1280:
+ sub r1, r1, #96
+ sub r3, r3, #96
+128:
+ vld1.8 {q8, q9}, [r2]!
+ vst1.8 {q8, q9}, [r0, :128]!
+ vld1.8 {q10, q11}, [r2]!
+ vst1.8 {q10, q11}, [r0, :128]!
+ vld1.8 {q12, q13}, [r2]!
+ vst1.8 {q12, q13}, [r0, :128]!
+ vld1.8 {q14, q15}, [r2], r3
+ subs r5, r5, #1
+ vst1.8 {q14, q15}, [r0, :128], r1
+ bgt 128b
+ pop {r4-r11,pc}
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// assumes that the caller has loaded the h argument into r4,
+// and assumes that r8 is set to (clz(w)-24), and r7 to w*2.
+function prep_neon
+ adr r9, L(prep_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(prep_tbl):
+ .word 1280f - L(prep_tbl) + CONFIG_THUMB
+ .word 640f - L(prep_tbl) + CONFIG_THUMB
+ .word 320f - L(prep_tbl) + CONFIG_THUMB
+ .word 160f - L(prep_tbl) + CONFIG_THUMB
+ .word 8f - L(prep_tbl) + CONFIG_THUMB
+ .word 4f - L(prep_tbl) + CONFIG_THUMB
+
+4:
+ vld1.32 {d0[]}, [r1], r2
+ vld1.32 {d2[]}, [r1], r2
+ subs r4, r4, #2
+ vshll.u8 q0, d0, #4
+ vshll.u8 q1, d2, #4
+ vst1.16 {d1, d2}, [r0, :64]!
+ bgt 4b
+ pop {r4-r11,pc}
+8:
+ vld1.8 {d0}, [r1], r2
+ vld1.8 {d2}, [r1], r2
+ subs r4, r4, #2
+ vshll.u8 q0, d0, #4
+ vshll.u8 q1, d2, #4
+ vst1.16 {q0, q1}, [r0, :128]!
+ bgt 8b
+ pop {r4-r11,pc}
+160:
+ add r9, r1, r2
+ lsl r2, r2, #1
+ add r8, r0, r7
+ lsl r7, r7, #1
+16:
+ vld1.8 {q2}, [r1], r2
+ vld1.8 {q3}, [r9], r2
+ subs r4, r4, #2
+ vshll.u8 q0, d4, #4
+ vshll.u8 q1, d5, #4
+ vshll.u8 q2, d6, #4
+ vshll.u8 q3, d7, #4
+ vst1.16 {q0, q1}, [r0, :128], r7
+ vst1.16 {q2, q3}, [r8, :128], r7
+ bgt 16b
+ pop {r4-r11,pc}
+320:
+ add r8, r0, r3
+32:
+ vld1.8 {q0, q1}, [r1], r2
+ subs r4, r4, #2
+ vshll.u8 q8, d0, #4
+ vshll.u8 q9, d1, #4
+ vld1.8 {q2, q3}, [r1], r2
+ vshll.u8 q10, d2, #4
+ vshll.u8 q11, d3, #4
+ vshll.u8 q12, d4, #4
+ vst1.16 {q8, q9}, [r0, :128], r7
+ vshll.u8 q13, d5, #4
+ vst1.16 {q10, q11}, [r8, :128], r7
+ vshll.u8 q14, d6, #4
+ vst1.16 {q12, q13}, [r0, :128], r7
+ vshll.u8 q15, d7, #4
+ vst1.16 {q14, q15}, [r8, :128], r7
+ bgt 32b
+ pop {r4-r11,pc}
+640:
+ sub r2, r2, #32
+ add r8, r0, #32
+ mov r6, #64
+64:
+ vld1.8 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshll.u8 q8, d0, #4
+ vshll.u8 q9, d1, #4
+ vld1.8 {q2, q3}, [r1], r2
+ vshll.u8 q10, d2, #4
+ vshll.u8 q11, d3, #4
+ vshll.u8 q12, d4, #4
+ vst1.16 {q8, q9}, [r0, :128], r6
+ vshll.u8 q13, d5, #4
+ vshll.u8 q14, d6, #4
+ vst1.16 {q10, q11}, [r8, :128], r6
+ vshll.u8 q15, d7, #4
+ vst1.16 {q12, q13}, [r0, :128], r6
+ vst1.16 {q14, q15}, [r8, :128], r6
+ bgt 64b
+ pop {r4-r11,pc}
+1280:
+ sub r2, r2, #96
+ add r8, r0, #32
+ mov r6, #64
+128:
+ vld1.8 {q0, q1}, [r1]!
+ vld1.8 {q2, q3}, [r1]!
+ vshll.u8 q10, d0, #4
+ vshll.u8 q11, d1, #4
+ vshll.u8 q12, d2, #4
+ vshll.u8 q13, d3, #4
+ vshll.u8 q14, d4, #4
+ vshll.u8 q15, d5, #4
+ vld1.8 {q8, q9}, [r1]!
+ vst1.16 {q10, q11}, [r0, :128], r6
+ vst1.16 {q12, q13}, [r8, :128], r6
+ vshll.u8 q0, d6, #4
+ vshll.u8 q1, d7, #4
+ vshll.u8 q2, d16, #4
+ vshll.u8 q3, d17, #4
+ vshll.u8 q8, d18, #4
+ vshll.u8 q9, d19, #4
+ vld1.8 {q10, q11}, [r1], r2
+ vst1.16 {q14, q15}, [r0, :128], r6
+ vst1.16 {q0, q1}, [r8, :128], r6
+ vshll.u8 q12, d20, #4
+ vshll.u8 q13, d21, #4
+ vshll.u8 q14, d22, #4
+ vshll.u8 q15, d23, #4
+ subs r4, r4, #1
+ vst1.16 {q2, q3}, [r0, :128], r6
+ vst1.16 {q8, q9}, [r8, :128], r6
+ vst1.16 {q12, q13}, [r0, :128], r6
+ vst1.16 {q14, q15}, [r8, :128], r6
+ bgt 128b
+ pop {r4-r11,pc}
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ vld1.\wd {\d0[]}, [\s0], \strd
+ vld1.\wd {\d1[]}, [\s1], \strd
+.ifnb \d2
+ vld1.\wd {\d2[]}, [\s0], \strd
+ vld1.\wd {\d3[]}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.\wd {\d4[]}, [\s0], \strd
+.endif
+.ifnb \d5
+ vld1.\wd {\d5[]}, [\s1], \strd
+.endif
+.ifnb \d6
+ vld1.\wd {\d6[]}, [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ vld1.8 {\d0}, [\s0], \strd
+ vld1.8 {\d1}, [\s1], \strd
+.ifnb \d2
+ vld1.8 {\d2}, [\s0], \strd
+ vld1.8 {\d3}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.8 {\d4}, [\s0], \strd
+.endif
+.ifnb \d5
+ vld1.8 {\d5}, [\s1], \strd
+.endif
+.ifnb \d6
+ vld1.8 {\d6}, [\s0], \strd
+.endif
+.endm
+.macro load_16 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, 16, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro interleave_1_16 r0, r1, r2, r3, r4
+ vext.8 \r0, \r0, \r1, #6
+ vext.8 \r1, \r1, \r2, #6
+.ifnb \r3
+ vext.8 \r2, \r2, \r3, #6
+ vext.8 \r3, \r3, \r4, #6
+.endif
+.endm
+.macro interleave_1_32 r0, r1, r2, r3, r4
+ vext.8 \r0, \r0, \r1, #4
+ vext.8 \r1, \r1, \r2, #4
+.ifnb \r3
+ vext.8 \r2, \r2, \r3, #4
+ vext.8 \r3, \r3, \r4, #4
+.endif
+.endm
+.macro vmovl_u8 q0, d0, q1, d1, q2, d2, q3, d3, q4, d4, q5, d5, q6, d6
+ vmovl.u8 \q0, \d0
+ vmovl.u8 \q1, \d1
+.ifnb \q2
+ vmovl.u8 \q2, \d2
+ vmovl.u8 \q3, \d3
+.endif
+.ifnb \q4
+ vmovl.u8 \q4, \d4
+.endif
+.ifnb \q5
+ vmovl.u8 \q5, \d5
+.endif
+.ifnb \q6
+ vmovl.u8 \q6, \d6
+.endif
+.endm
+.macro mul_mla_4 d, s0, s1, s2, s3
+ vmul.s16 \d, \s0, d0[0]
+ vmla.s16 \d, \s1, d0[1]
+ vmla.s16 \d, \s2, d0[2]
+ vmla.s16 \d, \s3, d0[3]
+.endm
+.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+ vmul.s16 \d0, \s0, d0[0]
+ vmla.s16 \d0, \s1, d0[1]
+ vmla.s16 \d0, \s2, d0[2]
+ vmla.s16 \d0, \s3, d0[3]
+ vmla.s16 \d0, \s4, d1[0]
+ vmla.s16 \d0, \s5, d1[1]
+ vmla.s16 \d0, \s6, d1[2]
+ vmla.s16 \d0, \s7, d1[3]
+.endm
+.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ vmul.s16 \d0, \s0, d0[0]
+ vmla.s16 \d0, \s1, d0[1]
+ vmla.s16 \d0, \s2, d0[2]
+ vmla.s16 \d0, \s3, d0[3]
+ vmla.s16 \d0, \s4, d1[0]
+ vmla.s16 \d0, \s5, d1[1]
+ vmla.s16 \d0, \s6, d1[2]
+ vmla.s16 \d0, \s7, d1[3]
+ vmul.s16 \d1, \s1, d0[0]
+ vmla.s16 \d1, \s2, d0[1]
+ vmla.s16 \d1, \s3, d0[2]
+ vmla.s16 \d1, \s4, d0[3]
+ vmla.s16 \d1, \s5, d1[0]
+ vmla.s16 \d1, \s6, d1[1]
+ vmla.s16 \d1, \s7, d1[2]
+ vmla.s16 \d1, \s8, d1[3]
+.endm
+.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+ vmul.s16 \d0, \s0, d0[0]
+ vmla.s16 \d0, \s1, d0[1]
+ vmla.s16 \d0, \s2, d0[2]
+ vmla.s16 \d0, \s3, d0[3]
+ vmla.s16 \d0, \s4, d1[0]
+ vmla.s16 \d0, \s5, d1[1]
+ vmla.s16 \d0, \s6, d1[2]
+ vmla.s16 \d0, \s7, d1[3]
+ vmul.s16 \d1, \s2, d0[0]
+ vmla.s16 \d1, \s3, d0[1]
+ vmla.s16 \d1, \s4, d0[2]
+ vmla.s16 \d1, \s5, d0[3]
+ vmla.s16 \d1, \s6, d1[0]
+ vmla.s16 \d1, \s7, d1[1]
+ vmla.s16 \d1, \s8, d1[2]
+ vmla.s16 \d1, \s9, d1[3]
+.endm
+.macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3
+ vqrshrun.s16 \d0, \q0, #\shift
+.ifnb \q1
+ vqrshrun.s16 \d1, \q1, #\shift
+.endif
+.ifnb \q2
+ vqrshrun.s16 \d2, \q2, #\shift
+ vqrshrun.s16 \d3, \q3, #\shift
+.endif
+.endm
+.macro vrshr_s16 shift, r0, r1, r2, r3
+ vrshr.s16 \r0, \r0, #\shift
+.ifnb \r1
+ vrshr.s16 \r1, \r1, #\shift
+.endif
+.ifnb \r2
+ vrshr.s16 \r2, \r2, #\shift
+ vrshr.s16 \r3, \r3, #\shift
+.endif
+.endm
+.macro st_16 strd, reg, lanes
+ vst1.16 {\reg[0]}, [r0, :16], \strd
+ vst1.16 {\reg[1]}, [r8, :16], \strd
+.if \lanes > 2
+ vst1.16 {\reg[2]}, [r0, :16], \strd
+ vst1.16 {\reg[3]}, [r8, :16], \strd
+.endif
+.endm
+.macro st_32 strd, r0, r1
+ vst1.32 {\r0[0]}, [r0, :32], \strd
+ vst1.32 {\r0[1]}, [r8, :32], \strd
+.ifnb \r1
+ vst1.32 {\r1[0]}, [r0, :32], \strd
+ vst1.32 {\r1[1]}, [r8, :32], \strd
+.endif
+.endm
+.macro st_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7
+ vst1.8 {\r0}, [r0, \align], \strd
+ vst1.8 {\r1}, [r8, \align], \strd
+.ifnb \r2
+ vst1.8 {\r2}, [r0, \align], \strd
+ vst1.8 {\r3}, [r8, \align], \strd
+.endif
+.ifnb \r4
+ vst1.8 {\r4}, [r0, \align], \strd
+ vst1.8 {\r5}, [r8, \align], \strd
+ vst1.8 {\r6}, [r0, \align], \strd
+ vst1.8 {\r7}, [r8, \align], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, q0, d0, d1, q1, d2, d3
+.ifc \type, put
+ vqrshrun_s16 6, \q0, \d0, \q1, \d2
+ st_32 \strd, \d0, \d2
+.else
+ vrshr_s16 2, \q0, \q1
+ st_reg \strd, :64, \d0, \d1, \d2, \d3
+.endif
+.endm
+.macro shift_store_8 type, strd, q0, d0, q1, d1, q2, d2, q3, d3
+.ifc \type, put
+ vqrshrun_s16 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+ st_reg \strd, :64, \d0, \d1, \d2, \d3
+.else
+ vrshr_s16 2, \q0, \q1, \q2, \q3
+ st_reg \strd, :128,\q0, \q1, \q2, \q3
+.endif
+.endm
+.macro shift_store_16 type, strd, q0, d0, d1, q1, q2, d4, d5, q3
+.ifc \type, put
+ vqrshrun.s16 \d0, \q0, #6
+ vqrshrun.s16 \d1, \q1, #6
+ vqrshrun.s16 \d4, \q2, #6
+ vqrshrun.s16 \d5, \q3, #6
+ st_reg \strd, :128, \q0, \q2
+.else
+ vrshr_s16 2, \q0, \q1, \q2, \q3
+ vst1.16 {\q0, \q1}, [r0, :128], \strd
+ vst1.16 {\q2, \q3}, [r8, :128], \strd
+.endif
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+ movw r8, \type_h
+ movw r9, \type_v
+ b \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, ds2, sr2, shift_hv
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap_neon
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+ movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, r10
+ mul \my, \my, r10
+ add \mx, \mx, r8 // mx, 8tap_h, 4tap_h
+ add \my, \my, r9 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+ lsl \d_strd, \w, #1
+.endif
+
+ clz r8, \w
+ tst \mx, #(0x7f << 14)
+ sub r8, r8, #24
+ movrel r10, X(mc_subpel_filters), -8
+ bne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ bne L(\type\()_8tap_v)
+ b \type\()_neon
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfx r9, \mx, #7, #7
+ and \mx, \mx, #0x7f
+ it gt
+ movgt \mx, r9
+ tst \my, #(0x7f << 14)
+ add \mx, r10, \mx, lsl #3
+ bne L(\type\()_8tap_hv)
+
+ adr r9, L(\type\()_8tap_h_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_8tap_h_tbl):
+ .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+
+20: // 2xN h
+.ifc \type, put
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ sub \src, \src, #1
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+2:
+ vld1.8 {d4}, [\src], \s_strd
+ vld1.8 {d6}, [\sr2], \s_strd
+ vmovl.u8 q2, d4
+ vmovl.u8 q3, d6
+ vext.8 d5, d4, d5, #2
+ vext.8 d7, d6, d7, #2
+ subs \h, \h, #2
+ vtrn.32 d4, d6
+ vtrn.32 d5, d7
+ vmul.s16 d2, d4, d0[0]
+ vmla.s16 d2, d5, d0[1]
+ vmla.s16 d2, d6, d0[2]
+ vmla.s16 d2, d7, d0[3]
+ vrshr.s16 d2, d2, #2
+ vqrshrun.s16 d2, q1, #4
+ vst1.16 {d2[0]}, [\dst, :16], \d_strd
+ vst1.16 {d2[1]}, [\ds2, :16], \d_strd
+ bgt 2b
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN h
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ sub \src, \src, #1
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+4:
+ vld1.8 {d16}, [\src], \s_strd
+ vld1.8 {d24}, [\sr2], \s_strd
+ vmovl.u8 q8, d16
+ vmovl.u8 q12, d24
+ vext.8 d18, d16, d17, #2
+ vext.8 d20, d16, d17, #4
+ vext.8 d22, d16, d17, #6
+ vext.8 d26, d24, d25, #2
+ vext.8 d28, d24, d25, #4
+ vext.8 d30, d24, d25, #6
+ subs \h, \h, #2
+ vmul.s16 d4, d16, d0[0]
+ vmla.s16 d4, d18, d0[1]
+ vmla.s16 d4, d20, d0[2]
+ vmla.s16 d4, d22, d0[3]
+ vmul.s16 d5, d24, d0[0]
+ vmla.s16 d5, d26, d0[1]
+ vmla.s16 d5, d28, d0[2]
+ vmla.s16 d5, d30, d0[3]
+ vrshr.s16 q2, q2, #2
+.ifc \type, put
+ vqrshrun.s16 d4, q2, #4
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d4[1]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+.endif
+ bgt 4b
+ pop {r4-r11,pc}
+
+80: // 8xN h
+ vld1.8 {d0}, [\mx, :64]
+ sub \src, \src, #3
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+8:
+ vld1.8 {q8}, [\src], \s_strd
+ vld1.8 {q12}, [\sr2], \s_strd
+ vmovl.u8 q9, d17
+ vmovl.u8 q8, d16
+ vmovl.u8 q13, d25
+ vmovl.u8 q12, d24
+
+ vmul.s16 q10, q8, d0[0]
+ vmul.s16 q14, q12, d0[0]
+.irpc i, 1234567
+ vext.8 q11, q8, q9, #(2*\i)
+ vext.8 q15, q12, q13, #(2*\i)
+.if \i < 4
+ vmla.s16 q10, q11, d0[\i]
+ vmla.s16 q14, q15, d0[\i]
+.else
+ vmla.s16 q10, q11, d1[\i-4]
+ vmla.s16 q14, q15, d1[\i-4]
+.endif
+.endr
+ subs \h, \h, #2
+ vrshr.s16 q10, q10, #2
+ vrshr.s16 q14, q14, #2
+.ifc \type, put
+ vqrshrun.s16 d20, q10, #4
+ vqrshrun.s16 d28, q14, #4
+ vst1.8 {d20}, [\dst, :64], \d_strd
+ vst1.8 {d28}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q10}, [\dst, :128], \d_strd
+ vst1.16 {q14}, [\ds2, :128], \d_strd
+.endif
+ bgt 8b
+ pop {r4-r11,pc}
+
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ // This could be done without touching q4-q6, by using only
+ // one temporary for vext in the loop. That's slower on A7 and A53,
+ // (but surprisingly, marginally faster on A8 and A73).
+ vpush {q4-q6}
+ vld1.8 {d0}, [\mx, :64]
+ sub \src, \src, #3
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+
+ sub \s_strd, \s_strd, \w
+ sub \s_strd, \s_strd, #8
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w
+.endif
+161:
+ vld1.8 {d16, d17, d18}, [\src]!
+ vld1.8 {d24, d25, d26}, [\sr2]!
+ mov \mx, \w
+ vmovl.u8 q10, d18
+ vmovl.u8 q9, d17
+ vmovl.u8 q8, d16
+ vmovl.u8 q14, d26
+ vmovl.u8 q13, d25
+ vmovl.u8 q12, d24
+
+16:
+ vmul.s16 q1, q8, d0[0]
+ vmul.s16 q2, q9, d0[0]
+ vmul.s16 q3, q12, d0[0]
+ vmul.s16 q4, q13, d0[0]
+.irpc i, 1234567
+ vext.8 q5, q8, q9, #(2*\i)
+ vext.8 q6, q9, q10, #(2*\i)
+ vext.8 q11, q12, q13, #(2*\i)
+ vext.8 q15, q13, q14, #(2*\i)
+.if \i < 4
+ vmla.s16 q1, q5, d0[\i]
+ vmla.s16 q2, q6, d0[\i]
+ vmla.s16 q3, q11, d0[\i]
+ vmla.s16 q4, q15, d0[\i]
+.else
+ vmla.s16 q1, q5, d1[\i-4]
+ vmla.s16 q2, q6, d1[\i-4]
+ vmla.s16 q3, q11, d1[\i-4]
+ vmla.s16 q4, q15, d1[\i-4]
+.endif
+.endr
+ vrshr.s16 q1, q1, #2
+ vrshr.s16 q2, q2, #2
+ vrshr.s16 q3, q3, #2
+ vrshr.s16 q4, q4, #2
+ subs \mx, \mx, #16
+.ifc \type, put
+ vqrshrun.s16 d2, q1, #4
+ vqrshrun.s16 d3, q2, #4
+ vqrshrun.s16 d4, q3, #4
+ vqrshrun.s16 d5, q4, #4
+ vst1.8 {q1}, [\dst, :128]!
+ vst1.8 {q2}, [\ds2, :128]!
+.else
+ vst1.16 {q1, q2}, [\dst, :128]!
+ vst1.16 {q3, q4}, [\ds2, :128]!
+.endif
+ ble 9f
+
+ vmov q8, q10
+ vmov q12, q14
+ vld1.8 {d18, d19}, [\src]!
+ vld1.8 {d26, d27}, [\sr2]!
+ vmovl.u8 q10, d19
+ vmovl.u8 q9, d18
+ vmovl.u8 q14, d27
+ vmovl.u8 q13, d26
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ bgt 161b
+ vpop {q4-q6}
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfx r9, \my, #7, #7
+ and \my, \my, #0x7f
+ it gt
+ movgt \my, r9
+ add \my, r10, \my, lsl #3
+
+ adr r9, L(\type\()_8tap_v_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_8tap_v_tbl):
+ .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+
+20: // 2xN v
+.ifc \type, put
+ bgt 28f
+
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ // 2x2 v
+ load_16 \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ interleave_1_16 d1, d2, d3, d4, d5
+ bgt 24f
+ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4
+ mul_mla_4 d6, d16, d18, d20, d22
+ vqrshrun_s16 6, q3, d6
+ st_16 \d_strd, d6, 2
+ pop {r4-r11,pc}
+
+24: // 2x4 v
+ load_16 \sr2, \src, \s_strd, d6, d7
+ interleave_1_16 d5, d6, d7
+ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5, q13, d6
+ vmov d17, d20
+ vmov d19, d22
+ vmov d21, d24
+ vmov d23, d26
+ mul_mla_4 q3, q8, q9, q10, q11
+ vqrshrun_s16 6, q3, d6
+ st_16 \d_strd, d6, 4
+ pop {r4-r11,pc}
+
+28: // 2x6, 2x8, 2x12, 2x16 v
+ vpush {q4-q7}
+ vld1.8 {d0}, [\my, :64]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+
+ load_16 \src, \sr2, \s_strd, d2, d4, d6, d8, d10, d12, d14
+ interleave_1_16 d2, d4, d6, d8, d10
+ interleave_1_16 d10, d12, d14
+ vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q5, d10, q6, d12
+ vmov d3, d6
+ vmov d5, d8
+ vmov d7, d10
+ vmov d9, d12
+216:
+ subs \h, \h, #4
+ load_16 \sr2, \src, \s_strd, d16, d18, d20, d22
+ interleave_1_16 d14, d16, d18, d20, d22
+ vmovl_u8 q7, d14, q8, d16, q9, d18, q10, d20
+ vmov d11, d14
+ vmov d13, d16
+ vmov d15, d18
+ vmov d17, d20
+ mul_mla_8_0 q1, q1, q2, q3, q4, q5, q6, q7, q8
+ vqrshrun_s16 6, q1, d2
+ st_16 \d_strd, d2, 4
+ ble 0f
+ cmp \h, #2
+ vmov q1, q5
+ vmov q2, q6
+ vmov q3, q7
+ vmov q4, q8
+ vmov q5, q9
+ vmov q6, q10
+ vmov d14, d22
+ beq 26f
+ b 216b
+26:
+ load_16 \sr2, \src, \s_strd, d16, d18
+ interleave_1_16 d14, d16, d18
+ vmovl_u8 q7, d14, q8, d16
+ vmov d11, d14
+ vmov d13, d16
+ mul_mla_8_0 d2, d2, d4, d6, d8, d10, d12, d14, d16
+ vqrshrun_s16 6, q1, d2
+ st_16 \d_strd, d2, 2
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+.endif
+
+40:
+ bgt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ interleave_1_32 d1, d2, d3, d4, d5
+ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4
+ mul_mla_4 q3, q8, q9, q10, q11
+ shift_store_4 \type, \d_strd, q3, d6, d7
+ ble 0f
+ load_32 \sr2, \src, \s_strd, d6, d7
+ interleave_1_32 d5, d6, d7
+ vmovl_u8 q12, d5, q13, d6
+ mul_mla_4 q3, q10, q11, q12, q13
+ shift_store_4 \type, \d_strd, q3, d6, d7
+0:
+ pop {r4-r11,pc}
+
+480: // 4x6, 4x8, 4x12, 4x16 v
+ vpush {q4}
+ vld1.8 {d0}, [\my, :64]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_32 \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20
+ interleave_1_32 d2, d4, d6
+ interleave_1_32 d6, d8, d16, d18, d20
+ vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18
+
+48:
+ subs \h, \h, #4
+ load_32 \sr2, \src, \s_strd, d22, d24, d26, d28
+ interleave_1_32 d20, d22, d24, d26, d28
+ vmovl_u8 q10, d20, q11, d22, q12, d24, q13, d26
+ mul_mla_8_2 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12, q13
+ shift_store_4 \type, \d_strd, q1, d2, d3, q2, d4, d5
+ ble 0f
+ load_32 \sr2, \src, \s_strd, d30, d2
+ subs \h, \h, #2
+ interleave_1_32 d28, d30, d2
+ vmovl_u8 q14, d28, q15, d30
+ mul_mla_8_0 q8, q8, q9, q10, q11, q12, q13, q14, q15
+ shift_store_4 \type, \d_strd, q8, d16, d17
+ ble 0f
+ load_32 \sr2, \src, \s_strd, d4, d6
+ subs \h, \h, #2
+ interleave_1_32 d2, d4, d6
+ vmovl_u8 q1, d2, q2, d4
+ mul_mla_8_0 q9, q10, q11, q12, q13, q14, q15, q1, q2
+ shift_store_4 \type, \d_strd, q9, d18, d19
+ ble 0f
+ subs \h, \h, #4
+ load_32 \sr2, \src, \s_strd, d8, d16, d18, d20
+ interleave_1_32 d6, d8, d16, d18, d20
+ vmovl_u8 q3, d6, q4, d8, q8, d16, q9, d18
+ mul_mla_8_2 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8, q9
+ shift_store_4 \type, \d_strd, q12, d24, d25, q13, d26, d27
+ bgt 48b
+0:
+ vpop {q4}
+ pop {r4-r11,pc}
+
+80:
+ bgt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5
+ mul_mla_4 q1, q8, q9, q10, q11
+ mul_mla_4 q2, q9, q10, q11, q12
+ shift_store_8 \type, \d_strd, q1, d2, q2, d4
+ ble 0f
+ load_reg \sr2, \src, \s_strd, d6, d7
+ vmovl_u8 q13, d6, q14, d7
+ mul_mla_4 q1, q10, q11, q12, q13
+ mul_mla_4 q2, q11, q12, q13, q14
+ shift_store_8 \type, \d_strd, q1, d2, q2, d4
+0:
+ pop {r4-r11,pc}
+
+880: // 8x6, 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ vpush {q4}
+ vld1.8 {d0}, [\my, :64]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ vmovl.s8 q0, d0
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_reg \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20
+ vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18, q10, d20
+
+88:
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, d22, d24
+ vmovl_u8 q11, d22, q12, d24
+ mul_mla_8_1 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12
+ shift_store_8 \type, \d_strd, q1, d2, q2, d4
+ ble 9f
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, d26, d28
+ vmovl_u8 q13, d26, q14, d28
+ mul_mla_8_1 q3, q4, q3, q4, q8, q9, q10, q11, q12, q13, q14
+ shift_store_8 \type, \d_strd, q3, d6, q4, d8
+ ble 9f
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, d30, d2
+ vmovl_u8 q15, d30, q1, d2
+ mul_mla_8_1 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1
+ shift_store_8 \type, \d_strd, q8, d16, q9, d18
+ ble 9f
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, d4, d6
+ vmovl_u8 q2, d4, q3, d6
+ mul_mla_8_1 q10, q11, q10, q11, q12, q13, q14, q15, q1, q2, q3
+ shift_store_8 \type, \d_strd, q10, d20, q11, d22
+ ble 9f
+ subs \h, \h, #4
+ load_reg \sr2, \src, \s_strd, d8, d16, d18, d20
+ vmovl_u8 q4, d8, q8, d16, q9, d18, q10, d20
+ mul_mla_8_1 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8
+ mul_mla_8_1 q14, q15, q14, q15, q1, q2, q3, q4, q8, q9, q10
+ shift_store_8 \type, \d_strd, q12, d24, q13, d26, q14, d28, q15, d30
+ bgt 88b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 168b
+0:
+ vpop {q4}
+ pop {r4-r11,pc}
+
+160:
+ bgt 1680b
+
+ // 16x2, 16x4 v
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ cmp \h, #2
+ load_reg \src, \sr2, \s_strd, q11, q12, q13, q14, q15
+ vmovl.u8 q1, d22
+ vmovl.u8 q2, d24
+ vmovl.u8 q3, d26
+ vmovl.u8 q8, d28
+ vmovl.u8 q9, d30
+ vmovl.u8 q11, d23
+ vmovl.u8 q12, d25
+ vmovl.u8 q13, d27
+ vmovl.u8 q14, d29
+ vmovl.u8 q15, d31
+ mul_mla_4 q1, q1, q2, q3, q8
+ mul_mla_4 q10, q2, q3, q8, q9
+ mul_mla_4 q2, q11, q12, q13, q14
+ mul_mla_4 q11, q12, q13, q14, q15
+ shift_store_16 \type, \d_strd, q1, d2, d3, q2, q10, d20, d21, q11
+ ble 0f
+ load_reg \sr2, \src, \s_strd, q10, q11
+ vmovl.u8 q1, d20
+ vmovl.u8 q10, d21
+ vmovl.u8 q12, d22
+ vmovl.u8 q11, d23
+ mul_mla_4 q2, q3, q8, q9, q1
+ mul_mla_4 q3, q13, q14, q15, q10
+ mul_mla_4 q13, q8, q9, q1, q12
+ mul_mla_4 q14, q14, q15, q10, q11
+ shift_store_16 \type, \d_strd, q2, d4, d5, q3, q13, d26, d27, q14
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfx r9, \my, #7, #7
+ and \my, \my, #0x7f
+ it gt
+ movgt \my, r9
+ add \my, r10, \my, lsl #3
+
+ adr r9, L(\type\()_8tap_hv_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_8tap_hv_tbl):
+ .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+
+20:
+.ifc \type, put
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ bgt 280f
+ add \my, \my, #2
+ vld1.32 {d2[]}, [\my]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+
+ vld1.8 {d26}, [\src], \s_strd
+ vmovl.u8 q13, d26
+ vext.8 q14, q13, q13, #2
+ vmul.s16 d26, d26, d0
+ vmul.s16 d28, d28, d0
+ vpadd.s16 d26, d26, d28
+ vpadd.s16 d26, d26, d26
+ vrshr.s16 d16, d26, #2
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d16, d16, d16, #4
+ vmov d17, d26
+ vext.8 d16, d16, d26, #4
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d18, d17, d26, #4
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d26, d2[3]
+
+ vqrshrn.s32 d4, q2, #\shift_hv
+ vqmovun.s16 d4, q2
+ subs \h, \h, #2
+ vst1.16 {d4[0]}, [\dst, :16], \d_strd
+ vst1.16 {d4[1]}, [\ds2, :16], \d_strd
+ ble 0f
+ vmov d16, d18
+ vmov d17, d26
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #1
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.8 {d26}, [\src], \s_strd
+ vmovl.u8 q13, d26
+ vext.8 q14, q13, q13, #2
+ vmul.s16 d26, d26, d0
+ vmul.s16 d28, d28, d0
+ vpadd.s16 d26, d26, d28
+ vpadd.s16 d26, d26, d26
+ vrshr.s16 d16, d26, #2
+
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d16, d16, d16, #4
+ vmov d17, d26
+ vext.8 d16, d16, d26, #4
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d18, d17, d26, #4
+ vmov d19, d26
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d20, d19, d26, #4
+ vmov d21, d26
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d22, d21, d26, #4
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d19, d2[3]
+ vmlal.s16 q2, d20, d3[0]
+ vmlal.s16 q2, d21, d3[1]
+ vmlal.s16 q2, d22, d3[2]
+ vmlal.s16 q2, d26, d3[3]
+
+ vqrshrn.s32 d4, q2, #\shift_hv
+ vqmovun.s16 d4, q2
+ subs \h, \h, #2
+ vst1.16 {d4[0]}, [\dst, :16], \d_strd
+ vst1.16 {d4[1]}, [\ds2, :16], \d_strd
+ ble 0f
+ vmov d16, d18
+ vmov d17, d19
+ vmov d18, d20
+ vmov d19, d21
+ vmov d20, d22
+ vmov d21, d26
+ b 28b
+
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_2):
+ vld1.8 {d28}, [\sr2], \s_strd
+ vld1.8 {d30}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vext.8 d31, d30, d30, #1
+ vmovl.u8 q13, d28
+ vmovl.u8 q14, d29
+ vmov d27, d28
+ vmovl.u8 q14, d30
+ vmovl.u8 q15, d31
+ vtrn.32 d26, d28
+ vtrn.32 d27, d30
+ vmul.s16 d26, d26, d0[0]
+ vmla.s16 d26, d27, d0[1]
+ vmla.s16 d26, d28, d0[2]
+ vmla.s16 d26, d30, d0[3]
+ vrshr.s16 d26, d26, #2
+ vext.8 d27, d26, d26, #4
+ bx lr
+.endif
+
+40:
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ bgt 480f
+ add \my, \my, #2
+ vld1.32 {d2[]}, [\my]
+ sub \sr2, \src, #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ // 4x2, 4x4 hv
+ vld1.8 {d30}, [\src], \s_strd
+ vmovl.u8 q14, d30
+ vext.8 d27, d28, d29, #2
+ vext.8 d30, d28, d29, #4
+ vext.8 d31, d28, d29, #6
+ vmul.s16 d26, d28, d0[0]
+ vmla.s16 d26, d27, d0[1]
+ vmla.s16 d26, d30, d0[2]
+ vmla.s16 d26, d31, d0[3]
+ vrshr.s16 d16, d26, #2
+
+ bl L(\type\()_8tap_filter_4)
+ vmov d17, d26
+ vmov d18, d27
+
+4:
+ bl L(\type\()_8tap_filter_4)
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d26, d2[3]
+ vmull.s16 q3, d17, d2[0]
+ vmlal.s16 q3, d18, d2[1]
+ vmlal.s16 q3, d26, d2[2]
+ vmlal.s16 q3, d27, d2[3]
+ vqrshrn.s32 d4, q2, #\shift_hv
+ vqrshrn.s32 d6, q3, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d6, q3
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d6[0]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d6}, [\ds2, :64], \d_strd
+.endif
+ ble 0f
+ vmov d16, d18
+ vmov d17, d26
+ vmov d18, d27
+ b 4b
+
+480: // 4x8, 4x16, 4x32 hv
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #1
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.8 {d30}, [\src], \s_strd
+ vmovl.u8 q14, d30
+ vext.8 d27, d28, d29, #2
+ vext.8 d30, d28, d29, #4
+ vext.8 d31, d28, d29, #6
+ vmul.s16 d26, d28, d0[0]
+ vmla.s16 d26, d27, d0[1]
+ vmla.s16 d26, d30, d0[2]
+ vmla.s16 d26, d31, d0[3]
+ vrshr.s16 d16, d26, #2
+
+ bl L(\type\()_8tap_filter_4)
+ vmov d17, d26
+ vmov d18, d27
+ bl L(\type\()_8tap_filter_4)
+ vmov d19, d26
+ vmov d20, d27
+ bl L(\type\()_8tap_filter_4)
+ vmov d21, d26
+ vmov d22, d27
+
+48:
+ bl L(\type\()_8tap_filter_4)
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d19, d2[3]
+ vmlal.s16 q2, d20, d3[0]
+ vmlal.s16 q2, d21, d3[1]
+ vmlal.s16 q2, d22, d3[2]
+ vmlal.s16 q2, d26, d3[3]
+ vmull.s16 q3, d17, d2[0]
+ vmlal.s16 q3, d18, d2[1]
+ vmlal.s16 q3, d19, d2[2]
+ vmlal.s16 q3, d20, d2[3]
+ vmlal.s16 q3, d21, d3[0]
+ vmlal.s16 q3, d22, d3[1]
+ vmlal.s16 q3, d26, d3[2]
+ vmlal.s16 q3, d27, d3[3]
+ vqrshrn.s32 d4, q2, #\shift_hv
+ vqrshrn.s32 d6, q3, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d6, q3
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d6[0]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d6}, [\ds2, :64], \d_strd
+.endif
+ ble 0f
+ vmov d16, d18
+ vmov d17, d19
+ vmov d18, d20
+ vmov d19, d21
+ vmov d20, d22
+ vmov d21, d26
+ vmov d22, d27
+ b 48b
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_4):
+ vld1.8 {d30}, [\sr2], \s_strd
+ vld1.8 {d31}, [\src], \s_strd
+ vmovl.u8 q14, d30
+ vext.8 d27, d28, d29, #2
+ vext.8 d30, d28, d29, #4
+ vext.8 d1, d28, d29, #6
+ vmul.s16 d26, d28, d0[0]
+ vmla.s16 d26, d27, d0[1]
+ vmla.s16 d26, d30, d0[2]
+ vmla.s16 d26, d1, d0[3]
+
+ vmovl.u8 q14, d31
+ vext.8 d30, d28, d29, #2
+ vext.8 d31, d28, d29, #4
+ vext.8 d1, d28, d29, #6
+ vmul.s16 d27, d28, d0[0]
+ vmla.s16 d27, d30, d0[1]
+ vmla.s16 d27, d31, d0[2]
+ vmla.s16 d27, d1, d0[3]
+ vrshr.s16 d26, d26, #2
+ vrshr.s16 d27, d27, #2
+ bx lr
+
+80:
+160:
+320:
+ bgt 880f
+ vpush {q4-q7}
+ add \my, \my, #2
+ vld1.8 {d0}, [\mx, :64]
+ vld1.32 {d2[]}, [\my]
+ sub \src, \src, #3
+ sub \src, \src, \s_strd
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ vld1.8 {q14}, [\src], \s_strd
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+ vmul.s16 q10, q12, d0[0]
+.irpc i, 123
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d1[\i-4]
+.endr
+ vrshr.s16 q3, q10, #2
+
+ bl L(\type\()_8tap_filter_8)
+ vmov q4, q10
+ vmov q5, q11
+
+8:
+ bl L(\type\()_8tap_filter_8)
+ vmull.s16 q12, d6, d2[0]
+ vmull.s16 q13, d7, d2[0]
+ vmull.s16 q14, d8, d2[0]
+ vmull.s16 q15, d9, d2[0]
+ vmlal.s16 q12, d8, d2[1]
+ vmlal.s16 q13, d9, d2[1]
+ vmlal.s16 q14, d10, d2[1]
+ vmlal.s16 q15, d11, d2[1]
+ vmlal.s16 q12, d10, d2[2]
+ vmlal.s16 q13, d11, d2[2]
+ vmlal.s16 q14, d20, d2[2]
+ vmlal.s16 q15, d21, d2[2]
+ vmlal.s16 q12, d20, d2[3]
+ vmlal.s16 q13, d21, d2[3]
+ vmlal.s16 q14, d22, d2[3]
+ vmlal.s16 q15, d23, d2[3]
+ vqrshrn.s32 d24, q12, #\shift_hv
+ vqrshrn.s32 d25, q13, #\shift_hv
+ vqrshrn.s32 d28, q14, #\shift_hv
+ vqrshrn.s32 d29, q15, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d28, q14
+ vst1.8 {d24}, [\dst, :64], \d_strd
+ vst1.8 {d28}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q12}, [\dst, :128], \d_strd
+ vst1.16 {q14}, [\ds2, :128], \d_strd
+.endif
+ ble 9f
+ vmov q3, q5
+ vmov q4, q10
+ vmov q5, q11
+ b 8b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 164b
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ vpush {q4-q7}
+ vld1.8 {d0}, [\mx, :64]
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #3
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ vld1.8 {q14}, [\src], \s_strd
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+ vmul.s16 q10, q12, d0[0]
+.irpc i, 123
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d1[\i-4]
+.endr
+ vrshr.s16 q3, q10, #2
+
+ bl L(\type\()_8tap_filter_8)
+ vmov q4, q10
+ vmov q5, q11
+ bl L(\type\()_8tap_filter_8)
+ vmov q6, q10
+ vmov q7, q11
+ bl L(\type\()_8tap_filter_8)
+ vmov q8, q10
+ vmov q9, q11
+
+88:
+ bl L(\type\()_8tap_filter_8)
+ vmull.s16 q12, d6, d2[0]
+ vmull.s16 q13, d7, d2[0]
+ vmull.s16 q14, d8, d2[0]
+ vmull.s16 q15, d9, d2[0]
+ vmlal.s16 q12, d8, d2[1]
+ vmlal.s16 q13, d9, d2[1]
+ vmlal.s16 q14, d10, d2[1]
+ vmlal.s16 q15, d11, d2[1]
+ vmlal.s16 q12, d10, d2[2]
+ vmlal.s16 q13, d11, d2[2]
+ vmlal.s16 q14, d12, d2[2]
+ vmlal.s16 q15, d13, d2[2]
+ vmlal.s16 q12, d12, d2[3]
+ vmlal.s16 q13, d13, d2[3]
+ vmlal.s16 q14, d14, d2[3]
+ vmlal.s16 q15, d15, d2[3]
+ vmlal.s16 q12, d14, d3[0]
+ vmlal.s16 q13, d15, d3[0]
+ vmlal.s16 q14, d16, d3[0]
+ vmlal.s16 q15, d17, d3[0]
+ vmlal.s16 q12, d16, d3[1]
+ vmlal.s16 q13, d17, d3[1]
+ vmlal.s16 q14, d18, d3[1]
+ vmlal.s16 q15, d19, d3[1]
+ vmlal.s16 q12, d18, d3[2]
+ vmlal.s16 q13, d19, d3[2]
+ vmlal.s16 q14, d20, d3[2]
+ vmlal.s16 q15, d21, d3[2]
+ vmlal.s16 q12, d20, d3[3]
+ vmlal.s16 q13, d21, d3[3]
+ vmlal.s16 q14, d22, d3[3]
+ vmlal.s16 q15, d23, d3[3]
+ vqrshrn.s32 d24, q12, #\shift_hv
+ vqrshrn.s32 d25, q13, #\shift_hv
+ vqrshrn.s32 d28, q14, #\shift_hv
+ vqrshrn.s32 d29, q15, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d28, q14
+ vst1.8 {d24}, [\dst, :64], \d_strd
+ vst1.8 {d28}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q12}, [\dst, :128], \d_strd
+ vst1.16 {q14}, [\ds2, :128], \d_strd
+.endif
+ ble 9f
+ vmov q3, q5
+ vmov q4, q6
+ vmov q5, q7
+ vmov q6, q8
+ vmov q7, q9
+ vmov q8, q10
+ vmov q9, q11
+ b 88b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 168b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_8):
+ vld1.8 {q14}, [\sr2], \s_strd
+ vld1.8 {q15}, [\src], \s_strd
+ vmovl.u8 q12, d28
+ vmovl.u8 q13, d29
+ vmul.s16 q10, q12, d0[0]
+.irpc i, 123
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q10, q14, d1[\i-4]
+.endr
+ vmovl.u8 q12, d30
+ vmovl.u8 q13, d31
+ vmul.s16 q11, q12, d0[0]
+.irpc i, 123
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q11, q14, d0[\i]
+.endr
+.irpc i, 4567
+ vext.8 q14, q12, q13, #(2*\i)
+ vmla.s16 q11, q14, d1[\i-4]
+.endr
+ vrshr.s16 q10, q10, #2
+ vrshr.s16 q11, q11, #2
+ bx lr
+endfunc
+
+
+function \type\()_bilin_8bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+ vdup.8 d1, \mx
+ vdup.8 d3, \my
+ rsb r8, \mx, #16
+ rsb r9, \my, #16
+ vdup.8 d0, r8
+ vdup.8 d2, r9
+.ifc \type, prep
+ lsl \d_strd, \w, #1
+.endif
+ clz r8, \w
+ cmp \mx, #0
+ sub r8, r8, #24
+ bne L(\type\()_bilin_h)
+ cmp \my, #0
+ bne L(\type\()_bilin_v)
+ b \type\()_neon
+
+L(\type\()_bilin_h):
+ cmp \my, #0
+ bne L(\type\()_bilin_hv)
+
+ adr r9, L(\type\()_bilin_h_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_bilin_h_tbl):
+ .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+
+20: // 2xN h
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ vld1.32 {d4[]}, [\src], \s_strd
+ vld1.32 {d6[]}, [\sr2], \s_strd
+ vext.8 d5, d4, d4, #1
+ vext.8 d7, d6, d6, #1
+ vtrn.16 q2, q3
+ subs \h, \h, #2
+ vmull.u8 q3, d4, d0
+ vmlal.u8 q3, d5, d1
+ vqrshrn.u16 d4, q3, #4
+ vst1.16 {d4[0]}, [\dst, :16], \d_strd
+ vst1.16 {d4[1]}, [\ds2, :16], \d_strd
+ bgt 2b
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ vld1.8 {d4}, [\src], \s_strd
+ vld1.8 {d6}, [\sr2], \s_strd
+ vext.8 d5, d4, d4, #1
+ vext.8 d7, d6, d6, #1
+ vtrn.32 q2, q3
+ subs \h, \h, #2
+ vmull.u8 q3, d4, d0
+ vmlal.u8 q3, d5, d1
+.ifc \type, put
+ vqrshrn.u16 d4, q3, #4
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d4[1]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d6}, [\dst, :64], \d_strd
+ vst1.16 {d7}, [\ds2, :64], \d_strd
+.endif
+ bgt 4b
+ pop {r4-r11,pc}
+
+80: // 8xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ vld1.8 {q8}, [\src], \s_strd
+ vld1.8 {q10}, [\sr2], \s_strd
+ vext.8 q9, q8, q8, #1
+ vext.8 q11, q10, q10, #1
+ subs \h, \h, #2
+ vmull.u8 q8, d16, d0
+ vmull.u8 q10, d20, d0
+ vmlal.u8 q8, d18, d1
+ vmlal.u8 q10, d22, d1
+.ifc \type, put
+ vqrshrn.u16 d16, q8, #4
+ vqrshrn.u16 d18, q10, #4
+ vst1.8 {d16}, [\dst, :64], \d_strd
+ vst1.8 {d18}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q8}, [\dst, :128], \d_strd
+ vst1.16 {q10}, [\ds2, :128], \d_strd
+.endif
+ bgt 8b
+ pop {r4-r11,pc}
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w
+ sub \s_strd, \s_strd, #8
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w
+.endif
+161:
+ vld1.8 {d16}, [\src]!
+ vld1.8 {d22}, [\sr2]!
+ mov \mx, \w
+
+16:
+ vld1.8 {d17,d18}, [\src]!
+ vld1.8 {d23,d24}, [\sr2]!
+ vext.8 q10, q8, q9, #1
+ vext.8 q13, q11, q12, #1
+ vmull.u8 q2, d16, d0
+ vmull.u8 q3, d17, d0
+ vmull.u8 q14, d22, d0
+ vmull.u8 q15, d23, d0
+ vmlal.u8 q2, d20, d1
+ vmlal.u8 q3, d21, d1
+ vmlal.u8 q14, d26, d1
+ vmlal.u8 q15, d27, d1
+ subs \mx, \mx, #16
+.ifc \type, put
+ vqrshrn.u16 d4, q2, #4
+ vqrshrn.u16 d5, q3, #4
+ vqrshrn.u16 d28, q14, #4
+ vqrshrn.u16 d29, q15, #4
+ vst1.8 {q2}, [\dst, :128]!
+ vst1.8 {q14}, [\ds2, :128]!
+.else
+ vst1.16 {q2, q3}, [\dst, :128]!
+ vst1.16 {q14, q15}, [\ds2, :128]!
+.endif
+ ble 9f
+
+ vmov d16, d18
+ vmov d22, d24
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ bgt 161b
+ pop {r4-r11,pc}
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr r9, L(\type\()_bilin_v_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_bilin_v_tbl):
+ .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+
+20: // 2xN v
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ vld1.16 {d16[]}, [\src], \s_strd
+ bgt 24f
+22:
+ vld1.16 {d17[]}, [\sr2], \s_strd
+ vld1.16 {d18[]}, [\src], \s_strd
+ vext.8 d16, d16, d17, #6
+ vext.8 d17, d17, d18, #6
+ vmull.u8 q2, d16, d2
+ vmlal.u8 q2, d17, d3
+ vqrshrn.u16 d4, q2, #4
+ vst1.16 {d4[0]}, [\dst, :16]
+ vst1.16 {d4[1]}, [\ds2, :16]
+ pop {r4-r11,pc}
+24: // 2x4, 2x6, 2x8, ... v
+ vld1.16 {d17[]}, [\sr2], \s_strd
+ vld1.16 {d18[]}, [\src], \s_strd
+ vld1.16 {d19[]}, [\sr2], \s_strd
+ vld1.16 {d20[]}, [\src], \s_strd
+ sub \h, \h, #4
+ vext.8 d16, d16, d17, #6
+ vext.8 d17, d17, d18, #6
+ vext.8 d18, d18, d19, #6
+ vext.8 d19, d19, d20, #6
+ vtrn.32 d16, d18
+ vtrn.32 d17, d19
+ vmull.u8 q2, d16, d2
+ vmlal.u8 q2, d17, d3
+ cmp \h, #2
+ vqrshrn.u16 d4, q2, #4
+ vst1.16 {d4[0]}, [\dst, :16], \d_strd
+ vst1.16 {d4[1]}, [\ds2, :16], \d_strd
+ vst1.16 {d4[2]}, [\dst, :16], \d_strd
+ vst1.16 {d4[3]}, [\ds2, :16], \d_strd
+ blt 0f
+ vmov d16, d20
+ beq 22b
+ b 24b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vld1.32 {d16[]}, [\src], \s_strd
+4:
+ vld1.32 {d17[]}, [\sr2], \s_strd
+ vld1.32 {d18[]}, [\src], \s_strd
+ vext.8 d16, d16, d17, #4
+ vext.8 d17, d17, d18, #4
+ vmull.u8 q2, d16, d2
+ vmlal.u8 q2, d17, d3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d4, q2, #4
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d4[1]}, [\ds2, :32], \d_strd
+.else
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+.endif
+ ble 0f
+ vmov d16, d18
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+80: // 8xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vld1.8 {d16}, [\src], \s_strd
+8:
+ vld1.8 {d17}, [\sr2], \s_strd
+ vld1.8 {d18}, [\src], \s_strd
+ vmull.u8 q2, d16, d2
+ vmull.u8 q3, d17, d2
+ vmlal.u8 q2, d17, d3
+ vmlal.u8 q3, d18, d3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d4, q2, #4
+ vqrshrn.u16 d6, q3, #4
+ vst1.8 {d4}, [\dst, :64], \d_strd
+ vst1.8 {d6}, [\ds2, :64], \d_strd
+.else
+ vst1.16 {q2}, [\dst, :128], \d_strd
+ vst1.16 {q3}, [\ds2, :128], \d_strd
+.endif
+ ble 0f
+ vmov d16, d18
+ b 8b
+0:
+ pop {r4-r11,pc}
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.8 {q8}, [\src], \s_strd
+2:
+ vld1.8 {q9}, [\sr2], \s_strd
+ vld1.8 {q10}, [\src], \s_strd
+ vmull.u8 q12, d16, d2
+ vmull.u8 q13, d17, d2
+ vmull.u8 q14, d18, d2
+ vmull.u8 q15, d19, d2
+ vmlal.u8 q12, d18, d3
+ vmlal.u8 q13, d19, d3
+ vmlal.u8 q14, d20, d3
+ vmlal.u8 q15, d21, d3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d24, q12, #4
+ vqrshrn.u16 d25, q13, #4
+ vqrshrn.u16 d28, q14, #4
+ vqrshrn.u16 d29, q15, #4
+ vst1.8 {q12}, [\dst, :128], \d_strd
+ vst1.8 {q14}, [\ds2, :128], \d_strd
+.else
+ vst1.16 {q12, q13}, [\dst, :128], \d_strd
+ vst1.16 {q14, q15}, [\ds2, :128], \d_strd
+.endif
+ ble 9f
+ vmov q8, q10
+ b 2b
+9:
+ subs \w, \w, #16
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+.ifc \type, put
+ add \dst, \dst, #16
+.else
+ add \dst, \dst, #32
+.endif
+ b 1b
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_bilin_hv):
+ vmovl.u8 q2, d2
+ vmovl.u8 q3, d3
+ adr r9, L(\type\()_bilin_hv_tbl)
+ ldr r8, [r9, r8, lsl #2]
+ add r9, r9, r8
+ bx r9
+
+ .align 2
+L(\type\()_bilin_hv_tbl):
+ .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+
+20: // 2xN hv
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.32 {d28[]}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vmull.u8 q8, d28, d0
+ vmlal.u8 q8, d29, d1
+
+2:
+ vld1.32 {d28[]}, [\sr2], \s_strd
+ vld1.32 {d30[]}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vext.8 d31, d30, d30, #1
+ vtrn.16 d28, d30
+ vtrn.16 d29, d31
+ vmull.u8 q9, d28, d0
+ vmlal.u8 q9, d29, d1
+
+ vtrn.32 d16, d18
+
+ vmul.u16 d20, d16, d4
+ vmla.u16 d20, d19, d6
+ vqrshrn.u16 d20, q10, #8
+ subs \h, \h, #2
+ vst1.16 {d20[0]}, [\dst, :16], \d_strd
+ vst1.16 {d20[1]}, [\ds2, :16], \d_strd
+ ble 0f
+ vtrn.32 d19, d16
+ b 2b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN hv
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.8 {d28}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vmull.u8 q8, d28, d0
+ vmlal.u8 q8, d29, d1
+
+4:
+ vld1.8 {d28}, [\sr2], \s_strd
+ vld1.8 {d30}, [\src], \s_strd
+ vext.8 d29, d28, d28, #1
+ vext.8 d31, d30, d30, #1
+ vtrn.32 d28, d30
+ vtrn.32 d29, d31
+ vmull.u8 q9, d28, d0
+ vmlal.u8 q9, d29, d1
+
+ vmov d17, d18
+
+ vmul.u16 q10, q8, q2
+ vmla.u16 q10, q9, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d20, q10, #8
+ vst1.32 {d20[0]}, [\dst, :32], \d_strd
+ vst1.32 {d20[1]}, [\ds2, :32], \d_strd
+.else
+ vrshr.u16 q10, q10, #4
+ vst1.16 {d20}, [\dst, :64], \d_strd
+ vst1.16 {d21}, [\ds2, :64], \d_strd
+.endif
+ ble 0f
+ vmov d16, d19
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.8 {q12}, [\src], \s_strd
+ vext.8 q13, q12, q12, #1
+ vmull.u8 q8, d24, d0
+ vmlal.u8 q8, d26, d1
+
+2:
+ vld1.8 {q12}, [\sr2], \s_strd
+ vld1.8 {q14}, [\src], \s_strd
+ vext.8 q13, q12, q12, #1
+ vext.8 q15, q14, q14, #1
+ vmull.u8 q9, d24, d0
+ vmlal.u8 q9, d26, d1
+ vmull.u8 q10, d28, d0
+ vmlal.u8 q10, d30, d1
+
+ vmul.u16 q8, q8, q2
+ vmla.u16 q8, q9, q3
+ vmul.u16 q9, q9, q2
+ vmla.u16 q9, q10, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vqrshrn.u16 d16, q8, #8
+ vqrshrn.u16 d18, q9, #8
+ vst1.8 {d16}, [\dst, :64], \d_strd
+ vst1.8 {d18}, [\ds2, :64], \d_strd
+.else
+ vrshr.u16 q8, q8, #4
+ vrshr.u16 q9, q9, #4
+ vst1.16 {q8}, [\dst, :128], \d_strd
+ vst1.16 {q9}, [\ds2, :128], \d_strd
+.endif
+ ble 9f
+ vmov q8, q10
+ b 2b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 1b
+0:
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10
+filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
+
+.macro load_filter_ptr src
+ asr r12, \src, #10
+ add r12, r11, r12, lsl #3
+.endm
+
+.macro load_filter_coef dst, src, inc
+ add \src, \src, \inc
+ vld1.8 {\dst}, [r12, :64]
+.endm
+
+.macro load_filter_row dst, src, inc
+ load_filter_ptr \src
+ load_filter_coef \dst, \src, \inc
+.endm
+
+function warp_filter_horz_neon
+ load_filter_ptr r5 // filter 0
+ vld1.16 {q7}, [r2], r3
+ vmov.i8 q6, #128
+
+ load_filter_coef d0, r5, r7 // filter 0
+ load_filter_row d1, r5, r7 // filter 1
+ load_filter_row d2, r5, r7 // filter 2
+ load_filter_ptr r5 // filter 3
+ veor q7, q7, q6 // subtract by 128 to allow using vmull
+ load_filter_coef d3, r5, r7 // filter 3
+ vext.8 d12, d14, d15, #1 // filter 1 pixels
+ vext.8 d13, d14, d15, #2 // filter 2 pixels
+ load_filter_ptr r5 // filter 4
+ vmull.s8 q2, d14, d0 // filter 0 output
+ vmull.s8 q3, d12, d1 // filter 1 output
+ load_filter_coef d0, r5, r7 // filter 4
+ load_filter_ptr r5 // filter 5
+ vext.8 d12, d14, d15, #3 // filter 3 pixels
+ vmull.s8 q4, d13, d2 // filter 2 output
+ vext.8 d13, d14, d15, #4 // filter 4 pixels
+ vpadd.i16 d4, d4, d5 // pixel 0 (4x16)
+ vpadd.i16 d5, d6, d7 // pixel 1 (4x16)
+ load_filter_coef d1, r5, r7 // filter 5
+ load_filter_ptr r5 // filter 6
+ vmull.s8 q5, d12, d3 // filter 3 output
+ vext.8 d12, d14, d15, #5 // filter 5 pixels
+ vmull.s8 q3, d13, d0 // filter 4 output
+ load_filter_coef d0, r5, r7 // filter 6
+ vext.8 d13, d14, d15, #6 // filter 6 pixels
+ load_filter_ptr r5 // filter 7
+ vpadd.i16 d8, d8, d9 // pixel 2 (4x16)
+ vpadd.i16 d9, d10, d11 // pixel 3 (4x16)
+ vmull.s8 q5, d12, d1 // filter 5 output
+ load_filter_coef d1, r5, r7 // filter 7
+ vext.8 d14, d14, d15, #7 // filter 7 pixels
+ vpadd.i16 d6, d6, d7 // pixel 4 (4x16)
+ vpadd.i16 d10, d10, d11 // pixel 5 (4x16)
+ vmull.s8 q6, d13, d0 // filter 6 output
+ vmull.s8 q7, d14, d1 // filter 7 output
+
+ sub r5, r5, r7, lsl #3
+
+ vpadd.i16 d4, d4, d5 // pixel 0,1 (2x16)
+ vpadd.i16 d5, d8, d9 // pixel 2,3 (2x16)
+ vpadd.i16 d12, d12, d13 // pixel 6 (4x16)
+ vpadd.i16 d14, d14, d15 // pixel 7 (4x16)
+ vpadd.i16 d6, d6, d10 // pixel 4,5 (2x16)
+ vpadd.i16 d10, d12, d14 // pixel 6,7 (2x16)
+ vpadd.i16 d4, d4, d5 // pixel 0-3
+ vpadd.i16 d5, d6, d10 // pixel 4-7
+
+ add r5, r5, r8
+
+ bx lr
+endfunc
+
+// void dav1d_warp_affine_8x8_8bpc_neon(
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *const abcd, int mx, int my)
+.macro warp t, shift
+function warp_affine_8x8\t\()_8bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldr r6, [sp, #108]
+ ldrd r8, r9, [r4]
+ sxth r7, r8
+ asr r8, r8, #16
+ asr r4, r9, #16
+ sxth r9, r9
+ mov r10, #8
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, r3
+ sub r2, r2, #3
+ movrel r11, X(mc_warp_filter), 64*8
+.ifnb \t
+ lsl r1, r1, #1
+.endif
+ add r5, r5, #512
+ add r6, r6, #512
+
+ bl warp_filter_horz_neon
+ vrshr.s16 q8, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q9, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q10, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q11, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q12, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q13, q2, #3
+ bl warp_filter_horz_neon
+ vrshr.s16 q14, q2, #3
+
+1:
+ bl warp_filter_horz_neon
+ vrshr.s16 q15, q2, #3
+
+ load_filter_row d8, r6, r9
+ load_filter_row d9, r6, r9
+ load_filter_row d10, r6, r9
+ load_filter_row d11, r6, r9
+ load_filter_row d12, r6, r9
+ load_filter_row d13, r6, r9
+ load_filter_row d14, r6, r9
+ load_filter_row d15, r6, r9
+ transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15
+ vmovl.s8 q1, d8
+ vmovl.s8 q2, d9
+ vmovl.s8 q3, d10
+ vmovl.s8 q4, d11
+ vmovl.s8 q5, d12
+ vmovl.s8 q6, d13
+
+ sub r6, r6, r9, lsl #3
+
+ // This ordering of vmull/vmlal is highly beneficial for
+ // Cortex A8/A9/A53 here, but harmful for Cortex A7.
+ vmull.s16 q0, d16, d2
+ vmlal.s16 q0, d18, d4
+ vmlal.s16 q0, d20, d6
+ vmlal.s16 q0, d22, d8
+ vmlal.s16 q0, d24, d10
+ vmlal.s16 q0, d26, d12
+ vmull.s16 q1, d17, d3
+ vmlal.s16 q1, d19, d5
+ vmlal.s16 q1, d21, d7
+ vmlal.s16 q1, d23, d9
+ vmlal.s16 q1, d25, d11
+ vmlal.s16 q1, d27, d13
+
+ vmovl.s8 q2, d14
+ vmovl.s8 q3, d15
+
+ vmlal.s16 q0, d28, d4
+ vmlal.s16 q0, d30, d6
+ vmlal.s16 q1, d29, d5
+ vmlal.s16 q1, d31, d7
+
+.ifb \t
+ vmov.i16 q7, #128
+.else
+ vmov.i16 q7, #0x800
+.endif
+
+ vmov q8, q9
+ vmov q9, q10
+ vqrshrn.s32 d0, q0, #\shift
+ vmov q10, q11
+ vqrshrn.s32 d1, q1, #\shift
+ vmov q11, q12
+ vadd.i16 q0, q0, q7
+ vmov q12, q13
+.ifb \t
+ vqmovun.s16 d0, q0
+.endif
+ vmov q13, q14
+ vmov q14, q15
+ subs r10, r10, #1
+.ifnb \t
+ vst1.16 {q0}, [r0, :128], r1
+.else
+ vst1.8 {d0}, [r0, :64], r1
+.endif
+
+ add r6, r6, r4
+ bgt 1b
+
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+warp , 11
+warp t, 7
+
+// void dav1d_emu_edge_8bpc_neon(
+// const intptr_t bw, const intptr_t bh,
+// const intptr_t iw, const intptr_t ih,
+// const intptr_t x, const intptr_t y,
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_8bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+ ldrd r8, r9, [sp, #52]
+
+ // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ // ref += iclip(x, 0, iw - 1)
+ sub r12, r3, #1 // ih - 1
+ cmp r5, r3
+ sub lr, r2, #1 // iw - 1
+ it lt
+ movlt r12, r5 // min(y, ih - 1)
+ cmp r4, r2
+ bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0)
+ it lt
+ movlt lr, r4 // min(x, iw - 1)
+ bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0)
+ mla r8, r12, r9, r8 // ref += iclip() * stride
+ add r8, r8, lr // ref += iclip()
+
+ // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ // top_ext = iclip(-y, 0, bh - 1)
+ add r10, r5, r1 // y + bh
+ neg r5, r5 // -y
+ sub r10, r10, r3 // y + bh - ih
+ sub r12, r1, #1 // bh - 1
+ cmp r10, r1
+ bic r5, r5, r5, asr #31 // max(-y, 0)
+ it ge
+ movge r10, r12 // min(y + bh - ih, bh-1)
+ cmp r5, r1
+ bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0)
+ it ge
+ movge r5, r12 // min(max(-y, 0), bh-1)
+
+ // right_ext = iclip(x + bw - iw, 0, bw - 1)
+ // left_ext = iclip(-x, 0, bw - 1)
+ add r11, r4, r0 // x + bw
+ neg r4, r4 // -x
+ sub r11, r11, r2 // x + bw - iw
+ sub lr, r0, #1 // bw - 1
+ cmp r11, r0
+ bic r4, r4, r4, asr #31 // max(-x, 0)
+ it ge
+ movge r11, lr // min(x + bw - iw, bw-1)
+ cmp r4, r0
+ bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0)
+ it ge
+ movge r4, lr // min(max(-x, 0), bw - 1)
+
+ // center_h = bh - top_ext - bottom_ext
+ // dst += top_ext * PXSTRIDE(dst_stride)
+ // center_w = bw - left_ext - right_ext
+ sub r1, r1, r5 // bh - top_ext
+ mla r6, r5, r7, r6
+ sub r2, r0, r4 // bw - left_ext
+ sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext
+ sub r2, r2, r11 // center_w = bw - left_ext - right_ext
+
+ mov r0, r6 // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+ vld1.8 {d0[], d1[]}, [r8]
+ mov r12, r6 // out = dst
+ mov r3, r4
+1:
+ subs r3, r3, #16
+ vst1.8 {q0}, [r12, :128]!
+ bgt 1b
+.endif
+ mov lr, r8
+ add r12, r6, r4 // out = dst + left_ext
+ mov r3, r2
+1:
+ vld1.8 {q0, q1}, [lr]!
+ subs r3, r3, #32
+.if \need_left
+ vst1.8 {q0, q1}, [r12]!
+.else
+ vst1.8 {q0, q1}, [r12, :128]!
+.endif
+ bgt 1b
+.if \need_right
+ add r3, r8, r2 // in + center_w
+ sub r3, r3, #1 // in + center_w - 1
+ add r12, r6, r4 // dst + left_ext
+ vld1.8 {d0[], d1[]}, [r3]
+ add r12, r12, r2 // out = dst + left_ext + center_w
+ mov r3, r11
+1:
+ subs r3, r3, #16
+ vst1.8 {q0}, [r12]!
+ bgt 1b
+.endif
+
+ subs r1, r1, #1 // center_h--
+ add r6, r6, r7
+ add r8, r8, r9
+ bgt 0b
+.endm
+
+ cmp r4, #0
+ beq 2f
+ // need_left
+ cmp r11, #0
+ beq 3f
+ // need_left + need_right
+ v_loop 1, 1
+ b 5f
+
+2:
+ // !need_left
+ cmp r11, #0
+ beq 4f
+ // !need_left + need_right
+ v_loop 0, 1
+ b 5f
+
+3:
+ // need_left + !need_right
+ v_loop 1, 0
+ b 5f
+
+4:
+ // !need_left + !need_right
+ v_loop 0, 0
+
+5:
+ cmp r10, #0
+ // Storing the original dst in r0 overwrote bw, recalculate it here
+ add r2, r2, r4 // center_w + left_ext
+ add r2, r2, r11 // bw = center_w + left_ext + right_ext
+
+ beq 3f
+ // need_bottom
+ sub r8, r6, r7 // ref = dst - stride
+ mov r4, r2
+1:
+ vld1.8 {q0, q1}, [r8, :128]!
+ mov r3, r10
+2:
+ subs r3, r3, #1
+ vst1.8 {q0, q1}, [r6, :128], r7
+ bgt 2b
+ mls r6, r7, r10, r6 // dst -= bottom_ext * stride
+ subs r4, r4, #32 // bw -= 32
+ add r6, r6, #32 // dst += 32
+ bgt 1b
+
+3:
+ cmp r5, #0
+ beq 3f
+ // need_top
+ mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride
+1:
+ vld1.8 {q0, q1}, [r0, :128]!
+ mov r3, r5
+2:
+ subs r3, r3, #1
+ vst1.8 {q0, q1}, [r6, :128], r7
+ bgt 2b
+ mls r6, r7, r5, r6 // dst -= top_ext * stride
+ subs r2, r2, #32 // bw -= 32
+ add r6, r6, #32 // dst += 32
+ bgt 1b
+
+3:
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/mc16.S b/third_party/dav1d/src/arm/32/mc16.S
new file mode 100644
index 0000000000..b7d845e219
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/mc16.S
@@ -0,0 +1,3658 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define PREP_BIAS 8192
+
+.macro avg d0, d00, d01, d1, d10, d11
+ vld1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q2, q3}, [r3, :128]!
+ vqadd.s16 q0, q0, q2
+ vqadd.s16 q1, q1, q3
+ vmax.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vmax.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vqsub.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vqsub.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vshl.s16 \d0, q0, q13 // -(intermediate_bits+1)
+ vshl.s16 \d1, q1, q13 // -(intermediate_bits+1)
+.endm
+
+.macro w_avg d0, d00, d01, d1, d10, d11
+ vld1.16 {q0, q1}, [r2, :128]!
+ vld1.16 {q2, q3}, [r3, :128]!
+ // This difference requires a 17 bit range, and all bits are
+ // significant for the following multiplication.
+ vsubl.s16 \d0, d4, d0
+ vsubl.s16 q0, d5, d1
+ vsubl.s16 \d1, d6, d2
+ vsubl.s16 q1, d7, d3
+ vmul.s32 \d0, \d0, q4
+ vmul.s32 q0, q0, q4
+ vmul.s32 \d1, \d1, q4
+ vmul.s32 q1, q1, q4
+ vshr.s32 \d0, \d0, #4
+ vshr.s32 q0, q0, #4
+ vshr.s32 \d1, \d1, #4
+ vshr.s32 q1, q1, #4
+ vaddw.s16 \d0, \d0, d4
+ vaddw.s16 q0, q0, d5
+ vaddw.s16 \d1, \d1, d6
+ vaddw.s16 q1, q1, d7
+ vmovn.i32 \d00, \d0
+ vmovn.i32 \d01, q0
+ vmovn.i32 \d10, \d1
+ vmovn.i32 \d11, q1
+ vrshl.s16 \d0, \d0, q13 // -intermediate_bits
+ vrshl.s16 \d1, \d1, q13 // -intermediate_bits
+ vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits
+ vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits
+ vmin.s16 \d0, \d0, q15 // bitdepth_max
+ vmin.s16 \d1, \d1, q15 // bitdepth_max
+ vmax.s16 \d0, \d0, q14 // 0
+ vmax.s16 \d1, \d1, q14 // 0
+.endm
+
+.macro mask d0, d00, d01, d1, d10, d11
+ vld1.8 {q7}, [r6, :128]!
+ vld1.16 {q0, q1}, [r2, :128]!
+ vneg.s8 q7, q7
+ vld1.16 {q2, q3}, [r3, :128]!
+ vmovl.s8 q6, d14
+ vmovl.s8 q7, d15
+ vmovl.s16 q4, d12
+ vmovl.s16 q5, d13
+ vmovl.s16 q6, d14
+ vmovl.s16 q7, d15
+ vsubl.s16 \d0, d4, d0
+ vsubl.s16 q0, d5, d1
+ vsubl.s16 \d1, d6, d2
+ vsubl.s16 q1, d7, d3
+ vmul.s32 \d0, \d0, q4
+ vmul.s32 q0, q0, q5
+ vmul.s32 \d1, \d1, q6
+ vmul.s32 q1, q1, q7
+ vshr.s32 \d0, \d0, #6
+ vshr.s32 q0, q0, #6
+ vshr.s32 \d1, \d1, #6
+ vshr.s32 q1, q1, #6
+ vaddw.s16 \d0, \d0, d4
+ vaddw.s16 q0, q0, d5
+ vaddw.s16 \d1, \d1, d6
+ vaddw.s16 q1, q1, d7
+ vmovn.i32 \d00, \d0
+ vmovn.i32 \d01, q0
+ vmovn.i32 \d10, \d1
+ vmovn.i32 \d11, q1
+ vrshl.s16 \d0, \d0, q13 // -intermediate_bits
+ vrshl.s16 \d1, \d1, q13 // -intermediate_bits
+ vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits
+ vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits
+ vmin.s16 \d0, \d0, q15 // bitdepth_max
+ vmin.s16 \d1, \d1, q15 // bitdepth_max
+ vmax.s16 \d0, \d0, q14 // 0
+ vmax.s16 \d1, \d1, q14 // 0
+.endm
+
+.macro bidir_fn type, bdmax
+function \type\()_16bpc_neon, export=1
+ push {r4-r7,lr}
+ ldrd r4, r5, [sp, #20]
+ ldr r6, [sp, #28]
+ clz r4, r4
+.ifnc \type, avg
+ ldr r7, [sp, #32]
+ vmov.i16 q14, #0
+ vdup.16 q15, r7 // bitdepth_max
+.endif
+.ifc \type, w_avg
+ vpush {q4}
+.endif
+.ifc \type, mask
+ vpush {q4-q7}
+.endif
+ clz r7, \bdmax
+ sub r7, r7, #18 // intermediate_bits = clz(bitdepth_max) - 18
+.ifc \type, avg
+ mov lr, #1
+ movw r12, #2*PREP_BIAS
+ lsl lr, lr, r7 // 1 << intermediate_bits
+ neg r12, r12 // -2*PREP_BIAS
+ add r7, r7, #1
+ sub r12, r12, lr // -2*PREP_BIAS - 1 << intermediate_bits
+ neg r7, r7 // -(intermediate_bits+1)
+ vdup.16 q12, r12 // -2*PREP_BIAS - 1 << intermediate_bits
+ vdup.16 q13, r7 // -(intermediate_bits+1)
+.else
+ mov r12, #PREP_BIAS
+ lsr r12, r12, r7 // PREP_BIAS >> intermediate_bits
+ neg r7, r7 // -intermediate_bits
+ vdup.16 q12, r12 // PREP_BIAS >> intermediate_bits
+ vdup.16 q13, r7 // -intermediate_bits
+.endif
+.ifc \type, w_avg
+ vdup.32 q4, r6
+ vneg.s32 q4, q4
+.endif
+ adr r7, L(\type\()_tbl)
+ sub r4, r4, #24
+ \type q8, d16, d17, q9, d18, d19
+ ldr r4, [r7, r4, lsl #2]
+ add r7, r7, r4
+ bx r7
+
+ .align 2
+L(\type\()_tbl):
+ .word 1280f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_tbl) + CONFIG_THUMB
+
+40:
+ add r7, r0, r1
+ lsl r1, r1, #1
+4:
+ subs r5, r5, #4
+ vst1.16 {d16}, [r0, :64], r1
+ vst1.16 {d17}, [r7, :64], r1
+ vst1.16 {d18}, [r0, :64], r1
+ vst1.16 {d19}, [r7, :64], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 4b
+80:
+ add r7, r0, r1
+ lsl r1, r1, #1
+8:
+ vst1.16 {q8}, [r0, :128], r1
+ subs r5, r5, #2
+ vst1.16 {q9}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 8b
+160:
+16:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #2
+ vst1.16 {q10, q11}, [r0, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 16b
+320:
+ add r7, r0, #32
+32:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #1
+ vst1.16 {q10, q11}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 32b
+640:
+ add r7, r0, #32
+ mov r12, #64
+ sub r1, r1, #64
+64:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #1
+ vst1.16 {q10, q11}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 64b
+1280:
+ add r7, r0, #32
+ mov r12, #64
+ sub r1, r1, #192
+128:
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r12
+ \type q8, d16, d17, q9, d18, d19
+ vst1.16 {q10, q11}, [r7, :128], r12
+ \type q10, d20, d21, q11, d22, d23
+ vst1.16 {q8, q9}, [r0, :128], r1
+ subs r5, r5, #1
+ vst1.16 {q10, q11}, [r7, :128], r1
+ ble 0f
+ \type q8, d16, d17, q9, d18, d19
+ b 128b
+0:
+.ifc \type, mask
+ vpop {q4-q7}
+.endif
+.ifc \type, w_avg
+ vpop {q4}
+.endif
+ pop {r4-r7,pc}
+endfunc
+.endm
+
+bidir_fn avg, r6
+bidir_fn w_avg, r7
+bidir_fn mask, r7
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_16bpc_neon, export=1
+ push {r4-r10,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #96]
+ ldrd r6, r7, [sp, #104]
+ ldr r8, [sp, #112]
+ clz r9, r4
+ adr lr, L(w_mask_\type\()_tbl)
+ vdup.16 q15, r8 // bitdepth_max
+ sub r9, r9, #24
+ clz r8, r8 // clz(bitdepth_max)
+ ldr r9, [lr, r9, lsl #2]
+ add r9, lr, r9
+ sub r8, r8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
+ mov r10, #PREP_BIAS*64
+ neg r8, r8 // -sh
+ movw r12, #27615 // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
+ vdup.32 q14, r8 // -sh
+ vdup.16 q0, r12
+.if \type == 444
+ vmov.i8 q1, #64
+.elseif \type == 422
+ vdup.8 d4, r7
+ vmov.i8 d2, #129
+ vsub.i16 d2, d2, d4
+.elseif \type == 420
+ vdup.16 q2, r7
+ vmov.i16 q1, #0x100
+ vsub.i16 q1, q1, q2
+.endif
+ add r12, r0, r1
+ lsl r1, r1, #1
+ bx r9
+
+ .align 2
+L(w_mask_\type\()_tbl):
+ .word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 640f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 320f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 160f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 8f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+ .word 4f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
+
+4:
+ vld1.16 {q2, q3}, [r2, :128]! // tmp1 (four rows at once)
+ vld1.16 {q4, q5}, [r3, :128]! // tmp2 (four rows at once)
+ subs r5, r5, #4
+ vdup.32 q13, r10 // PREP_BIAS*64
+ vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2)
+ vabd.s16 q7, q3, q5
+ vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit)
+ vsubl.s16 q9, d9, d5
+ vsubl.s16 q10, d10, d6
+ vsubl.s16 q11, d11, d7
+ vqsub.u16 q6, q0, q6 // 27615 - abs()
+ vqsub.u16 q7, q0, q7
+ vshll.s16 q5, d7, #6 // tmp1 << 6
+ vshll.s16 q4, d6, #6
+ vshll.s16 q3, d5, #6
+ vshll.s16 q2, d4, #6
+ vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh
+ vshr.u16 q7, q7, #10
+ vadd.i32 q2, q2, q13 // += PREP_BIAS*64
+ vadd.i32 q3, q3, q13
+ vadd.i32 q4, q4, q13
+ vadd.i32 q5, q5, q13
+ vmovl.u16 q12, d12
+ vmovl.u16 q13, d13
+ vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m)
+ vmovl.u16 q12, d14
+ vmla.i32 q3, q9, q13
+ vmovl.u16 q13, d15
+ vmla.i32 q4, q10, q12
+ vmla.i32 q5, q11, q13
+ vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ vrshl.s32 q3, q3, q14
+ vrshl.s32 q4, q4, q14
+ vrshl.s32 q5, q5, q14
+ vqmovun.s32 d4, q2 // iclip_pixel
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q4
+ vqmovun.s32 d7, q5
+ vmin.u16 q2, q2, q15 // iclip_pixel
+ vmin.u16 q3, q3, q15 // iclip_pixel
+.if \type == 444
+ vmovn.i16 d12, q6 // 64 - m
+ vmovn.i16 d13, q7
+ vsub.i16 q6, q1, q6 // m
+ vst1.8 {q6}, [r6, :128]!
+.elseif \type == 422
+ vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition)
+ vpadd.i16 d13, d14, d15
+ vmovn.i16 d12, q6
+ vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ vst1.8 {d12}, [r6, :64]!
+.elseif \type == 420
+ vadd.i16 d12, d12, d13 // (64 - my1) + (64 - my2) (row wise addition)
+ vadd.i16 d13, d14, d15
+ vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition)
+ vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d12[0]}, [r6, :32]!
+.endif
+ vst1.16 {d4}, [r0, :64], r1
+ vst1.16 {d5}, [r12, :64], r1
+ vst1.16 {d6}, [r0, :64], r1
+ vst1.16 {d7}, [r12, :64], r1
+ bgt 4b
+ vpop {q4-q7}
+ pop {r4-r10,pc}
+8:
+ vld1.16 {q2, q3}, [r2, :128]! // tmp1
+ vld1.16 {q4, q5}, [r3, :128]! // tmp2
+ subs r5, r5, #2
+ vdup.32 q13, r10 // PREP_BIAS*64
+ vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2)
+ vabd.s16 q7, q3, q5
+ vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit)
+ vsubl.s16 q9, d9, d5
+ vsubl.s16 q10, d10, d6
+ vsubl.s16 q11, d11, d7
+ vqsub.u16 q6, q0, q6 // 27615 - abs()
+ vqsub.u16 q7, q0, q7
+ vshll.s16 q5, d7, #6 // tmp1 << 6
+ vshll.s16 q4, d6, #6
+ vshll.s16 q3, d5, #6
+ vshll.s16 q2, d4, #6
+ vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh
+ vshr.u16 q7, q7, #10
+ vadd.i32 q2, q2, q13 // += PREP_BIAS*64
+ vadd.i32 q3, q3, q13
+ vadd.i32 q4, q4, q13
+ vadd.i32 q5, q5, q13
+ vmovl.u16 q12, d12
+ vmovl.u16 q13, d13
+ vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m)
+ vmovl.u16 q12, d14
+ vmla.i32 q3, q9, q13
+ vmovl.u16 q13, d15
+ vmla.i32 q4, q10, q12
+ vmla.i32 q5, q11, q13
+ vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ vrshl.s32 q3, q3, q14
+ vrshl.s32 q4, q4, q14
+ vrshl.s32 q5, q5, q14
+ vqmovun.s32 d4, q2 // iclip_pixel
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q4
+ vqmovun.s32 d7, q5
+ vmin.u16 q2, q2, q15 // iclip_pixel
+ vmin.u16 q3, q3, q15 // iclip_pixel
+.if \type == 444
+ vmovn.i16 d12, q6 // 64 - m
+ vmovn.i16 d13, q7
+ vsub.i16 q6, q1, q6 // m
+ vst1.8 {q6}, [r6, :128]!
+.elseif \type == 422
+ vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition)
+ vpadd.i16 d13, d14, d15
+ vmovn.i16 d12, q6
+ vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ vst1.8 {d12}, [r6, :64]!
+.elseif \type == 420
+ vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition)
+ vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition)
+ vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d12[0]}, [r6, :32]!
+.endif
+ vst1.16 {q2}, [r0, :128], r1
+ vst1.16 {q3}, [r12, :128], r1
+ bgt 8b
+ vpop {q4-q7}
+ pop {r4-r10,pc}
+1280:
+640:
+320:
+160:
+ sub r1, r1, r4, lsl #1
+.if \type == 444
+ add lr, r6, r4
+.elseif \type == 422
+ add lr, r6, r4, lsr #1
+.endif
+ add r7, r2, r4, lsl #1
+ add r9, r3, r4, lsl #1
+161:
+ mov r8, r4
+16:
+ vld1.16 {q2}, [r2, :128]! // tmp1
+ vld1.16 {q4}, [r3, :128]! // tmp2
+ vld1.16 {q3}, [r7, :128]!
+ vld1.16 {q5}, [r9, :128]!
+ subs r8, r8, #8
+ vdup.32 q13, r10 // PREP_BIAS*64
+ vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2)
+ vabd.s16 q7, q3, q5
+ vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit)
+ vsubl.s16 q9, d9, d5
+ vsubl.s16 q10, d10, d6
+ vsubl.s16 q11, d11, d7
+ vqsub.u16 q6, q0, q6 // 27615 - abs()
+ vqsub.u16 q7, q0, q7
+ vshll.s16 q5, d7, #6 // tmp1 << 6
+ vshll.s16 q4, d6, #6
+ vshll.s16 q3, d5, #6
+ vshll.s16 q2, d4, #6
+ vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh
+ vshr.u16 q7, q7, #10
+ vadd.i32 q2, q2, q13 // += PREP_BIAS*64
+ vadd.i32 q3, q3, q13
+ vadd.i32 q4, q4, q13
+ vadd.i32 q5, q5, q13
+ vmovl.u16 q12, d12
+ vmovl.u16 q13, d13
+ vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m)
+ vmovl.u16 q12, d14
+ vmla.i32 q3, q9, q13
+ vmovl.u16 q13, d15
+ vmla.i32 q4, q10, q12
+ vmla.i32 q5, q11, q13
+ vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ vrshl.s32 q3, q3, q14
+ vrshl.s32 q4, q4, q14
+ vrshl.s32 q5, q5, q14
+ vqmovun.s32 d4, q2 // iclip_pixel
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q4
+ vqmovun.s32 d7, q5
+ vmin.u16 q2, q2, q15 // iclip_pixel
+ vmin.u16 q3, q3, q15 // iclip_pixel
+.if \type == 444
+ vmovn.i16 d12, q6 // 64 - m
+ vmovn.i16 d13, q7
+ vsub.i16 q6, q1, q6 // m
+ vst1.8 {d12}, [r6, :64]!
+ vst1.8 {d13}, [lr, :64]!
+.elseif \type == 422
+ vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition)
+ vpadd.i16 d13, d14, d15
+ vmovn.i16 d12, q6
+ vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ vst1.32 {d12[0]}, [r6, :32]!
+ vst1.32 {d12[1]}, [lr, :32]!
+.elseif \type == 420
+ vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition)
+ vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition)
+ vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n))
+ vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ vst1.32 {d12[0]}, [r6, :32]!
+.endif
+ vst1.16 {q2}, [r0, :128]!
+ vst1.16 {q3}, [r12, :128]!
+ bgt 16b
+ subs r5, r5, #2
+ add r2, r2, r4, lsl #1
+ add r3, r3, r4, lsl #1
+ add r7, r7, r4, lsl #1
+ add r9, r9, r4, lsl #1
+.if \type == 444
+ add r6, r6, r4
+ add lr, lr, r4
+.elseif \type == 422
+ add r6, r6, r4, lsr #1
+ add lr, lr, r4, lsr #1
+.endif
+ add r0, r0, r1
+ add r12, r12, r1
+ bgt 161b
+ vpop {q4-q7}
+ pop {r4-r10,pc}
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+function blend_16bpc_neon, export=1
+ push {r4-r5,lr}
+ ldrd r4, r5, [sp, #12]
+ clz lr, r3
+ adr r3, L(blend_tbl)
+ sub lr, lr, #26
+ ldr lr, [r3, lr, lsl #2]
+ add r3, r3, lr
+ bx r3
+
+ .align 2
+L(blend_tbl):
+ .word 320f - L(blend_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_tbl) + CONFIG_THUMB
+
+40:
+ add r12, r0, r1
+ lsl r1, r1, #1
+4:
+ vld1.8 {d4}, [r5, :64]!
+ vld1.16 {q1}, [r2, :128]!
+ vld1.16 {d0}, [r0, :64]
+ vneg.s8 d4, d4 // -m
+ subs r4, r4, #2
+ vld1.16 {d1}, [r12, :64]
+ vmovl.s8 q2, d4
+ vshl.i16 q2, q2, #9 // -m << 9
+ vsub.i16 q1, q0, q1 // a - b
+ vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6
+ vadd.i16 q0, q0, q1
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d1}, [r12, :64], r1
+ bgt 4b
+ pop {r4-r5,pc}
+80:
+ add r12, r0, r1
+ lsl r1, r1, #1
+8:
+ vld1.8 {q8}, [r5, :128]!
+ vld1.16 {q2, q3}, [r2, :128]!
+ vneg.s8 q9, q8 // -m
+ vld1.16 {q0}, [r0, :128]
+ vld1.16 {q1}, [r12, :128]
+ vmovl.s8 q8, d18
+ vmovl.s8 q9, d19
+ vshl.i16 q8, q8, #9 // -m << 9
+ vshl.i16 q9, q9, #9
+ vsub.i16 q2, q0, q2 // a - b
+ vsub.i16 q3, q1, q3
+ subs r4, r4, #2
+ vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q3, q3, q9
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r12, :128], r1
+ bgt 8b
+ pop {r4-r5,pc}
+160:
+ add r12, r0, r1
+ lsl r1, r1, #1
+16:
+ vld1.8 {q12, q13}, [r5, :128]!
+ vld1.16 {q8, q9}, [r2, :128]!
+ subs r4, r4, #2
+ vneg.s8 q14, q12 // -m
+ vld1.16 {q0, q1}, [r0, :128]
+ vneg.s8 q15, q13
+ vld1.16 {q10, q11}, [r2, :128]!
+ vmovl.s8 q12, d28
+ vmovl.s8 q13, d29
+ vmovl.s8 q14, d30
+ vmovl.s8 q15, d31
+ vld1.16 {q2, q3}, [r12, :128]
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 q13, q13, #9
+ vshl.i16 q14, q14, #9
+ vshl.i16 q15, q15, #9
+ vsub.i16 q8, q0, q8 // a - b
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vsub.i16 q11, q3, q11
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q13
+ vqrdmulh.s16 q10, q10, q14
+ vqrdmulh.s16 q11, q11, q15
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vadd.i16 q3, q3, q11
+ vst1.16 {q2, q3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5,pc}
+320:
+ add r12, r0, #32
+32:
+ vld1.8 {q12, q13}, [r5, :128]!
+ vld1.16 {q8, q9}, [r2, :128]!
+ subs r4, r4, #1
+ vneg.s8 q14, q12 // -m
+ vld1.16 {q0, q1}, [r0, :128]
+ vneg.s8 q15, q13
+ vld1.16 {q10, q11}, [r2, :128]!
+ vmovl.s8 q12, d28
+ vmovl.s8 q13, d29
+ vmovl.s8 q14, d30
+ vmovl.s8 q15, d31
+ vld1.16 {q2, q3}, [r12, :128]
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 q13, q13, #9
+ vshl.i16 q14, q14, #9
+ vshl.i16 q15, q15, #9
+ vsub.i16 q8, q0, q8 // a - b
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vsub.i16 q11, q3, q11
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q13
+ vqrdmulh.s16 q10, q10, q14
+ vqrdmulh.s16 q11, q11, q15
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vadd.i16 q3, q3, q11
+ vst1.16 {q2, q3}, [r12, :128], r1
+ bgt 32b
+ pop {r4-r5,pc}
+endfunc
+
+function blend_h_16bpc_neon, export=1
+ push {r4-r5,lr}
+ ldr r4, [sp, #12]
+ movrel r5, X(obmc_masks)
+ add r5, r5, r4
+ sub r4, r4, r4, lsr #2
+ clz lr, r3
+ adr r12, L(blend_h_tbl)
+ sub lr, lr, #24
+ ldr lr, [r12, lr, lsl #2]
+ add r12, r12, lr
+ bx r12
+
+ .align 2
+L(blend_h_tbl):
+ .word 1280f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 640f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 320f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_h_tbl) + CONFIG_THUMB
+ .word 20f - L(blend_h_tbl) + CONFIG_THUMB
+
+20:
+ add r12, r0, r1
+ lsl r1, r1, #1
+2:
+ vld2.8 {d4[], d5[]}, [r5, :16]!
+ vld1.16 {d2}, [r2, :64]!
+ vext.8 d4, d4, d5, #6
+ subs r4, r4, #2
+ vneg.s8 d4, d4 // -m
+ vld1.32 {d0[]}, [r0, :32]
+ vld1.32 {d0[1]}, [r12, :32]
+ vmovl.s8 q2, d4
+ vshl.i16 d4, d4, #9 // -m << 9
+ vsub.i16 d2, d0, d2 // a - b
+ vqrdmulh.s16 d2, d2, d4 // ((a-b)*-m + 32) >> 6
+ vadd.i16 d0, d0, d2
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d0[1]}, [r12, :32], r1
+ bgt 2b
+ pop {r4-r5,pc}
+40:
+ add r12, r0, r1
+ lsl r1, r1, #1
+4:
+ vld2.8 {d4[], d5[]}, [r5, :16]!
+ vld1.16 {q1}, [r2, :128]!
+ vext.8 d4, d4, d5, #4
+ subs r4, r4, #2
+ vneg.s8 d4, d4 // -m
+ vld1.16 {d0}, [r0, :64]
+ vld1.16 {d1}, [r12, :64]
+ vmovl.s8 q2, d4
+ vshl.i16 q2, q2, #9 // -m << 9
+ vsub.i16 q1, q0, q1 // a - b
+ vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6
+ vadd.i16 q0, q0, q1
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d1}, [r12, :64], r1
+ bgt 4b
+ pop {r4-r5,pc}
+80:
+ add r12, r0, r1
+ lsl r1, r1, #1
+8:
+ vld2.8 {d16[], d17[]}, [r5, :16]!
+ vld1.16 {q2, q3}, [r2, :128]!
+ vneg.s8 q9, q8 // -m
+ vld1.16 {q0}, [r0, :128]
+ subs r4, r4, #2
+ vmovl.s8 q8, d18
+ vmovl.s8 q9, d19
+ vld1.16 {q1}, [r12, :128]
+ vshl.i16 q8, q8, #9 // -m << 9
+ vshl.i16 q9, q9, #9
+ vsub.i16 q2, q0, q2 // a - b
+ vsub.i16 q3, q1, q3
+ vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q3, q3, q9
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r12, :128], r1
+ bgt 8b
+ pop {r4-r5,pc}
+160:
+ add r12, r0, r1
+ lsl r1, r1, #1
+16:
+ vld2.8 {d24[], d25[]}, [r5, :16]!
+ vld1.16 {q8, q9}, [r2, :128]!
+ subs r4, r4, #2
+ vneg.s8 q13, q12 // -m
+ vld1.16 {q0, q1}, [r0, :128]
+ vmovl.s8 q12, d26
+ vld1.16 {q10, q11}, [r2, :128]!
+ vmovl.s8 q13, d27
+ vld1.16 {q2, q3}, [r12, :128]
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 q13, q13, #9
+ vsub.i16 q8, q0, q8 // a - b
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vsub.i16 q11, q3, q11
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q12
+ vqrdmulh.s16 q10, q10, q13
+ vqrdmulh.s16 q11, q11, q13
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vadd.i16 q3, q3, q11
+ vst1.16 {q0, q1}, [r0, :128], r1
+ vst1.16 {q2, q3}, [r12, :128], r1
+ bgt 16b
+ pop {r4-r5,pc}
+1280:
+640:
+320:
+ sub r1, r1, r3, lsl #1
+321:
+ vld1.8 {d24[]}, [r5]!
+ mov r12, r3
+ vneg.s8 d24, d24 // -m
+ vmovl.s8 q12, d24
+ vshl.i16 q12, q12, #9 // -m << 9
+32:
+ vld1.16 {q8, q9}, [r2, :128]!
+ vld1.16 {q0, q1}, [r0, :128]!
+ subs r12, r12, #32
+ vld1.16 {q10, q11}, [r2, :128]!
+ vld1.16 {q2, q3}, [r0, :128]
+ vsub.i16 q8, q0, q8 // a - b
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vsub.i16 q11, q3, q11
+ sub r0, r0, #32
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q12
+ vqrdmulh.s16 q10, q10, q12
+ vqrdmulh.s16 q11, q11, q12
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vst1.16 {q0, q1}, [r0, :128]!
+ vadd.i16 q3, q3, q11
+ vst1.16 {q2, q3}, [r0, :128]!
+ bgt 32b
+ subs r4, r4, #1
+ add r0, r0, r1
+ bgt 321b
+ pop {r4-r5,pc}
+endfunc
+
+function blend_v_16bpc_neon, export=1
+ push {r4,lr}
+ ldr r4, [sp, #8]
+ movrel lr, X(obmc_masks)
+ add lr, lr, r3
+ clz r12, r3
+ adr r3, L(blend_v_tbl)
+ sub r12, r12, #26
+ ldr r12, [r3, r12, lsl #2]
+ add r3, r3, r12
+ bx r3
+
+ .align 2
+L(blend_v_tbl):
+ .word 320f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 160f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 80f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 40f - L(blend_v_tbl) + CONFIG_THUMB
+ .word 20f - L(blend_v_tbl) + CONFIG_THUMB
+
+20:
+ add r12, r0, r1
+ lsl r1, r1, #1
+ vld1.8 {d4[]}, [lr]
+ vneg.s8 d4, d4 // -m
+ vmovl.s8 q2, d4
+ vshl.i16 d4, d4, #9 // -m << 9
+2:
+ vld1.32 {d2[]}, [r2, :32]!
+ vld1.16 {d0[]}, [r0, :16]
+ subs r4, r4, #2
+ vld1.16 {d2[1]}, [r2, :16]
+ vld1.16 {d0[1]}, [r12, :16]
+ add r2, r2, #4
+ vsub.i16 d2, d0, d2 // a - b
+ vqrdmulh.s16 d2, d2, d4 // ((a-b)*-m + 32) >> 6
+ vadd.i16 d0, d0, d2
+ vst1.16 {d0[0]}, [r0, :16], r1
+ vst1.16 {d0[1]}, [r12, :16], r1
+ bgt 2b
+ pop {r4,pc}
+40:
+ vld1.32 {d4[]}, [lr, :32]
+ add r12, r0, r1
+ vneg.s8 d4, d4 // -m
+ lsl r1, r1, #1
+ vmovl.s8 q2, d4
+ sub r1, r1, #4
+ vshl.i16 q2, q2, #9 // -m << 9
+4:
+ vld1.16 {q1}, [r2, :128]!
+ vld1.16 {d0}, [r0, :64]
+ vld1.16 {d1}, [r12, :64]
+ subs r4, r4, #2
+ vsub.i16 q1, q0, q1 // a - b
+ vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6
+ vadd.i16 q0, q0, q1
+ vst1.32 {d0[0]}, [r0, :32]!
+ vst1.32 {d1[0]}, [r12, :32]!
+ vst1.16 {d0[2]}, [r0, :16], r1
+ vst1.16 {d1[2]}, [r12, :16], r1
+ bgt 4b
+ pop {r4,pc}
+80:
+ vld1.8 {d16}, [lr, :64]
+ add r12, r0, r1
+ vneg.s8 d16, d16 // -m
+ lsl r1, r1, #1
+ vmovl.s8 q8, d16
+ sub r1, r1, #8
+ vshl.i16 q8, q8, #9 // -m << 9
+8:
+ vld1.16 {q2, q3}, [r2, :128]!
+ vld1.16 {q0}, [r0, :128]
+ vld1.16 {q1}, [r12, :128]
+ subs r4, r4, #2
+ vsub.i16 q2, q0, q2 // a - b
+ vsub.i16 q3, q1, q3
+ vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q3, q3, q8
+ vadd.i16 q0, q0, q2
+ vadd.i16 q1, q1, q3
+ vst1.16 {d0}, [r0, :64]!
+ vst1.16 {d2}, [r12, :64]!
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vst1.32 {d3[0]}, [r12, :32], r1
+ bgt 8b
+ pop {r4,pc}
+160:
+ vld1.8 {q12}, [lr, :128]
+ add r12, r0, r1
+ vneg.s8 q13, q12 // -m
+ lsl r1, r1, #1
+ vmovl.s8 q12, d26
+ vmovl.s8 q13, d27
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 d26, d26, #9
+16:
+ vld1.16 {q8, q9}, [r2, :128]!
+ vld1.16 {d0, d1, d2}, [r0, :64]
+ subs r4, r4, #2
+ vld1.16 {q10, q11}, [r2, :128]!
+ vsub.i16 q8, q0, q8 // a - b
+ vld1.16 {d4, d5, d6}, [r12, :64]
+ vsub.i16 d18, d2, d18
+ vsub.i16 q10, q2, q10
+ vsub.i16 d22, d6, d22
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 d18, d18, d26
+ vqrdmulh.s16 q10, q10, q12
+ vqrdmulh.s16 d22, d22, d26
+ vadd.i16 q0, q0, q8
+ vadd.i16 d2, d2, d18
+ vadd.i16 q2, q2, q10
+ vst1.16 {d0, d1, d2}, [r0, :64], r1
+ vadd.i16 d6, d6, d22
+ vst1.16 {d4, d5, d6}, [r12, :64], r1
+ bgt 16b
+ pop {r4,pc}
+320:
+ vld1.8 {d24, d25, d26}, [lr, :64]
+ vneg.s8 q14, q12 // -m
+ vneg.s8 d30, d26
+ vmovl.s8 q12, d28
+ vmovl.s8 q13, d29
+ vmovl.s8 q14, d30
+ sub r1, r1, #32
+ vshl.i16 q12, q12, #9 // -m << 9
+ vshl.i16 q13, q13, #9
+ vshl.i16 q14, q14, #9
+32:
+ vld1.16 {q8, q9}, [r2, :128]!
+ vld1.16 {q0, q1}, [r0, :128]!
+ subs r4, r4, #1
+ vld1.16 {q10}, [r2, :128]
+ vsub.i16 q8, q0, q8 // a - b
+ vld1.16 {q2}, [r0, :128]
+ sub r0, r0, #32
+ vsub.i16 q9, q1, q9
+ vsub.i16 q10, q2, q10
+ vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6
+ vqrdmulh.s16 q9, q9, q13
+ vqrdmulh.s16 q10, q10, q14
+ vadd.i16 q0, q0, q8
+ vadd.i16 q1, q1, q9
+ vadd.i16 q2, q2, q10
+ vst1.16 {q0, q1}, [r0, :128]!
+ add r2, r2, #32
+ vst1.16 {q2}, [r0, :128], r1
+ bgt 32b
+ pop {r4,pc}
+endfunc
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that r9 is set to (clz(w)-24).
+function put_neon
+ adr r10, L(put_tbl)
+ ldr r9, [r10, r9, lsl #2]
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(put_tbl):
+ .word 1280f - L(put_tbl) + CONFIG_THUMB
+ .word 640f - L(put_tbl) + CONFIG_THUMB
+ .word 320f - L(put_tbl) + CONFIG_THUMB
+ .word 16f - L(put_tbl) + CONFIG_THUMB
+ .word 80f - L(put_tbl) + CONFIG_THUMB
+ .word 4f - L(put_tbl) + CONFIG_THUMB
+ .word 2f - L(put_tbl) + CONFIG_THUMB
+
+2:
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d1[]}, [r2], r3
+ subs r5, r5, #2
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vst1.32 {d1[1]}, [r0, :32], r1
+ bgt 2b
+ pop {r4-r11,pc}
+4:
+ vld1.16 {d0}, [r2], r3
+ vld1.16 {d1}, [r2], r3
+ subs r5, r5, #2
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d1}, [r0, :64], r1
+ bgt 4b
+ pop {r4-r11,pc}
+80:
+ add r8, r0, r1
+ lsl r1, r1, #1
+ add r9, r2, r3
+ lsl r3, r3, #1
+8:
+ vld1.16 {q0}, [r2], r3
+ vld1.16 {q1}, [r9], r3
+ subs r5, r5, #2
+ vst1.16 {q0}, [r0, :128], r1
+ vst1.16 {q1}, [r8, :128], r1
+ bgt 8b
+ pop {r4-r11,pc}
+16:
+ vld1.16 {q0, q1}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q0, q1}, [r0, :128], r1
+ bgt 16b
+ pop {r4-r11,pc}
+320:
+ sub r1, r1, #32
+ sub r3, r3, #32
+32:
+ vld1.16 {q0, q1}, [r2]!
+ vst1.16 {q0, q1}, [r0, :128]!
+ vld1.16 {q2, q3}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q2, q3}, [r0, :128], r1
+ bgt 32b
+ pop {r4-r11,pc}
+640:
+ sub r1, r1, #96
+ sub r3, r3, #96
+64:
+ vld1.16 {q8, q9}, [r2]!
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2]!
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q12, q13}, [r2]!
+ vst1.16 {q12, q13}, [r0, :128]!
+ vld1.16 {q14, q15}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q14, q15}, [r0, :128], r1
+ bgt 64b
+ pop {r4-r11,pc}
+1280:
+ sub r1, r1, #224
+ sub r3, r3, #224
+128:
+ vld1.16 {q8, q9}, [r2]!
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2]!
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q12, q13}, [r2]!
+ vst1.16 {q12, q13}, [r0, :128]!
+ vld1.16 {q14, q15}, [r2]!
+ vst1.16 {q14, q15}, [r0, :128]!
+ vld1.16 {q8, q9}, [r2]!
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2]!
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q12, q13}, [r2]!
+ vst1.16 {q12, q13}, [r0, :128]!
+ vld1.16 {q14, q15}, [r2], r3
+ subs r5, r5, #1
+ vst1.16 {q14, q15}, [r0, :128], r1
+ bgt 128b
+ pop {r4-r11,pc}
+endfunc
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that r9 is set to (clz(w)-24), r7 to intermediate_bits and
+// r8 to w*2.
+function prep_neon
+ adr r10, L(prep_tbl)
+ ldr r9, [r10, r9, lsl #2]
+ vdup.16 q15, r7 // intermediate_bits
+ vmov.i16 q14, #PREP_BIAS
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(prep_tbl):
+ .word 1280f - L(prep_tbl) + CONFIG_THUMB
+ .word 640f - L(prep_tbl) + CONFIG_THUMB
+ .word 320f - L(prep_tbl) + CONFIG_THUMB
+ .word 16f - L(prep_tbl) + CONFIG_THUMB
+ .word 80f - L(prep_tbl) + CONFIG_THUMB
+ .word 40f - L(prep_tbl) + CONFIG_THUMB
+
+40:
+ add r9, r1, r2
+ lsl r2, r2, #1
+4:
+ vld1.16 {d0}, [r1], r2
+ vld1.16 {d1}, [r9], r2
+ subs r4, r4, #2
+ vshl.s16 q0, q0, q15
+ vsub.i16 q0, q0, q14
+ vst1.16 {q0}, [r0, :128]!
+ bgt 4b
+ pop {r4-r11,pc}
+80:
+ add r9, r1, r2
+ lsl r2, r2, #1
+8:
+ vld1.16 {q0}, [r1], r2
+ vld1.16 {q1}, [r9], r2
+ subs r4, r4, #2
+ vshl.s16 q0, q0, q15
+ vshl.s16 q1, q1, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ bgt 8b
+ pop {r4-r11,pc}
+16:
+ vld1.16 {q0, q1}, [r1], r2
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1], r2
+ subs r4, r4, #2
+ vshl.s16 q1, q1, q15
+ vshl.s16 q2, q2, q15
+ vshl.s16 q3, q3, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q3, q3, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ bgt 16b
+ pop {r4-r11,pc}
+320:
+ sub r2, r2, #32
+32:
+ vld1.16 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1], r2
+ vshl.s16 q1, q1, q15
+ vshl.s16 q2, q2, q15
+ vshl.s16 q3, q3, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q3, q3, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ bgt 32b
+ pop {r4-r11,pc}
+640:
+ sub r2, r2, #96
+64:
+ vld1.16 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1]!
+ vshl.s16 q1, q1, q15
+ vld1.16 {q8, q9}, [r1]!
+ vshl.s16 q2, q2, q15
+ vld1.16 {q10, q11}, [r1], r2
+ vshl.s16 q3, q3, q15
+ vshl.s16 q8, q8, q15
+ vshl.s16 q9, q9, q15
+ vshl.s16 q10, q10, q15
+ vshl.s16 q11, q11, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vsub.i16 q3, q3, q14
+ vsub.i16 q8, q8, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q9, q9, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ vsub.i16 q10, q10, q14
+ vst1.16 {q8, q9}, [r0, :128]!
+ vsub.i16 q11, q11, q14
+ vst1.16 {q10, q11}, [r0, :128]!
+ bgt 64b
+ pop {r4-r11,pc}
+1280:
+ sub r2, r2, #224
+128:
+ vld1.16 {q0, q1}, [r1]!
+ subs r4, r4, #1
+ vshl.s16 q0, q0, q15
+ vld1.16 {q2, q3}, [r1]!
+ vshl.s16 q1, q1, q15
+ vld1.16 {q8, q9}, [r1]!
+ vshl.s16 q2, q2, q15
+ vld1.16 {q10, q11}, [r1]!
+ vshl.s16 q3, q3, q15
+ vshl.s16 q8, q8, q15
+ vshl.s16 q9, q9, q15
+ vshl.s16 q10, q10, q15
+ vshl.s16 q11, q11, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vsub.i16 q3, q3, q14
+ vsub.i16 q8, q8, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vld1.16 {q0, q1}, [r1]!
+ vsub.i16 q9, q9, q14
+ vsub.i16 q10, q10, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ vld1.16 {q2, q3}, [r1]!
+ vsub.i16 q11, q11, q14
+ vshl.s16 q0, q0, q15
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q8, q9}, [r1]!
+ vshl.s16 q1, q1, q15
+ vshl.s16 q2, q2, q15
+ vst1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q10, q11}, [r1], r2
+ vshl.s16 q3, q3, q15
+ vshl.s16 q8, q8, q15
+ vshl.s16 q9, q9, q15
+ vshl.s16 q10, q10, q15
+ vshl.s16 q11, q11, q15
+ vsub.i16 q0, q0, q14
+ vsub.i16 q1, q1, q14
+ vsub.i16 q2, q2, q14
+ vsub.i16 q3, q3, q14
+ vsub.i16 q8, q8, q14
+ vst1.16 {q0, q1}, [r0, :128]!
+ vsub.i16 q9, q9, q14
+ vst1.16 {q2, q3}, [r0, :128]!
+ vsub.i16 q10, q10, q14
+ vst1.16 {q8, q9}, [r0, :128]!
+ vsub.i16 q11, q11, q14
+ vst1.16 {q10, q11}, [r0, :128]!
+ bgt 128b
+ pop {r4-r11,pc}
+endfunc
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ vld1.\wd {\d0[]}, [\s0], \strd
+ vld1.\wd {\d1[]}, [\s1], \strd
+.ifnb \d2
+ vld1.\wd {\d2[]}, [\s0], \strd
+ vld1.\wd {\d3[]}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.\wd {\d4[]}, [\s0], \strd
+.endif
+.ifnb \d5
+ vld1.\wd {\d5[]}, [\s1], \strd
+.endif
+.ifnb \d6
+ vld1.\wd {\d6[]}, [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ vld1.16 {\d0}, [\s0], \strd
+ vld1.16 {\d1}, [\s1], \strd
+.ifnb \d2
+ vld1.16 {\d2}, [\s0], \strd
+ vld1.16 {\d3}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.16 {\d4}, [\s0], \strd
+.endif
+.ifnb \d5
+ vld1.16 {\d5}, [\s1], \strd
+.endif
+.ifnb \d6
+ vld1.16 {\d6}, [\s0], \strd
+.endif
+.endm
+.macro load_regpair s0, s1, strd, d0, d1, d2, d3, d4, d5
+ vld1.16 {\d0, \d1}, [\s0], \strd
+.ifnb \d2
+ vld1.16 {\d2, \d3}, [\s1], \strd
+.endif
+.ifnb \d4
+ vld1.16 {\d4, \d5}, [\s0], \strd
+.endif
+.endm
+.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16s16 s0, s1, strd, d0, d1, d2, d3, d4, d5
+ load_regpair \s0, \s1, \strd, \d0, \d1, \d2, \d3, \d4, \d5
+.endm
+.macro interleave_1_32 r0, r1, r2, r3, r4
+ vext.8 \r0, \r0, \r1, #4
+ vext.8 \r1, \r1, \r2, #4
+.ifnb \r3
+ vext.8 \r2, \r2, \r3, #4
+ vext.8 \r3, \r3, \r4, #4
+.endif
+.endm
+.macro vmin_u16 c, r0, r1, r2, r3
+ vmin.u16 \r0, \r0, \c
+.ifnb \r1
+ vmin.u16 \r1, \r1, \c
+.endif
+.ifnb \r2
+ vmin.u16 \r2, \r2, \c
+ vmin.u16 \r3, \r3, \c
+.endif
+.endm
+.macro vsub_i16 c, r0, r1, r2, r3
+ vsub.i16 \r0, \r0, \c
+.ifnb \r1
+ vsub.i16 \r1, \r1, \c
+.endif
+.ifnb \r2
+ vsub.i16 \r2, \r2, \c
+ vsub.i16 \r3, \r3, \c
+.endif
+.endm
+.macro vmull_vmlal_4 d, s0, s1, s2, s3
+ vmull.s16 \d, \s0, d0[0]
+ vmlal.s16 \d, \s1, d0[1]
+ vmlal.s16 \d, \s2, d0[2]
+ vmlal.s16 \d, \s3, d0[3]
+.endm
+.macro vmull_vmlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+ vmull.s16 \d, \s0, d0[0]
+ vmlal.s16 \d, \s1, d0[1]
+ vmlal.s16 \d, \s2, d0[2]
+ vmlal.s16 \d, \s3, d0[3]
+ vmlal.s16 \d, \s4, d1[0]
+ vmlal.s16 \d, \s5, d1[1]
+ vmlal.s16 \d, \s6, d1[2]
+ vmlal.s16 \d, \s7, d1[3]
+.endm
+.macro vqrshrun_s32 shift, q0, d0, q1, d1, q2, d2, q3, d3
+ vqrshrun.s32 \d0, \q0, #\shift
+.ifnb \q1
+ vqrshrun.s32 \d1, \q1, #\shift
+.endif
+.ifnb \q2
+ vqrshrun.s32 \d2, \q2, #\shift
+ vqrshrun.s32 \d3, \q3, #\shift
+.endif
+.endm
+.macro vmovn_i32 q0, d0, q1, d1, q2, d2, q3, d3
+ vmovn.i32 \d0, \q0
+.ifnb \q1
+ vmovn.i32 \d1, \q1
+.endif
+.ifnb \q2
+ vmovn.i32 \d2, \q2
+ vmovn.i32 \d3, \q3
+.endif
+.endm
+.macro vrshl_s32 shift, r0, r1, r2, r3
+ vrshl.s32 \r0, \r0, \shift
+ vrshl.s32 \r1, \r1, \shift
+.ifnb \r2
+ vrshl.s32 \r2, \r2, \shift
+ vrshl.s32 \r3, \r3, \shift
+.endif
+.endm
+.macro vst1_32 strd, r0, r1
+ vst1.32 {\r0[0]}, [r0, :32], \strd
+ vst1.32 {\r0[1]}, [r9, :32], \strd
+.ifnb \r1
+ vst1.32 {\r1[0]}, [r0, :32], \strd
+ vst1.32 {\r1[1]}, [r9, :32], \strd
+.endif
+.endm
+.macro vst1_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7
+ vst1.16 {\r0}, [r0, \align], \strd
+ vst1.16 {\r1}, [r9, \align], \strd
+.ifnb \r2
+ vst1.16 {\r2}, [r0, \align], \strd
+ vst1.16 {\r3}, [r9, \align], \strd
+.endif
+.ifnb \r4
+ vst1.16 {\r4}, [r0, \align], \strd
+ vst1.16 {\r5}, [r9, \align], \strd
+ vst1.16 {\r6}, [r0, \align], \strd
+ vst1.16 {\r7}, [r9, \align], \strd
+.endif
+.endm
+.macro finalize type, q0, q1, d0, d1, q2, q3, d2, d3
+.ifc \type, put
+ vqrshrun_s32 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+ vmin_u16 q15, \q0, \q1
+.else
+ vrshl_s32 q14, \q0, \q1, \q2, \q3 // -(6-intermediate_bits)
+ vmovn_i32 \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+ vsub_i16 q15, \q0, \q1 // PREP_BIAS
+.endif
+.endm
+.macro shift_store_4 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+ finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+ vst1_reg \strd, :64, \d0, \d1, \d2, \d3
+.endm
+.macro shift_store_8 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+ finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+ vst1_reg \strd, :128, \q0, \q1
+.endm
+.macro shift_store_16 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+ finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+ vst1.16 {\q0, \q1}, [r0, :128], \strd
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ movw r9, \type_h
+ movw r10, \type_v
+ b \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, ds2, sr2
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap_neon
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+.ifc \bdmax, r8
+ ldr r8, [sp, #52]
+.endif
+ movw r11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, r11
+ mul \my, \my, r11
+ add \mx, \mx, r9 // mx, 8tap_h, 4tap_h
+ add \my, \my, r10 // my, 8tap_v, 4tap_v
+
+.ifc \type, prep
+ lsl \d_strd, \w, #1
+.endif
+
+ vdup.16 q15, \bdmax // bitdepth_max
+ clz \bdmax, \bdmax
+ clz r9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ tst \mx, #(0x7f << 14)
+ sub r9, r9, #24
+ add lr, \bdmax, #6 // 6 + intermediate_bits
+ rsb r12, \bdmax, #6 // 6 - intermediate_bits
+ movrel r11, X(mc_subpel_filters), -8
+ bne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ bne L(\type\()_8tap_v)
+ b \type\()_neon
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfx r10, \mx, #7, #7
+ and \mx, \mx, #0x7f
+ it gt
+ movgt \mx, r10
+ tst \my, #(0x7f << 14)
+ add \mx, r11, \mx, lsl #3
+ bne L(\type\()_8tap_hv)
+
+ adr r10, L(\type\()_8tap_h_tbl)
+ vdup.32 q14, r12 // 6 - intermediate_bits
+ ldr r9, [r10, r9, lsl #2]
+ vneg.s32 q14, q14 // -(6-intermediate_bits)
+.ifc \type, put
+ vdup.16 q13, \bdmax // intermediate_bits
+.else
+ vmov.i16 q13, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vneg.s16 q13, q13 // -intermediate_bits
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_8tap_h_tbl):
+ .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+
+20: // 2xN h
+.ifc \type, put
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+2:
+ vld1.16 {q2}, [\src], \s_strd
+ vld1.16 {q3}, [\sr2], \s_strd
+ vext.8 d5, d4, d5, #2
+ vext.8 d7, d6, d7, #2
+ subs \h, \h, #2
+ vtrn.32 d4, d6
+ vtrn.32 d5, d7
+ vmull.s16 q1, d4, d0[0]
+ vmlal.s16 q1, d5, d0[1]
+ vmlal.s16 q1, d6, d0[2]
+ vmlal.s16 q1, d7, d0[3]
+ vrshl.s32 q1, q1, q14 // -(6-intermediate_bits)
+ vqmovun.s32 d2, q1
+ vrshl.s16 d2, d2, d26 // -intermediate_bits
+ vmin.u16 d2, d2, d30
+ vst1.32 {d2[0]}, [\dst, :32], \d_strd
+ vst1.32 {d2[1]}, [\ds2, :32], \d_strd
+ bgt 2b
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN h
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+4:
+ vld1.16 {q8}, [\src], \s_strd
+ vld1.16 {q11}, [\sr2], \s_strd
+ vext.8 d18, d16, d17, #2
+ vext.8 d19, d16, d17, #4
+ vext.8 d20, d16, d17, #6
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d21, d22, d23, #6
+ subs \h, \h, #2
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d19, d0[2]
+ vmlal.s16 q2, d20, d0[3]
+ vmull.s16 q3, d22, d0[0]
+ vmlal.s16 q3, d24, d0[1]
+ vmlal.s16 q3, d25, d0[2]
+ vmlal.s16 q3, d21, d0[3]
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+.ifc \type, put
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vrshl.s16 q2, q2, q13 // -intermediate_bits
+ vmin.u16 q2, q2, q15
+.else
+ vmovn.s32 d4, q2
+ vmovn.s32 d5, q3
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+ bgt 4b
+ pop {r4-r11,pc}
+
+80:
+160:
+320:
+640:
+1280: // 8xN, 16xN, 32xN, ... h
+ vpush {q4-q5}
+ vld1.8 {d0}, [\mx, :64]
+ sub \src, \src, #6
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+
+ sub \s_strd, \s_strd, \w, lsl #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, lsl #1
+.endif
+81:
+ vld1.16 {q8, q9}, [\src]!
+ vld1.16 {q10, q11}, [\sr2]!
+ mov \mx, \w
+
+8:
+ vmull.s16 q1, d16, d0[0]
+ vmull.s16 q2, d17, d0[0]
+ vmull.s16 q3, d20, d0[0]
+ vmull.s16 q4, d21, d0[0]
+.irpc i, 1234567
+ vext.8 q12, q8, q9, #(2*\i)
+ vext.8 q5, q10, q11, #(2*\i)
+.if \i < 4
+ vmlal.s16 q1, d24, d0[\i]
+ vmlal.s16 q2, d25, d0[\i]
+ vmlal.s16 q3, d10, d0[\i]
+ vmlal.s16 q4, d11, d0[\i]
+.else
+ vmlal.s16 q1, d24, d1[\i-4]
+ vmlal.s16 q2, d25, d1[\i-4]
+ vmlal.s16 q3, d10, d1[\i-4]
+ vmlal.s16 q4, d11, d1[\i-4]
+.endif
+.endr
+ subs \mx, \mx, #8
+ vrshl.s32 q1, q1, q14 // -(6-intermediate_bits)
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vrshl.s32 q4, q4, q14 // -(6-intermediate_bits)
+.ifc \type, put
+ vqmovun.s32 d2, q1
+ vqmovun.s32 d3, q2
+ vqmovun.s32 d4, q3
+ vqmovun.s32 d5, q4
+ vrshl.s16 q1, q1, q13 // -intermediate_bits
+ vrshl.s16 q2, q2, q13 // -intermediate_bits
+ vmin.u16 q1, q1, q15
+ vmin.u16 q2, q2, q15
+.else
+ vmovn.s32 d2, q1
+ vmovn.s32 d3, q2
+ vmovn.s32 d4, q3
+ vmovn.s32 d5, q4
+ vsub.i16 q1, q1, q13 // PREP_BIAS
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ vst1.16 {q1}, [\dst, :128]!
+ vst1.16 {q2}, [\ds2, :128]!
+ ble 9f
+
+ vmov q8, q9
+ vmov q10, q11
+ vld1.16 {q9}, [\src]!
+ vld1.16 {q11}, [\sr2]!
+ b 8b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ bgt 81b
+ vpop {q4-q5}
+ pop {r4-r11,pc}
+
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfx r10, \my, #7, #7
+ and \my, \my, #0x7f
+ it gt
+ movgt \my, r10
+ add \my, r11, \my, lsl #3
+
+.ifc \type, prep
+ vdup.32 q14, r12 // 6 - intermediate_bits
+ vmov.i16 q15, #PREP_BIAS
+.endif
+ adr r10, L(\type\()_8tap_v_tbl)
+ ldr r9, [r10, r9, lsl #2]
+.ifc \type, prep
+ vneg.s32 q14, q14 // -(6-intermediate_bits)
+.endif
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(\type\()_8tap_v_tbl):
+ .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+
+20: // 2xN v
+.ifc \type, put
+ bgt 28f
+
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ // 2x2 v
+ load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ interleave_1_32 d1, d2, d3, d4, d5
+ bgt 24f
+ vmull_vmlal_4 q8, d1, d2, d3, d4
+ vqrshrun_s32 6, q8, d16
+ vmin_u16 d30, d16
+ vst1_32 \d_strd, d16
+ pop {r4-r11,pc}
+
+24: // 2x4 v
+ load_32 \sr2, \src, \s_strd, d6, d7
+ interleave_1_32 d5, d6, d7
+ vmull_vmlal_4 q8, d1, d2, d3, d4
+ vmull_vmlal_4 q9, d3, d4, d5, d6
+ vqrshrun_s32 6, q8, d16, q9, d17
+ vmin_u16 q15, q8
+ vst1_32 \d_strd, d16, d17
+ pop {r4-r11,pc}
+
+28: // 2x6, 2x8, 2x12, 2x16 v
+ vld1.8 {d0}, [\my, :64]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ vmovl.s8 q0, d0
+
+ load_32 \src, \sr2, \s_strd, d2, d3, d4, d5, d6, d7, d16
+ interleave_1_32 d2, d3, d4, d5, d6
+ interleave_1_32 d6, d7, d16
+216:
+ subs \h, \h, #4
+ load_32 \sr2, \src, \s_strd, d17, d18, d19, d20
+ interleave_1_32 d16, d17, d18, d19, d20
+ vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17
+ vmull_vmlal_8 q1, d4, d5, d6, d7, d16, d17, d18, d19
+ vqrshrun_s32 6, q13, d26, q1, d27
+ vmin_u16 q15, q13
+ vst1_32 \d_strd, d26, d27
+ ble 0f
+ cmp \h, #2
+ vmov q1, q3
+ vmov q2, q8
+ vmov q3, q9
+ vmov d16, d20
+ beq 26f
+ b 216b
+26:
+ load_32 \sr2, \src, \s_strd, d17, d18
+ interleave_1_32 d16, d17, d18
+ vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17
+ vqrshrun_s32 6, q13, d26
+ vmin_u16 d30, d26
+ vst1_32 \d_strd, d26
+0:
+ pop {r4-r11,pc}
+.endif
+
+40:
+ bgt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+ vmull_vmlal_4 q8, d1, d2, d3, d4
+ vmull_vmlal_4 q9, d2, d3, d4, d5
+ shift_store_4 \type, \d_strd, q8, q9, d16, d17
+ ble 0f
+ load_reg \sr2, \src, \s_strd, d6, d7
+ vmull_vmlal_4 q8, d3, d4, d5, d6
+ vmull_vmlal_4 q9, d4, d5, d6, d7
+ shift_store_4 \type, \d_strd, q8, q9, d16, d17
+0:
+ pop {r4-r11,pc}
+
+480: // 4x6, 4x8, 4x12, 4x16 v
+ vld1.8 {d0}, [\my, :64]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, d16, d17, d18, d19, d20, d21, d22
+
+48:
+ subs \h, \h, #4
+ load_reg \sr2, \src, \s_strd, d23, d24, d25, d26
+ vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23
+ vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24
+ vmull_vmlal_8 q3, d18, d19, d20, d21, d22, d23, d24, d25
+ vmull_vmlal_8 q8, d19, d20, d21, d22, d23, d24, d25, d26
+ shift_store_4 \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5
+ ble 0f
+ cmp \h, #2
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ vmov d22, d26
+ beq 46f
+ b 48b
+46:
+ load_reg \sr2, \src, \s_strd, d23, d24
+ vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23
+ vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24
+ shift_store_4 \type, \d_strd, q1, q2, d2, d3
+0:
+ pop {r4-r11,pc}
+
+80:
+ bgt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+
+ load_reg \src, \sr2, \s_strd, q1, q2, q3, q8, q9
+ vmull_vmlal_4 q10, d2, d4, d6, d16
+ vmull_vmlal_4 q11, d3, d5, d7, d17
+ vmull_vmlal_4 q12, d4, d6, d16, d18
+ vmull_vmlal_4 q13, d5, d7, d17, d19
+ shift_store_8 \type, \d_strd, q10, q11, d20, d21, q12, q13, d22, d23
+ ble 0f
+ load_reg \sr2, \src, \s_strd, q10, q11
+ vmull_vmlal_4 q1, d6, d16, d18, d20
+ vmull_vmlal_4 q2, d7, d17, d19, d21
+ vmull_vmlal_4 q12, d16, d18, d20, d22
+ vmull_vmlal_4 q13, d17, d19, d21, d23
+ shift_store_8 \type, \d_strd, q1, q2, d2, d3, q12, q13, d4, d5
+0:
+ pop {r4-r11,pc}
+
+880: // 8x6, 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ vpush {q4-q7}
+ vld1.8 {d0}, [\my, :64]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ vmovl.s8 q0, d0
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_reg \src, \sr2, \s_strd, q5, q6, q7, q8, q9, q10, q11
+
+88:
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, q12, q13
+ vmull_vmlal_8 q1, d10, d12, d14, d16, d18, d20, d22, d24
+ vmull_vmlal_8 q2, d11, d13, d15, d17, d19, d21, d23, d25
+ vmull_vmlal_8 q3, d12, d14, d16, d18, d20, d22, d24, d26
+ vmull_vmlal_8 q4, d13, d15, d17, d19, d21, d23, d25, d27
+ shift_store_8 \type, \d_strd, q1, q2, d2, d3, q3, q4, d4, d5
+ ble 9f
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, q1, q2
+ vmull_vmlal_8 q3, d14, d16, d18, d20, d22, d24, d26, d2
+ vmull_vmlal_8 q4, d15, d17, d19, d21, d23, d25, d27, d3
+ vmull_vmlal_8 q5, d16, d18, d20, d22, d24, d26, d2, d4
+ vmull_vmlal_8 q6, d17, d19, d21, d23, d25, d27, d3, d5
+ shift_store_8 \type, \d_strd, q3, q4, d6, d7, q5, q6, d8, d9
+ ble 9f
+ vmov q5, q9
+ vmov q6, q10
+ vmov q7, q11
+ vmov q8, q12
+ vmov q9, q13
+ vmov q10, q1
+ vmov q11, q2
+ b 88b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+160:
+ bgt 1680b
+
+ // 16x2, 16x4 v
+ vpush {q6-q7}
+ add \my, \my, #2
+ vld1.32 {d0[]}, [\my]
+ sub \src, \src, \s_strd
+ vmovl.s8 q0, d0
+
+ load_16s16 \src, \src, \s_strd, q6, q7, q8, q9, q10, q11
+16:
+ load_16s16 \src, \src, \s_strd, q12, q13
+ subs \h, \h, #1
+ vmull_vmlal_4 q1, d12, d16, d20, d24
+ vmull_vmlal_4 q2, d13, d17, d21, d25
+ vmull_vmlal_4 q3, d14, d18, d22, d26
+ vmull_vmlal_4 q6, d15, d19, d23, d27
+ shift_store_16 \type, \d_strd, q1, q2, d2, d3, q3, q6, d4, d5
+ ble 0f
+ vmov q6, q8
+ vmov q7, q9
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ vmov q11, q13
+ b 16b
+0:
+ vpop {q6-q7}
+ pop {r4-r11,pc}
+
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfx r10, \my, #7, #7
+ and \my, \my, #0x7f
+ it gt
+ movgt \my, r10
+4:
+ add \my, r11, \my, lsl #3
+
+ adr r10, L(\type\()_8tap_hv_tbl)
+ neg r12, r12 // -(6-intermediate_bits)
+ ldr r9, [r10, r9, lsl #2]
+ vdup.32 q14, r12 // -(6-intermediate_bits)
+.ifc \type, put
+ neg r8, lr // -(6+intermeidate_bits)
+.else
+ vmov.i16 q13, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vdup.32 q13, r8 // -(6+intermediate_bits)
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_8tap_hv_tbl):
+ .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+
+20:
+.ifc \type, put
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ bgt 280f
+ add \my, \my, #2
+ vld1.32 {d2[]}, [\my]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vmull.s16 q11, d22, d0
+ vmull.s16 q12, d24, d0
+ vpadd.s32 d22, d22, d23
+ vpadd.s32 d23, d24, d25
+ vpadd.s32 d22, d22, d23
+ vrshl.s32 d16, d22, d28 // -(6-intermediate_bits)
+ vmovn.i32 d16, q8
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d16, d16, d16, #4
+ vext.8 d16, d16, d24, #4
+ vmov d17, d24
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d18, d17, d24, #4
+ vmull.s16 q2, d16, d2[0]
+ vmlal.s16 q2, d17, d2[1]
+ vmlal.s16 q2, d18, d2[2]
+ vmlal.s16 q2, d24, d2[3]
+
+ vrshl.s32 q2, q2, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vmin.u16 d4, d4, d30
+ subs \h, \h, #2
+ vst1.32 {d4[0]}, [\dst, :32], \d_strd
+ vst1.32 {d4[1]}, [\ds2, :32], \d_strd
+ ble 0f
+ vmov d16, d18
+ vmov d17, d24
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vmull.s16 q11, d22, d0
+ vmull.s16 q12, d24, d0
+ vpadd.s32 d22, d22, d23
+ vpadd.s32 d23, d24, d25
+ vpadd.s32 d22, d22, d23
+ vrshl.s32 d16, d22, d28 // -(6-intermediate_bits)
+ vmovn.i32 d16, q8
+
+ bl L(\type\()_8tap_filter_2)
+
+ vext.8 d16, d16, d16, #4
+ vext.8 d16, d16, d24, #4
+ vmov d17, d24
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d18, d17, d24, #4
+ vmov d19, d24
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d20, d19, d24, #4
+ vmov d21, d24
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ vext.8 d22, d21, d24, #4
+ vmull.s16 q3, d16, d2[0]
+ vmlal.s16 q3, d17, d2[1]
+ vmlal.s16 q3, d18, d2[2]
+ vmlal.s16 q3, d19, d2[3]
+ vmlal.s16 q3, d20, d3[0]
+ vmlal.s16 q3, d21, d3[1]
+ vmlal.s16 q3, d22, d3[2]
+ vmlal.s16 q3, d24, d3[3]
+
+ vrshl.s32 q3, q3, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d6, q3
+ vmin.u16 d6, d6, d30
+ subs \h, \h, #2
+ vst1.32 {d6[0]}, [\dst, :32], \d_strd
+ vst1.32 {d6[1]}, [\ds2, :32], \d_strd
+ ble 0f
+ vmov q8, q9
+ vmov q9, q10
+ vmov d20, d22
+ vmov d21, d24
+ b 28b
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_2):
+ vld1.16 {q11}, [\sr2], \s_strd
+ vld1.16 {q12}, [\src], \s_strd
+ vext.8 d23, d22, d23, #2
+ vext.8 d25, d24, d25, #2
+ vtrn.32 q11, q12
+ vmull.s16 q3, d22, d0[0]
+ vmlal.s16 q3, d23, d0[1]
+ vmlal.s16 q3, d24, d0[2]
+ vmlal.s16 q3, d25, d0[3]
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vmovn.i32 d24, q3
+ bx lr
+.endif
+
+40:
+ add \mx, \mx, #2
+ vld1.32 {d0[]}, [\mx]
+ bgt 480f
+ add \my, \my, #2
+ vld1.32 {d2[]}, [\my]
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ // 4x2, 4x4 hv
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d23, d22, d23, #6
+ vmull.s16 q10, d22, d0[0]
+ vmlal.s16 q10, d24, d0[1]
+ vmlal.s16 q10, d25, d0[2]
+ vmlal.s16 q10, d23, d0[3]
+ vrshl.s32 q10, q10, q14 // -(6-intermediate_bits)
+ vmovn.i32 d17, q10
+
+ bl L(\type\()_8tap_filter_4)
+ vmov q9, q12
+
+4:
+ bl L(\type\()_8tap_filter_4)
+ vmull.s16 q2, d17, d2[0]
+ vmlal.s16 q2, d18, d2[1]
+ vmlal.s16 q2, d19, d2[2]
+ vmlal.s16 q2, d24, d2[3]
+ vmull.s16 q3, d18, d2[0]
+ vmlal.s16 q3, d19, d2[1]
+ vmlal.s16 q3, d24, d2[2]
+ vmlal.s16 q3, d25, d2[3]
+.ifc \type, put
+ vrshl.s32 q2, q2, q13 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vmin.u16 q2, q2, q15
+.else
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d17, d19
+ vmov q9, q12
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+480: // 4x8, 4x16, 4x32 hv
+ vpush {d13-d15}
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d23, d22, d23, #6
+ vmull.s16 q10, d22, d0[0]
+ vmlal.s16 q10, d24, d0[1]
+ vmlal.s16 q10, d25, d0[2]
+ vmlal.s16 q10, d23, d0[3]
+ vrshl.s32 q10, q10, q14 // -(6-intermediate_bits)
+ vmovn.i32 d13, q10
+
+ bl L(\type\()_8tap_filter_4)
+ vmov q7, q12
+ bl L(\type\()_8tap_filter_4)
+ vmov q8, q12
+ bl L(\type\()_8tap_filter_4)
+ vmov q9, q12
+
+48:
+ bl L(\type\()_8tap_filter_4)
+ vmull.s16 q2, d13, d2[0]
+ vmlal.s16 q2, d14, d2[1]
+ vmlal.s16 q2, d15, d2[2]
+ vmlal.s16 q2, d16, d2[3]
+ vmlal.s16 q2, d17, d3[0]
+ vmlal.s16 q2, d18, d3[1]
+ vmlal.s16 q2, d19, d3[2]
+ vmlal.s16 q2, d24, d3[3]
+ vmull.s16 q3, d14, d2[0]
+ vmlal.s16 q3, d15, d2[1]
+ vmlal.s16 q3, d16, d2[2]
+ vmlal.s16 q3, d17, d2[3]
+ vmlal.s16 q3, d18, d3[0]
+ vmlal.s16 q3, d19, d3[1]
+ vmlal.s16 q3, d24, d3[2]
+ vmlal.s16 q3, d25, d3[3]
+.ifc \type, put
+ vrshl.s32 q2, q2, q13 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q13 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vmin.u16 q2, q2, q15
+.else
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vsub.i16 q2, q2, q13 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ vst1.16 {d4}, [\dst, :64], \d_strd
+ vst1.16 {d5}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d13, d15
+ vmov q7, q8
+ vmov q8, q9
+ vmov q9, q12
+ b 48b
+0:
+ vpop {d13-d15}
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_4):
+ vld1.16 {q10}, [\sr2], \s_strd
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d24, d20, d21, #2
+ vext.8 d25, d20, d21, #4
+ vext.8 d21, d20, d21, #6
+ vmull.s16 q3, d20, d0[0]
+ vmlal.s16 q3, d24, d0[1]
+ vmlal.s16 q3, d25, d0[2]
+ vmlal.s16 q3, d21, d0[3]
+ vext.8 d24, d22, d23, #2
+ vext.8 d25, d22, d23, #4
+ vext.8 d23, d22, d23, #6
+ vmull.s16 q10, d22, d0[0]
+ vmlal.s16 q10, d24, d0[1]
+ vmlal.s16 q10, d25, d0[2]
+ vmlal.s16 q10, d23, d0[3]
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vrshl.s32 q10, q10, q14 // -(6-intermediate_bits)
+ vmovn.i32 d24, q3
+ vmovn.i32 d25, q10
+ bx lr
+
+80:
+160:
+320:
+ bgt 880f
+ add \my, \my, #2
+ vld1.8 {d0}, [\mx, :64]
+ vld1.32 {d2[]}, [\my]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ vld1.16 {q11, q12}, [\src], \s_strd
+ vmull.s16 q2, d22, d0[0]
+ vmull.s16 q3, d23, d0[0]
+ vdup.32 q14, r12 // -(6-intermediate_bits)
+.irpc i, 1234567
+ vext.8 q10, q11, q12, #(2*\i)
+.if \i < 4
+ vmlal.s16 q2, d20, d0[\i]
+ vmlal.s16 q3, d21, d0[\i]
+.else
+ vmlal.s16 q2, d20, d1[\i - 4]
+ vmlal.s16 q3, d21, d1[\i - 4]
+.endif
+.endr
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vmovn.i32 d16, q2
+ vmovn.i32 d17, q3
+
+ bl L(\type\()_8tap_filter_8)
+ vmov q9, q11
+ vmov q10, q12
+
+8:
+ bl L(\type\()_8tap_filter_8)
+ vmull.s16 q2, d16, d2[0]
+ vmull.s16 q3, d17, d2[0]
+ vmull.s16 q13, d18, d2[0]
+ vmull.s16 q14, d19, d2[0]
+.ifc \type, put
+ vdup.32 q8, r8 // -(6+intermediate_bits)
+.endif
+ vmlal.s16 q2, d18, d2[1]
+ vmlal.s16 q3, d19, d2[1]
+ vmlal.s16 q13, d20, d2[1]
+ vmlal.s16 q14, d21, d2[1]
+ vmlal.s16 q2, d20, d2[2]
+ vmlal.s16 q3, d21, d2[2]
+ vmlal.s16 q13, d22, d2[2]
+ vmlal.s16 q14, d23, d2[2]
+ vmlal.s16 q2, d22, d2[3]
+ vmlal.s16 q3, d23, d2[3]
+ vmlal.s16 q13, d24, d2[3]
+ vmlal.s16 q14, d25, d2[3]
+.ifc \type, put
+ vdup.16 q9, \bdmax // bitdepth_max
+ vrshl.s32 q2, q2, q8 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q8 // -(6+intermediate_bits)
+ vrshl.s32 q13, q13, q8 // -(6+intermediate_bits)
+ vrshl.s32 q14, q14, q8 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q13
+ vqmovun.s32 d7, q14
+ vmin.u16 q2, q2, q15
+ vmin.u16 q3, q3, q15
+.else
+ vmov.i16 q9, #PREP_BIAS
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q13, #6
+ vrshrn.i32 d7, q14, #6
+ vsub.i16 q2, q2, q9 // PREP_BIAS
+ vsub.i16 q3, q3, q9 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ vst1.16 {q2}, [\dst, :128], \d_strd
+ vst1.16 {q3}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ b 8b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 164b
+0:
+ pop {r4-r11,pc}
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ vpush {q4-q7}
+ vld1.8 {d0}, [\mx, :64]
+ vld1.8 {d2}, [\my, :64]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ vmovl.s8 q0, d0
+ vmovl.s8 q1, d2
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ vld1.16 {q11, q12}, [\src], \s_strd
+ vmull.s16 q2, d22, d0[0]
+ vmull.s16 q3, d23, d0[0]
+ vdup.32 q14, r12 // -(6-intermediate_bits)
+.irpc i, 1234567
+ vext.8 q10, q11, q12, #(2*\i)
+.if \i < 4
+ vmlal.s16 q2, d20, d0[\i]
+ vmlal.s16 q3, d21, d0[\i]
+.else
+ vmlal.s16 q2, d20, d1[\i - 4]
+ vmlal.s16 q3, d21, d1[\i - 4]
+.endif
+.endr
+ vrshl.s32 q2, q2, q14 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q14 // -(6-intermediate_bits)
+ vmovn.i32 d8, q2
+ vmovn.i32 d9, q3
+
+ bl L(\type\()_8tap_filter_8)
+ vmov q5, q11
+ vmov q6, q12
+ bl L(\type\()_8tap_filter_8)
+ vmov q7, q11
+ vmov q8, q12
+ bl L(\type\()_8tap_filter_8)
+ vmov q9, q11
+ vmov q10, q12
+
+88:
+ bl L(\type\()_8tap_filter_8)
+ vmull.s16 q2, d8, d2[0]
+ vmull.s16 q3, d9, d2[0]
+ vmull.s16 q13, d10, d2[0]
+ vmull.s16 q14, d11, d2[0]
+.ifc \type, put
+ vdup.32 q4, r8 // -(6+intermediate_bits)
+.endif
+ vmlal.s16 q2, d10, d2[1]
+ vmlal.s16 q3, d11, d2[1]
+ vmlal.s16 q13, d12, d2[1]
+ vmlal.s16 q14, d13, d2[1]
+ vmlal.s16 q2, d12, d2[2]
+ vmlal.s16 q3, d13, d2[2]
+ vmlal.s16 q13, d14, d2[2]
+ vmlal.s16 q14, d15, d2[2]
+ vmlal.s16 q2, d14, d2[3]
+ vmlal.s16 q3, d15, d2[3]
+ vmlal.s16 q13, d16, d2[3]
+ vmlal.s16 q14, d17, d2[3]
+ vmlal.s16 q2, d16, d3[0]
+ vmlal.s16 q3, d17, d3[0]
+ vmlal.s16 q13, d18, d3[0]
+ vmlal.s16 q14, d19, d3[0]
+ vmlal.s16 q2, d18, d3[1]
+ vmlal.s16 q3, d19, d3[1]
+ vmlal.s16 q13, d20, d3[1]
+ vmlal.s16 q14, d21, d3[1]
+ vmlal.s16 q2, d20, d3[2]
+ vmlal.s16 q3, d21, d3[2]
+ vmlal.s16 q13, d22, d3[2]
+ vmlal.s16 q14, d23, d3[2]
+ vmlal.s16 q2, d22, d3[3]
+ vmlal.s16 q3, d23, d3[3]
+ vmlal.s16 q13, d24, d3[3]
+ vmlal.s16 q14, d25, d3[3]
+.ifc \type, put
+ vrshl.s32 q2, q2, q4 // -(6+intermediate_bits)
+ vrshl.s32 q3, q3, q4 // -(6+intermediate_bits)
+ vrshl.s32 q13, q13, q4 // -(6+intermediate_bits)
+ vrshl.s32 q14, q14, q4 // -(6+intermediate_bits)
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vqmovun.s32 d6, q13
+ vqmovun.s32 d7, q14
+ vmin.u16 q2, q2, q15
+ vmin.u16 q3, q3, q15
+.else
+ vmov.i16 q5, #PREP_BIAS
+ vrshrn.i32 d4, q2, #6
+ vrshrn.i32 d5, q3, #6
+ vrshrn.i32 d6, q13, #6
+ vrshrn.i32 d7, q14, #6
+ vsub.i16 q2, q2, q5 // PREP_BIAS
+ vsub.i16 q3, q3, q5 // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ vst1.16 {q2}, [\dst, :128], \d_strd
+ vst1.16 {q3}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q4, q6
+ vmov q5, q7
+ vmov q6, q8
+ vmov q7, q9
+ vmov q8, q10
+ vmov q9, q11
+ vmov q10, q12
+ b 88b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+L(\type\()_8tap_filter_8):
+ vld1.16 {q13, q14}, [\sr2], \s_strd
+ vmull.s16 q2, d26, d0[0]
+ vmull.s16 q3, d27, d0[0]
+.irpc i, 1234567
+ vext.8 q12, q13, q14, #(2*\i)
+.if \i < 4
+ vmlal.s16 q2, d24, d0[\i]
+ vmlal.s16 q3, d25, d0[\i]
+.else
+ vmlal.s16 q2, d24, d1[\i - 4]
+ vmlal.s16 q3, d25, d1[\i - 4]
+.endif
+.endr
+ vdup.32 q12, r12 // -(6-intermediate_bits)
+ vld1.16 {q13, q14}, [\src], \s_strd
+ vrshl.s32 q2, q2, q12 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q12 // -(6-intermediate_bits)
+ vmovn.i32 d4, q2
+ vmovn.i32 d5, q3
+
+ vmull.s16 q3, d26, d0[0]
+ vmull.s16 q11, d27, d0[0]
+.irpc i, 1234567
+ vext.8 q12, q13, q14, #(2*\i)
+.if \i < 4
+ vmlal.s16 q3, d24, d0[\i]
+ vmlal.s16 q11, d25, d0[\i]
+.else
+ vmlal.s16 q3, d24, d1[\i - 4]
+ vmlal.s16 q11, d25, d1[\i - 4]
+.endif
+.endr
+ vdup.32 q13, r12 // -(6-intermediate_bits)
+ vrshl.s32 q3, q3, q13 // -(6-intermediate_bits)
+ vrshl.s32 q11, q11, q13 // -(6-intermediate_bits)
+
+ vmovn.i32 d24, q3
+ vmovn.i32 d25, q11
+ vmov q11, q2
+ bx lr
+endfunc
+
+function \type\()_bilin_16bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+.ifc \bdmax, r8
+ ldr r8, [sp, #52]
+.endif
+ vdup.16 q1, \mx
+ vdup.16 q3, \my
+ rsb r9, \mx, #16
+ rsb r10, \my, #16
+ vdup.16 q0, r9
+ vdup.16 q2, r10
+.ifc \type, prep
+ lsl \d_strd, \w, #1
+.endif
+ clz \bdmax, \bdmax // bitdepth_max
+ clz r9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ cmp \mx, #0
+ sub r9, r9, #24
+ rsb r11, \bdmax, #4 // 4 - intermediate_bits
+ add r12, \bdmax, #4 // 4 + intermediate_bits
+ bne L(\type\()_bilin_h)
+ cmp \my, #0
+ bne L(\type\()_bilin_v)
+ b \type\()_neon
+
+L(\type\()_bilin_h):
+ cmp \my, #0
+ bne L(\type\()_bilin_hv)
+
+ adr r10, L(\type\()_bilin_h_tbl)
+ vdup.16 q15, r11 // 4 - intermediate_bits
+ ldr r9, [r10, r9, lsl #2]
+ vneg.s16 q15, q15 // -(4-intermediate_bits)
+.ifc \type, put
+ vdup.16 q14, \bdmax // intermediate_bits
+.else
+ vmov.i16 q14, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vneg.s16 q14, q14 // -intermediate_bits
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_bilin_h_tbl):
+ .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+
+20: // 2xN h
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ vld1.16 {d16}, [\src], \s_strd
+ vld1.16 {d18}, [\sr2], \s_strd
+ vext.8 d17, d16, d16, #2
+ vext.8 d19, d18, d18, #2
+ vtrn.32 d16, d18
+ vtrn.32 d17, d19
+ subs \h, \h, #2
+ vmul.i16 d16, d16, d0
+ vmla.i16 d16, d17, d2
+ vrshl.u16 d16, d16, d30
+ vrshl.u16 d16, d16, d28
+ vst1.32 {d16[0]}, [\dst, :32], \d_strd
+ vst1.32 {d16[1]}, [\ds2, :32], \d_strd
+ bgt 2b
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ vld1.16 {q8}, [\src], \s_strd
+ vld1.16 {q10}, [\sr2], \s_strd
+ vext.8 q9, q8, q8, #2
+ vext.8 q11, q10, q10, #2
+ vmov d17, d20
+ vmov d19, d22
+ subs \h, \h, #2
+ vmul.i16 q8, q8, q0
+ vmla.i16 q8, q9, q1
+ vrshl.u16 q8, q8, q15
+.ifc \type, put
+ vrshl.u16 q8, q8, q14
+.else
+ vsub.i16 q8, q8, q14
+.endif
+ vst1.16 {d16}, [\dst, :64], \d_strd
+ vst1.16 {d17}, [\ds2, :64], \d_strd
+ bgt 4b
+ pop {r4-r11,pc}
+
+80: // 8xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ vld1.16 {d16, d17, d18}, [\src], \s_strd
+ vld1.16 {d20, d21, d22}, [\sr2], \s_strd
+ vext.8 q9, q8, q9, #2
+ vext.8 q11, q10, q11, #2
+ subs \h, \h, #2
+ vmul.i16 q8, q8, q0
+ vmla.i16 q8, q9, q1
+ vmul.i16 q10, q10, q0
+ vmla.i16 q10, q11, q1
+ vrshl.u16 q8, q8, q15
+ vrshl.u16 q10, q10, q15
+.ifc \type, put
+ vrshl.u16 q8, q8, q14
+ vrshl.u16 q10, q10, q14
+.else
+ vsub.i16 q8, q8, q14
+ vsub.i16 q10, q10, q14
+.endif
+ vst1.16 {q8}, [\dst, :128], \d_strd
+ vst1.16 {q10}, [\ds2, :128], \d_strd
+ bgt 8b
+ pop {r4-r11,pc}
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ vpush {q4-q7}
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w, lsl #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, lsl #1
+.endif
+161:
+ vld1.16 {q4}, [\src]!
+ vld1.16 {q9}, [\sr2]!
+ mov \mx, \w
+
+16:
+ vld1.16 {q5, q6}, [\src]!
+ vld1.16 {q10, q11}, [\sr2]!
+ vext.8 q7, q4, q5, #2
+ vext.8 q8, q5, q6, #2
+ vext.8 q12, q9, q10, #2
+ vext.8 q13, q10, q11, #2
+ vmul.i16 q4, q4, q0
+ vmla.i16 q4, q7, q1
+ vmul.i16 q5, q5, q0
+ vmla.i16 q5, q8, q1
+ vmul.i16 q9, q9, q0
+ vmla.i16 q9, q12, q1
+ vmul.i16 q10, q10, q0
+ vmla.i16 q10, q13, q1
+ vrshl.u16 q4, q4, q15
+ vrshl.u16 q5, q5, q15
+ vrshl.u16 q9, q9, q15
+ vrshl.u16 q10, q10, q15
+ subs \mx, \mx, #16
+.ifc \type, put
+ vrshl.u16 q4, q4, q14
+ vrshl.u16 q5, q5, q14
+ vrshl.u16 q9, q9, q14
+ vrshl.u16 q10, q10, q14
+.else
+ vsub.i16 q4, q4, q14
+ vsub.i16 q5, q5, q14
+ vsub.i16 q9, q9, q14
+ vsub.i16 q10, q10, q14
+.endif
+ vst1.16 {q4, q5}, [\dst, :128]!
+ vst1.16 {q9, q10}, [\ds2, :128]!
+ ble 9f
+
+ vmov q4, q6
+ vmov q9, q11
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ bgt 161b
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr r10, L(\type\()_bilin_v_tbl)
+.ifc \type, prep
+ vdup.16 q15, r11 // 4 - intermediate_bits
+.endif
+ ldr r9, [r10, r9, lsl #2]
+.ifc \type, prep
+ vmov.i16 q14, #PREP_BIAS
+ vneg.s16 q15, q15 // -(4-intermediate_bits)
+.endif
+ add r10, r10, r9
+ bx r10
+
+ .align 2
+L(\type\()_bilin_v_tbl):
+ .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+
+20: // 2xN v
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ vld1.32 {d16[]}, [\src], \s_strd
+ bgt 24f
+22:
+ vld1.32 {d17[]}, [\sr2], \s_strd
+ vld1.32 {d18[]}, [\src], \s_strd
+ vext.8 d16, d16, d17, #4
+ vext.8 d17, d17, d18, #4
+ vmul.i16 d16, d16, d4
+ vmla.i16 d16, d17, d6
+ vrshr.u16 d16, d16, #4
+ vst1.32 {d16[0]}, [\dst, :32]
+ vst1.32 {d16[1]}, [\ds2, :32]
+ pop {r4-r11,pc}
+24: // 2x4, 2x6, 2x8, ... v
+ vld1.32 {d17[]}, [\sr2], \s_strd
+ vld1.32 {d18[]}, [\src], \s_strd
+ vld1.32 {d19[]}, [\sr2], \s_strd
+ vld1.32 {d20[]}, [\src], \s_strd
+ subs \h, \h, #4
+ vext.8 d16, d16, d17, #4
+ vext.8 d17, d17, d18, #4
+ vext.8 d18, d18, d19, #4
+ vext.8 d19, d19, d20, #4
+ vswp d17, d18
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q9, q3
+ cmp \h, #2
+ vrshr.u16 q8, q8, #4
+ vst1.32 {d16[0]}, [\dst, :32], \d_strd
+ vst1.32 {d16[1]}, [\ds2, :32], \d_strd
+ vst1.32 {d17[0]}, [\dst, :32], \d_strd
+ vst1.32 {d17[1]}, [\ds2, :32], \d_strd
+ blt 0f
+ vmov d16, d20
+ beq 22b
+ b 24b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vld1.16 {d16}, [\src], \s_strd
+4:
+ vld1.16 {d17}, [\sr2], \s_strd
+ vld1.16 {d19}, [\src], \s_strd
+ vmov d18, d17
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q9, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vrshr.u16 q8, q8, #4
+.else
+ vrshl.u16 q8, q8, q15
+ vsub.i16 q8, q8, q14
+.endif
+ vst1.16 {d16}, [\dst, :64], \d_strd
+ vst1.16 {d17}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d16, d19
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+80: // 8xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ vld1.16 {q8}, [\src], \s_strd
+8:
+ vld1.16 {q9}, [\sr2], \s_strd
+ vld1.16 {q10}, [\src], \s_strd
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q9, q3
+ vmul.i16 q9, q9, q2
+ vmla.i16 q9, q10, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vrshr.u16 q8, q8, #4
+ vrshr.u16 q9, q9, #4
+.else
+ vrshl.u16 q8, q8, q15
+ vrshl.u16 q9, q9, q15
+ vsub.i16 q8, q8, q14
+ vsub.i16 q9, q9, q14
+.endif
+ vst1.16 {q8}, [\dst, :128], \d_strd
+ vst1.16 {q9}, [\ds2, :128], \d_strd
+ ble 0f
+ vmov q8, q10
+ b 8b
+0:
+ pop {r4-r11,pc}
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {q8, q9}, [\src], \s_strd
+2:
+ vld1.16 {q10, q11}, [\sr2], \s_strd
+ vld1.16 {q12, q13}, [\src], \s_strd
+ vmul.i16 q8, q8, q2
+ vmla.i16 q8, q10, q3
+ vmul.i16 q9, q9, q2
+ vmla.i16 q9, q11, q3
+ vmul.i16 q10, q10, q2
+ vmla.i16 q10, q12, q3
+ vmul.i16 q11, q11, q2
+ vmla.i16 q11, q13, q3
+ subs \h, \h, #2
+.ifc \type, put
+ vrshr.u16 q8, q8, #4
+ vrshr.u16 q9, q9, #4
+ vrshr.u16 q10, q10, #4
+ vrshr.u16 q11, q11, #4
+.else
+ vrshl.u16 q8, q8, q15
+ vrshl.u16 q9, q9, q15
+ vrshl.u16 q10, q10, q15
+ vrshl.u16 q11, q11, q15
+ vsub.i16 q8, q8, q14
+ vsub.i16 q9, q9, q14
+ vsub.i16 q10, q10, q14
+ vsub.i16 q11, q11, q14
+.endif
+ vst1.16 {q8, q9}, [\dst, :128], \d_strd
+ vst1.16 {q10, q11}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q8, q12
+ vmov q9, q13
+ b 2b
+9:
+ subs \w, \w, #16
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #32
+ add \dst, \dst, #32
+ b 1b
+0:
+ pop {r4-r11,pc}
+
+L(\type\()_bilin_hv):
+ adr r10, L(\type\()_bilin_hv_tbl)
+ vdup.16 q15, r11 // 4 - intermediate_bits
+ ldr r9, [r10, r9, lsl #2]
+ vneg.s16 q15, q15 // -(4-intermediate_bits)
+.ifc \type, put
+ vdup.32 q14, r12 // 4 + intermediate_bits
+.else
+ vmov.i16 q14, #PREP_BIAS
+.endif
+ add r10, r10, r9
+.ifc \type, put
+ vneg.s32 q14, q14 // -(4+intermediate_bits)
+.endif
+ bx r10
+
+ .align 2
+L(\type\()_bilin_hv_tbl):
+ .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+ .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+
+20: // 2xN hv
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {d20}, [\src], \s_strd
+ vext.8 d21, d20, d20, #2
+ vmul.i16 d16, d20, d0
+ vmla.i16 d16, d21, d2
+ vrshl.u16 d16, d16, d30
+ vext.8 d16, d16, d16, #4
+
+2:
+ vld1.16 {d20}, [\sr2], \s_strd
+ vld1.16 {d22}, [\src], \s_strd
+ vext.8 d21, d20, d20, #2
+ vext.8 d23, d22, d22, #2
+ vtrn.32 d20, d22
+ vtrn.32 d21, d23
+ vmul.i16 d18, d20, d0
+ vmla.i16 d18, d21, d2
+ vrshl.u16 d18, d18, d30
+
+ vext.8 d16, d16, d18, #4
+
+ vmull.u16 q8, d16, d4
+ vmlal.u16 q8, d18, d6
+ vrshl.u32 q8, q8, q14
+ vmovn.i32 d16, q8
+ subs \h, \h, #2
+ vst1.32 {d16[0]}, [\dst, :32], \d_strd
+ vst1.32 {d16[1]}, [\ds2, :32], \d_strd
+ ble 0f
+ vmov d16, d18
+ b 2b
+0:
+ pop {r4-r11,pc}
+.endif
+
+40: // 4xN hv
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {q10}, [\src], \s_strd
+ vext.8 d21, d20, d21, #2
+ vmul.i16 d16, d20, d0
+ vmla.i16 d16, d21, d2
+ vrshl.u16 d16, d16, d30
+
+4:
+ vld1.16 {q10}, [\sr2], \s_strd
+ vld1.16 {q11}, [\src], \s_strd
+ vext.8 d21, d20, d21, #2
+ vext.8 d23, d22, d23, #2
+ vswp d21, d22
+ vmul.i16 q9, q10, q0
+ vmla.i16 q9, q11, q1
+ vrshl.u16 q9, q9, q15
+
+ vmull.u16 q10, d16, d4
+ vmlal.u16 q10, d18, d6
+ vmull.u16 q11, d18, d4
+ vmlal.u16 q11, d19, d6
+.ifc \type, put
+ vrshl.u32 q10, q10, q14
+ vrshl.u32 q11, q11, q14
+ vmovn.i32 d20, q10
+ vmovn.i32 d21, q11
+.else
+ vrshrn.i32 d20, q10, #4
+ vrshrn.i32 d21, q11, #4
+ vsub.i16 q10, q10, q14
+.endif
+ subs \h, \h, #2
+ vst1.16 {d20}, [\dst, :64], \d_strd
+ vst1.16 {d21}, [\ds2, :64], \d_strd
+ ble 0f
+ vmov d16, d19
+ b 4b
+0:
+ pop {r4-r11,pc}
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ vld1.16 {d20, d21, d22}, [\src], \s_strd
+ vext.8 q11, q10, q11, #2
+ vmul.i16 q8, q10, q0
+ vmla.i16 q8, q11, q1
+ vrshl.u16 q8, q8, q15
+
+2:
+ vld1.16 {d20, d21, d22}, [\sr2], \s_strd
+ vld1.16 {d24, d25, d26}, [\src], \s_strd
+ vext.8 q11, q10, q11, #2
+ vext.8 q13, q12, q13, #2
+ vmul.i16 q9, q10, q0
+ vmla.i16 q9, q11, q1
+ vmul.i16 q10, q12, q0
+ vmla.i16 q10, q13, q1
+ vrshl.u16 q9, q9, q15
+ vrshl.u16 q10, q10, q15
+
+ vmull.u16 q11, d16, d4
+ vmlal.u16 q11, d18, d6
+ vmull.u16 q12, d17, d4
+ vmlal.u16 q12, d19, d6
+ vmull.u16 q8, d18, d4
+ vmlal.u16 q8, d20, d6
+ vmull.u16 q9, d19, d4
+ vmlal.u16 q9, d21, d6
+.ifc \type, put
+ vrshl.u32 q11, q11, q14
+ vrshl.u32 q12, q12, q14
+ vrshl.u32 q8, q8, q14
+ vrshl.u32 q9, q9, q14
+ vmovn.i32 d22, q11
+ vmovn.i32 d23, q12
+ vmovn.i32 d16, q8
+ vmovn.i32 d17, q9
+.else
+ vrshrn.i32 d22, q11, #4
+ vrshrn.i32 d23, q12, #4
+ vrshrn.i32 d16, q8, #4
+ vrshrn.i32 d17, q9, #4
+ vsub.i16 q11, q11, q14
+ vsub.i16 q8, q8, q14
+.endif
+ subs \h, \h, #2
+ vst1.16 {q11}, [\dst, :128], \d_strd
+ vst1.16 {q8}, [\ds2, :128], \d_strd
+ ble 9f
+ vmov q8, q10
+ b 2b
+9:
+ subs \w, \w, #8
+ ble 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ mls \src, \s_strd, \my, \src
+ mls \dst, \d_strd, \my, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 1b
+0:
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10
+
+.macro load_filter_ptr src
+ asr r12, \src, #10
+ add r12, r11, r12, lsl #3
+.endm
+
+.macro load_filter_coef dst, src, inc
+ add \src, \src, \inc
+ vld1.8 {\dst}, [r12, :64]
+.endm
+
+.macro load_filter_row dst, src, inc
+ load_filter_ptr \src
+ load_filter_coef \dst, \src, \inc
+.endm
+
+function warp_filter_horz_neon
+ load_filter_ptr r5 // filter 0
+ vld1.16 {q6,q7}, [r2], r3
+
+ load_filter_coef d0, r5, r7 // filter 0
+ load_filter_row d2, r5, r7 // filter 1
+ vmovl.s8 q0, d0 // filter 0
+ vext.8 q3, q6, q7, #2*1 // filter 1 pixels
+ vmovl.s8 q1, d2 // filter 1
+
+ vmull.s16 q4, d12, d0 // filter 0 output (0-3)
+ vmull.s16 q5, d13, d1 // filter 0 output (4-7)
+
+ load_filter_ptr r5 // filter 2
+
+ vmull.s16 q2, d6, d2 // filter 1 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 1 output (4-7)
+
+ load_filter_coef d0, r5, r7 // filter 2
+
+ vpadd.i32 d8, d8, d9 // half pixel 0 (2x32)
+ vpadd.i32 d9, d10, d11 // half pixel 0 (2x32)
+
+ load_filter_ptr r5 // filter 3
+
+ vpadd.i32 d4, d4, d5 // half pixel 1 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 1 (2x32)
+
+ vmovl.s8 q0, d0 // filter 2
+ vext.8 q3, q6, q7, #2*2 // filter 2 pixels
+
+ vpadd.i32 d8, d8, d9 // pixel 0 (2x32)
+ vpadd.i32 d9, d4, d5 // pixel 1 (2x32)
+
+ load_filter_coef d2, r5, r7 // filter 3
+
+ vmull.s16 q2, d6, d0 // filter 2 output (0-3)
+ vmull.s16 q3, d7, d1 // filter 2 output (4-7)
+
+ load_filter_ptr r5 // filter 4
+
+ vpadd.i32 d8, d8, d9 // pixel 0,1
+
+ vpadd.i32 d9, d4, d5 // half pixel 2 (2x32)
+ vpadd.i32 d10, d6, d7 // half pixel 2 (2x32)
+
+ vmovl.s8 q1, d2 // filter 3
+ vext.8 q3, q6, q7, #2*3 // filter 3 pixels
+
+ load_filter_coef d0, r5, r7 // filter 4
+
+ vpadd.i32 d9, d9, d10 // pixel 2 (2x32)
+
+ vmull.s16 q2, d6, d2 // filter 3 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 3 output (4-7)
+
+ vmovl.s8 q0, d0 // filter 4
+ load_filter_ptr r5 // filter 5
+
+ vpadd.i32 d10, d4, d5 // half pixel 3 (2x32)
+ vpadd.i32 d11, d6, d7 // half pixel 3 (2x32)
+
+ vext.8 q3, q6, q7, #2*4 // filter 4 pixels
+ load_filter_coef d2, r5, r7 // filter 5
+
+ vpadd.i32 d10, d10, d11 // pixel 3 (2x32)
+
+ vpadd.i32 d9, d9, d10 // pixel 2,3
+
+ vmull.s16 q2, d6, d0 // filter 4 output (0-3)
+ vmull.s16 q3, d7, d1 // filter 4 output (4-7)
+
+ vmovl.s8 q1, d2 // filter 5
+ load_filter_ptr r5 // filter 6
+
+ vpadd.i32 d10, d4, d5 // half pixel 4 (2x32)
+ vpadd.i32 d11, d6, d7 // half pixel 4 (2x32)
+
+ vext.8 q3, q6, q7, #2*5 // filter 5 pixels
+ load_filter_coef d0, r5, r7 // filter 6
+
+ vpadd.i32 d10, d10, d11 // pixel 4 (2x32)
+
+ vmull.s16 q2, d6, d2 // filter 5 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 5 output (4-7)
+
+ vmovl.s8 q0, d0 // filter 6
+ load_filter_ptr r5 // filter 7
+
+ vpadd.i32 d4, d4, d5 // half pixel 5 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 5 (2x32)
+
+ vext.8 q3, q6, q7, #2*6 // filter 6 pixels
+ load_filter_coef d2, r5, r7 // filter 7
+
+ vpadd.i32 d11, d4, d5 // pixel 5 (2x32)
+
+ vmull.s16 q2, d6, d0 // filter 6 output (0-3)
+ vmull.s16 q3, d7, d1 // filter 6 output (4-7)
+
+ vmovl.s8 q1, d2 // filter 7
+
+ vpadd.i32 d10, d10, d11 // pixel 4,5
+
+ vpadd.i32 d4, d4, d5 // half pixel 6 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 6 (2x32)
+
+ vext.8 q3, q6, q7, #2*7 // filter 7 pixels
+
+ vpadd.i32 d11, d4, d5 // pixel 6 (2x32)
+
+ vmull.s16 q2, d6, d2 // filter 7 output (0-3)
+ vmull.s16 q3, d7, d3 // filter 7 output (4-7)
+
+ vld1.32 {d14[],d15[]}, [sp] // -(7 - intermediate_bits)
+
+ vpadd.i32 d4, d4, d5 // half pixel 7 (2x32)
+ vpadd.i32 d5, d6, d7 // half pixel 7 (2x32)
+
+ sub r5, r5, r7, lsl #3
+
+ vpadd.i32 d4, d4, d5 // pixel 7 (2x32)
+
+ add r5, r5, r8
+
+ vpadd.i32 d11, d11, d4 // pixel 6,7
+
+ vrshl.s32 q4, q4, q7 // -(7 - intermediate_bits)
+ vrshl.s32 q5, q5, q7 // -(7 - intermediate_bits)
+
+ bx lr
+endfunc
+
+// void dav1d_warp_affine_8x8_16bpc_neon(
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *const abcd, int mx, int my,
+// const int bitdepth_max)
+.macro warp t
+function warp_affine_8x8\t\()_16bpc_neon, export=1
+ push {r4-r11,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #100]
+ ldrd r6, r7, [sp, #108]
+ sub sp, sp, #8
+
+ clz r7, r7
+ // intermediate_bits = clz(bitdepth_max) - 18
+.ifb \t
+ sub r8, r7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
+.endif
+ sub r7, r7, #25 // -(7 - intermediate_bits)
+.ifb \t
+ neg r8, r8 // -(7 + intermediate_bits)
+.endif
+ str r7, [sp] // spill -(7 - intermediate_bits) on stack
+.ifb \t
+ str r8, [sp, #4] // spill -(7 + intermediate_bits) on stack
+.endif
+
+ ldrd r8, r9, [r4]
+ sxth r7, r8
+ asr r8, r8, #16
+ asr r4, r9, #16
+ sxth r9, r9
+ mov r10, #8
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, r3
+ sub r2, r2, #6
+ movrel r11, X(mc_warp_filter), 64*8
+.ifnb \t
+ lsl r1, r1, #1
+.endif
+ add r5, r5, #512
+ add r6, r6, #512
+
+ bl warp_filter_horz_neon
+ vmovn.i32 d16, q4
+ vmovn.i32 d17, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d18, q4
+ vmovn.i32 d19, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d20, q4
+ vmovn.i32 d21, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d22, q4
+ vmovn.i32 d23, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d24, q4
+ vmovn.i32 d25, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d26, q4
+ vmovn.i32 d27, q5
+ bl warp_filter_horz_neon
+ vmovn.i32 d28, q4
+ vmovn.i32 d29, q5
+
+1:
+ bl warp_filter_horz_neon
+ vmovn.i32 d30, q4
+ vmovn.i32 d31, q5
+
+ load_filter_row d8, r6, r9
+ load_filter_row d9, r6, r9
+ load_filter_row d10, r6, r9
+ load_filter_row d11, r6, r9
+ load_filter_row d12, r6, r9
+ load_filter_row d13, r6, r9
+ load_filter_row d14, r6, r9
+ load_filter_row d15, r6, r9
+ transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15
+ vmovl.s8 q1, d8
+ vmovl.s8 q2, d9
+ vmovl.s8 q3, d10
+ vmovl.s8 q4, d11
+ vmovl.s8 q5, d12
+ vmovl.s8 q6, d13
+
+ sub r6, r6, r9, lsl #3
+
+ // This ordering of vmull/vmlal is highly beneficial for
+ // Cortex A8/A9/A53 here, but harmful for Cortex A7.
+ vmull.s16 q0, d16, d2
+ vmlal.s16 q0, d18, d4
+ vmlal.s16 q0, d20, d6
+ vmlal.s16 q0, d22, d8
+ vmlal.s16 q0, d24, d10
+ vmlal.s16 q0, d26, d12
+ vmull.s16 q1, d17, d3
+ vmlal.s16 q1, d19, d5
+ vmlal.s16 q1, d21, d7
+ vmlal.s16 q1, d23, d9
+ vmlal.s16 q1, d25, d11
+ vmlal.s16 q1, d27, d13
+
+ vmovl.s8 q2, d14
+ vmovl.s8 q3, d15
+
+ vmlal.s16 q0, d28, d4
+ vmlal.s16 q0, d30, d6
+ vmlal.s16 q1, d29, d5
+ vmlal.s16 q1, d31, d7
+
+.ifb \t
+ ldr lr, [sp, #4] // -(7 + intermediate_bits)
+ ldr r12, [sp, #120] // bitdepth_max
+ vdup.32 q2, lr // -(7 + intermediate_bits)
+ vdup.16 q3, r12 // bitdepth_max
+.endif
+
+ vmov q8, q9
+ vmov q9, q10
+.ifb \t
+ vrshl.s32 q0, q0, q2 // -(7 + intermediate_bits)
+ vrshl.s32 q1, q1, q2 // -(7 + intermediate_bits)
+.else
+ vrshrn.s32 d0, q0, #7
+ vrshrn.s32 d1, q1, #7
+ vmov.i16 q3, #PREP_BIAS
+.endif
+ vmov q10, q11
+.ifb \t
+ vqmovun.s32 d0, q0
+ vqmovun.s32 d1, q1
+.else
+ vsub.i16 q0, q0, q3 // PREP_BIAS
+.endif
+ vmov q11, q12
+ vmov q12, q13
+.ifb \t
+ vmin.u16 q0, q0, q3 // bitdepth_max
+.endif
+ vmov q13, q14
+ vmov q14, q15
+ subs r10, r10, #1
+ vst1.16 {q0}, [r0, :128], r1
+
+ add r6, r6, r4
+ bgt 1b
+
+ add sp, sp, #8
+ vpop {q4-q7}
+ pop {r4-r11,pc}
+endfunc
+.endm
+
+warp
+warp t
+
+// void dav1d_emu_edge_16bpc_neon(
+// const intptr_t bw, const intptr_t bh,
+// const intptr_t iw, const intptr_t ih,
+// const intptr_t x, const intptr_t y,
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_16bpc_neon, export=1
+ push {r4-r11,lr}
+ ldrd r4, r5, [sp, #36]
+ ldrd r6, r7, [sp, #44]
+ ldrd r8, r9, [sp, #52]
+
+ // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ // ref += iclip(x, 0, iw - 1)
+ sub r12, r3, #1 // ih - 1
+ cmp r5, r3
+ sub lr, r2, #1 // iw - 1
+ it lt
+ movlt r12, r5 // min(y, ih - 1)
+ cmp r4, r2
+ bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0)
+ it lt
+ movlt lr, r4 // min(x, iw - 1)
+ bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0)
+ mla r8, r12, r9, r8 // ref += iclip() * stride
+ add r8, r8, lr, lsl #1 // ref += iclip()
+
+ // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ // top_ext = iclip(-y, 0, bh - 1)
+ add r10, r5, r1 // y + bh
+ neg r5, r5 // -y
+ sub r10, r10, r3 // y + bh - ih
+ sub r12, r1, #1 // bh - 1
+ cmp r10, r1
+ bic r5, r5, r5, asr #31 // max(-y, 0)
+ it ge
+ movge r10, r12 // min(y + bh - ih, bh-1)
+ cmp r5, r1
+ bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0)
+ it ge
+ movge r5, r12 // min(max(-y, 0), bh-1)
+
+ // right_ext = iclip(x + bw - iw, 0, bw - 1)
+ // left_ext = iclip(-x, 0, bw - 1)
+ add r11, r4, r0 // x + bw
+ neg r4, r4 // -x
+ sub r11, r11, r2 // x + bw - iw
+ sub lr, r0, #1 // bw - 1
+ cmp r11, r0
+ bic r4, r4, r4, asr #31 // max(-x, 0)
+ it ge
+ movge r11, lr // min(x + bw - iw, bw-1)
+ cmp r4, r0
+ bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0)
+ it ge
+ movge r4, lr // min(max(-x, 0), bw - 1)
+
+ // center_h = bh - top_ext - bottom_ext
+ // dst += top_ext * PXSTRIDE(dst_stride)
+ // center_w = bw - left_ext - right_ext
+ sub r1, r1, r5 // bh - top_ext
+ mla r6, r5, r7, r6
+ sub r2, r0, r4 // bw - left_ext
+ sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext
+ sub r2, r2, r11 // center_w = bw - left_ext - right_ext
+
+ mov r0, r6 // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+ vld1.16 {d0[], d1[]}, [r8]
+ mov r12, r6 // out = dst
+ mov r3, r4
+ vmov q1, q0
+1:
+ subs r3, r3, #16
+ vst1.16 {q0, q1}, [r12, :128]!
+ bgt 1b
+.endif
+ mov lr, r8
+ add r12, r6, r4, lsl #1 // out = dst + left_ext
+ mov r3, r2
+1:
+ vld1.16 {q0, q1}, [lr]!
+ subs r3, r3, #32
+ vld1.16 {q2, q3}, [lr]!
+.if \need_left
+ vst1.16 {q0, q1}, [r12]!
+ vst1.16 {q2, q3}, [r12]!
+.else
+ vst1.16 {q0, q1}, [r12, :128]!
+ vst1.16 {q2, q3}, [r12, :128]!
+.endif
+ bgt 1b
+.if \need_right
+ add r3, r8, r2, lsl #1 // in + center_w
+ sub r3, r3, #2 // in + center_w - 1
+ add r12, r6, r4, lsl #1 // dst + left_ext
+ vld1.16 {d0[], d1[]}, [r3]
+ add r12, r12, r2, lsl #1 // out = dst + left_ext + center_w
+ mov r3, r11
+ vmov q1, q0
+1:
+ subs r3, r3, #16
+ vst1.16 {q0, q1}, [r12]!
+ bgt 1b
+.endif
+
+ subs r1, r1, #1 // center_h--
+ add r6, r6, r7
+ add r8, r8, r9
+ bgt 0b
+.endm
+
+ cmp r4, #0
+ beq 2f
+ // need_left
+ cmp r11, #0
+ beq 3f
+ // need_left + need_right
+ v_loop 1, 1
+ b 5f
+
+2:
+ // !need_left
+ cmp r11, #0
+ beq 4f
+ // !need_left + need_right
+ v_loop 0, 1
+ b 5f
+
+3:
+ // need_left + !need_right
+ v_loop 1, 0
+ b 5f
+
+4:
+ // !need_left + !need_right
+ v_loop 0, 0
+
+5:
+ cmp r10, #0
+ // Storing the original dst in r0 overwrote bw, recalculate it here
+ add r2, r2, r4 // center_w + left_ext
+ add r2, r2, r11 // bw = center_w + left_ext + right_ext
+
+ beq 3f
+ // need_bottom
+ sub r8, r6, r7 // ref = dst - stride
+ mov r4, r2
+ sub r12, r7, #32
+1:
+ vld1.16 {q0, q1}, [r8, :128]!
+ mov r3, r10
+ vld1.16 {q2, q3}, [r8, :128]!
+2:
+ vst1.16 {q0, q1}, [r6, :128]!
+ subs r3, r3, #1
+ vst1.16 {q2, q3}, [r6, :128], r12
+ bgt 2b
+ mls r6, r7, r10, r6 // dst -= bottom_ext * stride
+ subs r4, r4, #32 // bw -= 32
+ add r6, r6, #64 // dst += 32
+ bgt 1b
+
+3:
+ cmp r5, #0
+ beq 3f
+ // need_top
+ mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride
+ sub r12, r7, #32
+1:
+ vld1.16 {q0, q1}, [r0, :128]!
+ mov r3, r5
+ vld1.16 {q2, q3}, [r0, :128]!
+2:
+ vst1.16 {q0, q1}, [r6, :128]!
+ subs r3, r3, #1
+ vst1.16 {q2, q3}, [r6, :128], r12
+ bgt 2b
+ mls r6, r7, r5, r6 // dst -= top_ext * stride
+ subs r2, r2, #32 // bw -= 32
+ add r6, r6, #64 // dst += 32
+ bgt 1b
+
+3:
+ pop {r4-r11,pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/msac.S b/third_party/dav1d/src/arm/32/msac.S
new file mode 100644
index 0000000000..b06e109dda
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/msac.S
@@ -0,0 +1,575 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define BUF_POS 0
+#define BUF_END 4
+#define DIF 8
+#define RNG 12
+#define CNT 16
+#define ALLOW_UPDATE_CDF 20
+
+const coeffs
+ .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+ .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+endconst
+
+const bits, align=4
+ .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+ .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
+endconst
+
+.macro vld1_align_n d0, q0, q1, src, n
+.if \n == 4
+ vld1.16 {\d0}, [\src, :64]
+.elseif \n == 8
+ vld1.16 {\q0}, [\src, :128]
+.else
+ vld1.16 {\q0, \q1}, [\src, :128]
+.endif
+.endm
+
+.macro vld1_n d0, q0, q1, src, n
+.if \n == 4
+ vld1.16 {\d0}, [\src]
+.elseif \n == 8
+ vld1.16 {\q0}, [\src]
+.else
+ vld1.16 {\q0, \q1}, [\src]
+.endif
+.endm
+
+.macro vst1_align_n d0, q0, q1, src, n
+.if \n == 4
+ vst1.16 {\d0}, [\src, :64]
+.elseif \n == 8
+ vst1.16 {\q0}, [\src, :128]
+.else
+ vst1.16 {\q0, \q1}, [\src, :128]
+.endif
+.endm
+
+.macro vst1_n d0, q0, q1, src, n
+.if \n == 4
+ vst1.16 {\d0}, [\src]
+.elseif \n == 8
+ vst1.16 {\q0}, [\src]
+.else
+ vst1.16 {\q0, \q1}, [\src]
+.endif
+.endm
+
+.macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vshr.u16 \d0, \s0, \s3
+.else
+ vshr.u16 \d1, \s1, \s4
+.if \n == 16
+ vshr.u16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vadd.i16 \d0, \s0, \s3
+.else
+ vadd.i16 \d1, \s1, \s4
+.if \n == 16
+ vadd.i16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vsub.i16 \d0, \s0, \s3
+.else
+ vsub.i16 \d1, \s1, \s4
+.if \n == 16
+ vsub.i16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vand \d0, \s0, \s3
+.else
+ vand \d1, \s1, \s4
+.if \n == 16
+ vand \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vcge.u16 \d0, \s0, \s3
+.else
+ vcge.u16 \d1, \s1, \s4
+.if \n == 16
+ vcge.u16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vrhadd.u16 \d0, \s0, \s3
+.else
+ vrhadd.u16 \d1, \s1, \s4
+.if \n == 16
+ vrhadd.u16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vshl.s16 \d0, \s0, \s3
+.else
+ vshl.s16 \d1, \s1, \s4
+.if \n == 16
+ vshl.s16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+.macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+ vqdmulh.s16 \d0, \s0, \s3
+.else
+ vqdmulh.s16 \d1, \s1, \s4
+.if \n == 16
+ vqdmulh.s16 \d2, \s2, \s5
+.endif
+.endif
+.endm
+
+// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+// size_t n_symbols);
+
+function msac_decode_symbol_adapt4_neon, export=1
+.macro decode_update n
+ push {r4-r10,lr}
+ sub sp, sp, #48
+ add r8, r0, #RNG
+
+ vld1_align_n d0, q0, q1, r1, \n // cdf
+ vld1.16 {d16[]}, [r8, :16] // rng
+ movrel_local r9, coeffs, 30
+ vmov.i16 d30, #0x7f00 // 0x7f00
+ sub r9, r9, r2, lsl #1
+ vmvn.i16 q14, #0x3f // 0xffc0
+ add r8, sp, #14
+ vand d22, d16, d30 // rng & 0x7f00
+ vst1.16 {d16[0]}, [r8, :16] // store original u = s->rng
+ vand_n d4, q2, q3, d0, q0, q1, d28, q14, q14, \n // cdf & 0xffc0
+.if \n > 4
+ vmov d23, d22
+.endif
+
+ vld1_n d16, q8, q9, r9, \n // EC_MIN_PROB * (n_symbols - ret)
+ vqdmulh_n d20, q10, q11, d4, q2, q3, d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add r8, r0, #DIF + 2
+
+ vadd_n d16, q8, q9, d4, q2, q3, d16, q8, q9, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+.if \n == 4
+ vmov.i16 d17, #0
+.endif
+ vadd_n d16, q8, q9, d20, q10, q11, d16, q8, q9, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+
+ add r9, sp, #16
+ vld1.16 {d20[]}, [r8, :16] // dif >> (EC_WIN_SIZE - 16)
+ movrel_local r8, bits
+ vst1_n q8, q8, q9, r9, \n // store v values to allow indexed access
+
+ vmov d21, d20
+ vld1_align_n q12, q12, q13, r8, \n
+.if \n == 16
+ vmov q11, q10
+.endif
+
+ vcge_n q2, q2, q3, q10, q10, q11, q8, q8, q9, \n // c >= v
+
+ vand_n q10, q10, q11, q2, q2, q3, q12, q12, q13, \n // One bit per halfword set in the mask
+.if \n == 16
+ vadd.i16 q10, q10, q11
+.endif
+ vadd.i16 d20, d20, d21 // Aggregate mask bits
+ ldr r4, [r0, #ALLOW_UPDATE_CDF]
+ vpadd.i16 d20, d20, d20
+ lsl r10, r2, #1
+ vpadd.i16 d20, d20, d20
+ vmov.u16 r3, d20[0]
+ cmp r4, #0
+ rbit r3, r3
+ clz lr, r3 // ret
+
+ beq L(renorm)
+ // update_cdf
+ ldrh r3, [r1, r10] // count = cdf[n_symbols]
+ vmov.i8 q10, #0xff
+.if \n == 16
+ mov r4, #-5
+.else
+ mvn r12, r2
+ mov r4, #-4
+ cmn r12, #3 // set C if n_symbols <= 2
+.endif
+ vrhadd_n d16, q8, q9, d20, q10, q10, d4, q2, q3, \n // i >= val ? -1 : 32768
+.if \n == 16
+ sub r4, r4, r3, lsr #4 // -((count >> 4) + 5)
+.else
+ lsr r12, r3, #4 // count >> 4
+ sbc r4, r4, r12 // -((count >> 4) + (n_symbols > 2) + 4)
+.endif
+ vsub_n d16, q8, q9, d16, q8, q9, d0, q0, q1, \n // (32768 - cdf[i]) or (-1 - cdf[i])
+.if \n == 4
+ vdup.16 d20, r4 // -rate
+.else
+ vdup.16 q10, r4 // -rate
+.endif
+
+ sub r3, r3, r3, lsr #5 // count - (count == 32)
+ vsub_n d0, q0, q1, d0, q0, q1, d4, q2, q3, \n // cdf + (i >= val ? 1 : 0)
+ vshl_n d16, q8, q9, d16, q8, q9, d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate
+ add r3, r3, #1 // count + (count < 32)
+ vadd_n d0, q0, q1, d0, q0, q1, d16, q8, q9, \n // cdf + (32768 - cdf[i]) >> rate
+ vst1_align_n d0, q0, q1, r1, \n
+ strh r3, [r1, r10]
+.endm
+
+ decode_update 4
+
+L(renorm):
+ add r8, sp, #16
+ add r8, r8, lr, lsl #1
+ ldrh r3, [r8] // v
+ ldrh r4, [r8, #-2] // u
+ ldr r6, [r0, #CNT]
+ ldr r7, [r0, #DIF]
+ sub r4, r4, r3 // rng = u - v
+ clz r5, r4 // clz(rng)
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mvn r7, r7 // ~dif
+ add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+L(renorm2):
+ lsl r4, r4, r5 // rng << d
+ subs r6, r6, r5 // cnt -= d
+ lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ str r4, [r0, #RNG]
+ mvn r7, r7 // ~dif
+ bhs 9f
+
+ // refill
+ ldr r3, [r0, #BUF_POS] // BUF_POS
+ ldr r4, [r0, #BUF_END] // BUF_END
+ add r5, r3, #4
+ cmp r5, r4
+ bgt 2f
+
+ ldr r3, [r3] // next_bits
+ add r8, r6, #23 // shift_bits = cnt + 23
+ add r6, r6, #16 // cnt += 16
+ rev r3, r3 // next_bits = bswap(next_bits)
+ sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
+ and r8, r8, #24 // shift_bits &= 24
+ lsr r3, r3, r8 // next_bits >>= shift_bits
+ sub r8, r8, r6 // shift_bits -= 16 + cnt
+ str r5, [r0, #BUF_POS]
+ lsl r3, r3, r8 // next_bits <<= shift_bits
+ rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
+ eor r7, r7, r3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ rsb r5, r6, #8 // c = 8 - cnt
+3:
+ cmp r3, r4
+ bge 4f
+ ldrb r8, [r3], #1
+ lsl r8, r8, r5
+ eor r7, r7, r8
+ subs r5, r5, #8
+ bge 3b
+
+4: // refill_eob_end
+ str r3, [r0, #BUF_POS]
+ rsb r6, r5, #8 // cnt = 8 - c
+
+9:
+ str r6, [r0, #CNT]
+ str r7, [r0, #DIF]
+
+ mov r0, lr
+ add sp, sp, #48
+
+ pop {r4-r10,pc}
+endfunc
+
+function msac_decode_symbol_adapt8_neon, export=1
+ decode_update 8
+ b L(renorm)
+endfunc
+
+function msac_decode_symbol_adapt16_neon, export=1
+ decode_update 16
+ b L(renorm)
+endfunc
+
+function msac_decode_hi_tok_neon, export=1
+ push {r4-r10,lr}
+ vld1.16 {d0}, [r1, :64] // cdf
+ add r4, r0, #RNG
+ vmov.i16 d31, #0x7f00 // 0x7f00
+ movrel_local r5, coeffs, 30-2*3
+ vmvn.i16 d30, #0x3f // 0xffc0
+ ldrh r9, [r1, #6] // count = cdf[n_symbols]
+ vld1.16 {d1[]}, [r4, :16] // rng
+ movrel_local r4, bits
+ vld1.16 {d29}, [r5] // EC_MIN_PROB * (n_symbols - ret)
+ add r5, r0, #DIF + 2
+ vld1.16 {q8}, [r4, :128]
+ mov r2, #-24
+ vand d20, d0, d30 // cdf & 0xffc0
+ ldr r10, [r0, #ALLOW_UPDATE_CDF]
+ vld1.16 {d2[]}, [r5, :16] // dif >> (EC_WIN_SIZE - 16)
+ sub sp, sp, #48
+ ldr r6, [r0, #CNT]
+ ldr r7, [r0, #DIF]
+ vmov d3, d2
+1:
+ vand d23, d1, d31 // rng & 0x7f00
+ vqdmulh.s16 d18, d20, d23 // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add r12, sp, #14
+ vadd.i16 d6, d20, d29 // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+ vadd.i16 d6, d18, d6 // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+ vmov.i16 d7, #0
+ vst1.16 {d1[0]}, [r12, :16] // store original u = s->rng
+ add r12, sp, #16
+ vcge.u16 q2, q1, q3 // c >= v
+ vst1.16 {q3}, [r12] // store v values to allow indexed access
+ vand q9, q2, q8 // One bit per halfword set in the mask
+
+ vadd.i16 d18, d18, d19 // Aggregate mask bits
+ vpadd.i16 d18, d18, d18
+ vpadd.i16 d18, d18, d18
+ vmov.u16 r3, d18[0]
+ cmp r10, #0
+ add r2, r2, #5
+ rbit r3, r3
+ add r8, sp, #16
+ clz lr, r3 // ret
+
+ beq 2f
+ // update_cdf
+ vmov.i8 d22, #0xff
+ mov r4, #-5
+ vrhadd.u16 d6, d22, d4 // i >= val ? -1 : 32768
+ sub r4, r4, r9, lsr #4 // -((count >> 4) + 5)
+ vsub.i16 d6, d6, d0 // (32768 - cdf[i]) or (-1 - cdf[i])
+ vdup.16 d18, r4 // -rate
+
+ sub r9, r9, r9, lsr #5 // count - (count == 32)
+ vsub.i16 d0, d0, d4 // cdf + (i >= val ? 1 : 0)
+ vshl.s16 d6, d6, d18 // ({32768,-1} - cdf[i]) >> rate
+ add r9, r9, #1 // count + (count < 32)
+ vadd.i16 d0, d0, d6 // cdf + (32768 - cdf[i]) >> rate
+ vst1.16 {d0}, [r1, :64]
+ vand d20, d0, d30 // cdf & 0xffc0
+ strh r9, [r1, #6]
+
+2:
+ add r8, r8, lr, lsl #1
+ ldrh r3, [r8] // v
+ ldrh r4, [r8, #-2] // u
+ sub r4, r4, r3 // rng = u - v
+ clz r5, r4 // clz(rng)
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mvn r7, r7 // ~dif
+ add r7, r7, r3, lsl #16 // ~dif + (v << 16)
+ lsl r4, r4, r5 // rng << d
+ subs r6, r6, r5 // cnt -= d
+ lsl r7, r7, r5 // (~dif + (v << 16)) << d
+ str r4, [r0, #RNG]
+ vdup.16 d1, r4
+ mvn r7, r7 // ~dif
+ bhs 9f
+
+ // refill
+ ldr r3, [r0, #BUF_POS] // BUF_POS
+ ldr r4, [r0, #BUF_END] // BUF_END
+ add r5, r3, #4
+ cmp r5, r4
+ bgt 2f
+
+ ldr r3, [r3] // next_bits
+ add r8, r6, #23 // shift_bits = cnt + 23
+ add r6, r6, #16 // cnt += 16
+ rev r3, r3 // next_bits = bswap(next_bits)
+ sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3
+ and r8, r8, #24 // shift_bits &= 24
+ lsr r3, r3, r8 // next_bits >>= shift_bits
+ sub r8, r8, r6 // shift_bits -= 16 + cnt
+ str r5, [r0, #BUF_POS]
+ lsl r3, r3, r8 // next_bits <<= shift_bits
+ rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits
+ eor r7, r7, r3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ rsb r5, r6, #8 // c = 40 - cnt
+3:
+ cmp r3, r4
+ bge 4f
+ ldrb r8, [r3], #1
+ lsl r8, r8, r5
+ eor r7, r7, r8
+ subs r5, r5, #8
+ bge 3b
+
+4: // refill_eob_end
+ str r3, [r0, #BUF_POS]
+ rsb r6, r5, #8 // cnt = 40 - c
+
+9:
+ lsl lr, lr, #1
+ sub lr, lr, #5
+ lsr r12, r7, #16
+ adds r2, r2, lr // carry = tok_br < 3 || tok == 15
+ vdup.16 q1, r12
+ bcc 1b // loop if !carry
+ add r2, r2, #30
+ str r6, [r0, #CNT]
+ add sp, sp, #48
+ str r7, [r0, #DIF]
+ lsr r0, r2, #1
+ pop {r4-r10,pc}
+endfunc
+
+function msac_decode_bool_equi_neon, export=1
+ push {r4-r10,lr}
+ ldr r5, [r0, #RNG]
+ ldr r6, [r0, #CNT]
+ sub sp, sp, #48
+ ldr r7, [r0, #DIF]
+ bic r4, r5, #0xff // r &= 0xff00
+ add r4, r4, #8
+ mov r2, #0
+ subs r8, r7, r4, lsl #15 // dif - vw
+ lsr r4, r4, #1 // v
+ sub r5, r5, r4 // r - v
+ itee lo
+ movlo r2, #1
+ movhs r4, r5 // if (ret) v = r - v;
+ movhs r7, r8 // if (ret) dif = dif - vw;
+
+ clz r5, r4 // clz(rng)
+ mvn r7, r7 // ~dif
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mov lr, r2
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_neon, export=1
+ push {r4-r10,lr}
+ ldr r5, [r0, #RNG]
+ ldr r6, [r0, #CNT]
+ sub sp, sp, #48
+ ldr r7, [r0, #DIF]
+ lsr r4, r5, #8 // r >> 8
+ bic r1, r1, #0x3f // f &= ~63
+ mul r4, r4, r1
+ mov r2, #0
+ lsr r4, r4, #7
+ add r4, r4, #4 // v
+ subs r8, r7, r4, lsl #16 // dif - vw
+ sub r5, r5, r4 // r - v
+ itee lo
+ movlo r2, #1
+ movhs r4, r5 // if (ret) v = r - v;
+ movhs r7, r8 // if (ret) dif = dif - vw;
+
+ clz r5, r4 // clz(rng)
+ mvn r7, r7 // ~dif
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mov lr, r2
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_adapt_neon, export=1
+ push {r4-r10,lr}
+ ldr r9, [r1] // cdf[0-1]
+ ldr r5, [r0, #RNG]
+ movw lr, #0xffc0
+ ldr r6, [r0, #CNT]
+ sub sp, sp, #48
+ ldr r7, [r0, #DIF]
+ lsr r4, r5, #8 // r >> 8
+ and r2, r9, lr // f &= ~63
+ mul r4, r4, r2
+ mov r2, #0
+ lsr r4, r4, #7
+ add r4, r4, #4 // v
+ subs r8, r7, r4, lsl #16 // dif - vw
+ sub r5, r5, r4 // r - v
+ ldr r10, [r0, #ALLOW_UPDATE_CDF]
+ itee lo
+ movlo r2, #1
+ movhs r4, r5 // if (ret) v = r - v;
+ movhs r7, r8 // if (ret) dif = dif - vw;
+
+ cmp r10, #0
+ clz r5, r4 // clz(rng)
+ mvn r7, r7 // ~dif
+ eor r5, r5, #16 // d = clz(rng) ^ 16
+ mov lr, r2
+
+ beq L(renorm2)
+
+ lsr r2, r9, #16 // count = cdf[1]
+ uxth r9, r9 // cdf[0]
+
+ sub r3, r2, r2, lsr #5 // count - (count >= 32)
+ lsr r2, r2, #4 // count >> 4
+ add r10, r3, #1 // count + (count < 32)
+ add r2, r2, #4 // rate = (count >> 4) | 4
+
+ sub r9, r9, lr // cdf[0] -= bit
+ sub r3, r9, lr, lsl #15 // {cdf[0], cdf[0] - 32769}
+ asr r3, r3, r2 // {cdf[0], cdf[0] - 32769} >> rate
+ sub r9, r9, r3 // cdf[0]
+
+ strh r9, [r1]
+ strh r10, [r1, #2]
+
+ b L(renorm2)
+endfunc
diff --git a/third_party/dav1d/src/arm/32/refmvs.S b/third_party/dav1d/src/arm/32/refmvs.S
new file mode 100644
index 0000000000..e16c5448d0
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/refmvs.S
@@ -0,0 +1,97 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
+// int bx4, int bw4, int bh4)
+
+function splat_mv_neon, export=1
+ push {r4, lr}
+ vld1.8 {q3}, [r1]
+ ldr r4, [sp, #8]
+ clz r3, r3
+ adr lr, L(splat_tbl)
+ sub r3, r3, #26
+ vext.8 q2, q3, q3, #12
+ ldr r3, [lr, r3, lsl #2]
+ add r2, r2, r2, lsl #1
+ vext.8 q0, q2, q3, #4
+ add r3, lr, r3
+ vext.8 q1, q2, q3, #8
+ lsl r2, r2, #2
+ vext.8 q2, q2, q3, #12
+ vmov q3, q0
+1:
+ ldr r1, [r0], #4
+ subs r4, r4, #1
+ add r1, r1, r2
+ bx r3
+
+ .align 2
+L(splat_tbl):
+ .word 320f - L(splat_tbl) + CONFIG_THUMB
+ .word 160f - L(splat_tbl) + CONFIG_THUMB
+ .word 80f - L(splat_tbl) + CONFIG_THUMB
+ .word 40f - L(splat_tbl) + CONFIG_THUMB
+ .word 20f - L(splat_tbl) + CONFIG_THUMB
+ .word 10f - L(splat_tbl) + CONFIG_THUMB
+
+10:
+ vst1.8 {d0}, [r1]
+ vstr s2, [r1, #8]
+ bgt 1b
+ pop {r4, pc}
+20:
+ vst1.8 {q0}, [r1]
+ vstr d2, [r1, #16]
+ bgt 1b
+ pop {r4, pc}
+40:
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2}, [r1]
+ bgt 1b
+ pop {r4, pc}
+320:
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2, q3}, [r1]!
+ vst1.8 {q1, q2}, [r1]!
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2, q3}, [r1]!
+ vst1.8 {q1, q2}, [r1]!
+160:
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2, q3}, [r1]!
+ vst1.8 {q1, q2}, [r1]!
+80:
+ vst1.8 {q0, q1}, [r1]!
+ vst1.8 {q2, q3}, [r1]!
+ vst1.8 {q1, q2}, [r1]
+ bgt 1b
+ pop {r4, pc}
+endfunc
diff --git a/third_party/dav1d/src/arm/32/util.S b/third_party/dav1d/src/arm/32/util.S
new file mode 100644
index 0000000000..c3710d3767
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/util.S
@@ -0,0 +1,184 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2015 Martin Storsjo
+ * Copyright © 2015 Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#ifndef DAV1D_SRC_ARM_32_UTIL_S
+#define DAV1D_SRC_ARM_32_UTIL_S
+
+#include "config.h"
+#include "src/arm/asm.S"
+
+.macro movrel_local rd, val, offset=0
+#if defined(PIC)
+ ldr \rd, 90001f
+ b 90002f
+90001:
+ .word \val + \offset - (90002f + 8 - 4 * CONFIG_THUMB)
+90002:
+ add \rd, \rd, pc
+#else
+ movw \rd, #:lower16:\val+\offset
+ movt \rd, #:upper16:\val+\offset
+#endif
+.endm
+
+.macro movrel rd, val, offset=0
+#if defined(PIC) && defined(__APPLE__)
+ ldr \rd, 1f
+ b 2f
+1:
+ .word 3f - (2f + 8 - 4 * CONFIG_THUMB)
+2:
+ ldr \rd, [pc, \rd]
+.if \offset < 0
+ sub \rd, \rd, #-(\offset)
+.elseif \offset > 0
+ add \rd, \rd, #\offset
+.endif
+ .non_lazy_symbol_pointer
+3:
+ .indirect_symbol \val
+ .word 0
+ .text
+#else
+ movrel_local \rd, \val, \offset
+#endif
+.endm
+
+// This macro clobbers r7 (and r12 on windows) and stores data at the
+// bottom of the stack; sp is the start of the space allocated that
+// the caller can use.
+.macro sub_sp_align space
+#if CONFIG_THUMB
+ mov r7, sp
+ and r7, r7, #15
+#else
+ and r7, sp, #15
+#endif
+ sub sp, sp, r7
+ // Now the stack is aligned, store the amount of adjustment back
+ // on the stack, as we don't want to waste a register as frame
+ // pointer.
+ str r7, [sp, #-16]!
+#ifdef _WIN32
+.if \space > 8192
+ // Here, we'd need to touch two (or more) pages while decrementing
+ // the stack pointer.
+ .error "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+ sub r7, sp, #4096
+ ldr r12, [r7]
+ sub r7, r7, #(\space - 4096)
+ mov sp, r7
+.else
+ sub sp, sp, #\space
+.endif
+#else
+.if \space >= 4096
+ sub sp, sp, #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+ sub sp, sp, #(\space)%4096
+.endif
+#endif
+.endm
+
+.macro add_sp_align space
+.if \space >= 4096
+ add sp, sp, #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+ add sp, sp, #(\space)%4096
+.endif
+ ldr r7, [sp], #16
+ // Add back the original stack adjustment
+ add sp, sp, r7
+.endm
+
+.macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+ vtrn.32 \q0, \q2
+ vtrn.32 \q1, \q3
+
+ vtrn.16 \r0, \r2
+ vtrn.16 \r1, \r3
+ vtrn.16 \r4, \r6
+ vtrn.16 \r5, \r7
+
+ vtrn.8 \r0, \r1
+ vtrn.8 \r2, \r3
+ vtrn.8 \r4, \r5
+ vtrn.8 \r6, \r7
+.endm
+
+.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, d0, d1, d2, d3, d4, d5, d6, d7
+ vswp \d0, \d4
+ vswp \d1, \d5
+ vswp \d2, \d6
+ vswp \d3, \d7
+
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+ vtrn.32 \r4, \r6
+ vtrn.32 \r5, \r7
+
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+ vtrn.16 \r4, \r5
+ vtrn.16 \r6, \r7
+.endm
+
+.macro transpose_4x8b q0, q1, r0, r1, r2, r3
+ vtrn.16 \q0, \q1
+
+ vtrn.8 \r0, \r1
+ vtrn.8 \r2, \r3
+.endm
+
+.macro transpose_4x4s q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+ vswp \r1, \r4 // vtrn.64 \q0, \q2
+ vswp \r3, \r6 // vtrn.64 \q1, \q3
+
+ vtrn.32 \q0, \q1
+ vtrn.32 \q2, \q3
+.endm
+
+.macro transpose_4x4h q0, q1, r0, r1, r2, r3
+ vtrn.32 \q0, \q1
+
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+.endm
+
+.macro transpose_4x8h r0, r1, r2, r3
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+.endm
+
+#endif /* DAV1D_SRC_ARM_32_UTIL_S */
diff --git a/third_party/dav1d/src/arm/64/cdef.S b/third_party/dav1d/src/arm/64/cdef.S
new file mode 100644
index 0000000000..32b258aba8
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/cdef.S
@@ -0,0 +1,520 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
+ tst w7, #1 // CDEF_HAVE_LEFT
+ b.eq 2f
+ // CDEF_HAVE_LEFT
+ sub \s1, \s1, #2
+ sub \s2, \s2, #2
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldr \rn\()0, [\s1]
+ ldr s1, [\s1, #\w]
+ ldr \rn\()2, [\s2]
+ ldr s3, [\s2, #\w]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+ uxtl v3.8h, v3.8b
+ str \rw\()0, [x0]
+ str d1, [x0, #2*\w]
+ add x0, x0, #2*\stride
+ str \rw\()2, [x0]
+ str d3, [x0, #2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldr \rn\()0, [\s1]
+ ldr h1, [\s1, #\w]
+ ldr \rn\()2, [\s2]
+ ldr h3, [\s2, #\w]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+ uxtl v3.8h, v3.8b
+ str \rw\()0, [x0]
+ str s1, [x0, #2*\w]
+ str s31, [x0, #2*\w+4]
+ add x0, x0, #2*\stride
+ str \rw\()2, [x0]
+ str s3, [x0, #2*\w]
+ str s31, [x0, #2*\w+4]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+2:
+ // !CDEF_HAVE_LEFT
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldr \rn\()0, [\s1]
+ ldr h1, [\s1, #\w]
+ ldr \rn\()2, [\s2]
+ ldr h3, [\s2, #\w]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+ uxtl v3.8h, v3.8b
+ str s31, [x0]
+ stur \rw\()0, [x0, #4]
+ str s1, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ str s31, [x0]
+ stur \rw\()2, [x0, #4]
+ str s3, [x0, #4+2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldr \rn\()0, [\s1]
+ ldr \rn\()1, [\s2]
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ str s31, [x0]
+ stur \rw\()0, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ str s31, [x0]
+ stur \rw\()1, [x0, #4]
+ str s31, [x0, #4+2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+.endif
+3:
+.endm
+
+.macro load_n_incr dst, src, incr, w
+.if \w == 4
+ ld1 {\dst\().s}[0], [\src], \incr
+.else
+ ld1 {\dst\().8b}, [\src], \incr
+.endif
+.endm
+
+// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+.macro padding_func w, stride, rn, rw
+function cdef_padding\w\()_8bpc_neon, export=1
+ cmp w7, #0xf // fully edged
+ b.eq cdef_padding\w\()_edged_8bpc_neon
+ movi v30.8h, #0x80, lsl #8
+ mov v31.16b, v30.16b
+ sub x0, x0, #2*(2*\stride+2)
+ tst w7, #4 // CDEF_HAVE_TOP
+ b.ne 1f
+ // !CDEF_HAVE_TOP
+ st1 {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+ st1 {v30.8h, v31.8h}, [x0], #32
+.endif
+ b 3f
+1:
+ // CDEF_HAVE_TOP
+ add x9, x4, x2
+ pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0
+
+ // Middle section
+3:
+ tst w7, #1 // CDEF_HAVE_LEFT
+ b.eq 2f
+ // CDEF_HAVE_LEFT
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ld1 {v0.h}[0], [x3], #2
+ ldr h2, [x1, #\w]
+ load_n_incr v1, x1, x2, \w
+ subs w6, w6, #1
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+ str s0, [x0]
+ stur \rw\()1, [x0, #4]
+ str s2, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 0b
+ b 3f
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ld1 {v0.h}[0], [x3], #2
+ load_n_incr v1, x1, x2, \w
+ subs w6, w6, #1
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ str s0, [x0]
+ stur \rw\()1, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 1b
+ b 3f
+2:
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ldr h1, [x1, #\w]
+ load_n_incr v0, x1, x2, \w
+ subs w6, w6, #1
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ str s31, [x0]
+ stur \rw\()0, [x0, #4]
+ str s1, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 0b
+ b 3f
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ load_n_incr v0, x1, x2, \w
+ subs w6, w6, #1
+ uxtl v0.8h, v0.8b
+ str s31, [x0]
+ stur \rw\()0, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 1b
+
+3:
+ tst w7, #8 // CDEF_HAVE_BOTTOM
+ b.ne 1f
+ // !CDEF_HAVE_BOTTOM
+ st1 {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+ st1 {v30.8h, v31.8h}, [x0], #32
+.endif
+ ret
+1:
+ // CDEF_HAVE_BOTTOM
+ add x9, x5, x2
+ pad_top_bottom x5, x9, \w, \stride, \rn, \rw, 1
+endfunc
+.endm
+
+padding_func 8, 16, d, q
+padding_func 4, 8, s, d
+
+// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+.macro padding_func_edged w, stride, reg
+function cdef_padding\w\()_edged_8bpc_neon, export=1
+ sub x4, x4, #2
+ sub x5, x5, #2
+ sub x0, x0, #(2*\stride+2)
+
+.if \w == 4
+ ldr d0, [x4]
+ ldr d1, [x4, x2]
+ st1 {v0.8b, v1.8b}, [x0], #16
+.else
+ add x9, x4, x2
+ ldr d0, [x4]
+ ldr s1, [x4, #8]
+ ldr d2, [x9]
+ ldr s3, [x9, #8]
+ str d0, [x0]
+ str s1, [x0, #8]
+ str d2, [x0, #\stride]
+ str s3, [x0, #\stride+8]
+ add x0, x0, #2*\stride
+.endif
+
+0:
+ ld1 {v0.h}[0], [x3], #2
+ ldr h2, [x1, #\w]
+ load_n_incr v1, x1, x2, \w
+ subs w6, w6, #1
+ str h0, [x0]
+ stur \reg\()1, [x0, #2]
+ str h2, [x0, #2+\w]
+ add x0, x0, #\stride
+ b.gt 0b
+
+.if \w == 4
+ ldr d0, [x5]
+ ldr d1, [x5, x2]
+ st1 {v0.8b, v1.8b}, [x0], #16
+.else
+ add x9, x5, x2
+ ldr d0, [x5]
+ ldr s1, [x5, #8]
+ ldr d2, [x9]
+ ldr s3, [x9, #8]
+ str d0, [x0]
+ str s1, [x0, #8]
+ str d2, [x0, #\stride]
+ str s3, [x0, #\stride+8]
+.endif
+ ret
+endfunc
+.endm
+
+padding_func_edged 8, 16, d
+padding_func_edged 4, 8, s
+
+tables
+
+filter 8, 8
+filter 4, 8
+
+find_dir 8
+
+.macro load_px_8 d1, d2, w
+.if \w == 8
+ add x6, x2, w9, sxtb // x + off
+ sub x9, x2, w9, sxtb // x - off
+ ld1 {\d1\().d}[0], [x6] // p0
+ add x6, x6, #16 // += stride
+ ld1 {\d2\().d}[0], [x9] // p1
+ add x9, x9, #16 // += stride
+ ld1 {\d1\().d}[1], [x6] // p0
+ ld1 {\d2\().d}[1], [x9] // p0
+.else
+ add x6, x2, w9, sxtb // x + off
+ sub x9, x2, w9, sxtb // x - off
+ ld1 {\d1\().s}[0], [x6] // p0
+ add x6, x6, #8 // += stride
+ ld1 {\d2\().s}[0], [x9] // p1
+ add x9, x9, #8 // += stride
+ ld1 {\d1\().s}[1], [x6] // p0
+ add x6, x6, #8 // += stride
+ ld1 {\d2\().s}[1], [x9] // p1
+ add x9, x9, #8 // += stride
+ ld1 {\d1\().s}[2], [x6] // p0
+ add x6, x6, #8 // += stride
+ ld1 {\d2\().s}[2], [x9] // p1
+ add x9, x9, #8 // += stride
+ ld1 {\d1\().s}[3], [x6] // p0
+ ld1 {\d2\().s}[3], [x9] // p1
+.endif
+.endm
+.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
+.if \min
+ umin v3.16b, v3.16b, \s1\().16b
+ umax v4.16b, v4.16b, \s1\().16b
+ umin v3.16b, v3.16b, \s2\().16b
+ umax v4.16b, v4.16b, \s2\().16b
+.endif
+ uabd v16.16b, v0.16b, \s1\().16b // abs(diff)
+ uabd v20.16b, v0.16b, \s2\().16b // abs(diff)
+ ushl v17.16b, v16.16b, \shift // abs(diff) >> shift
+ ushl v21.16b, v20.16b, \shift // abs(diff) >> shift
+ uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift))
+ uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift))
+ cmhi v18.16b, v0.16b, \s1\().16b // px > p0
+ cmhi v22.16b, v0.16b, \s2\().16b // px > p1
+ umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip)
+ umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip)
+ dup v19.16b, \tap // taps[k]
+ neg v16.16b, v17.16b // -imin()
+ neg v20.16b, v21.16b // -imin()
+ bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign()
+ bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign()
+ mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain()
+ mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain()
+.endm
+
+// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
+// const uint8_t *tmp, int pri_strength,
+// int sec_strength, int dir, int damping,
+// int h);
+.macro filter_func_8 w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_edged_8bpc_neon
+.if \pri
+ movrel x8, pri_taps
+ and w9, w3, #1
+ add x8, x8, w9, uxtw #1
+.endif
+ movrel x9, directions\w
+ add x5, x9, w5, uxtw #1
+ movi v30.8b, #7
+ dup v28.8b, w6 // damping
+
+.if \pri
+ dup v25.16b, w3 // threshold
+.endif
+.if \sec
+ dup v27.16b, w4 // threshold
+.endif
+ trn1 v24.8b, v25.8b, v27.8b
+ clz v24.8b, v24.8b // clz(threshold)
+ sub v24.8b, v30.8b, v24.8b // ulog2(threshold)
+ uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold))
+ neg v24.8b, v24.8b // -shift
+.if \sec
+ dup v26.16b, v24.b[1]
+.endif
+.if \pri
+ dup v24.16b, v24.b[0]
+.endif
+
+1:
+.if \w == 8
+ add x12, x2, #16
+ ld1 {v0.d}[0], [x2] // px
+ ld1 {v0.d}[1], [x12] // px
+.else
+ add x12, x2, #1*8
+ add x13, x2, #2*8
+ add x14, x2, #3*8
+ ld1 {v0.s}[0], [x2] // px
+ ld1 {v0.s}[1], [x12] // px
+ ld1 {v0.s}[2], [x13] // px
+ ld1 {v0.s}[3], [x14] // px
+.endif
+
+ // We need 9-bits or two 8-bit accululators to fit the sum.
+ // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228.
+ // Start sum at -1 instead of 0 to help handle rounding later.
+ movi v1.16b, #255 // sum
+ movi v2.16b, #0 // sum
+.if \min
+ mov v3.16b, v0.16b // min
+ mov v4.16b, v0.16b // max
+.endif
+
+ // Instead of loading sec_taps 2, 1 from memory, just set it
+ // to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
+ mov w11, #2 // sec_taps[0]
+
+2:
+.if \pri
+ ldrb w9, [x5] // off1
+
+ load_px_8 v5, v6, \w
+.endif
+
+.if \sec
+ add x5, x5, #4 // +2*2
+ ldrb w9, [x5] // off2
+ load_px_8 v28, v29, \w
+.endif
+
+.if \pri
+ ldrb w10, [x8] // *pri_taps
+
+ handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min
+.endif
+
+.if \sec
+ add x5, x5, #8 // +2*4
+ ldrb w9, [x5] // off3
+ load_px_8 v5, v6, \w
+
+ handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min
+
+ handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min
+
+ sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
+.else
+ add x5, x5, #1 // x5 += 1
+.endif
+ subs w11, w11, #1 // sec_tap-- (value)
+.if \pri
+ add x8, x8, #1 // pri_taps++ (pointer)
+.endif
+ b.ne 2b
+
+ // Perform halving adds since the value won't fit otherwise.
+ // To handle the offset for negative values, use both halving w/ and w/o rounding.
+ srhadd v5.16b, v1.16b, v2.16b // sum >> 1
+ shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1
+ cmlt v1.16b, v5.16b, #0 // sum < 0
+ bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1
+
+ srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4
+
+ usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4
+.if \min
+ umin v0.16b, v0.16b, v4.16b
+ umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max)
+.endif
+.if \w == 8
+ st1 {v0.d}[0], [x0], x1
+ add x2, x2, #2*16 // tmp += 2*tmp_stride
+ subs w7, w7, #2 // h -= 2
+ st1 {v0.d}[1], [x0], x1
+.else
+ st1 {v0.s}[0], [x0], x1
+ add x2, x2, #4*8 // tmp += 4*tmp_stride
+ st1 {v0.s}[1], [x0], x1
+ subs w7, w7, #4 // h -= 4
+ st1 {v0.s}[2], [x0], x1
+ st1 {v0.s}[3], [x0], x1
+.endif
+
+ // Reset pri_taps and directions back to the original point
+ sub x5, x5, #2
+.if \pri
+ sub x8, x8, #2
+.endif
+
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+.macro filter_8 w
+filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
+.endm
+
+filter_8 8
+filter_8 4
diff --git a/third_party/dav1d/src/arm/64/cdef16.S b/third_party/dav1d/src/arm/64/cdef16.S
new file mode 100644
index 0000000000..ecf864a26d
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/cdef16.S
@@ -0,0 +1,229 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+.macro pad_top_bot_16 s1, s2, w, stride, reg, ret
+ tst w7, #1 // CDEF_HAVE_LEFT
+ b.eq 2f
+ // CDEF_HAVE_LEFT
+ sub \s1, \s1, #4
+ sub \s2, \s2, #4
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldr \reg\()0, [\s1]
+ ldr d1, [\s1, #2*\w]
+ ldr \reg\()2, [\s2]
+ ldr d3, [\s2, #2*\w]
+ str \reg\()0, [x0]
+ str d1, [x0, #2*\w]
+ add x0, x0, #2*\stride
+ str \reg\()2, [x0]
+ str d3, [x0, #2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldr \reg\()0, [\s1]
+ ldr s1, [\s1, #2*\w]
+ ldr \reg\()2, [\s2]
+ ldr s3, [\s2, #2*\w]
+ str \reg\()0, [x0]
+ str s1, [x0, #2*\w]
+ str s31, [x0, #2*\w+4]
+ add x0, x0, #2*\stride
+ str \reg\()2, [x0]
+ str s3, [x0, #2*\w]
+ str s31, [x0, #2*\w+4]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+2:
+ // !CDEF_HAVE_LEFT
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+ ldr \reg\()0, [\s1]
+ ldr s1, [\s1, #2*\w]
+ ldr \reg\()2, [\s2]
+ ldr s3, [\s2, #2*\w]
+ str s31, [x0]
+ stur \reg\()0, [x0, #4]
+ str s1, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ str s31, [x0]
+ stur \reg\()2, [x0, #4]
+ str s3, [x0, #4+2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+ b 3f
+.endif
+
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ldr \reg\()0, [\s1]
+ ldr \reg\()1, [\s2]
+ str s31, [x0]
+ stur \reg\()0, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ str s31, [x0]
+ stur \reg\()1, [x0, #4]
+ str s31, [x0, #4+2*\w]
+.if \ret
+ ret
+.else
+ add x0, x0, #2*\stride
+.endif
+3:
+.endm
+
+.macro load_n_incr_16 dst, src, incr, w
+.if \w == 4
+ ld1 {\dst\().4h}, [\src], \incr
+.else
+ ld1 {\dst\().8h}, [\src], \incr
+.endif
+.endm
+
+// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
+// ptrdiff_t src_stride, const pixel (*left)[2],
+// const pixel *const top,
+// const pixel *const bottom, int h,
+// enum CdefEdgeFlags edges);
+
+.macro padding_func_16 w, stride, reg
+function cdef_padding\w\()_16bpc_neon, export=1
+ movi v30.8h, #0x80, lsl #8
+ mov v31.16b, v30.16b
+ sub x0, x0, #2*(2*\stride+2)
+ tst w7, #4 // CDEF_HAVE_TOP
+ b.ne 1f
+ // !CDEF_HAVE_TOP
+ st1 {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+ st1 {v30.8h, v31.8h}, [x0], #32
+.endif
+ b 3f
+1:
+ // CDEF_HAVE_TOP
+ add x9, x4, x2
+ pad_top_bot_16 x4, x9, \w, \stride, \reg, 0
+
+ // Middle section
+3:
+ tst w7, #1 // CDEF_HAVE_LEFT
+ b.eq 2f
+ // CDEF_HAVE_LEFT
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ld1 {v0.s}[0], [x3], #4
+ ldr s2, [x1, #2*\w]
+ load_n_incr_16 v1, x1, x2, \w
+ subs w6, w6, #1
+ str s0, [x0]
+ stur \reg\()1, [x0, #4]
+ str s2, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 0b
+ b 3f
+1:
+ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ ld1 {v0.s}[0], [x3], #4
+ load_n_incr_16 v1, x1, x2, \w
+ subs w6, w6, #1
+ str s0, [x0]
+ stur \reg\()1, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 1b
+ b 3f
+2:
+ tst w7, #2 // CDEF_HAVE_RIGHT
+ b.eq 1f
+ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+ ldr s1, [x1, #2*\w]
+ load_n_incr_16 v0, x1, x2, \w
+ subs w6, w6, #1
+ str s31, [x0]
+ stur \reg\()0, [x0, #4]
+ str s1, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 0b
+ b 3f
+1:
+ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+ load_n_incr_16 v0, x1, x2, \w
+ subs w6, w6, #1
+ str s31, [x0]
+ stur \reg\()0, [x0, #4]
+ str s31, [x0, #4+2*\w]
+ add x0, x0, #2*\stride
+ b.gt 1b
+
+3:
+ tst w7, #8 // CDEF_HAVE_BOTTOM
+ b.ne 1f
+ // !CDEF_HAVE_BOTTOM
+ st1 {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+ st1 {v30.8h, v31.8h}, [x0], #32
+.endif
+ ret
+1:
+ // CDEF_HAVE_BOTTOM
+ add x9, x5, x2
+ pad_top_bot_16 x5, x9, \w, \stride, \reg, 1
+endfunc
+.endm
+
+padding_func_16 8, 16, q
+padding_func_16 4, 8, d
+
+tables
+
+filter 8, 16
+filter 4, 16
+
+find_dir 16
diff --git a/third_party/dav1d/src/arm/64/cdef_tmpl.S b/third_party/dav1d/src/arm/64/cdef_tmpl.S
new file mode 100644
index 0000000000..d35d7a09ba
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/cdef_tmpl.S
@@ -0,0 +1,511 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro dir_table w, stride
+const directions\w
+ .byte -1 * \stride + 1, -2 * \stride + 2
+ .byte 0 * \stride + 1, -1 * \stride + 2
+ .byte 0 * \stride + 1, 0 * \stride + 2
+ .byte 0 * \stride + 1, 1 * \stride + 2
+ .byte 1 * \stride + 1, 2 * \stride + 2
+ .byte 1 * \stride + 0, 2 * \stride + 1
+ .byte 1 * \stride + 0, 2 * \stride + 0
+ .byte 1 * \stride + 0, 2 * \stride - 1
+// Repeated, to avoid & 7
+ .byte -1 * \stride + 1, -2 * \stride + 2
+ .byte 0 * \stride + 1, -1 * \stride + 2
+ .byte 0 * \stride + 1, 0 * \stride + 2
+ .byte 0 * \stride + 1, 1 * \stride + 2
+ .byte 1 * \stride + 1, 2 * \stride + 2
+ .byte 1 * \stride + 0, 2 * \stride + 1
+endconst
+.endm
+
+.macro tables
+dir_table 8, 16
+dir_table 4, 8
+
+const pri_taps
+ .byte 4, 2, 3, 3
+endconst
+.endm
+
+.macro load_px d1, d2, w
+.if \w == 8
+ add x6, x2, w9, sxtb #1 // x + off
+ sub x9, x2, w9, sxtb #1 // x - off
+ ld1 {\d1\().8h}, [x6] // p0
+ ld1 {\d2\().8h}, [x9] // p1
+.else
+ add x6, x2, w9, sxtb #1 // x + off
+ sub x9, x2, w9, sxtb #1 // x - off
+ ld1 {\d1\().4h}, [x6] // p0
+ add x6, x6, #2*8 // += stride
+ ld1 {\d2\().4h}, [x9] // p1
+ add x9, x9, #2*8 // += stride
+ ld1 {\d1\().d}[1], [x6] // p0
+ ld1 {\d2\().d}[1], [x9] // p1
+.endif
+.endm
+.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
+.if \min
+ umin v2.8h, v2.8h, \s1\().8h
+ smax v3.8h, v3.8h, \s1\().8h
+ umin v2.8h, v2.8h, \s2\().8h
+ smax v3.8h, v3.8h, \s2\().8h
+.endif
+ uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
+ uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
+ ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
+ ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
+ uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
+ uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
+ sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
+ sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
+ neg v16.8h, v17.8h // -clip
+ neg v20.8h, v21.8h // -clip
+ smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
+ smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
+ dup v19.8h, \tap // taps[k]
+ smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
+ smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
+ mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
+ mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
+.endm
+
+// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
+// const uint16_t *tmp, int pri_strength,
+// int sec_strength, int dir, int damping,
+// int h, size_t edges);
+.macro filter_func w, bpc, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_\bpc\()bpc_neon
+.if \bpc == 8
+ ldr w8, [sp] // edges
+ cmp w8, #0xf
+ b.eq cdef_filter\w\suffix\()_edged_8bpc_neon
+.endif
+.if \pri
+.if \bpc == 16
+ ldr w9, [sp, #8] // bitdepth_max
+ clz w9, w9
+ sub w9, w9, #24 // -bitdepth_min_8
+ neg w9, w9 // bitdepth_min_8
+.endif
+ movrel x8, pri_taps
+.if \bpc == 16
+ lsr w9, w3, w9 // pri_strength >> bitdepth_min_8
+ and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1
+.else
+ and w9, w3, #1
+.endif
+ add x8, x8, w9, uxtw #1
+.endif
+ movrel x9, directions\w
+ add x5, x9, w5, uxtw #1
+ movi v30.4h, #15
+ dup v28.4h, w6 // damping
+
+.if \pri
+ dup v25.8h, w3 // threshold
+.endif
+.if \sec
+ dup v27.8h, w4 // threshold
+.endif
+ trn1 v24.4h, v25.4h, v27.4h
+ clz v24.4h, v24.4h // clz(threshold)
+ sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
+ uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
+ neg v24.4h, v24.4h // -shift
+.if \sec
+ dup v26.8h, v24.h[1]
+.endif
+.if \pri
+ dup v24.8h, v24.h[0]
+.endif
+
+1:
+.if \w == 8
+ ld1 {v0.8h}, [x2] // px
+.else
+ add x12, x2, #2*8
+ ld1 {v0.4h}, [x2] // px
+ ld1 {v0.d}[1], [x12] // px
+.endif
+
+ movi v1.8h, #0 // sum
+.if \min
+ mov v2.16b, v0.16b // min
+ mov v3.16b, v0.16b // max
+.endif
+
+ // Instead of loading sec_taps 2, 1 from memory, just set it
+ // to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
+ mov w11, #2 // sec_taps[0]
+
+2:
+.if \pri
+ ldrb w9, [x5] // off1
+
+ load_px v4, v5, \w
+.endif
+
+.if \sec
+ add x5, x5, #4 // +2*2
+ ldrb w9, [x5] // off2
+ load_px v6, v7, \w
+.endif
+
+.if \pri
+ ldrb w10, [x8] // *pri_taps
+
+ handle_pixel v4, v5, v25.8h, v24.8h, w10, \min
+.endif
+
+.if \sec
+ add x5, x5, #8 // +2*4
+ ldrb w9, [x5] // off3
+ load_px v4, v5, \w
+
+ handle_pixel v6, v7, v27.8h, v26.8h, w11, \min
+
+ handle_pixel v4, v5, v27.8h, v26.8h, w11, \min
+
+ sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
+.else
+ add x5, x5, #1 // x5 += 1
+.endif
+ subs w11, w11, #1 // sec_tap-- (value)
+.if \pri
+ add x8, x8, #1 // pri_taps++ (pointer)
+.endif
+ b.ne 2b
+
+ cmlt v4.8h, v1.8h, #0 // -(sum < 0)
+ add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
+ srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
+ add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
+.if \min
+ smin v0.8h, v0.8h, v3.8h
+ smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
+.endif
+.if \bpc == 8
+ xtn v0.8b, v0.8h
+.endif
+.if \w == 8
+ add x2, x2, #2*16 // tmp += tmp_stride
+ subs w7, w7, #1 // h--
+.if \bpc == 8
+ st1 {v0.8b}, [x0], x1
+.else
+ st1 {v0.8h}, [x0], x1
+.endif
+.else
+.if \bpc == 8
+ st1 {v0.s}[0], [x0], x1
+.else
+ st1 {v0.d}[0], [x0], x1
+.endif
+ add x2, x2, #2*16 // tmp += 2*tmp_stride
+ subs w7, w7, #2 // h -= 2
+.if \bpc == 8
+ st1 {v0.s}[1], [x0], x1
+.else
+ st1 {v0.d}[1], [x0], x1
+.endif
+.endif
+
+ // Reset pri_taps and directions back to the original point
+ sub x5, x5, #2
+.if \pri
+ sub x8, x8, #2
+.endif
+
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+.macro filter w, bpc
+filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_\bpc\()bpc_neon, export=1
+ cbnz w3, 1f // pri_strength
+ b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
+1:
+ cbnz w4, 1f // sec_strength
+ b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
+1:
+ b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
+endfunc
+.endm
+
+const div_table
+ .short 840, 420, 280, 210, 168, 140, 120, 105
+endconst
+
+const alt_fact
+ .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
+endconst
+
+.macro cost_alt d1, d2, s1, s2, s3, s4
+ smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
+ smull2 v23.4s, \s1\().8h, \s1\().8h
+ smull v24.4s, \s2\().4h, \s2\().4h
+ smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
+ smull2 v26.4s, \s3\().8h, \s3\().8h
+ smull v27.4s, \s4\().4h, \s4\().4h
+ mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact
+ mla v22.4s, v23.4s, v30.4s
+ mla v22.4s, v24.4s, v31.4s
+ mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact
+ mla v25.4s, v26.4s, v30.4s
+ mla v25.4s, v27.4s, v31.4s
+ addv \d1, v22.4s // *cost_ptr
+ addv \d2, v25.4s // *cost_ptr
+.endm
+
+.macro find_best s1, s2, s3
+.ifnb \s2
+ mov w5, \s2\().s[0]
+.endif
+ cmp w4, w1 // cost[n] > best_cost
+ csel w0, w3, w0, gt // best_dir = n
+ csel w1, w4, w1, gt // best_cost = cost[n]
+.ifnb \s2
+ add w3, w3, #1 // n++
+ cmp w5, w1 // cost[n] > best_cost
+ mov w4, \s3\().s[0]
+ csel w0, w3, w0, gt // best_dir = n
+ csel w1, w5, w1, gt // best_cost = cost[n]
+ add w3, w3, #1 // n++
+.endif
+.endm
+
+// Steps for loading and preparing each row
+.macro dir_load_step1 s1, bpc
+.if \bpc == 8
+ ld1 {\s1\().8b}, [x0], x1
+.else
+ ld1 {\s1\().8h}, [x0], x1
+.endif
+.endm
+
+.macro dir_load_step2 s1, bpc
+.if \bpc == 8
+ usubl \s1\().8h, \s1\().8b, v31.8b
+.else
+ ushl \s1\().8h, \s1\().8h, v8.8h
+.endif
+.endm
+
+.macro dir_load_step3 s1, bpc
+// Nothing for \bpc == 8
+.if \bpc != 8
+ sub \s1\().8h, \s1\().8h, v31.8h
+.endif
+.endm
+
+// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
+// unsigned *const var)
+.macro find_dir bpc
+function cdef_find_dir_\bpc\()bpc_neon, export=1
+.if \bpc == 16
+ str d8, [sp, #-0x10]!
+ clz w3, w3 // clz(bitdepth_max)
+ sub w3, w3, #24 // -bitdepth_min_8
+ dup v8.8h, w3
+.endif
+ sub sp, sp, #32 // cost
+ mov w3, #8
+.if \bpc == 8
+ movi v31.16b, #128
+.else
+ movi v31.8h, #128
+.endif
+ movi v30.16b, #0
+ movi v1.8h, #0 // v0-v1 sum_diag[0]
+ movi v3.8h, #0 // v2-v3 sum_diag[1]
+ movi v5.8h, #0 // v4-v5 sum_hv[0-1]
+ movi v7.8h, #0 // v6-v7 sum_alt[0]
+ dir_load_step1 v26, \bpc // Setup first row early
+ movi v17.8h, #0 // v16-v17 sum_alt[1]
+ movi v18.8h, #0 // v18-v19 sum_alt[2]
+ dir_load_step2 v26, \bpc
+ movi v19.8h, #0
+ dir_load_step3 v26, \bpc
+ movi v21.8h, #0 // v20-v21 sum_alt[3]
+
+.irpc i, 01234567
+ addv h25, v26.8h // [y]
+ rev64 v27.8h, v26.8h
+ addp v28.8h, v26.8h, v30.8h // [(x >> 1)]
+ add v5.8h, v5.8h, v26.8h // sum_hv[1]
+ ext v27.16b, v27.16b, v27.16b, #8 // [-x]
+ rev64 v29.4h, v28.4h // [-(x >> 1)]
+ ins v4.h[\i], v25.h[0] // sum_hv[0]
+.if \i < 6
+ ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
+ ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
+ add v18.8h, v18.8h, v22.8h // sum_alt[2]
+ add v19.4h, v19.4h, v23.4h // sum_alt[2]
+.else
+ add v18.8h, v18.8h, v26.8h // sum_alt[2]
+.endif
+.if \i == 0
+ mov v20.16b, v26.16b // sum_alt[3]
+.elseif \i == 1
+ add v20.8h, v20.8h, v26.8h // sum_alt[3]
+.else
+ ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
+ ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
+ add v20.8h, v20.8h, v24.8h // sum_alt[3]
+ add v21.4h, v21.4h, v25.4h // sum_alt[3]
+.endif
+.if \i == 0
+ mov v0.16b, v26.16b // sum_diag[0]
+ dir_load_step1 v26, \bpc
+ mov v2.16b, v27.16b // sum_diag[1]
+ dir_load_step2 v26, \bpc
+ mov v6.16b, v28.16b // sum_alt[0]
+ dir_load_step3 v26, \bpc
+ mov v16.16b, v29.16b // sum_alt[1]
+.else
+ ext v22.16b, v30.16b, v26.16b, #(16-2*\i)
+ ext v23.16b, v26.16b, v30.16b, #(16-2*\i)
+ ext v24.16b, v30.16b, v27.16b, #(16-2*\i)
+ ext v25.16b, v27.16b, v30.16b, #(16-2*\i)
+.if \i != 7 // Nothing to load for the final row
+ dir_load_step1 v26, \bpc // Start setting up the next row early.
+.endif
+ add v0.8h, v0.8h, v22.8h // sum_diag[0]
+ add v1.8h, v1.8h, v23.8h // sum_diag[0]
+ add v2.8h, v2.8h, v24.8h // sum_diag[1]
+ add v3.8h, v3.8h, v25.8h // sum_diag[1]
+.if \i != 7
+ dir_load_step2 v26, \bpc
+.endif
+ ext v22.16b, v30.16b, v28.16b, #(16-2*\i)
+ ext v23.16b, v28.16b, v30.16b, #(16-2*\i)
+ ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
+ ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
+.if \i != 7
+ dir_load_step3 v26, \bpc
+.endif
+ add v6.8h, v6.8h, v22.8h // sum_alt[0]
+ add v7.4h, v7.4h, v23.4h // sum_alt[0]
+ add v16.8h, v16.8h, v24.8h // sum_alt[1]
+ add v17.4h, v17.4h, v25.4h // sum_alt[1]
+.endif
+.endr
+
+ movi v31.4s, #105
+
+ smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0]
+ smlal2 v26.4s, v4.8h, v4.8h
+ smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1]
+ smlal2 v27.4s, v5.8h, v5.8h
+ mul v26.4s, v26.4s, v31.4s // cost[2] *= 105
+ mul v27.4s, v27.4s, v31.4s // cost[6] *= 105
+ addv s4, v26.4s // cost[2]
+ addv s5, v27.4s // cost[6]
+
+ rev64 v1.8h, v1.8h
+ rev64 v3.8h, v3.8h
+ ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
+ ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]
+
+ str s4, [sp, #2*4] // cost[2]
+ str s5, [sp, #6*4] // cost[6]
+
+ movrel x4, div_table
+ ld1 {v31.8h}, [x4]
+
+ smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0]
+ smull2 v23.4s, v0.8h, v0.8h
+ smlal v22.4s, v1.4h, v1.4h
+ smlal2 v23.4s, v1.8h, v1.8h
+ smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1]
+ smull2 v25.4s, v2.8h, v2.8h
+ smlal v24.4s, v3.4h, v3.4h
+ smlal2 v25.4s, v3.8h, v3.8h
+ uxtl v30.4s, v31.4h // div_table
+ uxtl2 v31.4s, v31.8h
+ mul v22.4s, v22.4s, v30.4s // cost[0]
+ mla v22.4s, v23.4s, v31.4s // cost[0]
+ mul v24.4s, v24.4s, v30.4s // cost[4]
+ mla v24.4s, v25.4s, v31.4s // cost[4]
+ addv s0, v22.4s // cost[0]
+ addv s2, v24.4s // cost[4]
+
+ movrel x5, alt_fact
+ ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
+
+ str s0, [sp, #0*4] // cost[0]
+ str s2, [sp, #4*4] // cost[4]
+
+ uxtl v29.4s, v29.4h // div_table[2*m+1] + 105
+ uxtl v30.4s, v30.4h
+ uxtl v31.4s, v31.4h
+
+ cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
+ cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
+ str s6, [sp, #1*4] // cost[1]
+ str s16, [sp, #3*4] // cost[3]
+
+ mov w0, #0 // best_dir
+ mov w1, v0.s[0] // best_cost
+ mov w3, #1 // n
+
+ str s18, [sp, #5*4] // cost[5]
+ str s20, [sp, #7*4] // cost[7]
+
+ mov w4, v6.s[0]
+
+ find_best v6, v4, v16
+ find_best v16, v2, v18
+ find_best v18, v5, v20
+ find_best v20
+
+ eor w3, w0, #4 // best_dir ^4
+ ldr w4, [sp, w3, uxtw #2]
+ sub w1, w1, w4 // best_cost - cost[best_dir ^ 4]
+ lsr w1, w1, #10
+ str w1, [x2] // *var
+
+ add sp, sp, #32
+.if \bpc == 16
+ ldr d8, [sp], 0x10
+.endif
+ ret
+endfunc
+.endm
diff --git a/third_party/dav1d/src/arm/64/filmgrain.S b/third_party/dav1d/src/arm/64/filmgrain.S
new file mode 100644
index 0000000000..6cdd7ec5fa
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/filmgrain.S
@@ -0,0 +1,2010 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr w11, w2, #3
+ lsr w12, w2, #12
+ lsr w13, w2, #1
+ eor w11, w2, w11 // (r >> 0) ^ (r >> 3)
+ eor w12, w12, w13 // (r >> 12) ^ (r >> 1)
+ eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr w2, w2, #\steps
+.endif
+ and w11, w11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr w2, w2, w11, lsl #(16 - \steps) // *state
+.else
+ orr w2, w2, w11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, x2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, x2, #17 - \bits, #\bits
+ lsr w2, w2, #1
+.endm
+
+// special calling convention:
+// w2 holds seed
+// x3 holds dav1d_gaussian_sequence
+// clobbers x11-x15
+// returns in v0.8h
+function get_gaussian_neon
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 0
+ increment_seed 4
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ read_rand x14, 11, 3
+ ld1 {v0.h}[3], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 2
+ ld1 {v0.h}[4], [x14]
+ add x15, x3, x15, lsl #1
+ read_rand x14, 11, 1
+ ld1 {v0.h}[5], [x15]
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[6], [x14]
+ ld1 {v0.h}[7], [x15]
+ ret
+endfunc
+
+.macro get_grain_row r0, r1, r2, r3, r4, r5
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r0\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r0\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r1\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r1\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r2\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r2\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r3\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r3\().16b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn \r4\().8b, \r5\().8h
+ bl get_gaussian_neon
+ srshl \r5\().8h, v0.8h, v31.8h
+ xtn2 \r4\().16b, \r5\().8h
+ increment_seed 2
+ read_rand x14, 11, 1
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {\r5\().h}[0], [x14]
+ ld1 {\r5\().h}[1], [x15]
+ srshl v0.4h, \r5\().4h, v31.4h
+ xtn \r5\().8b, v0.8h
+.endm
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5
+ st1 {\r0\().16b,\r1\().16b}, [x0], #32
+ st1 {\r2\().16b,\r3\().16b}, [x0], #32
+ st1 {\r4\().16b}, [x0], #16
+ st1 {\r5\().h}[0], [x0], #2
+.endm
+
+.macro get_grain_row_44 r0, r1, r2
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn \r0\().8b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn2 \r0\().16b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn \r1\().8b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn2 \r1\().16b, \r2\().8h
+ bl get_gaussian_neon
+ srshl \r2\().8h, v0.8h, v31.8h
+ xtn \r2\().8b, \r2\().8h
+
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ ld1 {v0.h}[3], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ xtn2 \r2\().16b, v0.8h
+.endm
+
+.macro store_grain_row_44 r0, r1, r2
+ st1 {\r0\().16b,\r1\().16b}, [x0], #32
+ st1 {\r2\().16b}, [x0]
+ add x0, x0, #GRAIN_WIDTH-32
+.endm
+
+function get_grain_2_neon
+ increment_seed 2
+ read_rand x14, 11, 1
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ ld1 {v0.h}[1], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ xtn v0.8b, v0.8h
+ ret
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, v0
+ mov \dst\().8b, v0.8b
+.endif
+.endm
+
+// w15 holds the number of entries to produce
+// w14, w16 and w17 hold the previous output entries
+// v0 holds the vector of produced entries
+// v1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+1:
+ read_shift_rand x13, 11
+ mov w11, v1.s[0]
+ ldrsh w12, [x3, x13, lsl #1]
+ ext v0.16b, v0.16b, v0.16b, #1
+.if \n == 1
+ madd w11, w14, w4, w11 // sum (above) + *coeff * prev output
+.elseif \n == 2
+ madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w14, w17, w11 // += *coeff * prev output 2
+ mov w16, w14
+.else
+ madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2
+ madd w11, w14, w21, w11 // += *coeff * prev output 3
+ mov w17, w16
+ mov w16, w14
+.endif
+ add w14, w11, w8 // 1 << (ar_coeff_shift - 1)
+ add w12, w12, w10 // 1 << (4 + grain_scale_shift - 1)
+ asr w14, w14, w7 // >> ar_coeff_shift
+ asr w12, w12, w9 // >> (4 + grain_scale_shift)
+ add w14, w14, w12
+ cmp w14, w5
+ csel w14, w14, w5, le
+ cmp w14, w6
+ csel w14, w14, w6, ge
+ subs w15, w15, #1
+ ext v1.16b, v1.16b, v1.16b, #4
+ ins v0.b[15], w14
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ smull v2.8h, v3.8b, v28.8b
+ smull2 v3.8h, v3.16b, v28.16b
+ smull v4.8h, v0.8b, v27.8b
+ smull2 v5.8h, v0.16b, v27.16b
+ smull v6.8h, v1.8b, v29.8b
+ smull2 v7.8h, v1.16b, v29.16b
+ saddl v0.4s, v2.4h, v4.4h
+ saddl2 v1.4s, v2.8h, v4.8h
+ saddl v2.4s, v3.4h, v5.4h
+ saddl2 v3.4s, v3.8h, v5.8h
+ saddw v4.4s, v0.4s, v6.4h
+ saddw2 v5.4s, v1.4s, v6.8h
+ saddw v6.4s, v2.4s, v7.4h
+ saddw2 v7.4s, v3.4s, v7.8h
+ ret
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
+ bl sum_\lag\()_above_neon
+.ifc \type, uv_420
+ add x12, x19, #GRAIN_WIDTH
+ ld1 {v22.16b, v23.16b}, [x19], #32
+ ld1 {v24.16b, v25.16b}, [x12]
+ saddlp v22.8h, v22.16b
+ saddlp v23.8h, v23.16b
+ saddlp v24.8h, v24.16b
+ saddlp v25.8h, v25.16b
+ add v22.8h, v22.8h, v24.8h
+ add v23.8h, v23.8h, v25.8h
+ rshrn v0.8b, v22.8h, #2
+ rshrn2 v0.16b, v23.8h, #2
+.endif
+.ifc \type, uv_422
+ ld1 {v22.16b, v23.16b}, [x19], #32
+ saddlp v22.8h, v22.16b
+ saddlp v23.8h, v23.16b
+ rshrn v0.8b, v22.8h, #1
+ rshrn2 v0.16b, v23.8h, #1
+.endif
+.ifc \type, uv_444
+ ld1 {v0.16b}, [x19], #16
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ dup v1.16b, \uv_coeff
+ smull v2.8h, v0.8b, v1.8b
+ smull2 v3.8h, v0.16b, v1.16b
+.else
+ smull v2.8h, v0.8b, v30.8b
+ smull2 v3.8h, v0.16b, v30.16b
+.endif
+ saddw v4.4s, v4.4s, v2.4h
+ saddw2 v5.4s, v5.4s, v2.8h
+ saddw v6.4s, v6.4s, v3.4h
+ saddw2 v7.4s, v7.4s, v3.8h
+.endif
+.if \uv_layout && \elems == 16
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 15
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 9
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+.ifc \edge, left
+ increment_seed 4
+ read_rand x12, 11, 3
+ read_rand x13, 11, 2
+ read_rand x14, 11, 1
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v0.h}[5], [x12]
+ ld1 {v0.h}[6], [x13]
+ ld1 {v0.h}[7], [x14]
+ lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ srshl v0.8h, v0.8h, v31.8h
+ xtn2 v0.16b, v0.8h
+ ext v4.16b, v4.16b, v4.16b, #12
+.ifc \lag, lag3
+ smov w17, v0.b[13]
+.endif
+.ifnc \lag, lag1
+ smov w16, v0.b[14]
+.endif
+ smov w14, v0.b[15]
+
+ mov v1.16b, v4.16b
+ mov w15, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ mov v1.16b, v4.16b
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ mov v1.16b, v5.16b
+ mov w15, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ mov v1.16b, v6.16b
+.if \elems == 9
+ mov w15, #1
+ bl output_\lag\()_neon
+ lsr w2, w2, #3
+
+ read_rand x12, 11, 2
+ read_rand x13, 11, 1
+ read_rand x14, 11, 0
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v1.h}[0], [x12]
+ ld1 {v1.h}[1], [x13]
+ ld1 {v1.h}[2], [x14]
+ srshl v1.4h, v1.4h, v31.4h
+ xtn v1.8b, v1.8h
+ ext v0.16b, v0.16b, v1.16b, #7
+.else
+ mov w15, #4
+ bl output_\lag\()_neon
+
+ increment_seed 4, shift=0
+ mov v1.16b, v7.16b
+
+.ifc \edge, right
+ mov w15, #3
+ bl output_\lag\()_neon
+ read_shift_rand x15, 11
+ add x15, x3, x15, lsl #1
+ ld1 {v1.h}[0], [x15]
+ srshl v1.4h, v1.4h, v31.4h
+ ext v0.16b, v0.16b, v1.16b, #1
+.else
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+.endif
+.if \store
+ st1 {v0.16b}, [x0], #16
+.endif
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag1_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 15
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 15
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 9
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 9
+
+.macro sum_lag1 type, dst, left, mid, right, edge=mid
+ mov v3.16b, \mid\().16b
+ ext v0.16b, \left\().16b, \mid\().16b, #15
+ ext v1.16b, \mid\().16b, \right\().16b, #1
+ bl sum_\type\()_lag1_\edge\()_neon
+ mov \dst\().16b, v0.16b
+.endm
+
+.macro sum_y_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 y, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_444, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_422, \dst, \left, \mid, \right, \edge
+.endm
+
+.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
+ sum_lag1 uv_420, \dst, \left, \mid, \right, \edge
+.endm
+
+
+function sum_lag2_above_neon
+ sub x12, x0, #2*GRAIN_WIDTH - 16
+ sub x13, x0, #1*GRAIN_WIDTH - 16
+ ld1 {v18.16b}, [x12] // load top right
+ ld1 {v21.16b}, [x13]
+
+ ext v22.16b, v16.16b, v17.16b, #14 // top left, top mid
+ dup v26.16b, v30.b[0]
+ ext v23.16b, v16.16b, v17.16b, #15
+ dup v27.16b, v30.b[1]
+ ext v0.16b, v17.16b, v18.16b, #1 // top mid, top right
+ dup v28.16b, v30.b[3]
+ ext v1.16b, v17.16b, v18.16b, #2
+ dup v29.16b, v30.b[4]
+
+ smull v2.8h, v22.8b, v26.8b
+ smull2 v3.8h, v22.16b, v26.16b
+ smull v4.8h, v23.8b, v27.8b
+ smull2 v5.8h, v23.16b, v27.16b
+ smull v6.8h, v0.8b, v28.8b
+ smull2 v7.8h, v0.16b, v28.16b
+ smull v0.8h, v1.8b, v29.8b
+ smull2 v1.8h, v1.16b, v29.16b
+ saddl v22.4s, v2.4h, v4.4h
+ saddl2 v23.4s, v2.8h, v4.8h
+ saddl v26.4s, v3.4h, v5.4h
+ saddl2 v27.4s, v3.8h, v5.8h
+ saddl v2.4s, v0.4h, v6.4h
+ saddl2 v3.4s, v0.8h, v6.8h
+ saddl v6.4s, v1.4h, v7.4h
+ saddl2 v7.4s, v1.8h, v7.8h
+ add v4.4s, v22.4s, v2.4s
+ add v5.4s, v23.4s, v3.4s
+ add v6.4s, v26.4s, v6.4s
+ add v7.4s, v27.4s, v7.4s
+
+ ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid
+ dup v26.16b, v30.b[5]
+ ext v23.16b, v19.16b, v20.16b, #15
+ dup v27.16b, v30.b[6]
+ ext v0.16b, v20.16b, v21.16b, #1 // top mid, top right
+ dup v28.16b, v30.b[8]
+ ext v1.16b, v20.16b, v21.16b, #2
+ dup v29.16b, v30.b[9]
+
+ smull v2.8h, v22.8b, v26.8b
+ smull2 v3.8h, v22.16b, v26.16b
+ smull v22.8h, v23.8b, v27.8b
+ smull2 v23.8h, v23.16b, v27.16b
+ smull v26.8h, v0.8b, v28.8b
+ smull2 v27.8h, v0.16b, v28.16b
+ smull v28.8h, v1.8b, v29.8b
+ smull2 v29.8h, v1.16b, v29.16b
+ saddl v0.4s, v2.4h, v22.4h
+ saddl2 v1.4s, v2.8h, v22.8h
+ saddl v2.4s, v3.4h, v23.4h
+ saddl2 v3.4s, v3.8h, v23.8h
+ saddl v22.4s, v26.4h, v28.4h
+ saddl2 v23.4s, v26.8h, v28.8h
+ saddl v26.4s, v27.4h, v29.4h
+ saddl2 v27.4s, v27.8h, v29.8h
+ add v0.4s, v0.4s, v22.4s
+ add v1.4s, v1.4s, v23.4s
+ add v2.4s, v2.4s, v26.4s
+ add v3.4s, v3.4s, v27.4s
+ dup v26.16b, v30.b[2]
+ dup v27.16b, v30.b[7]
+ smull v22.8h, v17.8b, v26.8b
+ smull2 v23.8h, v17.16b, v26.16b
+ smull v24.8h, v20.8b, v27.8b
+ smull2 v25.8h, v20.16b, v27.16b
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+
+ saddl v0.4s, v22.4h, v24.4h
+ saddl2 v1.4s, v22.8h, v24.8h
+ saddl v2.4s, v23.4h, v25.4h
+ saddl2 v3.4s, v23.8h, v25.8h
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ ret
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag2_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x12, x0, #2*GRAIN_WIDTH
+ sub x13, x0, #1*GRAIN_WIDTH
+ ld1 {v17.16b}, [x12] // load the previous block right above
+ ld1 {v20.16b}, [x13]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 15
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 15
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 9
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 9
+
+
+function sum_lag3_above_neon
+ sub x11, x0, #3*GRAIN_WIDTH - 16
+ sub x12, x0, #2*GRAIN_WIDTH - 16
+ sub x13, x0, #1*GRAIN_WIDTH - 16
+ ld1 {v15.16b}, [x11] // load top right
+ ld1 {v18.16b}, [x12]
+ ld1 {v21.16b}, [x13]
+
+ ext v8.16b, v13.16b, v14.16b, #13 // top left, top mid
+ dup v22.16b, v29.b[0]
+ ext v9.16b, v13.16b, v14.16b, #14
+ dup v23.16b, v29.b[1]
+ ext v10.16b, v13.16b, v14.16b, #15
+ dup v24.16b, v29.b[2]
+ dup v25.16b, v29.b[3]
+ ext v11.16b, v14.16b, v15.16b, #1 // top mid, top right
+ dup v26.16b, v29.b[4]
+ ext v12.16b, v14.16b, v15.16b, #2
+ dup v27.16b, v29.b[5]
+ ext v13.16b, v14.16b, v15.16b, #3
+ dup v28.16b, v29.b[6]
+
+ smull v0.8h, v8.8b, v22.8b
+ smull2 v1.8h, v8.16b, v22.16b
+ smull v2.8h, v9.8b, v23.8b
+ smull2 v3.8h, v9.16b, v23.16b
+ smull v8.8h, v10.8b, v24.8b
+ smull2 v9.8h, v10.16b, v24.16b
+ smull v10.8h, v11.8b, v26.8b
+ smull2 v11.8h, v11.16b, v26.16b
+ saddl v22.4s, v0.4h, v2.4h
+ saddl2 v23.4s, v0.8h, v2.8h
+ saddl v24.4s, v1.4h, v3.4h
+ saddl2 v26.4s, v1.8h, v3.8h
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ smull v8.8h, v12.8b, v27.8b
+ smull2 v9.8h, v12.16b, v27.16b
+ smull v10.8h, v13.8b, v28.8b
+ smull2 v11.8h, v13.16b, v28.16b
+ smull v12.8h, v14.8b, v25.8b
+ smull2 v13.8h, v14.16b, v25.16b
+ add v4.4s, v22.4s, v0.4s
+ add v5.4s, v23.4s, v1.4s
+ add v6.4s, v24.4s, v2.4s
+ add v7.4s, v26.4s, v3.4s
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ saddw v4.4s, v4.4s, v12.4h
+ saddw2 v5.4s, v5.4s, v12.8h
+ saddw v6.4s, v6.4s, v13.4h
+ saddw2 v7.4s, v7.4s, v13.8h
+
+ ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid
+ dup v22.16b, v29.b[7]
+ ext v9.16b, v16.16b, v17.16b, #14
+ dup v23.16b, v29.b[8]
+ ext v10.16b, v16.16b, v17.16b, #15
+ dup v24.16b, v29.b[9]
+ dup v25.16b, v29.b[10]
+ ext v11.16b, v17.16b, v18.16b, #1 // top mid, top right
+ dup v26.16b, v29.b[11]
+ ext v12.16b, v17.16b, v18.16b, #2
+ dup v27.16b, v29.b[12]
+ ext v13.16b, v17.16b, v18.16b, #3
+ dup v28.16b, v29.b[13]
+
+ smull v0.8h, v8.8b, v22.8b
+ smull2 v1.8h, v8.16b, v22.16b
+ smull v2.8h, v9.8b, v23.8b
+ smull2 v3.8h, v9.16b, v23.16b
+ smull v8.8h, v10.8b, v24.8b
+ smull2 v9.8h, v10.16b, v24.16b
+ smull v10.8h, v11.8b, v26.8b
+ smull2 v11.8h, v11.16b, v26.16b
+ saddl v22.4s, v0.4h, v2.4h
+ saddl2 v23.4s, v0.8h, v2.8h
+ saddl v24.4s, v1.4h, v3.4h
+ saddl2 v26.4s, v1.8h, v3.8h
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ smull v8.8h, v12.8b, v27.8b
+ smull2 v9.8h, v12.16b, v27.16b
+ smull v10.8h, v13.8b, v28.8b
+ smull2 v11.8h, v13.16b, v28.16b
+ smull v12.8h, v17.8b, v25.8b
+ smull2 v13.8h, v17.16b, v25.16b
+ add v22.4s, v22.4s, v0.4s
+ add v23.4s, v23.4s, v1.4s
+ add v24.4s, v24.4s, v2.4s
+ add v26.4s, v26.4s, v3.4s
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ add v4.4s, v4.4s, v22.4s
+ add v5.4s, v5.4s, v23.4s
+ add v6.4s, v6.4s, v24.4s
+ add v7.4s, v7.4s, v26.4s
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ saddw v4.4s, v4.4s, v12.4h
+ saddw2 v5.4s, v5.4s, v12.8h
+ saddw v6.4s, v6.4s, v13.4h
+ saddw2 v7.4s, v7.4s, v13.8h
+
+ ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid
+ dup v22.16b, v29.b[14]
+ ext v9.16b, v19.16b, v20.16b, #14
+ dup v23.16b, v29.b[15]
+ ext v10.16b, v19.16b, v20.16b, #15
+ dup v24.16b, v30.b[0]
+ dup v25.16b, v30.b[1]
+ ext v11.16b, v20.16b, v21.16b, #1 // top mid, top right
+ dup v26.16b, v30.b[2]
+ ext v12.16b, v20.16b, v21.16b, #2
+ dup v27.16b, v30.b[3]
+ ext v13.16b, v20.16b, v21.16b, #3
+ dup v28.16b, v30.b[4]
+
+ smull v0.8h, v8.8b, v22.8b
+ smull2 v1.8h, v8.16b, v22.16b
+ smull v2.8h, v9.8b, v23.8b
+ smull2 v3.8h, v9.16b, v23.16b
+ smull v8.8h, v10.8b, v24.8b
+ smull2 v9.8h, v10.16b, v24.16b
+ smull v10.8h, v11.8b, v26.8b
+ smull2 v11.8h, v11.16b, v26.16b
+ saddl v22.4s, v0.4h, v2.4h
+ saddl2 v23.4s, v0.8h, v2.8h
+ saddl v24.4s, v1.4h, v3.4h
+ saddl2 v26.4s, v1.8h, v3.8h
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ smull v8.8h, v12.8b, v27.8b
+ smull2 v9.8h, v12.16b, v27.16b
+ smull v10.8h, v13.8b, v28.8b
+ smull2 v11.8h, v13.16b, v28.16b
+ smull v12.8h, v20.8b, v25.8b
+ smull2 v19.8h, v20.16b, v25.16b
+ add v22.4s, v22.4s, v0.4s
+ add v23.4s, v23.4s, v1.4s
+ add v24.4s, v24.4s, v2.4s
+ add v26.4s, v26.4s, v3.4s
+ saddl v0.4s, v8.4h, v10.4h
+ saddl2 v1.4s, v8.8h, v10.8h
+ saddl v2.4s, v9.4h, v11.4h
+ saddl2 v3.4s, v9.8h, v11.8h
+ add v4.4s, v4.4s, v22.4s
+ add v5.4s, v5.4s, v23.4s
+ add v6.4s, v6.4s, v24.4s
+ add v7.4s, v7.4s, v26.4s
+ mov v13.16b, v14.16b
+ mov v14.16b, v15.16b
+ add v4.4s, v4.4s, v0.4s
+ add v5.4s, v5.4s, v1.4s
+ add v6.4s, v6.4s, v2.4s
+ add v7.4s, v7.4s, v3.4s
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+ saddw v4.4s, v4.4s, v12.4h
+ saddw2 v5.4s, v5.4s, v12.8h
+ saddw v6.4s, v6.4s, v19.4h
+ saddw2 v7.4s, v7.4s, v19.8h
+
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ ret
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=16
+function sum_\type\()_lag3_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x11, x0, #3*GRAIN_WIDTH
+ sub x12, x0, #2*GRAIN_WIDTH
+ sub x13, x0, #1*GRAIN_WIDTH
+ ld1 {v14.16b}, [x11] // load the previous block right above
+ ld1 {v17.16b}, [x12]
+ ld1 {v20.16b}, [x13]
+.endif
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 15
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 15
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 9
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 9
+
+function generate_grain_rows_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ get_grain_row v16, v17, v18, v19, v20, v21
+ subs w1, w1, #1
+ store_grain_row v16, v17, v18, v19, v20, v21
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function generate_grain_rows_44_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ get_grain_row_44 v16, v17, v18
+ subs w1, w1, #1
+ store_grain_row_44 v16, v17, v18
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function get_grain_row_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ get_grain_row v16, v17, v18, v19, v20, v21
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function get_grain_row_44_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ get_grain_row_44 v16, v17, v18
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function add_uv_444_coeff_lag0_neon
+add_coeff_lag0_start:
+ smull v2.8h, v0.8b, v27.8b
+ smull2 v3.8h, v0.16b, v27.16b
+ srshl v2.8h, v2.8h, v28.8h
+ srshl v3.8h, v3.8h, v28.8h
+ saddw v2.8h, v2.8h, v1.8b
+ saddw2 v3.8h, v3.8h, v1.16b
+ sqxtn v2.8b, v2.8h
+ sqxtn2 v2.16b, v3.8h
+ ret
+endfunc
+
+function add_uv_420_coeff_lag0_neon
+ ld1 {v4.16b, v5.16b}, [x19], #32
+ ld1 {v6.16b, v7.16b}, [x12], #32
+ saddlp v4.8h, v4.16b
+ saddlp v5.8h, v5.16b
+ saddlp v6.8h, v6.16b
+ saddlp v7.8h, v7.16b
+ add v4.8h, v4.8h, v6.8h
+ add v5.8h, v5.8h, v7.8h
+ rshrn v4.8b, v4.8h, #2
+ rshrn2 v4.16b, v5.8h, #2
+ and v0.16b, v4.16b, v0.16b
+ b add_coeff_lag0_start
+endfunc
+
+function add_uv_422_coeff_lag0_neon
+ ld1 {v4.16b, v5.16b}, [x19], #32
+ saddlp v4.8h, v4.16b
+ saddlp v5.8h, v5.16b
+ rshrn v4.8b, v4.8h, #1
+ rshrn2 v4.16b, v5.8h, #1
+ and v0.16b, v4.16b, v0.16b
+ b add_coeff_lag0_start
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+.ifc \type, uv_444
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #3*GRAIN_WIDTH
+ mov x1, x2
+ mul w13, w13, w14
+.endif
+ movrel x3, X(gaussian_sequence)
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add x4, x1, #FGD_AR_COEFFS_Y
+.else
+ add x4, x1, #FGD_AR_COEFFS_UV
+.endif
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+.ifc \type, uv_444
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+.endif
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #127
+ mov w6, #-128
+
+.ifc \type, uv_444
+ eor w2, w2, w11
+.endif
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, y
+ mov w1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+ dup v28.8h, w7
+ ld1r {v27.16b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ ext v29.16b, v0.16b, v1.16b, #13
+ ext v30.16b, v1.16b, v0.16b, #1
+ neg v28.8h, v28.8h
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+ mov w1, #GRAIN_HEIGHT-3
+1:
+ ld1 {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64
+ bl get_grain_row_neon
+ and v0.16b, v22.16b, v29.16b
+ mov v1.16b, v16.16b
+ bl add_uv_444_coeff_lag0_neon
+ mov v0.16b, v23.16b
+ mov v1.16b, v17.16b
+ mov v16.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ ld1 {v26.16b}, [x19], #16
+ mov v0.16b, v24.16b
+ mov v1.16b, v18.16b
+ mov v17.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ add x19, x19, #2
+ mov v0.16b, v25.16b
+ mov v1.16b, v19.16b
+ mov v18.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ and v0.16b, v26.16b, v30.16b
+ mov v1.16b, v20.16b
+ mov v19.16b, v2.16b
+ bl add_uv_444_coeff_lag0_neon
+ mov v20.16b, v2.16b
+ subs w1, w1, #1
+ store_grain_row v16, v17, v18, v19, v20, v21
+ b.gt 1b
+.endif
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.16b}, [x4], #1 // ar_coeffs_y[0]
+ ld1r {v28.16b}, [x4], #1 // ar_coeffs_y[1]
+ ld1r {v29.16b}, [x4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb w4, [x4, #1] // ar_coeffs_y[3]
+.else
+ add x4, x4, #2
+.endif
+
+ mov w1, #3
+.ifc \type, uv_444
+ ld1r {v30.16b}, [x4] // ar_coeffs_uv[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ sum_\type\()_lag1 v22, v16, v16, v17, left
+ sum_\type\()_lag1 v23, v16, v17, v18
+ sum_\type\()_lag1 v24, v17, v18, v19
+ sum_\type\()_lag1 v25, v18, v19, v20
+ sum_\type\()_lag1 v20, v19, v20, v21, right
+ get_grain_2 v21
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #2
+.endif
+ store_grain_row v22, v23, v24, v25, v20, v21
+ mov v16.16b, v22.16b
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+ mov v19.16b, v25.16b
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #2
+.endif
+ st1 {v16.h}[0], [x0], #2
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #2
+.endif
+ st1 {v16.h}[0], [x0], #2
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH-(3*32)
+.else
+ sub \reg, \reg, #3*32-GRAIN_WIDTH
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #3*GRAIN_WIDTH-3
+ mov x1, x2
+ mul w13, w13, w14
+
+ movrel x3, X(gaussian_sequence)
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+ add x4, x1, #FGD_AR_COEFFS_UV
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #127
+ mov w6, #-128
+
+ eor w2, w2, w11
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+ dup v28.8h, w7
+ ld1r {v27.16b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ ext v29.16b, v0.16b, v1.16b, #13
+ ext v30.16b, v1.16b, v0.16b, #7
+ neg v28.8h, v28.8h
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+ set_height w1, \type
+1:
+ bl get_grain_row_44_neon
+.ifc \type, uv_420
+ add x12, x19, #GRAIN_WIDTH
+.endif
+ mov v0.16b, v29.16b
+ mov v1.16b, v16.16b
+ bl add_\type\()_coeff_lag0_neon
+ movi v0.16b, #255
+ mov v1.16b, v17.16b
+ mov v16.16b, v2.16b
+ bl add_\type\()_coeff_lag0_neon
+ mov v0.16b, v30.16b
+ mov v1.16b, v18.16b
+ mov v17.16b, v2.16b
+ bl add_\type\()_coeff_lag0_neon
+ mov v18.16b, v2.16b
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ store_grain_row_44 v16, v17, v18
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.16b}, [x4], #1 // ar_coeffs_uv[0]
+ ld1r {v28.16b}, [x4], #1 // ar_coeffs_uv[1]
+ ld1r {v29.16b}, [x4] // ar_coeffs_uv[2]
+ add x4, x4, #2
+
+ mov w1, #3
+ ld1r {v30.16b}, [x4] // ar_coeffs_u4[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ sum_\type\()_lag1 v20, v16, v16, v17, left
+ sum_\type\()_lag1 v21, v16, v17, v18
+ sum_\type\()_lag1 v18, v17, v18, v18, right
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ store_grain_row_44 v20, v21, v18
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag2_left_neon
+ bl sum_\type\()_lag2_mid_neon
+ bl sum_\type\()_lag2_right_neon
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH-48
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ldr q29, [x4] // ar_coeffs_uv[0-15]
+ ldr q30, [x4, #16] // ar_coeffs_uv[16-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag3_left_neon
+ bl sum_\type\()_lag3_mid_neon
+ bl sum_\type\()_lag3_right_neon
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH-48
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+ umov w14, \src1[0+\off]
+ umov w15, \src2[8+\off]
+ umov w16, \src1[2+\off]
+ add x14, x14, x3
+ umov w17, \src2[10+\off]
+ add x15, x15, x3
+ ld1 {\dst1}[0+\off], [x14]
+ umov w14, \src1[4+\off]
+ add x16, x16, x3
+ ld1 {\dst2}[8+\off], [x15]
+ umov w15, \src2[12+\off]
+ add x17, x17, x3
+ ld1 {\dst1}[2+\off], [x16]
+ umov w16, \src1[6+\off]
+ add x14, x14, x3
+ ld1 {\dst2}[10+\off], [x17]
+ umov w17, \src2[14+\off]
+ add x15, x15, x3
+ ld1 {\dst1}[4+\off], [x14]
+ add x16, x16, x3
+ ld1 {\dst2}[12+\off], [x15]
+ add x17, x17, x3
+ ld1 {\dst1}[6+\off], [x16]
+ ld1 {\dst2}[14+\off], [x17]
+.endm
+
+.macro gather dst1, dst2, src1, src2
+ gather_interleaved \dst1, \dst2, \src1, \src2, 0
+ gather_interleaved \dst2, \dst1, \src2, \src1, 0
+ gather_interleaved \dst1, \dst2, \src1, \src2, 1
+ gather_interleaved \dst2, \dst1, \src2, \src1, 1
+.endm
+
+function gather32_neon
+ gather v4.b, v5.b, v0.b, v1.b
+ ret
+endfunc
+
+function gather16_neon
+ gather_interleaved v4.b, v5.b, v0.b, v0.b, 0
+ gather_interleaved v4.b, v5.b, v0.b, v0.b, 1
+ ins v4.d[1], v5.d[1]
+ ret
+endfunc
+
+const overlap_coeffs_0, align=4
+ .byte 27, 17, 0, 0, 0, 0, 0, 0
+ .byte 17, 27, 32, 32, 32, 32, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .byte 23, 0, 0, 0, 0, 0, 0, 0
+ .byte 22, 32, 32, 32, 32, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx, uxtw // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type);
+function fgy_32x32_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ldr w11, [x6, #8] // offsets[1][0]
+ ldr w13, [x6, #4] // offsets[0][1]
+ ldr w15, [x6, #12] // offsets[1][1]
+ ldr w6, [x6] // offsets[0][0]
+ ldr w8, [sp, #16] // clip
+ mov x9, #GRAIN_WIDTH // grain_lut stride
+
+ neg w4, w4
+ dup v29.8h, w4 // -scaling_shift
+
+ movrel x16, overlap_coeffs_0
+
+ cbz w8, 1f
+ // clip
+ movi v30.16b, #16
+ movi v31.16b, #235
+ b 2f
+1:
+ // no clip
+ movi v30.16b, #0
+ movi v31.16b, #255
+2:
+
+ ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs
+
+ add x5, x5, #9 // grain_lut += 9
+ add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x9 // grain_lut += grain_stride
+
+ calc_offset w11, w12, w11, 0, 0
+ calc_offset w13, w14, w13, 0, 0
+ calc_offset w15, w16, w15, 0, 0
+ calc_offset w6, w10, w6, 0, 0
+
+ add_offset x12, w11, x12, x5, x9
+ add_offset x14, w13, x14, x5, x9
+ add_offset x16, w15, x16, x5, x9
+ add_offset x5, w6, x10, x5, x9
+
+ ldr w11, [sp, #24] // type
+ adr x13, L(fgy_loop_tbl)
+
+ add x4, x12, #32 // grain_lut += BLOCK_SIZE * bx
+ add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+
+ tst w11, #1
+ ldrh w11, [x13, w11, uxtw #1]
+
+ add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+ add x8, x8, #32 // grain_lut += BLOCK_SIZE * bx
+
+ sub x11, x13, w11, uxtw
+
+ b.eq 1f
+ // y overlap
+ dup v6.16b, v27.b[0]
+ dup v7.16b, v27.b[1]
+ mov w10, w7 // backup actual h
+ mov w7, #2
+1:
+ br x11
+endfunc
+
+function fgy_loop_neon
+.macro fgy ox, oy
+L(loop_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.16b, v1.16b}, [x1], x2 // src
+.if \ox
+ ld1 {v20.8b}, [x4], x9 // grain_lut old
+.endif
+.if \oy
+ ld1 {v22.16b, v23.16b}, [x6], x9 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v21.8b}, [x8], x9 // grain_lut top old
+.endif
+ ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut
+
+ bl gather32_neon
+
+.if \ox
+ smull v20.8h, v20.8b, v27.8b
+ smlal v20.8h, v18.8b, v28.8b
+.endif
+
+.if \oy
+.if \ox
+ smull v21.8h, v21.8b, v27.8b
+ smlal v21.8h, v22.8b, v28.8b
+ sqrshrn v20.8b, v20.8h, #5
+ sqrshrn v21.8b, v21.8h, #5
+.endif
+
+.if \ox
+ smull v16.8h, v20.8b, v7.8b
+.else
+ smull v16.8h, v18.8b, v7.8b
+.endif
+ smull2 v17.8h, v18.16b, v7.16b
+ smull v18.8h, v19.8b, v7.8b
+ smull2 v19.8h, v19.16b, v7.16b
+.if \ox
+ smlal v16.8h, v21.8b, v6.8b
+.else
+ smlal v16.8h, v22.8b, v6.8b
+.endif
+ smlal2 v17.8h, v22.16b, v6.16b
+ smlal v18.8h, v23.8b, v6.8b
+ smlal2 v19.8h, v23.16b, v6.16b
+ sqrshrn v22.8b, v16.8h, #5
+ sqrshrn2 v22.16b, v17.8h, #5
+ sqrshrn v23.8b, v18.8h, #5
+ sqrshrn2 v23.16b, v19.8h, #5
+.endif
+
+ // sxtl of grain
+.if \oy
+ sxtl v16.8h, v22.8b
+ sxtl2 v17.8h, v22.16b
+ sxtl v18.8h, v23.8b
+ sxtl2 v19.8h, v23.16b
+.elseif \ox
+ sqrshrn v20.8b, v20.8h, #5
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+ sxtl v16.8h, v20.8b
+.else
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+.endif
+
+ uxtl v2.8h, v4.8b // scaling
+ uxtl2 v3.8h, v4.16b
+ uxtl v4.8h, v5.8b
+ uxtl2 v5.8h, v5.16b
+
+ mul v16.8h, v16.8h, v2.8h // scaling * grain
+ mul v17.8h, v17.8h, v3.8h
+ mul v18.8h, v18.8h, v4.8h
+ mul v19.8h, v19.8h, v5.8h
+
+ srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift)
+ srshl v17.8h, v17.8h, v29.8h
+ srshl v18.8h, v18.8h, v29.8h
+ srshl v19.8h, v19.8h, v29.8h
+
+ uaddw v16.8h, v16.8h, v0.8b // *src + noise
+ uaddw2 v17.8h, v17.8h, v0.16b
+ uaddw v18.8h, v18.8h, v1.8b
+ uaddw2 v19.8h, v19.8h, v1.16b
+
+ sqxtun v0.8b, v16.8h
+ sqxtun2 v0.16b, v17.8h
+ sqxtun v1.8b, v18.8h
+ sqxtun2 v1.16b, v19.8h
+
+ umax v0.16b, v0.16b, v30.16b
+ umax v1.16b, v1.16b, v30.16b
+ umin v0.16b, v0.16b, v31.16b
+ umin v1.16b, v1.16b, v31.16b
+
+ subs w7, w7, #1
+.if \oy
+ dup v6.16b, v28.b[0]
+ dup v7.16b, v28.b[1]
+.endif
+ st1 {v0.16b, v1.16b}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w10, #2
+ sub w7, w10, #2 // restore actual remaining h
+ b.gt L(loop_\ox\()0)
+.endif
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+
+L(fgy_loop_tbl):
+ .hword L(fgy_loop_tbl) - L(loop_00)
+ .hword L(fgy_loop_tbl) - L(loop_01)
+ .hword L(fgy_loop_tbl) - L(loop_10)
+ .hword L(fgy_loop_tbl) - L(loop_11)
+endfunc
+
+// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-32]!
+ str d8, [sp, #16]
+ ldp x8, x9, [sp, #32] // offsets, h
+ ldp x10, x11, [sp, #48] // uv, is_id
+
+ ldr w13, [x4, #FGD_SCALING_SHIFT]
+ ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ neg w13, w13 // -scaling_shift
+
+ // !csfl
+ add x10, x4, x10, lsl #2 // + 4*uv
+ add x14, x10, #FGD_UV_LUMA_MULT
+ add x15, x10, #FGD_UV_MULT
+ add x10, x10, #FGD_UV_OFFSET
+ ld1 {v8.h}[0], [x14] // uv_luma_mult
+ ld1r {v24.8h}, [x10] // uv_offset
+ ld1 {v8.h}[1], [x15] // uv_mult
+
+ dup v29.8h, w13 // -scaling_shift
+
+ cbz w12, 1f
+ // clip
+ movi v30.16b, #16
+ movi v31.16b, #240
+ cbz w11, 2f
+ // is_id
+ movi v31.16b, #235
+ b 2f
+1:
+ // no clip
+ movi v30.16b, #0
+ movi v31.16b, #255
+2:
+
+ ldr w12, [x8, #8] // offsets[1][0]
+ ldr w14, [x8, #4] // offsets[0][1]
+ ldr w16, [x8, #12] // offsets[1][1]
+ ldr w8, [x8] // offsets[0][0]
+
+ mov x10, #GRAIN_WIDTH // grain_lut stride
+
+ add x5, x5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
+.if \sy
+ add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride
+ add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x10 // grain_lut += grain_stride
+.endif
+
+ calc_offset w12, w13, w12, \sx, \sy
+ calc_offset w14, w15, w14, \sx, \sy
+ calc_offset w16, w17, w16, \sx, \sy
+ calc_offset w8, w11, w8, \sx, \sy
+
+ add_offset x13, w12, x13, x5, x10
+ add_offset x15, w14, x15, x5, x10
+ add_offset x17, w16, x17, x5, x10
+ add_offset x5, w8, x11, x5, x10
+
+ add x4, x13, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+ add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add x11, x11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+
+ ldr w13, [sp, #64] // type
+
+ movrel x16, overlap_coeffs_\sx
+ adr x14, L(fguv_loop_sx\sx\()_tbl)
+
+ ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs
+ tst w13, #1
+ ldrh w13, [x14, w13, uxtw #1]
+
+ b.eq 1f
+ // y overlap
+ sub w12, w9, #(2 >> \sy) // backup remaining h
+ mov w9, #(2 >> \sy)
+
+1:
+ sub x13, x14, w13, uxtw
+
+.if \sy
+ movi v25.16b, #23
+ movi v26.16b, #22
+.else
+ movi v25.16b, #27
+ movi v26.16b, #17
+.endif
+
+.if \sy
+ add x7, x7, x7 // luma_stride *= 2
+.endif
+
+ br x13
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.16b, v1.16b}, [x6], x7 // luma
+ ld1 {v6.16b, v7.16b}, [x1], x2 // src
+.if \ox
+ ld1 {v20.8b}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v22.16b, v23.16b}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v21.8b}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v18.16b, v19.16b}, [x5], x10 // grain_lut
+
+.if !\csfl
+ uxtl v2.8h, v0.8b
+ uxtl2 v3.8h, v0.16b
+ uxtl v4.8h, v1.8b
+ uxtl2 v5.8h, v1.16b
+ uxtl v0.8h, v6.8b
+ uxtl2 v1.8h, v6.16b
+ uxtl v16.8h, v7.8b
+ uxtl2 v17.8h, v7.16b
+ mul v2.8h, v2.8h, v8.h[0]
+ mul v3.8h, v3.8h, v8.h[0]
+ mul v4.8h, v4.8h, v8.h[0]
+ mul v5.8h, v5.8h, v8.h[0]
+ mul v0.8h, v0.8h, v8.h[1]
+ mul v1.8h, v1.8h, v8.h[1]
+ mul v16.8h, v16.8h, v8.h[1]
+ mul v17.8h, v17.8h, v8.h[1]
+ sqadd v2.8h, v2.8h, v0.8h
+ sqadd v3.8h, v3.8h, v1.8h
+ sqadd v4.8h, v4.8h, v16.8h
+ sqadd v5.8h, v5.8h, v17.8h
+ sshr v2.8h, v2.8h, #6
+ sshr v3.8h, v3.8h, #6
+ sshr v4.8h, v4.8h, #6
+ sshr v5.8h, v5.8h, #6
+ add v2.8h, v2.8h, v24.8h
+ add v3.8h, v3.8h, v24.8h
+ add v4.8h, v4.8h, v24.8h
+ add v5.8h, v5.8h, v24.8h
+ sqxtun v0.8b, v2.8h
+ sqxtun2 v0.16b, v3.8h
+ sqxtun v1.8b, v4.8h
+ sqxtun2 v1.16b, v5.8h
+.endif
+
+ bl gather32_neon
+
+.if \ox
+ smull v20.8h, v20.8b, v27.8b
+ smlal v20.8h, v18.8b, v28.8b
+.endif
+
+.if \oy
+.if \ox
+ smull v21.8h, v21.8b, v27.8b
+ smlal v21.8h, v22.8b, v28.8b
+ sqrshrn v20.8b, v20.8h, #5
+ sqrshrn v21.8b, v21.8h, #5
+.endif
+
+.if \ox
+ smull v16.8h, v20.8b, v26.8b
+.else
+ smull v16.8h, v18.8b, v26.8b
+.endif
+ smull2 v17.8h, v18.16b, v26.16b
+ smull v18.8h, v19.8b, v26.8b
+ smull2 v19.8h, v19.16b, v26.16b
+.if \ox
+ smlal v16.8h, v21.8b, v25.8b
+.else
+ smlal v16.8h, v22.8b, v25.8b
+.endif
+ smlal2 v17.8h, v22.16b, v25.16b
+ smlal v18.8h, v23.8b, v25.8b
+ smlal2 v19.8h, v23.16b, v25.16b
+ sqrshrn v22.8b, v16.8h, #5
+ sqrshrn2 v22.16b, v17.8h, #5
+ sqrshrn v23.8b, v18.8h, #5
+ sqrshrn2 v23.16b, v19.8h, #5
+.endif
+
+ // sxtl of grain
+.if \oy
+ sxtl v16.8h, v22.8b
+ sxtl2 v17.8h, v22.16b
+ sxtl v18.8h, v23.8b
+ sxtl2 v19.8h, v23.16b
+.elseif \ox
+ sqrshrn v20.8b, v20.8h, #5
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+ sxtl v16.8h, v20.8b
+.else
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+.endif
+
+ uxtl v2.8h, v4.8b // scaling
+ uxtl2 v3.8h, v4.16b
+ uxtl v4.8h, v5.8b
+ uxtl2 v5.8h, v5.16b
+
+ mul v16.8h, v16.8h, v2.8h // scaling * grain
+ mul v17.8h, v17.8h, v3.8h
+ mul v18.8h, v18.8h, v4.8h
+ mul v19.8h, v19.8h, v5.8h
+
+ srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift)
+ srshl v17.8h, v17.8h, v29.8h
+ srshl v18.8h, v18.8h, v29.8h
+ srshl v19.8h, v19.8h, v29.8h
+
+ uaddw v16.8h, v16.8h, v6.8b // *src + noise
+ uaddw2 v17.8h, v17.8h, v6.16b
+ uaddw v18.8h, v18.8h, v7.8b
+ uaddw2 v19.8h, v19.8h, v7.16b
+
+ sqxtun v0.8b, v16.8h
+ sqxtun2 v0.16b, v17.8h
+ sqxtun v1.8b, v18.8h
+ sqxtun2 v1.16b, v19.8h
+
+ umax v0.16b, v0.16b, v30.16b
+ umax v1.16b, v1.16b, v30.16b
+ umin v0.16b, v0.16b, v31.16b
+ umin v1.16b, v1.16b, v31.16b
+
+ subs w9, w9, #1
+.if \oy
+ dup v25.16b, v28.b[0]
+ dup v26.16b, v28.b[1]
+.endif
+ st1 {v0.16b, v1.16b}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ ldr d8, [sp, #16]
+ ldr x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx0_tbl):
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
+endfunc
+
+function fguv_loop_sx1_neon
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.16b, v1.16b}, [x6], x7 // luma
+ ld1 {v6.16b}, [x1], x2 // src
+.if \ox
+ ld1 {v20.8b}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v22.16b}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v21.8b}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v18.16b}, [x5], x10 // grain_lut
+
+ uaddlp v2.8h, v0.16b
+ uaddlp v3.8h, v1.16b
+.if \csfl
+ rshrn v0.8b, v2.8h, #1
+ rshrn2 v0.16b, v3.8h, #1
+.else
+ urshr v2.8h, v2.8h, #1
+ urshr v3.8h, v3.8h, #1
+ uxtl v0.8h, v6.8b
+ uxtl2 v1.8h, v6.16b
+ mul v2.8h, v2.8h, v8.h[0]
+ mul v3.8h, v3.8h, v8.h[0]
+ mul v0.8h, v0.8h, v8.h[1]
+ mul v1.8h, v1.8h, v8.h[1]
+ sqadd v2.8h, v2.8h, v0.8h
+ sqadd v3.8h, v3.8h, v1.8h
+ sshr v2.8h, v2.8h, #6
+ sshr v3.8h, v3.8h, #6
+ add v2.8h, v2.8h, v24.8h
+ add v3.8h, v3.8h, v24.8h
+ sqxtun v0.8b, v2.8h
+ sqxtun2 v0.16b, v3.8h
+.endif
+
+ bl gather16_neon
+
+.if \ox
+ smull v20.8h, v20.8b, v27.8b
+ smlal v20.8h, v18.8b, v28.8b
+.endif
+
+.if \oy
+.if \ox
+ smull v21.8h, v21.8b, v27.8b
+ smlal v21.8h, v22.8b, v28.8b
+ sqrshrn v20.8b, v20.8h, #5
+ sqrshrn v21.8b, v21.8h, #5
+.endif
+
+.if \ox
+ smull v16.8h, v20.8b, v26.8b
+.else
+ smull v16.8h, v18.8b, v26.8b
+.endif
+ smull2 v17.8h, v18.16b, v26.16b
+.if \ox
+ smlal v16.8h, v21.8b, v25.8b
+.else
+ smlal v16.8h, v22.8b, v25.8b
+.endif
+ smlal2 v17.8h, v22.16b, v25.16b
+ sqrshrn v22.8b, v16.8h, #5
+ sqrshrn2 v22.16b, v17.8h, #5
+.endif
+
+ // sxtl of grain
+.if \oy
+ sxtl v16.8h, v22.8b
+ sxtl2 v17.8h, v22.16b
+.elseif \ox
+ sqrshrn v20.8b, v20.8h, #5
+ sxtl2 v17.8h, v18.16b
+ sxtl v16.8h, v20.8b
+.else
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+.endif
+
+ uxtl v2.8h, v4.8b // scaling
+ uxtl2 v3.8h, v4.16b
+
+ mul v16.8h, v16.8h, v2.8h // scaling * grain
+ mul v17.8h, v17.8h, v3.8h
+
+ srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift)
+ srshl v17.8h, v17.8h, v29.8h
+
+ uaddw v16.8h, v16.8h, v6.8b // *src + noise
+ uaddw2 v17.8h, v17.8h, v6.16b
+
+ sqxtun v0.8b, v16.8h
+ sqxtun2 v0.16b, v17.8h
+
+ umax v0.16b, v0.16b, v30.16b
+ umin v0.16b, v0.16b, v31.16b
+
+.if \oy
+ mov v16.16b, v25.16b
+.endif
+ subs w9, w9, #1
+.if \oy
+ mov v25.16b, v26.16b
+ mov v26.16b, v16.16b
+.endif
+ st1 {v0.16b}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ ldr d8, [sp, #16]
+ ldr x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx1_tbl):
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/filmgrain16.S b/third_party/dav1d/src/arm/64/filmgrain16.S
new file mode 100644
index 0000000000..7c4ff6dda9
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/filmgrain16.S
@@ -0,0 +1,1997 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "src/arm/asm-offsets.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+.macro increment_seed steps, shift=1
+ lsr w11, w2, #3
+ lsr w12, w2, #12
+ lsr w13, w2, #1
+ eor w11, w2, w11 // (r >> 0) ^ (r >> 3)
+ eor w12, w12, w13 // (r >> 12) ^ (r >> 1)
+ eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
+.if \shift
+ lsr w2, w2, #\steps
+.endif
+ and w11, w11, #((1 << \steps) - 1) // bit
+.if \shift
+ orr w2, w2, w11, lsl #(16 - \steps) // *state
+.else
+ orr w2, w2, w11, lsl #16 // *state
+.endif
+.endm
+
+.macro read_rand dest, bits, age
+ ubfx \dest, x2, #16 - \bits - \age, #\bits
+.endm
+
+.macro read_shift_rand dest, bits
+ ubfx \dest, x2, #17 - \bits, #\bits
+ lsr w2, w2, #1
+.endm
+
+// special calling convention:
+// w2 holds seed
+// x3 holds dav1d_gaussian_sequence
+// clobbers x11-x15
+// returns in v0.8h
+function get_gaussian_neon
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 0
+ increment_seed 4
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ read_rand x14, 11, 3
+ ld1 {v0.h}[3], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 2
+ ld1 {v0.h}[4], [x14]
+ add x15, x3, x15, lsl #1
+ read_rand x14, 11, 1
+ ld1 {v0.h}[5], [x15]
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[6], [x14]
+ ld1 {v0.h}[7], [x15]
+ ret
+endfunc
+
+.macro store_grain_row r0, r1, r2, r3, r4, r5
+ st1 {\r0\().16b,\r1\().16b}, [x0], #32
+ st1 {\r2\().16b,\r3\().16b}, [x0], #32
+ st1 {\r4\().16b}, [x0], #16
+ st1 {\r5\().h}[0], [x0], #2
+.endm
+
+function get_grain_2_neon
+ increment_seed 2
+ read_rand x14, 11, 1
+ read_rand x15, 11, 0
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ ld1 {v0.h}[1], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ ret
+endfunc
+
+.macro get_grain_2 dst
+ bl get_grain_2_neon
+.ifnc \dst, v0
+ mov \dst\().8b, v0.8b
+.endif
+.endm
+
+function get_grain_4_neon
+ increment_seed 4
+ read_rand x14, 11, 3
+ read_rand x15, 11, 2
+ add x14, x3, x14, lsl #1
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[0], [x14]
+ read_rand x14, 11, 1
+ ld1 {v0.h}[1], [x15]
+ add x14, x3, x14, lsl #1
+ read_rand x15, 11, 0
+ add x15, x3, x15, lsl #1
+ ld1 {v0.h}[2], [x14]
+ ld1 {v0.h}[3], [x15]
+ srshl v0.4h, v0.4h, v31.4h
+ ret
+endfunc
+
+.macro get_grain_4 dst
+ bl get_grain_4_neon
+.ifnc \dst, v0
+ mov \dst\().8b, v0.8b
+.endif
+.endm
+
+// w15 holds the number of entries to produce
+// w14, w16 and w17 hold the previous output entries
+// v0 holds the vector of produced entries
+// v1 holds the input vector of sums from above
+.macro output_lag n
+function output_lag\n\()_neon
+1:
+ read_shift_rand x13, 11
+ mov w11, v1.s[0]
+ ldrsh w12, [x3, x13, lsl #1]
+ ext v0.16b, v0.16b, v0.16b, #2
+.if \n == 1
+ madd w11, w14, w4, w11 // sum (above) + *coeff * prev output
+.elseif \n == 2
+ madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w14, w17, w11 // += *coeff * prev output 2
+ mov w16, w14
+.else
+ madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1
+ madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2
+ madd w11, w14, w21, w11 // += *coeff * prev output 3
+ mov w17, w16
+ mov w16, w14
+.endif
+ add w14, w11, w8 // 1 << (ar_coeff_shift - 1)
+ add w12, w12, w10 // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
+ asr w14, w14, w7 // >> ar_coeff_shift
+ asr w12, w12, w9 // >> (4 - bitdepth_min_8 + grain_scale_shift)
+ add w14, w14, w12
+ cmp w14, w5
+ csel w14, w14, w5, le
+ cmp w14, w6
+ csel w14, w14, w6, ge
+ subs w15, w15, #1
+ ext v1.16b, v1.16b, v1.16b, #4
+ ins v0.h[7], w14
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
+
+function sum_lag1_above_neon
+ sub x12, x0, #1*GRAIN_WIDTH*2 - 16
+ ld1 {v18.8h}, [x12] // load top right
+
+ ext v0.16b, v16.16b, v17.16b, #14 // top left, top mid
+ ext v1.16b, v17.16b, v18.16b, #2 // top mid, top right
+
+ smull v4.4s, v17.4h, v28.4h
+ smlal v4.4s, v0.4h, v27.4h
+ smlal v4.4s, v1.4h, v29.4h
+ smull2 v5.4s, v17.8h, v28.8h
+ smlal2 v5.4s, v0.8h, v27.8h
+ smlal2 v5.4s, v1.8h, v29.8h
+
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+
+ ret
+endfunc
+
+.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
+ bl sum_\lag\()_above_neon
+.ifc \type, uv_420
+ add x12, x19, #GRAIN_WIDTH*2
+ ld1 {v22.8h, v23.8h}, [x19], #32
+ ld1 {v24.8h, v25.8h}, [x12]
+ addp v22.8h, v22.8h, v23.8h
+ addp v23.8h, v24.8h, v25.8h
+ add v22.8h, v22.8h, v23.8h
+ srshr v0.8h, v22.8h, #2
+.endif
+.ifc \type, uv_422
+ ld1 {v22.8h, v23.8h}, [x19], #32
+ addp v22.8h, v22.8h, v23.8h
+ srshr v0.8h, v22.8h, #1
+.endif
+.ifc \type, uv_444
+ ld1 {v0.8h}, [x19], #16
+.endif
+.if \uv_layout
+.ifnb \uv_coeff
+ dup v1.8b, \uv_coeff
+ sxtl v1.8h, v1.8b
+ smlal v4.4s, v0.4h, v1.4h
+ smlal2 v5.4s, v0.8h, v1.8h
+.else
+ smlal v4.4s, v0.4h, v30.4h
+ smlal2 v5.4s, v0.8h, v30.8h
+.endif
+.endif
+.if \uv_layout && \elems == 8
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 444 && \elems == 7
+ b sum_\lag\()_y_\edge\()_start
+.elseif \uv_layout == 422 && \elems == 1
+ b sum_\lag\()_uv_420_\edge\()_start
+.else
+sum_\lag\()_\type\()_\edge\()_start:
+.if \elems > 4
+.ifc \edge, left
+ increment_seed 4
+ read_rand x12, 11, 3
+ read_rand x13, 11, 2
+ read_rand x14, 11, 1
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v0.h}[5], [x12]
+ ld1 {v0.h}[6], [x13]
+ ld1 {v0.h}[7], [x14]
+ lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0
+ srshl v0.8h, v0.8h, v31.8h
+ ext v4.16b, v4.16b, v4.16b, #12
+.ifc \lag, lag3
+ smov w17, v0.h[5]
+.endif
+.ifnc \lag, lag1
+ smov w16, v0.h[6]
+.endif
+ smov w14, v0.h[7]
+
+ mov v1.16b, v4.16b
+ mov w15, #1
+ bl output_\lag\()_neon
+.else
+ increment_seed 4, shift=0
+ mov v1.16b, v4.16b
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+
+ increment_seed 4, shift=0
+ mov v1.16b, v5.16b
+.ifc \edge, right
+ mov w15, #3
+ bl output_\lag\()_neon
+ read_shift_rand x15, 11
+ add x15, x3, x15, lsl #1
+ ld1 {v1.h}[0], [x15]
+ srshl v1.4h, v1.4h, v31.4h
+ ext v0.16b, v0.16b, v1.16b, #2
+.else
+ mov w15, #4
+ bl output_\lag\()_neon
+.endif
+.else
+ // elems == 1
+ increment_seed 4, shift=0
+ mov v1.16b, v4.16b
+ mov w15, #1
+ bl output_\lag\()_neon
+ lsr w2, w2, #3
+
+ read_rand x12, 11, 2
+ read_rand x13, 11, 1
+ read_rand x14, 11, 0
+ add x12, x3, x12, lsl #1
+ add x13, x3, x13, lsl #1
+ add x14, x3, x14, lsl #1
+ ld1 {v1.h}[0], [x12]
+ ld1 {v1.h}[1], [x13]
+ ld1 {v1.h}[2], [x14]
+ srshl v1.4h, v1.4h, v31.4h
+ ext v0.16b, v0.16b, v1.16b, #14
+.endif
+ st1 {v0.8h}, [x0], #16
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endif
+.endm
+
+.macro sum_lag1_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag1_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x12, x0, #1*GRAIN_WIDTH*2
+ ld1 {v17.8h}, [x12] // load the previous block right above
+.endif
+ sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems
+endfunc
+.endm
+
+sum_lag1_func y, 0, left
+sum_lag1_func y, 0, mid
+sum_lag1_func y, 0, right, 7
+sum_lag1_func uv_444, 444, left
+sum_lag1_func uv_444, 444, mid
+sum_lag1_func uv_444, 444, right, 7
+sum_lag1_func uv_422, 422, left
+sum_lag1_func uv_422, 422, mid
+sum_lag1_func uv_422, 422, right, 1
+sum_lag1_func uv_420, 420, left
+sum_lag1_func uv_420, 420, mid
+sum_lag1_func uv_420, 420, right, 1
+
+
+function sum_lag2_above_neon
+ sub x12, x0, #2*GRAIN_WIDTH*2 - 16
+ sub x13, x0, #1*GRAIN_WIDTH*2 - 16
+ ld1 {v18.8h}, [x12] // load top right
+ ld1 {v21.8h}, [x13]
+
+ dup v26.8b, v30.b[0]
+ ext v22.16b, v16.16b, v17.16b, #12 // top left, top mid
+ dup v27.8b, v30.b[1]
+ ext v23.16b, v16.16b, v17.16b, #14
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v30.b[3]
+ ext v0.16b, v17.16b, v18.16b, #2 // top mid, top right
+ sxtl v27.8h, v27.8b
+ dup v29.8b, v30.b[4]
+ ext v1.16b, v17.16b, v18.16b, #4
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+
+ smull v4.4s, v22.4h, v26.4h
+ smlal v4.4s, v23.4h, v27.4h
+ smlal v4.4s, v0.4h, v28.4h
+ smlal v4.4s, v1.4h, v29.4h
+ smull2 v5.4s, v22.8h, v26.8h
+ smlal2 v5.4s, v23.8h, v27.8h
+ smlal2 v5.4s, v0.8h, v28.8h
+ smlal2 v5.4s, v1.8h, v29.8h
+
+ dup v26.16b, v30.b[5]
+ ext v22.16b, v19.16b, v20.16b, #12 // top left, top mid
+ dup v27.16b, v30.b[6]
+ ext v23.16b, v19.16b, v20.16b, #14
+ sxtl v26.8h, v26.8b
+ dup v28.16b, v30.b[8]
+ ext v0.16b, v20.16b, v21.16b, #2 // top mid, top right
+ sxtl v27.8h, v27.8b
+ dup v29.16b, v30.b[9]
+ ext v1.16b, v20.16b, v21.16b, #4
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+
+ smlal v4.4s, v22.4h, v26.4h
+ smlal v4.4s, v23.4h, v27.4h
+ smlal v4.4s, v0.4h, v28.4h
+ smlal v4.4s, v1.4h, v29.4h
+ smlal2 v5.4s, v22.8h, v26.8h
+ smlal2 v5.4s, v23.8h, v27.8h
+ smlal2 v5.4s, v0.8h, v28.8h
+ smlal2 v5.4s, v1.8h, v29.8h
+
+ dup v26.16b, v30.b[2]
+ dup v27.16b, v30.b[7]
+ sxtl v26.8h, v26.8b
+ sxtl v27.8h, v27.8b
+
+ smlal v4.4s, v17.4h, v26.4h
+ smlal v4.4s, v20.4h, v27.4h
+ smlal2 v5.4s, v17.8h, v26.8h
+ smlal2 v5.4s, v20.8h, v27.8h
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ ret
+endfunc
+
+.macro sum_lag2_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag2_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x12, x0, #2*GRAIN_WIDTH*2
+ sub x13, x0, #1*GRAIN_WIDTH*2
+ ld1 {v17.8h}, [x12] // load the previous block right above
+ ld1 {v20.8h}, [x13]
+.endif
+ sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, v30.b[12]
+endfunc
+.endm
+
+sum_lag2_func y, 0, left
+sum_lag2_func y, 0, mid
+sum_lag2_func y, 0, right, 7
+sum_lag2_func uv_444, 444, left
+sum_lag2_func uv_444, 444, mid
+sum_lag2_func uv_444, 444, right, 7
+sum_lag2_func uv_422, 422, left
+sum_lag2_func uv_422, 422, mid
+sum_lag2_func uv_422, 422, right, 1
+sum_lag2_func uv_420, 420, left
+sum_lag2_func uv_420, 420, mid
+sum_lag2_func uv_420, 420, right, 1
+
+
+function sum_lag3_above_neon
+ sub x11, x0, #3*GRAIN_WIDTH*2 - 16
+ sub x12, x0, #2*GRAIN_WIDTH*2 - 16
+ sub x13, x0, #1*GRAIN_WIDTH*2 - 16
+ ld1 {v15.8h}, [x11] // load top right
+ ld1 {v18.8h}, [x12]
+ ld1 {v21.8h}, [x13]
+
+ dup v22.8b, v29.b[0]
+ ext v8.16b, v13.16b, v14.16b, #10 // top left, top mid
+ dup v23.8b, v29.b[1]
+ ext v9.16b, v13.16b, v14.16b, #12
+ sxtl v22.8h, v22.8b
+ dup v24.8b, v29.b[2]
+ sxtl v23.8h, v23.8b
+ dup v25.8b, v29.b[3]
+ ext v10.16b, v13.16b, v14.16b, #14
+ sxtl v24.8h, v24.8b
+ dup v26.8b, v29.b[4]
+ ext v11.16b, v14.16b, v15.16b, #2 // top mid, top right
+ sxtl v25.8h, v25.8b
+ dup v27.8b, v29.b[5]
+ ext v12.16b, v14.16b, v15.16b, #4
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v29.b[6]
+ ext v13.16b, v14.16b, v15.16b, #6
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+
+ smull v4.4s, v8.4h, v22.4h
+ smlal v4.4s, v9.4h, v23.4h
+ smlal v4.4s, v10.4h, v24.4h
+ smlal v4.4s, v11.4h, v26.4h
+ smlal v4.4s, v12.4h, v27.4h
+ smlal v4.4s, v13.4h, v28.4h
+ smlal v4.4s, v14.4h, v25.4h
+ smull2 v5.4s, v8.8h, v22.8h
+ smlal2 v5.4s, v9.8h, v23.8h
+ smlal2 v5.4s, v10.8h, v24.8h
+ smlal2 v5.4s, v11.8h, v26.8h
+ smlal2 v5.4s, v12.8h, v27.8h
+ smlal2 v5.4s, v13.8h, v28.8h
+ smlal2 v5.4s, v14.8h, v25.8h
+
+ dup v22.8b, v29.b[7]
+ ext v8.16b, v16.16b, v17.16b, #10 // top left, top mid
+ dup v23.8b, v29.b[8]
+ ext v9.16b, v16.16b, v17.16b, #12
+ sxtl v22.8h, v22.8b
+ dup v24.8b, v29.b[9]
+ sxtl v23.8h, v23.8b
+ dup v25.8b, v29.b[10]
+ ext v10.16b, v16.16b, v17.16b, #14
+ sxtl v24.8h, v24.8b
+ dup v26.8b, v29.b[11]
+ ext v11.16b, v17.16b, v18.16b, #2 // top mid, top right
+ sxtl v25.8h, v25.8b
+ dup v27.8b, v29.b[12]
+ ext v12.16b, v17.16b, v18.16b, #4
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v29.b[13]
+ ext v13.16b, v17.16b, v18.16b, #6
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+
+ smlal v4.4s, v8.4h, v22.4h
+ smlal v4.4s, v9.4h, v23.4h
+ smlal v4.4s, v10.4h, v24.4h
+ smlal v4.4s, v11.4h, v26.4h
+ smlal v4.4s, v12.4h, v27.4h
+ smlal v4.4s, v13.4h, v28.4h
+ smlal v4.4s, v17.4h, v25.4h
+ smlal2 v5.4s, v8.8h, v22.8h
+ smlal2 v5.4s, v9.8h, v23.8h
+ smlal2 v5.4s, v10.8h, v24.8h
+ smlal2 v5.4s, v11.8h, v26.8h
+ smlal2 v5.4s, v12.8h, v27.8h
+ smlal2 v5.4s, v13.8h, v28.8h
+ smlal2 v5.4s, v17.8h, v25.8h
+
+ dup v22.8b, v29.b[14]
+ ext v8.16b, v19.16b, v20.16b, #10 // top left, top mid
+ dup v23.8b, v29.b[15]
+ ext v9.16b, v19.16b, v20.16b, #12
+ sxtl v22.8h, v22.8b
+ dup v24.8b, v30.b[0]
+ sxtl v23.8h, v23.8b
+ dup v25.8b, v30.b[1]
+ ext v10.16b, v19.16b, v20.16b, #14
+ sxtl v24.8h, v24.8b
+ dup v26.8b, v30.b[2]
+ ext v11.16b, v20.16b, v21.16b, #2 // top mid, top right
+ sxtl v25.8h, v25.8b
+ dup v27.8b, v30.b[3]
+ ext v12.16b, v20.16b, v21.16b, #4
+ sxtl v26.8h, v26.8b
+ dup v28.8b, v30.b[4]
+ ext v13.16b, v20.16b, v21.16b, #6
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+
+ smlal v4.4s, v8.4h, v22.4h
+ smlal v4.4s, v9.4h, v23.4h
+ smlal v4.4s, v10.4h, v24.4h
+ smlal v4.4s, v11.4h, v26.4h
+ smlal v4.4s, v12.4h, v27.4h
+ smlal v4.4s, v13.4h, v28.4h
+ smlal v4.4s, v20.4h, v25.4h
+ mov v16.16b, v17.16b
+ mov v17.16b, v18.16b
+ smlal2 v5.4s, v8.8h, v22.8h
+ smlal2 v5.4s, v9.8h, v23.8h
+ smlal2 v5.4s, v10.8h, v24.8h
+ smlal2 v5.4s, v11.8h, v26.8h
+ smlal2 v5.4s, v12.8h, v27.8h
+ smlal2 v5.4s, v13.8h, v28.8h
+ smlal2 v5.4s, v20.8h, v25.8h
+
+ mov v13.16b, v14.16b
+ mov v14.16b, v15.16b
+
+ mov v19.16b, v20.16b
+ mov v20.16b, v21.16b
+ ret
+endfunc
+
+.macro sum_lag3_func type, uv_layout, edge, elems=8
+function sum_\type\()_lag3_\edge\()_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+.ifc \edge, left
+ sub x11, x0, #3*GRAIN_WIDTH*2
+ sub x12, x0, #2*GRAIN_WIDTH*2
+ sub x13, x0, #1*GRAIN_WIDTH*2
+ ld1 {v14.8h}, [x11] // load the previous block right above
+ ld1 {v17.8h}, [x12]
+ ld1 {v20.8h}, [x13]
+.endif
+ sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, v30.b[8]
+endfunc
+.endm
+
+sum_lag3_func y, 0, left
+sum_lag3_func y, 0, mid
+sum_lag3_func y, 0, right, 7
+sum_lag3_func uv_444, 444, left
+sum_lag3_func uv_444, 444, mid
+sum_lag3_func uv_444, 444, right, 7
+sum_lag3_func uv_422, 422, left
+sum_lag3_func uv_422, 422, mid
+sum_lag3_func uv_422, 422, right, 1
+sum_lag3_func uv_420, 420, left
+sum_lag3_func uv_420, 420, mid
+sum_lag3_func uv_420, 420, right, 1
+
+function generate_grain_rows_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ mov w16, #80
+2:
+ bl get_gaussian_neon
+ srshl v0.8h, v0.8h, v31.8h
+ subs w16, w16, #8
+ st1 {v0.8h}, [x0], #16
+ b.gt 2b
+ get_grain_2 v0
+ subs w1, w1, #1
+ st1 {v0.s}[0], [x0], #4
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function generate_grain_rows_44_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+1:
+ mov w16, #40
+2:
+ bl get_gaussian_neon
+ srshl v0.8h, v0.8h, v31.8h
+ subs w16, w16, #8
+ st1 {v0.8h}, [x0], #16
+ b.gt 2b
+ get_grain_4 v0
+ subs w1, w1, #1
+ st1 {v0.4h}, [x0]
+ add x0, x0, #GRAIN_WIDTH*2-80
+ b.gt 1b
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function gen_grain_uv_444_lag0_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v4.8h}, [x19], #16
+gen_grain_uv_lag0_8_start:
+ bl get_gaussian_neon
+ srshl v0.8h, v0.8h, v31.8h
+gen_grain_uv_lag0_8_add:
+ and v4.16b, v4.16b, v1.16b
+ smull v2.4s, v4.4h, v27.4h
+ smull2 v3.4s, v4.8h, v27.8h
+ srshl v2.4s, v2.4s, v28.4s
+ srshl v3.4s, v3.4s, v28.4s
+ sqxtn v2.4h, v2.4s
+ sqxtn2 v2.8h, v3.4s
+ sqadd v2.8h, v2.8h, v0.8h
+ smin v2.8h, v2.8h, v25.8h
+ smax v2.8h, v2.8h, v26.8h
+ st1 {v2.8h}, [x0], #16
+ ldr x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+endfunc
+
+function gen_grain_uv_420_lag0_8_neon
+ AARCH64_SIGN_LINK_REGISTER
+ add x12, x19, #GRAIN_WIDTH*2
+ str x30, [sp, #-16]!
+ ld1 {v16.8h, v17.8h}, [x19], #32
+ ld1 {v18.8h, v19.8h}, [x12]
+ addp v16.8h, v16.8h, v17.8h
+ addp v17.8h, v18.8h, v19.8h
+ add v16.8h, v16.8h, v17.8h
+ srshr v4.8h, v16.8h, #2
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_422_lag0_8_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v16.8h, v17.8h}, [x19], #32
+ addp v16.8h, v16.8h, v17.8h
+ srshr v4.8h, v16.8h, #1
+ b gen_grain_uv_lag0_8_start
+endfunc
+
+function gen_grain_uv_420_lag0_4_neon
+ add x12, x19, #GRAIN_WIDTH*2
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v16.4h, v17.4h}, [x19]
+ ld1 {v18.4h, v19.4h}, [x12]
+ add x19, x19, #32
+ addp v16.4h, v16.4h, v17.4h
+ addp v17.4h, v18.4h, v19.4h
+ add v16.4h, v16.4h, v17.4h
+ srshr v4.4h, v16.4h, #2
+ get_grain_4 v0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+function gen_grain_uv_422_lag0_4_neon
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-16]!
+ ld1 {v16.4h, v17.4h}, [x19]
+ add x19, x19, #32
+ addp v16.4h, v16.4h, v17.4h
+ srshr v4.4h, v16.4h, #1
+ get_grain_4 v0
+ b gen_grain_uv_lag0_8_add
+endfunc
+
+.macro gen_grain_82 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+.ifc \type, uv_444
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #3*GRAIN_WIDTH*2
+ mov x1, x2
+ mul w13, w13, w14
+ clz w15, w4
+.else
+ clz w15, w2
+.endif
+ movrel x3, X(gaussian_sequence)
+ sub w15, w15, #24 // -bitdepth_min_8
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+.ifc \type, y
+ add x4, x1, #FGD_AR_COEFFS_Y
+.else
+ add x4, x1, #FGD_AR_COEFFS_UV
+.endif
+ add w9, w9, w15 // grain_scale_shift - bitdepth_min_8
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+.ifc \type, uv_444
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+.endif
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ neg w15, w15 // bitdepth_min_8
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #128
+ lsl w5, w5, w15 // 128 << bitdepth_min_8
+ neg w6, w5 // -(128 << bitpdeth_min_8)
+ sub w5, w5, #1 // (128 << bitdepth_min_8) - 1
+
+.ifc \type, uv_444
+ eor w2, w2, w11
+.endif
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, y
+ mov w1, #GRAIN_HEIGHT
+ bl generate_grain_rows_neon
+.else
+ dup v28.4s, w7
+ ld1r {v27.8b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ dup v25.8h, w5
+ dup v26.8h, w6
+ ext v29.16b, v0.16b, v1.16b, #10
+ ext v30.16b, v1.16b, v0.16b, #2
+ neg v28.4s, v28.4s
+ sxtl v27.8h, v27.8b
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+ mov w1, #GRAIN_HEIGHT-3
+1:
+ mov v1.16b, v29.16b
+ bl gen_grain_uv_444_lag0_neon // 8
+ movi v1.16b, #255
+ bl gen_grain_uv_444_lag0_neon // 16
+ bl gen_grain_uv_444_lag0_neon // 24
+ bl gen_grain_uv_444_lag0_neon // 32
+ bl gen_grain_uv_444_lag0_neon // 40
+ bl gen_grain_uv_444_lag0_neon // 48
+ bl gen_grain_uv_444_lag0_neon // 56
+ bl gen_grain_uv_444_lag0_neon // 64
+ bl gen_grain_uv_444_lag0_neon // 72
+ mov v1.16b, v30.16b
+ bl gen_grain_uv_444_lag0_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+ add x19, x19, #4
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+.endif
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.8b}, [x4], #1 // ar_coeffs_y[0]
+ ld1r {v28.8b}, [x4], #1 // ar_coeffs_y[1]
+ ld1r {v29.8b}, [x4] // ar_coeffs_y[2]
+.ifc \type, y
+ ldrsb w4, [x4, #1] // ar_coeffs_y[3]
+.else
+ add x4, x4, #2
+.endif
+
+ mov w1, #3
+.ifc \type, uv_444
+ ld1r {v30.8b}, [x4] // ar_coeffs_uv[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+.endif
+ bl generate_grain_rows_neon
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+.ifc \type, uv_444
+ sxtl v30.8h, v30.8b
+.endif
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_mid_neon // 48
+ bl sum_\type\()_lag1_mid_neon // 56
+ bl sum_\type\()_lag1_mid_neon // 64
+ bl sum_\type\()_lag1_mid_neon // 72
+ bl sum_\type\()_lag1_right_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #4
+.endif
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_mid_neon // 48
+ bl sum_\type\()_lag2_mid_neon // 56
+ bl sum_\type\()_lag2_mid_neon // 64
+ bl sum_\type\()_lag2_mid_neon // 72
+ bl sum_\type\()_lag2_right_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #4
+.endif
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_neon
+
+ mov w1, #GRAIN_HEIGHT - 3
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_mid_neon // 48
+ bl sum_\type\()_lag3_mid_neon // 56
+ bl sum_\type\()_lag3_mid_neon // 64
+ bl sum_\type\()_lag3_mid_neon // 72
+ bl sum_\type\()_lag3_right_neon // 80
+ get_grain_2 v16
+ subs w1, w1, #1
+.ifc \type, uv_444
+ add x19, x19, #4
+.endif
+ st1 {v16.s}[0], [x0], #4
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_82 y
+gen_grain_82 uv_444
+
+.macro set_height dst, type
+.ifc \type, uv_420
+ mov \dst, #SUB_GRAIN_HEIGHT-3
+.else
+ mov \dst, #GRAIN_HEIGHT-3
+.endif
+.endm
+
+.macro increment_y_ptr reg, type
+.ifc \type, uv_420
+ add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
+.else
+ sub \reg, \reg, #6*32-GRAIN_WIDTH*2
+.endif
+.endm
+
+.macro gen_grain_44 type
+function generate_grain_\type\()_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x30, x19, [sp, #-96]!
+
+ mov w13, w3
+ mov w14, #28
+ add x19, x1, #(3*GRAIN_WIDTH-3)*2
+ mov x1, x2
+ mul w13, w13, w14
+ clz w15, w4
+
+ movrel x3, X(gaussian_sequence)
+ sub w15, w15, #24 // -bitdepth_min_8
+ ldr w2, [x1, #FGD_SEED]
+ ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
+ add x4, x1, #FGD_AR_COEFFS_UV
+ add w9, w9, w15 // grain_scale_shift - bitdepth_min_8
+ adr x16, L(gen_grain_\type\()_tbl)
+ ldr w17, [x1, #FGD_AR_COEFF_LAG]
+ add w9, w9, #4
+ ldrh w17, [x16, w17, uxtw #1]
+ dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift
+ sub x16, x16, w17, uxtw
+ neg v31.8h, v31.8h
+
+ cmp w13, #0
+ mov w11, #0x49d8
+ mov w14, #0xb524
+ add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1]
+ csel w11, w11, w14, ne
+
+ ldr w7, [x1, #FGD_AR_COEFF_SHIFT]
+ neg w15, w15 // bitdepth_min_8
+ mov w8, #1
+ mov w10, #1
+ lsl w8, w8, w7 // 1 << ar_coeff_shift
+ lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift)
+ lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1)
+ lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1)
+ mov w5, #128
+ lsl w5, w5, w15 // 128 << bitdepth_min_8
+ neg w6, w5 // -(128 << bitpdeth_min_8)
+ sub w5, w5, #1 // (128 << bitdepth_min_8) - 1
+
+ eor w2, w2, w11
+
+ br x16
+
+L(generate_grain_\type\()_lag0):
+ AARCH64_VALID_JUMP_TARGET
+ dup v28.4s, w7
+ ld1r {v27.8b}, [x4] // ar_coeffs_uv[0]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ dup v25.8h, w5
+ dup v26.8h, w6
+ ext v29.16b, v0.16b, v1.16b, #10
+ ext v30.16b, v1.16b, v0.16b, #14
+ neg v28.4s, v28.4s
+ sxtl v27.8h, v27.8b
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+ set_height w1, \type
+1:
+ mov v1.16b, v29.16b
+ bl gen_grain_\type\()_lag0_8_neon // 8
+ movi v1.16b, #255
+ bl gen_grain_\type\()_lag0_8_neon // 16
+ bl gen_grain_\type\()_lag0_8_neon // 24
+ bl gen_grain_\type\()_lag0_8_neon // 32
+ bl gen_grain_\type\()_lag0_8_neon // 40
+ mov v1.16b, v30.16b
+ bl gen_grain_\type\()_lag0_4_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag1):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v27.8b}, [x4], #1 // ar_coeffs_uv[0]
+ ld1r {v28.8b}, [x4], #1 // ar_coeffs_uv[1]
+ ld1r {v29.8b}, [x4] // ar_coeffs_uv[2]
+ add x4, x4, #2
+
+ mov w1, #3
+ ld1r {v30.8b}, [x4] // ar_coeffs_u4[4]
+ ldursb w4, [x4, #-1] // ar_coeffs_uv[3]
+ bl generate_grain_rows_44_neon
+
+ sxtl v27.8h, v27.8b
+ sxtl v28.8h, v28.8b
+ sxtl v29.8h, v29.8b
+ sxtl v30.8h, v30.8b
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag1_left_neon // 8
+ bl sum_\type\()_lag1_mid_neon // 16
+ bl sum_\type\()_lag1_mid_neon // 24
+ bl sum_\type\()_lag1_mid_neon // 32
+ bl sum_\type\()_lag1_mid_neon // 40
+ bl sum_\type\()_lag1_right_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag2):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12]
+
+ smov w4, v30.b[10]
+ smov w17, v30.b[11]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag2_left_neon // 8
+ bl sum_\type\()_lag2_mid_neon // 16
+ bl sum_\type\()_lag2_mid_neon // 24
+ bl sum_\type\()_lag2_mid_neon // 32
+ bl sum_\type\()_lag2_mid_neon // 40
+ bl sum_\type\()_lag2_right_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(generate_grain_\type\()_lag3):
+ AARCH64_VALID_JUMP_TARGET
+ ldr q29, [x4] // ar_coeffs_uv[0-15]
+ ldr q30, [x4, #16] // ar_coeffs_uv[16-24]
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x20, x21, [sp, #80]
+
+ smov w4, v30.b[5]
+ smov w20, v30.b[6]
+ smov w21, v30.b[7]
+
+ mov w1, #3
+ bl generate_grain_rows_44_neon
+
+ set_height w1, \type
+1:
+ bl sum_\type\()_lag3_left_neon // 8
+ bl sum_\type\()_lag3_mid_neon // 16
+ bl sum_\type\()_lag3_mid_neon // 24
+ bl sum_\type\()_lag3_mid_neon // 32
+ bl sum_\type\()_lag3_mid_neon // 40
+ bl sum_\type\()_lag3_right_neon // 44
+ subs w1, w1, #1
+ increment_y_ptr x19, \type
+ add x0, x0, #GRAIN_WIDTH*2-6*16
+ b.gt 1b
+
+ ldp x20, x21, [sp, #80]
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldp x30, x19, [sp], #96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(gen_grain_\type\()_tbl):
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
+ .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
+endfunc
+.endm
+
+gen_grain_44 uv_420
+gen_grain_44 uv_422
+
+.macro gather_interleaved dst1, dst2, src1, src2, off
+ umov w14, \src1[0]
+ umov w15, \src2[1]
+ umov w16, \src1[2]
+ add x14, x14, x3
+ umov w17, \src2[3]
+ add x15, x15, x3
+ ld1 {\dst1}[0+\off], [x14]
+ umov w14, \src1[4]
+ add x16, x16, x3
+ ld1 {\dst2}[1+\off], [x15]
+ umov w15, \src2[5]
+ add x17, x17, x3
+ ld1 {\dst1}[2+\off], [x16]
+ umov w16, \src1[6]
+ add x14, x14, x3
+ ld1 {\dst2}[3+\off], [x17]
+ umov w17, \src2[7]
+ add x15, x15, x3
+ ld1 {\dst1}[4+\off], [x14]
+ add x16, x16, x3
+ ld1 {\dst2}[5+\off], [x15]
+ add x17, x17, x3
+ ld1 {\dst1}[6+\off], [x16]
+ ld1 {\dst2}[7+\off], [x17]
+.endm
+
+.macro gather dst1, dst2, src1, src2, src3, src4
+ gather_interleaved \dst1, \dst2, \src1, \src3, 0
+ gather_interleaved \dst2, \dst1, \src3, \src1, 0
+ gather_interleaved \dst1, \dst2, \src2, \src4, 8
+ gather_interleaved \dst2, \dst1, \src4, \src2, 8
+.endm
+
+function gather32_neon
+ gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h
+ ret
+endfunc
+
+function gather16_neon
+ gather_interleaved v6.b, v7.b, v0.h, v1.h, 0
+ gather_interleaved v7.b, v6.b, v1.h, v0.h, 0
+ ins v6.d[1], v7.d[0]
+ ret
+endfunc
+
+const overlap_coeffs_0, align=4
+ .short 27, 17, 0, 0
+ .short 17, 27, 32, 32
+endconst
+
+const overlap_coeffs_1, align=4
+ .short 23, 0, 0, 0
+ .short 22, 32, 32, 32
+endconst
+
+.macro calc_offset offx, offy, src, sx, sy
+ and \offy, \src, #0xF // randval & 0xF
+ lsr \offx, \src, #4 // randval >> 4
+.if \sy == 0
+ add \offy, \offy, \offy // 2 * (randval & 0xF)
+.endif
+.if \sx == 0
+ add \offx, \offx, \offx // 2 * (randval >> 4)
+.endif
+.endm
+
+.macro add_offset dst, offx, offy, src, stride
+ madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
+ add \dst, \dst, \offx, uxtw #1 // grain_lut += offx
+.endm
+
+// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const int scaling_shift,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const int offsets[][2],
+// const int h, const ptrdiff_t clip,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+function fgy_32x32_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-80]!
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ str d14, [sp, #64]
+ eor w4, w4, #15 // 15 - scaling_shift
+ ldr w11, [x6, #8] // offsets[1][0]
+ ldr w13, [x6, #4] // offsets[0][1]
+ ldr w15, [x6, #12] // offsets[1][1]
+ ldr w10, [sp, #96] // bitdepth_max
+ ldr w6, [x6] // offsets[0][0]
+ dup v26.8h, w10 // bitdepth_max
+ clz w10, w10
+ ldr w8, [sp, #80] // clip
+ sub w10, w10, #24 // -bitdepth_min_8
+ mov x9, #GRAIN_WIDTH*2 // grain_lut stride
+ neg w10, w10 // bitdepth_min_8
+
+ dup v29.8h, w4 // 15 - scaling_shift
+ dup v27.8h, w10 // bitdepth_min_8
+
+ movrel x16, overlap_coeffs_0
+
+ cbz w8, 1f
+ // clip
+ movi v30.8h, #16
+ movi v31.8h, #235
+ sshl v30.8h, v30.8h, v27.8h
+ sshl v31.8h, v31.8h, v27.8h
+ b 2f
+1:
+ // no clip
+ movi v30.8h, #0
+ mov v31.16b, v26.16b // bitdepth_max
+2:
+
+ ushr v26.8h, v26.8h, #1 // grain_max
+ not v25.16b, v26.16b // grain_min
+
+ ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs
+
+ add x5, x5, #18 // grain_lut += 9
+ add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x9 // grain_lut += grain_stride
+
+ calc_offset w11, w12, w11, 0, 0
+ calc_offset w13, w14, w13, 0, 0
+ calc_offset w15, w16, w15, 0, 0
+ calc_offset w6, w10, w6, 0, 0
+
+ add_offset x12, w11, x12, x5, x9
+ add_offset x14, w13, x14, x5, x9
+ add_offset x16, w15, x16, x5, x9
+ add_offset x5, w6, x10, x5, x9
+
+ ldr w11, [sp, #88] // type
+ adr x13, L(fgy_loop_tbl)
+
+ add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx
+ add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+
+ tst w11, #1
+ ldrh w11, [x13, w11, uxtw #1]
+
+ add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
+ add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx
+
+ sub x11, x13, w11, uxtw
+
+ b.eq 1f
+ // y overlap
+ dup v8.8h, v27.h[0]
+ dup v9.8h, v27.h[1]
+ mov w10, w7 // backup actual h
+ mov w7, #2
+1:
+ br x11
+endfunc
+
+function fgy_loop_neon
+.macro fgy ox, oy
+L(loop_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src
+.if \ox
+ ld1 {v20.4h}, [x4], x9 // grain_lut old
+.endif
+.if \oy
+ ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v14.4h}, [x8], x9 // grain_lut top old
+.endif
+ mvni v4.8h, #0xf0, lsl #8 // 0x0fff
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut
+
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ and v0.16b, v0.16b, v4.16b
+ and v1.16b, v1.16b, v4.16b
+ and v2.16b, v2.16b, v4.16b
+ and v3.16b, v3.16b, v4.16b
+ bl gather32_neon
+
+.if \ox
+ smull v20.4s, v20.4h, v27.4h
+ smlal v20.4s, v16.4h, v28.4h
+.endif
+
+.if \oy
+.if \ox
+ smull v14.4s, v14.4h, v27.4h
+ smlal v14.4s, v21.4h, v28.4h
+ sqrshrn v20.4h, v20.4s, #5
+ sqrshrn v14.4h, v14.4s, #5
+ smin v20.4h, v20.4h, v26.4h
+ smin v14.4h, v14.4h, v26.4h
+ smax v20.4h, v20.4h, v25.4h
+ smax v14.4h, v14.4h, v25.4h
+.endif
+
+.if \ox
+ smull v10.4s, v20.4h, v9.4h
+.else
+ smull v10.4s, v16.4h, v9.4h
+.endif
+ smull2 v11.4s, v16.8h, v9.8h
+ smull v12.4s, v17.4h, v9.4h
+ smull2 v13.4s, v17.8h, v9.8h
+ smull v16.4s, v18.4h, v9.4h
+ smull2 v17.4s, v18.8h, v9.8h
+ smull v18.4s, v19.4h, v9.4h
+ smull2 v19.4s, v19.8h, v9.8h
+.if \ox
+ smlal v10.4s, v14.4h, v8.4h
+.else
+ smlal v10.4s, v21.4h, v8.4h
+.endif
+ smlal2 v11.4s, v21.8h, v8.8h
+ smlal v12.4s, v22.4h, v8.4h
+ smlal2 v13.4s, v22.8h, v8.8h
+ smlal v16.4s, v23.4h, v8.4h
+ smlal2 v17.4s, v23.8h, v8.8h
+ smlal v18.4s, v24.4h, v8.4h
+ smlal2 v19.4s, v24.8h, v8.8h
+ sqrshrn v10.4h, v10.4s, #5
+ sqrshrn2 v10.8h, v11.4s, #5
+ sqrshrn v11.4h, v12.4s, #5
+ sqrshrn2 v11.8h, v13.4s, #5
+ sqrshrn v12.4h, v16.4s, #5
+ sqrshrn2 v12.8h, v17.4s, #5
+ sqrshrn v13.4h, v18.4s, #5
+ sqrshrn2 v13.8h, v19.4s, #5
+ smin v16.8h, v10.8h, v26.8h
+ smin v17.8h, v11.8h, v26.8h
+ smin v18.8h, v12.8h, v26.8h
+ smin v19.8h, v13.8h, v26.8h
+ smax v16.8h, v16.8h, v25.8h
+ smax v17.8h, v17.8h, v25.8h
+ smax v18.8h, v18.8h, v25.8h
+ smax v19.8h, v19.8h, v25.8h
+.endif
+
+ uxtl v4.8h, v6.8b // scaling
+.if \ox && !\oy
+ sqrshrn v20.4h, v20.4s, #5
+.endif
+ uxtl2 v5.8h, v6.16b
+.if \ox && !\oy
+ smin v20.4h, v20.4h, v26.4h
+.endif
+ uxtl v6.8h, v7.8b
+.if \ox && !\oy
+ smax v20.4h, v20.4h, v25.4h
+.endif
+ uxtl2 v7.8h, v7.16b
+.if \ox && !\oy
+ ins v16.d[0], v20.d[0]
+.endif
+ ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
+ ushl v5.8h, v5.8h, v29.8h
+ ushl v6.8h, v6.8h, v29.8h
+ ushl v7.8h, v7.8h, v29.8h
+
+ sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
+ sqrdmulh v21.8h, v17.8h, v5.8h
+ sqrdmulh v22.8h, v18.8h, v6.8h
+ sqrdmulh v23.8h, v19.8h, v7.8h
+
+ usqadd v0.8h, v20.8h // *src + noise
+ usqadd v1.8h, v21.8h
+ usqadd v2.8h, v22.8h
+ usqadd v3.8h, v23.8h
+
+ umax v0.8h, v0.8h, v30.8h
+ umax v1.8h, v1.8h, v30.8h
+ umax v2.8h, v2.8h, v30.8h
+ umax v3.8h, v3.8h, v30.8h
+ umin v0.8h, v0.8h, v31.8h
+ umin v1.8h, v1.8h, v31.8h
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+
+ subs w7, w7, #1
+.if \oy
+ dup v8.8h, v28.h[0]
+ dup v9.8h, v28.h[1]
+.endif
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w10, #2
+ sub w7, w10, #2 // restore actual remaining h
+ b.gt L(loop_\ox\()0)
+.endif
+ ldr d14, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldr x30, [sp], #80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.endm
+
+ fgy 0, 0
+ fgy 0, 1
+ fgy 1, 0
+ fgy 1, 1
+
+L(fgy_loop_tbl):
+ .hword L(fgy_loop_tbl) - L(loop_00)
+ .hword L(fgy_loop_tbl) - L(loop_01)
+ .hword L(fgy_loop_tbl) - L(loop_10)
+ .hword L(fgy_loop_tbl) - L(loop_11)
+endfunc
+
+// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
+// const pixel *const src,
+// const ptrdiff_t stride,
+// const uint8_t scaling[SCALING_SIZE],
+// const Dav1dFilmGrainData *const data,
+// const entry grain_lut[][GRAIN_WIDTH],
+// const pixel *const luma_row,
+// const ptrdiff_t luma_stride,
+// const int offsets[][2],
+// const ptrdiff_t h, const ptrdiff_t uv,
+// const ptrdiff_t is_id,
+// const ptrdiff_t type,
+// const int bitdepth_max);
+.macro fguv layout, sx, sy
+function fguv_32x32_\layout\()_16bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ str x30, [sp, #-80]!
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+
+ ldp x8, x9, [sp, #80] // offsets, h
+ ldp x10, x11, [sp, #96] // uv, is_id
+ ldr w16, [sp, #120] // bitdepth_max
+
+ ldr w13, [x4, #FGD_SCALING_SHIFT]
+ ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
+ dup v23.8h, w16 // bitdepth_max
+ clz w16, w16
+ eor w13, w13, #15 // 15 - scaling_shift
+ sub w16, w16, #24 // -bitdepth_min_8
+
+ // !csfl
+ add x10, x4, x10, lsl #2 // + 4*uv
+ add x14, x10, #FGD_UV_LUMA_MULT
+ add x15, x10, #FGD_UV_MULT
+ add x10, x10, #FGD_UV_OFFSET
+ neg w16, w16 // bitdepth_min_8
+ ld1r {v8.8h}, [x14] // uv_luma_mult
+ ld1r {v24.8h}, [x10] // uv_offset
+ ld1r {v9.8h}, [x15] // uv_mult
+
+ dup v29.8h, w13 // 15 - scaling_shift
+ dup v27.8h, w16 // bitdepth_min_8
+
+ cbz w12, 1f
+ // clip
+ movi v30.8h, #16
+ movi v31.8h, #240
+ sshl v30.8h, v30.8h, v27.8h
+ sshl v31.8h, v31.8h, v27.8h
+ cbz w11, 2f
+ // is_id
+ movi v31.8h, #235
+ sshl v31.8h, v31.8h, v27.8h
+ b 2f
+1:
+ // no clip
+ movi v30.8h, #0
+ mov v31.16b, v23.16b // bitdepth_max
+2:
+
+ ushr v15.8h, v23.8h, #1 // grain_max
+ sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8
+ not v14.16b, v15.16b // grain_min
+
+ ldr w12, [x8, #8] // offsets[1][0]
+ ldr w14, [x8, #4] // offsets[0][1]
+ ldr w16, [x8, #12] // offsets[1][1]
+ ldr w8, [x8] // offsets[0][0]
+
+ mov x10, #GRAIN_WIDTH*2 // grain_lut stride
+
+ add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
+.if \sy
+ add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride
+ add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride
+.else
+ add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride
+ add x5, x5, x10 // grain_lut += grain_stride
+.endif
+
+ calc_offset w12, w13, w12, \sx, \sy
+ calc_offset w14, w15, w14, \sx, \sy
+ calc_offset w16, w17, w16, \sx, \sy
+ calc_offset w8, w11, w8, \sx, \sy
+
+ add_offset x13, w12, x13, x5, x10
+ add_offset x15, w14, x15, x5, x10
+ add_offset x17, w16, x17, x5, x10
+ add_offset x5, w8, x11, x5, x10
+
+ add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+ add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
+ add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+
+ ldr w13, [sp, #112] // type
+
+ movrel x16, overlap_coeffs_\sx
+ adr x14, L(fguv_loop_sx\sx\()_tbl)
+
+ ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs
+ tst w13, #1
+ ldrh w13, [x14, w13, uxtw #1]
+
+ b.eq 1f
+ // y overlap
+ sub w12, w9, #(2 >> \sy) // backup remaining h
+ mov w9, #(2 >> \sy)
+
+1:
+ sub x13, x14, w13, uxtw
+
+.if \sy
+ movi v25.8h, #23
+ movi v26.8h, #22
+.else
+ movi v25.8h, #27
+ movi v26.8h, #17
+.endif
+
+.if \sy
+ add x7, x7, x7 // luma_stride *= 2
+.endif
+
+ br x13
+endfunc
+.endm
+
+fguv 420, 1, 1
+fguv 422, 1, 0
+fguv 444, 0, 0
+
+function fguv_loop_sx0_neon
+.macro fguv_loop_sx0 csfl, ox, oy
+L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+.if \ox
+ ld1 {v4.4h}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v5.4h}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut
+
+.if \ox
+ smull v4.4s, v4.4h, v27.4h
+ smlal v4.4s, v16.4h, v28.4h
+.endif
+
+.if \oy
+.if \ox
+ smull v5.4s, v5.4h, v27.4h
+ smlal v5.4s, v0.4h, v28.4h
+ sqrshrn v4.4h, v4.4s, #5
+ sqrshrn v5.4h, v5.4s, #5
+ smin v4.4h, v4.4h, v15.4h
+ smin v5.4h, v5.4h, v15.4h
+ smax v4.4h, v4.4h, v14.4h
+ smax v5.4h, v5.4h, v14.4h
+ ins v16.d[0], v4.d[0]
+ ins v0.d[0], v5.d[0]
+.endif
+
+ smull v6.4s, v16.4h, v26.4h
+ smull2 v7.4s, v16.8h, v26.8h
+ smull v10.4s, v17.4h, v26.4h
+ smull2 v11.4s, v17.8h, v26.8h
+ smull v16.4s, v18.4h, v26.4h
+ smull2 v17.4s, v18.8h, v26.8h
+ smull v18.4s, v19.4h, v26.4h
+ smull2 v19.4s, v19.8h, v26.8h
+ smlal v6.4s, v0.4h, v25.4h
+ smlal2 v7.4s, v0.8h, v25.8h
+ smlal v10.4s, v1.4h, v25.4h
+ smlal2 v11.4s, v1.8h, v25.8h
+ smlal v16.4s, v2.4h, v25.4h
+ smlal2 v17.4s, v2.8h, v25.8h
+ smlal v18.4s, v3.4h, v25.4h
+ smlal2 v19.4s, v3.8h, v25.8h
+ sqrshrn v6.4h, v6.4s, #5
+ sqrshrn2 v6.8h, v7.4s, #5
+ sqrshrn v7.4h, v10.4s, #5
+ sqrshrn2 v7.8h, v11.4s, #5
+ sqrshrn v10.4h, v16.4s, #5
+ sqrshrn2 v10.8h, v17.4s, #5
+ sqrshrn v11.4h, v18.4s, #5
+ sqrshrn2 v11.8h, v19.4s, #5
+.endif
+
+.if \ox && !\oy
+ sqrshrn v4.4h, v4.4s, #5
+ smin v4.4h, v4.4h, v15.4h
+.endif
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma
+.if \oy
+ smin v16.8h, v6.8h, v15.8h
+ smin v17.8h, v7.8h, v15.8h
+ smin v18.8h, v10.8h, v15.8h
+ smin v19.8h, v11.8h, v15.8h
+ smax v16.8h, v16.8h, v14.8h
+ smax v17.8h, v17.8h, v14.8h
+ smax v18.8h, v18.8h, v14.8h
+ smax v19.8h, v19.8h, v14.8h
+.endif
+
+.if \ox && !\oy
+ smax v4.4h, v4.4h, v14.4h
+.endif
+ ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src
+.if \ox && !\oy
+ ins v16.d[0], v4.d[0]
+.endif
+
+.if !\csfl
+ smull v4.4s, v0.4h, v8.4h
+ smull2 v5.4s, v0.8h, v8.8h
+ smull v6.4s, v1.4h, v8.4h
+ smull2 v7.4s, v1.8h, v8.8h
+ smull v0.4s, v2.4h, v8.4h
+ smull2 v1.4s, v2.8h, v8.8h
+ smull v2.4s, v3.4h, v8.4h
+ smull2 v3.4s, v3.8h, v8.8h
+ smlal v4.4s, v10.4h, v9.4h
+ smlal2 v5.4s, v10.8h, v9.8h
+ smlal v6.4s, v11.4h, v9.4h
+ smlal2 v7.4s, v11.8h, v9.8h
+ smlal v0.4s, v12.4h, v9.4h
+ smlal2 v1.4s, v12.8h, v9.8h
+ smlal v2.4s, v13.4h, v9.4h
+ smlal2 v3.4s, v13.8h, v9.8h
+ shrn v4.4h, v4.4s, #6
+ shrn2 v4.8h, v5.4s, #6
+ shrn v5.4h, v6.4s, #6
+ shrn2 v5.8h, v7.4s, #6
+ shrn v6.4h, v0.4s, #6
+ shrn2 v6.8h, v1.4s, #6
+ shrn v7.4h, v2.4s, #6
+ shrn2 v7.8h, v3.4s, #6
+ add v0.8h, v4.8h, v24.8h
+ add v1.8h, v5.8h, v24.8h
+ add v2.8h, v6.8h, v24.8h
+ add v3.8h, v7.8h, v24.8h
+ movi v20.8h, #0
+ smin v0.8h, v0.8h, v23.8h
+ smin v1.8h, v1.8h, v23.8h
+ smin v2.8h, v2.8h, v23.8h
+ smin v3.8h, v3.8h, v23.8h
+ smax v0.8h, v0.8h, v20.8h
+ smax v1.8h, v1.8h, v20.8h
+ smax v2.8h, v2.8h, v20.8h
+ smax v3.8h, v3.8h, v20.8h
+.else
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ and v0.16b, v0.16b, v23.16b
+ and v1.16b, v1.16b, v23.16b
+ and v2.16b, v2.16b, v23.16b
+ and v3.16b, v3.16b, v23.16b
+.endif
+
+ bl gather32_neon
+
+ uxtl v4.8h, v6.8b // scaling
+ uxtl2 v5.8h, v6.16b
+ uxtl v6.8h, v7.8b
+ uxtl2 v7.8h, v7.16b
+
+ ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
+ ushl v5.8h, v5.8h, v29.8h
+ ushl v6.8h, v6.8h, v29.8h
+ ushl v7.8h, v7.8h, v29.8h
+
+ sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
+ sqrdmulh v17.8h, v17.8h, v5.8h
+ sqrdmulh v18.8h, v18.8h, v6.8h
+ sqrdmulh v19.8h, v19.8h, v7.8h
+
+ usqadd v10.8h, v16.8h // *src + noise
+ usqadd v11.8h, v17.8h
+ usqadd v12.8h, v18.8h
+ usqadd v13.8h, v19.8h
+
+ umax v0.8h, v10.8h, v30.8h
+ umax v1.8h, v11.8h, v30.8h
+ umax v2.8h, v12.8h, v30.8h
+ umax v3.8h, v13.8h, v30.8h
+ umin v0.8h, v0.8h, v31.8h
+ umin v1.8h, v1.8h, v31.8h
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+
+ subs w9, w9, #1
+.if \oy
+ dup v25.8h, v28.h[0]
+ dup v26.8h, v28.h[1]
+.endif
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
+.endif
+ b 9f
+.endm
+ fguv_loop_sx0 0, 0, 0
+ fguv_loop_sx0 0, 0, 1
+ fguv_loop_sx0 0, 1, 0
+ fguv_loop_sx0 0, 1, 1
+ fguv_loop_sx0 1, 0, 0
+ fguv_loop_sx0 1, 0, 1
+ fguv_loop_sx0 1, 1, 0
+ fguv_loop_sx0 1, 1, 1
+
+9:
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldr x30, [sp], #80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx0_tbl):
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
+ .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
+endfunc
+
+function fguv_loop_sx1_neon
+.macro fguv_loop_sx1 csfl, ox, oy
+L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
+ AARCH64_VALID_JUMP_TARGET
+1:
+.if \ox
+ ld1 {v18.4h}, [x4], x10 // grain_lut old
+.endif
+.if \oy
+ ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top
+.endif
+.if \ox && \oy
+ ld1 {v19.4h}, [x11], x10 // grain_lut top old
+.endif
+ ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut
+
+.if \ox
+ smull v18.4s, v18.4h, v27.4h
+ smlal v18.4s, v16.4h, v28.4h
+.endif
+
+.if \oy
+.if \ox
+ smull v19.4s, v19.4h, v27.4h
+ smlal v19.4s, v20.4h, v28.4h
+ sqrshrn v18.4h, v18.4s, #5
+ sqrshrn v19.4h, v19.4s, #5
+ smin v18.4h, v18.4h, v15.4h
+ smin v19.4h, v19.4h, v15.4h
+ smax v18.4h, v18.4h, v14.4h
+ smax v19.4h, v19.4h, v14.4h
+ ins v16.d[0], v18.d[0]
+ ins v20.d[0], v19.d[0]
+.endif
+
+ smull v0.4s, v16.4h, v26.4h
+ smull2 v1.4s, v16.8h, v26.8h
+ smull v2.4s, v17.4h, v26.4h
+ smull2 v3.4s, v17.8h, v26.8h
+ smlal v0.4s, v20.4h, v25.4h
+ smlal2 v1.4s, v20.8h, v25.8h
+ smlal v2.4s, v21.4h, v25.4h
+ smlal2 v3.4s, v21.8h, v25.8h
+ sqrshrn v16.4h, v0.4s, #5
+ sqrshrn2 v16.8h, v1.4s, #5
+ sqrshrn v17.4h, v2.4s, #5
+ sqrshrn2 v17.8h, v3.4s, #5
+.endif
+
+.if \ox && !\oy
+ sqrshrn v18.4h, v18.4s, #5
+ smin v18.4h, v18.4h, v15.4h
+.endif
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma
+.if \oy
+ smin v16.8h, v16.8h, v15.8h
+ smin v17.8h, v17.8h, v15.8h
+ smax v16.8h, v16.8h, v14.8h
+ smax v17.8h, v17.8h, v14.8h
+.endif
+
+.if \ox && !\oy
+ smax v18.4h, v18.4h, v14.4h
+.endif
+ ld1 {v10.8h, v11.8h}, [x1], x2 // src
+.if \ox && !\oy
+ ins v16.d[0], v18.d[0]
+.endif
+ addp v0.8h, v0.8h, v1.8h
+ addp v1.8h, v2.8h, v3.8h
+ urshr v0.8h, v0.8h, #1
+ urshr v1.8h, v1.8h, #1
+.if !\csfl
+ smull v2.4s, v0.4h, v8.4h
+ smull2 v3.4s, v0.8h, v8.8h
+ smull v0.4s, v1.4h, v8.4h
+ smull2 v1.4s, v1.8h, v8.8h
+ smlal v2.4s, v10.4h, v9.4h
+ smlal2 v3.4s, v10.8h, v9.8h
+ smlal v0.4s, v11.4h, v9.4h
+ smlal2 v1.4s, v11.8h, v9.8h
+ shrn v2.4h, v2.4s, #6
+ shrn2 v2.8h, v3.4s, #6
+ shrn v3.4h, v0.4s, #6
+ shrn2 v3.8h, v1.4s, #6
+ add v0.8h, v2.8h, v24.8h
+ add v1.8h, v3.8h, v24.8h
+ movi v2.8h, #0
+ smin v0.8h, v0.8h, v23.8h
+ smin v1.8h, v1.8h, v23.8h
+ smax v0.8h, v0.8h, v2.8h
+ smax v1.8h, v1.8h, v2.8h
+.else
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ and v0.16b, v0.16b, v23.16b
+ and v1.16b, v1.16b, v23.16b
+.endif
+
+ bl gather16_neon
+
+ uxtl v4.8h, v6.8b // scaling
+ uxtl2 v5.8h, v6.16b
+
+ ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
+ ushl v5.8h, v5.8h, v29.8h
+
+ sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
+ sqrdmulh v17.8h, v17.8h, v5.8h
+
+ usqadd v10.8h, v16.8h // *src + noise
+ usqadd v11.8h, v17.8h
+
+ umax v0.8h, v10.8h, v30.8h
+ umax v1.8h, v11.8h, v30.8h
+ umin v0.8h, v0.8h, v31.8h
+ umin v1.8h, v1.8h, v31.8h
+
+.if \oy
+ mov v16.16b, v25.16b
+.endif
+ subs w9, w9, #1
+.if \oy
+ mov v25.16b, v26.16b
+ mov v26.16b, v16.16b
+.endif
+ st1 {v0.8h, v1.8h}, [x0], x2 // dst
+ b.gt 1b
+
+.if \oy
+ cmp w12, #0
+ mov w9, w12 // restore actual remaining h
+ b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
+.endif
+
+ b 9f
+.endm
+ fguv_loop_sx1 0, 0, 0
+ fguv_loop_sx1 0, 0, 1
+ fguv_loop_sx1 0, 1, 0
+ fguv_loop_sx1 0, 1, 1
+ fguv_loop_sx1 1, 0, 0
+ fguv_loop_sx1 1, 0, 1
+ fguv_loop_sx1 1, 1, 0
+ fguv_loop_sx1 1, 1, 1
+
+9:
+ ldp d14, d15, [sp, #64]
+ ldp d12, d13, [sp, #48]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #16]
+ ldr x30, [sp], #80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(fguv_loop_sx1_tbl):
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
+ .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/ipred.S b/third_party/dav1d/src/arm/64/ipred.S
new file mode 100644
index 0000000000..dab67577e6
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/ipred.S
@@ -0,0 +1,3985 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_128_8bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_dc_128_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ movi v0.16b, #128
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+4:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ movi v1.16b, #128
+32:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ movi v1.16b, #128
+ movi v2.16b, #128
+ movi v3.16b, #128
+64:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_128_tbl):
+ .hword L(ipred_dc_128_tbl) - 640b
+ .hword L(ipred_dc_128_tbl) - 320b
+ .hword L(ipred_dc_128_tbl) - 16b
+ .hword L(ipred_dc_128_tbl) - 8b
+ .hword L(ipred_dc_128_tbl) - 4b
+endfunc
+
+// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_8bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_v_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #1
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2]
+4:
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+8:
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+16:
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x2]
+32:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+64:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_v_tbl):
+ .hword L(ipred_v_tbl) - 640b
+ .hword L(ipred_v_tbl) - 320b
+ .hword L(ipred_v_tbl) - 160b
+ .hword L(ipred_v_tbl) - 80b
+ .hword L(ipred_v_tbl) - 40b
+endfunc
+
+// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_8bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_h_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ sub x2, x2, #4
+ sub x5, x5, w3, uxtw
+ mov x7, #-4
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
+ st1 {v3.s}[0], [x0], x1
+ st1 {v2.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v1.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
+ st1 {v3.8b}, [x0], x1
+ st1 {v2.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+ st1 {v3.16b}, [x0], x1
+ st1 {v2.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ st1 {v3.16b}, [x0], x1
+ st1 {v2.16b}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ st1 {v1.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ stp q3, q3, [x0, #32]
+ stp q2, q2, [x6, #32]
+ st1 {v3.16b}, [x0], x1
+ st1 {v2.16b}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ stp q1, q1, [x0, #32]
+ stp q0, q0, [x6, #32]
+ st1 {v1.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_h_tbl):
+ .hword L(ipred_h_tbl) - 64b
+ .hword L(ipred_h_tbl) - 32b
+ .hword L(ipred_h_tbl) - 16b
+ .hword L(ipred_h_tbl) - 8b
+ .hword L(ipred_h_tbl) - 4b
+endfunc
+
+// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_8bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_dc_top_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #1
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.8b, v0.b[0]
+4:
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.8b, v0.b[0]
+8:
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ rshrn v0.8b, v0.8h, #4
+ dup v0.16b, v0.b[0]
+16:
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ add v2.4h, v0.4h, v1.4h
+ rshrn v2.8b, v2.8h, #5
+ dup v0.16b, v2.b[0]
+ dup v1.16b, v2.b[0]
+32:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v4.4h, v0.4h, v1.4h
+ add v5.4h, v2.4h, v3.4h
+ add v4.4h, v4.4h, v5.4h
+ rshrn v4.8b, v4.8h, #6
+ dup v0.16b, v4.b[0]
+ dup v1.16b, v4.b[0]
+ dup v2.16b, v4.b[0]
+ dup v3.16b, v4.b[0]
+64:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_top_tbl):
+ .hword L(ipred_dc_top_tbl) - 640b
+ .hword L(ipred_dc_top_tbl) - 320b
+ .hword L(ipred_dc_top_tbl) - 160b
+ .hword L(ipred_dc_top_tbl) - 80b
+ .hword L(ipred_dc_top_tbl) - 40b
+endfunc
+
+// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_8bpc_neon, export=1
+ sub x2, x2, w4, uxtw
+ clz w3, w3
+ clz w7, w4
+ adr x5, L(ipred_dc_left_tbl)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w7, w7, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w7, [x5, w7, uxtw #1]
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w7, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_left_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w4):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt L(ipred_dc_left_w4)
+ ret
+
+L(ipred_dc_left_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+ uaddlv h0, v0.8b
+ rshrn v0.8b, v0.8h, #3
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w8):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt L(ipred_dc_left_w8)
+ ret
+
+L(ipred_dc_left_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ rshrn v0.8b, v0.8h, #4
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w16):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt L(ipred_dc_left_w16)
+ ret
+
+L(ipred_dc_left_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ add v0.4h, v0.4h, v1.4h
+ rshrn v0.8b, v0.8h, #5
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w32):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+1:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_h64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v0.4h, v0.4h, v1.4h
+ add v2.4h, v2.4h, v3.4h
+ add v0.4h, v0.4h, v2.4h
+ rshrn v0.8b, v0.8h, #6
+ dup v0.16b, v0.b[0]
+ br x3
+L(ipred_dc_left_w64):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+1:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_tbl):
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
+endfunc
+
+// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_8bpc_neon, export=1
+ sub x2, x2, w4, uxtw
+ add w7, w3, w4 // width + height
+ clz w3, w3
+ clz w6, w4
+ dup v16.8h, w7 // width + height
+ adr x5, L(ipred_dc_tbl)
+ rbit w7, w7 // rbit(width + height)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w6, w6, #25
+ clz w7, w7 // ctz(width + height)
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w6, [x5, w6, uxtw #1]
+ neg w7, w7 // -ctz(width + height)
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w6, uxtw
+ ushr v16.8h, v16.8h, #1 // (width + height) >> 1
+ dup v17.8h, w7 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2], #4
+ ins v0.s[1], wzr
+ uaddlv h0, v0.8b
+ add x2, x2, #1
+ br x3
+L(ipred_dc_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.s}[0], [x2]
+ ins v1.s[1], wzr
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.8b
+ cmp w4, #4
+ add v0.4h, v0.4h, v1.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 8/16
+ mov w16, #(0x3334/2)
+ movk w16, #(0x5556/2), lsl #16
+ add w17, w4, w4 // w17 = 2*h = 16 or 32
+ lsr w16, w16, w17
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8b, v0.b[0]
+2:
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[0], [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2], #8
+ uaddlv h0, v0.8b
+ add x2, x2, #1
+ br x3
+L(ipred_dc_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.8b
+ cmp w4, #8
+ add v0.4h, v0.4h, v1.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8b, v0.b[0]
+2:
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8b}, [x0], x1
+ st1 {v0.8b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2], #16
+ uaddlv h0, v0.16b
+ add x2, x2, #1
+ br x3
+L(ipred_dc_w16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.16b
+ cmp w4, #16
+ add v0.4h, v0.4h, v1.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 4/8/32/64
+ tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.16b, v0.b[0]
+2:
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b}, [x0], x1
+ st1 {v0.16b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x2], #32
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ add x2, x2, #1
+ add v0.4h, v0.4h, v1.4h
+ br x3
+L(ipred_dc_w32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b, v2.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ cmp w4, #32
+ add v0.4h, v0.4h, v1.4h
+ add v0.4h, v0.4h, v2.4h
+ ushl v4.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 8/16/64
+ cmp w4, #8
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v4.4h, v4.4h, v16.4h
+1:
+ dup v0.16b, v4.b[0]
+ dup v1.16b, v4.b[0]
+2:
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b}, [x0], x1
+ st1 {v0.16b, v1.16b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v0.4h, v0.4h, v1.4h
+ add v2.4h, v2.4h, v3.4h
+ add x2, x2, #1
+ add v0.4h, v0.4h, v2.4h
+ br x3
+L(ipred_dc_w64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h1, v1.16b
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ uaddlv h4, v4.16b
+ add v1.4h, v1.4h, v2.4h
+ add v3.4h, v3.4h, v4.4h
+ cmp w4, #64
+ add v0.4h, v0.4h, v1.4h
+ add v0.4h, v0.4h, v3.4h
+ ushl v4.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 16/32
+ mov w16, #(0x5556/2)
+ movk w16, #(0x3334/2), lsl #16
+ lsr w16, w16, w4
+ dup v16.4h, w16
+ sqdmulh v4.4h, v4.4h, v16.4h
+1:
+ dup v0.16b, v4.b[0]
+ dup v1.16b, v4.b[0]
+ dup v2.16b, v4.b[0]
+ dup v3.16b, v4.b[0]
+2:
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_tbl):
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
+endfunc
+
+// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_paeth_8bpc_neon, export=1
+ clz w9, w3
+ adr x5, L(ipred_paeth_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.16b}, [x2]
+ add x8, x2, #1
+ sub x2, x2, #4
+ sub x5, x5, w9, uxtw
+ mov x7, #-4
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v5.4s}, [x8]
+ usubl v6.8h, v5.8b, v4.8b // top - topleft
+4:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
+ zip1 v0.2s, v0.2s, v1.2s
+ zip1 v2.2s, v2.2s, v3.2s
+ uaddw v16.8h, v6.8h, v0.8b
+ uaddw v17.8h, v6.8h, v2.8b
+ sqxtun v16.8b, v16.8h // base
+ sqxtun2 v16.16b, v17.8h
+ zip1 v0.2d, v0.2d, v2.2d
+ uabd v20.16b, v5.16b, v16.16b // tdiff
+ uabd v22.16b, v4.16b, v16.16b // tldiff
+ uabd v16.16b, v0.16b, v16.16b // ldiff
+ umin v18.16b, v20.16b, v22.16b // min(tdiff, tldiff)
+ cmhs v20.16b, v22.16b, v20.16b // tldiff >= tdiff
+ cmhs v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff
+ bsl v20.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bit v20.16b, v0.16b, v16.16b // ldiff <= min ? left : ...
+ st1 {v20.s}[3], [x0], x1
+ st1 {v20.s}[2], [x6], x1
+ subs w4, w4, #4
+ st1 {v20.s}[1], [x0], x1
+ st1 {v20.s}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v5.2d}, [x8]
+ usubl v6.8h, v5.8b, v4.8b // top - topleft
+8:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
+ uaddw v16.8h, v6.8h, v0.8b
+ uaddw v17.8h, v6.8h, v1.8b
+ uaddw v18.8h, v6.8h, v2.8b
+ uaddw v19.8h, v6.8h, v3.8b
+ sqxtun v16.8b, v16.8h // base
+ sqxtun2 v16.16b, v17.8h
+ sqxtun v18.8b, v18.8h
+ sqxtun2 v18.16b, v19.8h
+ zip1 v2.2d, v2.2d, v3.2d
+ zip1 v0.2d, v0.2d, v1.2d
+ uabd v21.16b, v5.16b, v18.16b // tdiff
+ uabd v20.16b, v5.16b, v16.16b
+ uabd v23.16b, v4.16b, v18.16b // tldiff
+ uabd v22.16b, v4.16b, v16.16b
+ uabd v17.16b, v2.16b, v18.16b // ldiff
+ uabd v16.16b, v0.16b, v16.16b
+ umin v19.16b, v21.16b, v23.16b // min(tdiff, tldiff)
+ umin v18.16b, v20.16b, v22.16b
+ cmhs v21.16b, v23.16b, v21.16b // tldiff >= tdiff
+ cmhs v20.16b, v22.16b, v20.16b
+ cmhs v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff
+ cmhs v16.16b, v18.16b, v16.16b
+ bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bsl v20.16b, v5.16b, v4.16b
+ bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ...
+ bit v20.16b, v0.16b, v16.16b
+ st1 {v21.d}[1], [x0], x1
+ st1 {v21.d}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v20.d}[1], [x0], x1
+ st1 {v20.d}[0], [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v5.16b}, [x8], #16
+ mov w9, w3
+ // Set up pointers for four rows in parallel; x0, x6, x5, x10
+ add x5, x0, x1
+ add x10, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+1:
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7
+2:
+ usubl v6.8h, v5.8b, v4.8b // top - topleft
+ usubl2 v7.8h, v5.16b, v4.16b
+ uaddw v24.8h, v6.8h, v0.8b
+ uaddw v25.8h, v7.8h, v0.8b
+ uaddw v26.8h, v6.8h, v1.8b
+ uaddw v27.8h, v7.8h, v1.8b
+ uaddw v28.8h, v6.8h, v2.8b
+ uaddw v29.8h, v7.8h, v2.8b
+ uaddw v30.8h, v6.8h, v3.8b
+ uaddw v31.8h, v7.8h, v3.8b
+ sqxtun v17.8b, v26.8h // base
+ sqxtun2 v17.16b, v27.8h
+ sqxtun v16.8b, v24.8h
+ sqxtun2 v16.16b, v25.8h
+ sqxtun v19.8b, v30.8h
+ sqxtun2 v19.16b, v31.8h
+ sqxtun v18.8b, v28.8h
+ sqxtun2 v18.16b, v29.8h
+ uabd v23.16b, v5.16b, v19.16b // tdiff
+ uabd v22.16b, v5.16b, v18.16b
+ uabd v21.16b, v5.16b, v17.16b
+ uabd v20.16b, v5.16b, v16.16b
+ uabd v27.16b, v4.16b, v19.16b // tldiff
+ uabd v26.16b, v4.16b, v18.16b
+ uabd v25.16b, v4.16b, v17.16b
+ uabd v24.16b, v4.16b, v16.16b
+ uabd v19.16b, v3.16b, v19.16b // ldiff
+ uabd v18.16b, v2.16b, v18.16b
+ uabd v17.16b, v1.16b, v17.16b
+ uabd v16.16b, v0.16b, v16.16b
+ umin v31.16b, v23.16b, v27.16b // min(tdiff, tldiff)
+ umin v30.16b, v22.16b, v26.16b
+ umin v29.16b, v21.16b, v25.16b
+ umin v28.16b, v20.16b, v24.16b
+ cmhs v23.16b, v27.16b, v23.16b // tldiff >= tdiff
+ cmhs v22.16b, v26.16b, v22.16b
+ cmhs v21.16b, v25.16b, v21.16b
+ cmhs v20.16b, v24.16b, v20.16b
+ cmhs v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff
+ cmhs v18.16b, v30.16b, v18.16b
+ cmhs v17.16b, v29.16b, v17.16b
+ cmhs v16.16b, v28.16b, v16.16b
+ bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bsl v22.16b, v5.16b, v4.16b
+ bsl v21.16b, v5.16b, v4.16b
+ bsl v20.16b, v5.16b, v4.16b
+ bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ...
+ bit v22.16b, v2.16b, v18.16b
+ bit v21.16b, v1.16b, v17.16b
+ bit v20.16b, v0.16b, v16.16b
+ subs w3, w3, #16
+ st1 {v23.16b}, [x0], #16
+ st1 {v22.16b}, [x6], #16
+ st1 {v21.16b}, [x5], #16
+ st1 {v20.16b}, [x10], #16
+ b.le 8f
+ ld1 {v5.16b}, [x8], #16
+ b 2b
+8:
+ subs w4, w4, #4
+ b.le 9f
+ // End of horizontal loop, move pointers to next four rows
+ sub x8, x8, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ // Load the top row as early as possible
+ ld1 {v5.16b}, [x8], #16
+ add x5, x5, x1
+ add x10, x10, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_paeth_tbl):
+ .hword L(ipred_paeth_tbl) - 640b
+ .hword L(ipred_paeth_tbl) - 320b
+ .hword L(ipred_paeth_tbl) - 160b
+ .hword L(ipred_paeth_tbl) - 80b
+ .hword L(ipred_paeth_tbl) - 40b
+endfunc
+
+// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_8bpc_neon, export=1
+ movrel x10, X(sm_weights)
+ add x11, x10, w4, uxtw
+ add x10, x10, w3, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_tbl)
+ sub x12, x2, w4, uxtw
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.16b}, [x12] // bottom
+ add x8, x2, #1
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v6.2s}, [x8] // top
+ ld1r {v7.2s}, [x10] // weights_hor
+ sub x2, x2, #4
+ mov x7, #-4
+ dup v5.16b, v6.b[3] // right
+ usubl v6.8h, v6.8b, v4.8b // top-bottom
+ uxtl v7.8h, v7.8b // weights_hor
+4:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ zip1 v1.2s, v1.2s, v0.2s // left, flipped
+ zip1 v0.2s, v3.2s, v2.2s
+ zip1 v16.2s, v16.2s, v17.2s // weights_ver
+ zip1 v18.2s, v18.2s, v19.2s
+ shll v22.8h, v4.8b, #8 // bottom*256
+ shll v23.8h, v4.8b, #8
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v18.8h, v18.8b
+ mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v1.8h, v7.8h
+ mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v23.8h, v6.8h, v18.8h
+ uhadd v20.8h, v20.8h, v22.8h
+ uhadd v21.8h, v21.8h, v23.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn v21.8b, v21.8h, #8
+ st1 {v20.s}[0], [x0], x1
+ st1 {v20.s}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v21.s}[0], [x0], x1
+ st1 {v21.s}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v6.8b}, [x8] // top
+ ld1 {v7.8b}, [x10] // weights_hor
+ sub x2, x2, #4
+ mov x7, #-4
+ dup v5.16b, v6.b[7] // right
+ usubl v6.8h, v6.8b, v4.8b // top-bottom
+ uxtl v7.8h, v7.8b // weights_hor
+8:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ shll v22.8h, v5.8b, #8
+ shll v23.8h, v5.8b, #8
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ usubl v2.8h, v2.8b, v5.8b
+ usubl v3.8h, v3.8b, v5.8b
+ shll v24.8h, v4.8b, #8 // bottom*256
+ shll v25.8h, v4.8b, #8
+ shll v26.8h, v4.8b, #8
+ shll v27.8h, v4.8b, #8
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+ mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v2.8h, v7.8h // (left flipped)
+ mla v22.8h, v1.8h, v7.8h
+ mla v23.8h, v0.8h, v7.8h
+ mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v25.8h, v6.8h, v17.8h
+ mla v26.8h, v6.8h, v18.8h
+ mla v27.8h, v6.8h, v19.8h
+ uhadd v20.8h, v20.8h, v24.8h
+ uhadd v21.8h, v21.8h, v25.8h
+ uhadd v22.8h, v22.8h, v26.8h
+ uhadd v23.8h, v23.8h, v27.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn v21.8b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn v23.8b, v23.8h, #8
+ st1 {v20.8b}, [x0], x1
+ st1 {v21.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8b}, [x0], x1
+ st1 {v23.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x12, x2, w3, uxtw
+ sub x2, x2, #2
+ mov x7, #-2
+ ld1r {v5.16b}, [x12] // right
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+
+1:
+ ld2r {v0.8b, v1.8b}, [x2], x7 // left
+ ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+2:
+ ld1 {v7.16b}, [x10], #16 // weights_hor
+ ld1 {v3.16b}, [x8], #16 // top
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ shll v22.8h, v5.8b, #8
+ shll v23.8h, v5.8b, #8
+ uxtl v6.8h, v7.8b // weights_hor
+ uxtl2 v7.8h, v7.16b
+ usubl v2.8h, v3.8b, v4.8b // top-bottom
+ usubl2 v3.8h, v3.16b, v4.16b
+ mla v20.8h, v1.8h, v6.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v1.8h, v7.8h // (left flipped)
+ mla v22.8h, v0.8h, v6.8h
+ mla v23.8h, v0.8h, v7.8h
+ shll v24.8h, v4.8b, #8 // bottom*256
+ shll v25.8h, v4.8b, #8
+ shll v26.8h, v4.8b, #8
+ shll v27.8h, v4.8b, #8
+ mla v24.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v25.8h, v3.8h, v16.8h
+ mla v26.8h, v2.8h, v17.8h
+ mla v27.8h, v3.8h, v17.8h
+ uhadd v20.8h, v20.8h, v24.8h
+ uhadd v21.8h, v21.8h, v25.8h
+ uhadd v22.8h, v22.8h, v26.8h
+ uhadd v23.8h, v23.8h, v27.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn2 v20.16b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn2 v22.16b, v23.8h, #8
+ subs w3, w3, #16
+ st1 {v20.16b}, [x0], #16
+ st1 {v22.16b}, [x6], #16
+ b.gt 2b
+ subs w4, w4, #2
+ b.le 9f
+ sub x8, x8, w9, uxtw
+ sub x10, x10, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_tbl):
+ .hword L(ipred_smooth_tbl) - 640b
+ .hword L(ipred_smooth_tbl) - 320b
+ .hword L(ipred_smooth_tbl) - 160b
+ .hword L(ipred_smooth_tbl) - 80b
+ .hword L(ipred_smooth_tbl) - 40b
+endfunc
+
+// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_v_8bpc_neon, export=1
+ movrel x7, X(sm_weights)
+ add x7, x7, w4, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_v_tbl)
+ sub x8, x2, w4, uxtw
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.16b}, [x8] // bottom
+ add x2, x2, #1
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v6.2s}, [x2] // top
+ usubl v6.8h, v6.8b, v4.8b // top-bottom
+4:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ shll v22.8h, v4.8b, #8 // bottom*256
+ shll v23.8h, v4.8b, #8
+ zip1 v16.2s, v16.2s, v17.2s // weights_ver
+ zip1 v18.2s, v18.2s, v19.2s
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v18.8h, v18.8b
+ mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v23.8h, v6.8h, v18.8h
+ rshrn v22.8b, v22.8h, #8
+ rshrn v23.8b, v23.8h, #8
+ st1 {v22.s}[0], [x0], x1
+ st1 {v22.s}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v23.s}[0], [x0], x1
+ st1 {v23.s}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v6.8b}, [x2] // top
+ usubl v6.8h, v6.8b, v4.8b // top-bottom
+8:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ shll v24.8h, v4.8b, #8 // bottom*256
+ shll v25.8h, v4.8b, #8
+ shll v26.8h, v4.8b, #8
+ shll v27.8h, v4.8b, #8
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+ mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v25.8h, v6.8h, v17.8h
+ mla v26.8h, v6.8h, v18.8h
+ mla v27.8h, v6.8h, v19.8h
+ rshrn v24.8b, v24.8h, #8
+ rshrn v25.8b, v25.8h, #8
+ rshrn v26.8b, v26.8h, #8
+ rshrn v27.8b, v27.8h, #8
+ st1 {v24.8b}, [x0], x1
+ st1 {v25.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v26.8b}, [x0], x1
+ st1 {v27.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ // Set up pointers for four rows in parallel; x0, x6, x5, x8
+ add x5, x0, x1
+ add x8, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+
+1:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+2:
+ ld1 {v3.16b}, [x2], #16 // top
+ shll v20.8h, v4.8b, #8 // bottom*256
+ shll v21.8h, v4.8b, #8
+ shll v22.8h, v4.8b, #8
+ shll v23.8h, v4.8b, #8
+ shll v24.8h, v4.8b, #8
+ shll v25.8h, v4.8b, #8
+ shll v26.8h, v4.8b, #8
+ shll v27.8h, v4.8b, #8
+ usubl v2.8h, v3.8b, v4.8b // top-bottom
+ usubl2 v3.8h, v3.16b, v4.16b
+ mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
+ mla v21.8h, v3.8h, v16.8h
+ mla v22.8h, v2.8h, v17.8h
+ mla v23.8h, v3.8h, v17.8h
+ mla v24.8h, v2.8h, v18.8h
+ mla v25.8h, v3.8h, v18.8h
+ mla v26.8h, v2.8h, v19.8h
+ mla v27.8h, v3.8h, v19.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn2 v20.16b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn2 v22.16b, v23.8h, #8
+ rshrn v24.8b, v24.8h, #8
+ rshrn2 v24.16b, v25.8h, #8
+ rshrn v26.8b, v26.8h, #8
+ rshrn2 v26.16b, v27.8h, #8
+ subs w3, w3, #16
+ st1 {v20.16b}, [x0], #16
+ st1 {v22.16b}, [x6], #16
+ st1 {v24.16b}, [x5], #16
+ st1 {v26.16b}, [x8], #16
+ b.gt 2b
+ subs w4, w4, #4
+ b.le 9f
+ sub x2, x2, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ add x5, x5, x1
+ add x8, x8, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_v_tbl):
+ .hword L(ipred_smooth_v_tbl) - 640b
+ .hword L(ipred_smooth_v_tbl) - 320b
+ .hword L(ipred_smooth_v_tbl) - 160b
+ .hword L(ipred_smooth_v_tbl) - 80b
+ .hword L(ipred_smooth_v_tbl) - 40b
+endfunc
+
+// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_h_8bpc_neon, export=1
+ movrel x8, X(sm_weights)
+ add x8, x8, w3, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_h_tbl)
+ add x12, x2, w3, uxtw
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v5.16b}, [x12] // right
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v7.2s}, [x8] // weights_hor
+ sub x2, x2, #4
+ mov x7, #-4
+ uxtl v7.8h, v7.8b // weights_hor
+4:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ zip1 v1.2s, v1.2s, v0.2s // left, flipped
+ zip1 v0.2s, v3.2s, v2.2s
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v1.8h, v7.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn v21.8b, v21.8h, #8
+ st1 {v20.s}[0], [x0], x1
+ st1 {v20.s}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v21.s}[0], [x0], x1
+ st1 {v21.s}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v7.8b}, [x8] // weights_hor
+ sub x2, x2, #4
+ mov x7, #-4
+ uxtl v7.8h, v7.8b // weights_hor
+8:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ shll v22.8h, v5.8b, #8
+ shll v23.8h, v5.8b, #8
+ usubl v3.8h, v3.8b, v5.8b // left-right
+ usubl v2.8h, v2.8b, v5.8b
+ usubl v1.8h, v1.8b, v5.8b
+ usubl v0.8h, v0.8b, v5.8b
+ mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v2.8h, v7.8h // (left flipped)
+ mla v22.8h, v1.8h, v7.8h
+ mla v23.8h, v0.8h, v7.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn v21.8b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn v23.8b, v23.8h, #8
+ st1 {v20.8b}, [x0], x1
+ st1 {v21.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8b}, [x0], x1
+ st1 {v23.8b}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ sub x2, x2, #4
+ mov x7, #-4
+ // Set up pointers for four rows in parallel; x0, x6, x5, x10
+ add x5, x0, x1
+ add x10, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+
+1:
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
+ usubl v0.8h, v0.8b, v5.8b // left-right
+ usubl v1.8h, v1.8b, v5.8b
+ usubl v2.8h, v2.8b, v5.8b
+ usubl v3.8h, v3.8b, v5.8b
+2:
+ ld1 {v7.16b}, [x8], #16 // weights_hor
+ shll v20.8h, v5.8b, #8 // right*256
+ shll v21.8h, v5.8b, #8
+ shll v22.8h, v5.8b, #8
+ shll v23.8h, v5.8b, #8
+ shll v24.8h, v5.8b, #8
+ shll v25.8h, v5.8b, #8
+ shll v26.8h, v5.8b, #8
+ shll v27.8h, v5.8b, #8
+ uxtl v6.8h, v7.8b // weights_hor
+ uxtl2 v7.8h, v7.16b
+ mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor
+ mla v21.8h, v3.8h, v7.8h // (left flipped)
+ mla v22.8h, v2.8h, v6.8h
+ mla v23.8h, v2.8h, v7.8h
+ mla v24.8h, v1.8h, v6.8h
+ mla v25.8h, v1.8h, v7.8h
+ mla v26.8h, v0.8h, v6.8h
+ mla v27.8h, v0.8h, v7.8h
+ rshrn v20.8b, v20.8h, #8
+ rshrn2 v20.16b, v21.8h, #8
+ rshrn v22.8b, v22.8h, #8
+ rshrn2 v22.16b, v23.8h, #8
+ rshrn v24.8b, v24.8h, #8
+ rshrn2 v24.16b, v25.8h, #8
+ rshrn v26.8b, v26.8h, #8
+ rshrn2 v26.16b, v27.8h, #8
+ subs w3, w3, #16
+ st1 {v20.16b}, [x0], #16
+ st1 {v22.16b}, [x6], #16
+ st1 {v24.16b}, [x5], #16
+ st1 {v26.16b}, [x10], #16
+ b.gt 2b
+ subs w4, w4, #4
+ b.le 9f
+ sub x8, x8, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ add x5, x5, x1
+ add x10, x10, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_h_tbl):
+ .hword L(ipred_smooth_h_tbl) - 640b
+ .hword L(ipred_smooth_h_tbl) - 320b
+ .hword L(ipred_smooth_h_tbl) - 160b
+ .hword L(ipred_smooth_h_tbl) - 80b
+ .hword L(ipred_smooth_h_tbl) - 40b
+endfunc
+
+const padding_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+padding_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz,
+// const pixel *const in, const int end);
+function ipred_z1_upsample_edge_8bpc_neon, export=1
+ movrel x4, padding_mask
+ ld1 {v0.16b}, [x2] // in[]
+ add x5, x2, w3, uxtw // in[end]
+ sub x4, x4, w3, uxtw
+
+ ld1r {v1.16b}, [x5] // padding
+ ld1 {v3.16b}, [x4] // padding_mask
+
+ movi v31.8h, #9
+
+ bit v0.16b, v1.16b, v3.16b // padded in[]
+
+ ext v4.16b, v0.16b, v1.16b, #1
+ ext v5.16b, v0.16b, v1.16b, #2
+ ext v6.16b, v0.16b, v1.16b, #3
+
+ uaddl v16.8h, v4.8b, v5.8b // in[i+1] + in[i+2]
+ uaddl2 v17.8h, v4.16b, v5.16b
+ uaddl v18.8h, v0.8b, v6.8b // in[i+0] + in[i+3]
+ uaddl2 v19.8h, v0.16b, v6.16b
+ mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2])
+ mul v17.8h, v17.8h, v31.8h
+ sub v16.8h, v16.8h, v18.8h
+ sub v17.8h, v17.8h, v19.8h
+
+ sqrshrun v16.8b, v16.8h, #4
+ sqrshrun2 v16.16b, v17.8h, #4
+
+ zip1 v0.16b, v4.16b, v16.16b
+ zip2 v1.16b, v4.16b, v16.16b
+
+ st1 {v0.16b, v1.16b}, [x0]
+
+ ret
+endfunc
+
+const edge_filter
+ .byte 0, 4, 8, 0
+ .byte 0, 5, 6, 0
+// Leaving out the coeffs for strength=3
+// .byte 2, 4, 4, 0
+endconst
+
+// void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz,
+// const pixel *const in, const int end,
+// const int strength);
+function ipred_z1_filter_edge_8bpc_neon, export=1
+ cmp w4, #3
+ b.eq L(fivetap) // if (strength == 3) goto fivetap
+
+ movrel x5, edge_filter, -3
+ add x5, x5, w4, uxtw #2 // edge_filter + (strength - 1)*4 + 1
+
+ ld1 {v31.h}[0], [x5] // kernel[1-2]
+
+ ld1 {v0.16b}, [x2], #16
+
+ dup v30.16b, v31.b[0]
+ dup v31.16b, v31.b[1]
+1:
+ // in[end], is the last valid pixel. We produce 16 pixels out by
+ // using 18 pixels in - the last pixel used is [17] of the ones
+ // read/buffered.
+ cmp w3, #17
+ ld1 {v1.16b}, [x2], #16
+ b.lt 2f
+ ext v2.16b, v0.16b, v1.16b, #1
+ ext v3.16b, v0.16b, v1.16b, #2
+ umull v4.8h, v0.8b, v30.8b
+ umlal v4.8h, v2.8b, v31.8b
+ umlal v4.8h, v3.8b, v30.8b
+ umull2 v5.8h, v0.16b, v30.16b
+ umlal2 v5.8h, v2.16b, v31.16b
+ umlal2 v5.8h, v3.16b, v30.16b
+ subs w1, w1, #16
+ mov v0.16b, v1.16b
+ rshrn v4.8b, v4.8h, #4
+ rshrn2 v4.16b, v5.8h, #4
+ sub w3, w3, #16
+ st1 {v4.16b}, [x0], #16
+ b.gt 1b
+ ret
+2:
+ // Right padding
+
+ // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead)
+ movrel x5, padding_mask
+ sub w6, w3, #32
+ sub x5, x5, w3, uxtw
+ add x6, x2, w6, sxtw
+
+ ld1 {v2.16b}, [x5] // padding_mask
+
+ ld1r {v1.16b}, [x6]
+ bit v0.16b, v1.16b, v2.16b // Pad v0-v1
+
+ // Filter one block
+ ext v2.16b, v0.16b, v1.16b, #1
+ ext v3.16b, v0.16b, v1.16b, #2
+ umull v4.8h, v0.8b, v30.8b
+ umlal v4.8h, v2.8b, v31.8b
+ umlal v4.8h, v3.8b, v30.8b
+ umull2 v5.8h, v0.16b, v30.16b
+ umlal2 v5.8h, v2.16b, v31.16b
+ umlal2 v5.8h, v3.16b, v30.16b
+ subs w1, w1, #16
+ rshrn v4.8b, v4.8h, #4
+ rshrn2 v4.16b, v5.8h, #4
+ st1 {v4.16b}, [x0], #16
+ b.le 9f
+5:
+ // After one block, any remaining output would only be filtering
+ // padding - thus just store the padding.
+ subs w1, w1, #16
+ st1 {v1.16b}, [x0], #16
+ b.gt 5b
+9:
+ ret
+
+L(fivetap):
+ sub x2, x2, #1 // topleft -= 1
+ movi v29.16b, #2
+ ld1 {v0.16b}, [x2], #16
+ movi v30.16b, #4
+ movi v31.16b, #4
+ ins v0.b[0], v0.b[1]
+1:
+ // in[end+1], is the last valid pixel. We produce 16 pixels out by
+ // using 20 pixels in - the last pixel used is [19] of the ones
+ // read/buffered.
+ cmp w3, #18
+ ld1 {v1.16b}, [x2], #16
+ b.lt 2f // if (end + 1 < 19)
+ ext v2.16b, v0.16b, v1.16b, #1
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v0.16b, v1.16b, #3
+ ext v5.16b, v0.16b, v1.16b, #4
+ umull v6.8h, v0.8b, v29.8b
+ umlal v6.8h, v2.8b, v30.8b
+ umlal v6.8h, v3.8b, v31.8b
+ umlal v6.8h, v4.8b, v30.8b
+ umlal v6.8h, v5.8b, v29.8b
+ umull2 v7.8h, v0.16b, v29.16b
+ umlal2 v7.8h, v2.16b, v30.16b
+ umlal2 v7.8h, v3.16b, v31.16b
+ umlal2 v7.8h, v4.16b, v30.16b
+ umlal2 v7.8h, v5.16b, v29.16b
+ subs w1, w1, #16
+ mov v0.16b, v1.16b
+ rshrn v6.8b, v6.8h, #4
+ rshrn2 v6.16b, v7.8h, #4
+ sub w3, w3, #16
+ st1 {v6.16b}, [x0], #16
+ b.gt 1b
+ ret
+2:
+ // Right padding
+
+ // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead)
+ movrel x5, padding_mask, -1
+ sub w6, w3, #31
+ sub x5, x5, w3, uxtw
+ add x6, x2, w6, sxtw
+
+ ld1 {v2.16b, v3.16b}, [x5] // padding_mask
+
+ ld1r {v28.16b}, [x6]
+ bit v0.16b, v28.16b, v2.16b // Pad v0-v1
+ bit v1.16b, v28.16b, v3.16b
+4:
+ // Filter one block
+ ext v2.16b, v0.16b, v1.16b, #1
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v0.16b, v1.16b, #3
+ ext v5.16b, v0.16b, v1.16b, #4
+ umull v6.8h, v0.8b, v29.8b
+ umlal v6.8h, v2.8b, v30.8b
+ umlal v6.8h, v3.8b, v31.8b
+ umlal v6.8h, v4.8b, v30.8b
+ umlal v6.8h, v5.8b, v29.8b
+ umull2 v7.8h, v0.16b, v29.16b
+ umlal2 v7.8h, v2.16b, v30.16b
+ umlal2 v7.8h, v3.16b, v31.16b
+ umlal2 v7.8h, v4.16b, v30.16b
+ umlal2 v7.8h, v5.16b, v29.16b
+ subs w1, w1, #16
+ mov v0.16b, v1.16b
+ mov v1.16b, v28.16b
+ rshrn v6.8b, v6.8h, #4
+ rshrn2 v6.16b, v7.8h, #4
+ sub w3, w3, #16
+ st1 {v6.16b}, [x0], #16
+ b.le 9f
+ // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
+ // filter properly once more - aka (w3 >= 0).
+ cmp w3, #0
+ b.ge 4b
+5:
+ // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
+ // last valid pixel - thus just output that without filtering.
+ subs w1, w1, #16
+ st1 {v1.16b}, [x0], #16
+ b.gt 5b
+9:
+ ret
+endfunc
+
+// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px,
+// const int n);
+function ipred_pixel_set_8bpc_neon, export=1
+ dup v0.16b, w1
+1:
+ subs w2, w2, #16
+ st1 {v0.16b}, [x0], #16
+ b.gt 1b
+ ret
+endfunc
+
+// void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const top,
+// const int width, const int height,
+// const int dx, const int max_base_x);
+function ipred_z1_fill1_8bpc_neon, export=1
+ clz w9, w3
+ adr x8, L(ipred_z1_fill1_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ add x10, x2, w6, uxtw // top[max_base_x]
+ sub x8, x8, w9, uxtw
+ ld1r {v31.16b}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ br x8
+40:
+ AARCH64_VALID_JUMP_TARGET
+4:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 49f
+ ldr d0, [x2, w8, uxtw] // top[base]
+ ldr d2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ ext v1.8b, v0.8b, v0.8b, #1 // top[base+1]
+ ext v3.8b, v2.8b, v2.8b, #1
+ usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base]
+ usubl v7.8h, v3.8b, v2.8b
+ ushll v16.8h, v0.8b, #6 // top[base]*64
+ ushll v17.8h, v2.8b, #6
+ mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac
+ mla v17.4h, v7.4h, v5.4h
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.s}[0], [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.s}[0], [x0], x1
+ b.gt 4b
+ ret
+
+49:
+ st1 {v31.s}[0], [x0], x1
+ subs w4, w4, #2
+ st1 {v31.s}[0], [x0], x1
+ b.gt 49b
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+8:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 89f
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.8b, w9 // frac
+ dup v5.8b, w11
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.8b, w9 // 64 - frac
+ dup v7.8b, w11
+ ext v1.16b, v0.16b, v0.16b, #1 // top[base+1]
+ ext v3.16b, v2.16b, v2.16b, #1
+ umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac)
+ umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac
+ umull v17.8h, v2.8b, v7.8b
+ umlal v17.8h, v3.8b, v5.8b
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.8b}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.8b}, [x0], x1
+ b.gt 8b
+ ret
+
+89:
+ st1 {v31.8b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.8b}, [x0], x1
+ b.gt 89b
+ ret
+
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+
+ mov w12, w3
+
+ add x13, x0, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+1:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 169f
+ add x8, x2, w8, uxtw
+ add x10, x2, w10, uxtw
+ dup v4.16b, w9 // frac
+ dup v5.16b, w11
+ ld1 {v0.16b, v1.16b}, [x8], #32 // top[base]
+ ld1 {v2.16b, v3.16b}, [x10], #32
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.16b, w9 // 64 - frac
+ dup v7.16b, w11
+ add w7, w7, w5 // xpos += dx
+2:
+ ext v16.16b, v0.16b, v1.16b, #1 // top[base+1]
+ ext v17.16b, v2.16b, v3.16b, #1
+ subs w3, w3, #16
+ umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac)
+ umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac
+ umull2 v19.8h, v0.16b, v6.16b
+ umlal2 v19.8h, v16.16b, v4.16b
+ umull v20.8h, v2.8b, v7.8b
+ umlal v20.8h, v17.8b, v5.8b
+ umull2 v21.8h, v2.16b, v7.16b
+ umlal2 v21.8h, v17.16b, v5.16b
+ rshrn v16.8b, v18.8h, #6
+ rshrn2 v16.16b, v19.8h, #6
+ rshrn v17.8b, v20.8h, #6
+ rshrn2 v17.16b, v21.8h, #6
+ st1 {v16.16b}, [x0], #16
+ st1 {v17.16b}, [x13], #16
+ b.le 3f
+ mov v0.16b, v1.16b
+ ld1 {v1.16b}, [x8], #16 // top[base]
+ mov v2.16b, v3.16b
+ ld1 {v3.16b}, [x10], #16
+ b 2b
+
+3:
+ subs w4, w4, #2
+ b.le 9f
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w3, w12
+ b 1b
+9:
+ ret
+
+169:
+ st1 {v31.16b}, [x0], #16
+ subs w3, w3, #16
+ st1 {v31.16b}, [x13], #16
+ b.gt 169b
+ subs w4, w4, #2
+ b.le 9b
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w3, w12
+ b 169b
+
+L(ipred_z1_fill1_tbl):
+ .hword L(ipred_z1_fill1_tbl) - 640b
+ .hword L(ipred_z1_fill1_tbl) - 320b
+ .hword L(ipred_z1_fill1_tbl) - 160b
+ .hword L(ipred_z1_fill1_tbl) - 80b
+ .hword L(ipred_z1_fill1_tbl) - 40b
+endfunc
+
+function ipred_z1_fill2_8bpc_neon, export=1
+ cmp w3, #8
+ add x10, x2, w6, uxtw // top[max_base_x]
+ ld1r {v31.16b}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ b.eq 8f
+
+4: // w == 4
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 49f
+ ldr d0, [x2, w8, uxtw] // top[base]
+ ldr d2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ uzp2 v1.8b, v0.8b, v0.8b // top[base+1]
+ uzp1 v0.8b, v0.8b, v0.8b // top[base]
+ uzp2 v3.8b, v2.8b, v2.8b
+ uzp1 v2.8b, v2.8b, v2.8b
+ usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base]
+ usubl v7.8h, v3.8b, v2.8b
+ ushll v16.8h, v0.8b, #6 // top[base]*64
+ ushll v17.8h, v2.8b, #6
+ mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac
+ mla v17.4h, v7.4h, v5.4h
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.s}[0], [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.s}[0], [x0], x1
+ b.gt 4b
+ ret
+
+49:
+ st1 {v31.s}[0], [x0], x1
+ subs w4, w4, #2
+ st1 {v31.s}[0], [x0], x1
+ b.gt 49b
+ ret
+
+8: // w == 8
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 89f
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.8b, w9 // frac
+ dup v5.8b, w11
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.8b, w9 // 64 - frac
+ dup v7.8b, w11
+ uzp2 v1.16b, v0.16b, v0.16b // top[base+1]
+ uzp1 v0.16b, v0.16b, v0.16b // top[base]
+ uzp2 v3.16b, v2.16b, v2.16b
+ uzp1 v2.16b, v2.16b, v2.16b
+ umull v16.8h, v1.8b, v4.8b // top[base+1]*frac
+ umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac)
+ umull v17.8h, v3.8b, v5.8b
+ umlal v17.8h, v2.8b, v7.8b
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.8b}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.8b}, [x0], x1
+ b.gt 8b
+ ret
+
+89:
+ st1 {v31.8b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.8b}, [x0], x1
+ b.gt 89b
+ ret
+endfunc
+
+// void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src,
+// const int n);
+function ipred_reverse_8bpc_neon, export=1
+ sub x1, x1, #16
+ add x3, x0, #8
+ mov x4, #16
+1:
+ ld1 {v0.16b}, [x1]
+ subs w2, w2, #16
+ rev64 v0.16b, v0.16b
+ sub x1, x1, #16
+ st1 {v0.d}[1], [x0], x4
+ st1 {v0.d}[0], [x3], x4
+ b.gt 1b
+ ret
+endfunc
+
+const increments
+ .short 0, 1, 2, 3, 4, 5, 6, 7
+endconst
+
+// void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const left,
+// const int width, const int height,
+// const int dy, const int max_base_y);
+function ipred_z3_fill1_8bpc_neon, export=1
+ cmp w6, #64
+ clz w9, w3
+ adr x8, L(ipred_z3_fill1_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ add x10, x2, w6, uxtw // left[max_base_y]
+ sub x8, x8, w9, uxtw
+ movrel x11, increments
+ ld1r {v31.16b}, [x10] // padding
+ ld1 {v30.8h}, [x11] // increments
+ mov w7, w5
+ b.gt L(ipred_z3_fill1_large_h16)
+ br x8
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ dup v29.4h, w5 // dy
+
+ mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32
+ ld1 {v0.16b, v1.16b}, [x2] // left[]
+ add v30.4h, v29.4h, v30.4h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ xtn v24.8b, v30.8h // (uint8_t)ypos
+ uqshrn v26.8b, v30.8h, #6 // base
+ and v24.8b, v24.8b, v23.8b // frac
+
+ mov v4.8b, v31.8b
+ uqadd v27.8b, v26.8b, v20.8b // base + 1
+ uqadd v28.8b, v26.8b, v21.8b // base + 2
+ sub v25.8b, v22.8b, v24.8b // 64 - frac
+
+ tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base]
+
+ trn1 v27.2s, v27.2s, v28.2s // base + 1, base + 2
+ trn1 v24.2s, v24.2s, v24.2s // frac
+ trn1 v25.2s, v25.2s, v25.2s // 64 - frac
+1:
+ mov v5.8b, v31.8b
+ tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2]
+
+ trn1 v4.2s, v4.2s, v5.2s // left[base], left[base+1]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ rshrn v16.8b, v16.8h, #6
+ st1 {v16.s}[0], [x0], x1
+ subs w4, w4, #2
+ st1 {v16.s}[1], [x0], x1
+ b.le 9f
+
+ ext v4.8b, v5.8b, v5.8b, #4
+ uqadd v27.8b, v27.8b, v21.8b // base += 2
+ b 1b
+
+9:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+ dup v29.8h, w5 // dy
+
+ mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48
+ ld1 {v0.16b, v1.16b, v2.16b}, [x2] // left[]
+ add v30.8h, v29.8h, v30.8h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ xtn v24.8b, v30.8h // (uint8_t)ypos
+ uqshrn v26.8b, v30.8h, #6 // base
+ and v24.8b, v24.8b, v23.8b // frac
+
+ mov v4.8b, v31.8b
+ uqadd v27.8b, v26.8b, v20.8b // base + 1
+ uqadd v28.8b, v26.8b, v21.8b // base + 2
+ sub v25.8b, v22.8b, v24.8b // 64 - frac
+
+ tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base]
+1:
+ mov v5.8b, v31.8b
+ mov v6.8b, v31.8b
+ tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1]
+ tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ umull v17.8h, v5.8b, v25.8b
+ umlal v17.8h, v6.8b, v24.8b
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.8b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v17.8b}, [x0], x1
+ b.le 9f
+
+ mov v4.8b, v6.8b
+ uqadd v27.8b, v27.8b, v21.8b // base += 2
+ uqadd v28.8b, v28.8b, v21.8b // base += 2
+ b 1b
+
+9:
+ ret
+
+160:
+ AARCH64_VALID_JUMP_TARGET
+ dup v28.8h, w5 // dy
+
+ shl v29.8h, v28.8h, #3 // 8*dy
+ mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // This is only executed if we've checked that max_base_y <= 64.
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
+ add v28.8h, v28.8h, v30.8h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ add v29.8h, v28.8h, v29.8h // ypos + 8*dy
+
+ xtn v24.8b, v28.8h // (uint8_t)ypos
+ xtn2 v24.16b, v29.8h
+ uqshrn v26.8b, v28.8h, #6 // base
+ uqshrn2 v26.16b, v29.8h, #6
+ and v24.16b, v24.16b, v23.16b // frac
+
+ mov v4.16b, v31.16b
+ uqadd v27.16b, v26.16b, v20.16b // base + 1
+ uqadd v28.16b, v26.16b, v21.16b // base + 2
+ sub v25.16b, v22.16b, v24.16b // 64 - frac
+
+ tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base]
+1:
+ mov v5.16b, v31.16b
+ mov v6.16b, v31.16b
+ tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1]
+ tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ umull2 v17.8h, v4.16b, v25.16b
+ umlal2 v17.8h, v5.16b, v24.16b
+ umull v18.8h, v5.8b, v25.8b
+ umlal v18.8h, v6.8b, v24.8b
+ umull2 v19.8h, v5.16b, v25.16b
+ umlal2 v19.8h, v6.16b, v24.16b
+ rshrn v16.8b, v16.8h, #6
+ rshrn2 v16.16b, v17.8h, #6
+ rshrn v17.8b, v18.8h, #6
+ rshrn2 v17.16b, v19.8h, #6
+ st1 {v16.16b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v17.16b}, [x0], x1
+ b.le 9f
+
+ mov v4.16b, v6.16b
+ uqadd v27.16b, v27.16b, v21.16b // base += 2
+ uqadd v28.16b, v28.16b, v21.16b // base += 2
+ b 1b
+
+9:
+ ret
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ dup v28.8h, w5 // dy
+ mov w12, w3
+
+ add x13, x0, x1
+
+ shl v29.8h, v28.8h, #3 // 8*dy
+ mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw
+ add v30.8h, v28.8h, v30.8h // ypos
+
+ // This is only executed if we've checked that max_base_y <= 64.
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+1:
+ mov v26.16b, v30.16b // reset ypos
+
+2:
+ add v27.8h, v26.8h, v29.8h // ypos + 8*dy
+ uqshrn v16.8b, v26.8h, #6 // base
+ uqshrn2 v16.16b, v27.8h, #6
+ xtn v24.8b, v26.8h // (uint8_t)ypos
+ xtn2 v24.16b, v27.8h
+ umov w14, v16.b[0]
+ and v24.16b, v24.16b, v23.16b // frac
+
+ uqadd v17.16b, v16.16b, v20.16b // base + 1
+ cmp w14, w6 // base >= max_base_y
+ uqadd v18.16b, v16.16b, v21.16b // base + 2
+ sub v25.16b, v22.16b, v24.16b // 64 - frac
+
+ b.ge 4f
+
+ mov v4.16b, v31.16b
+ mov v5.16b, v31.16b
+ mov v6.16b, v31.16b
+ tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base]
+ tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1]
+ tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2]
+
+ subs w3, w3, #16
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ umull2 v17.8h, v4.16b, v25.16b
+ umlal2 v17.8h, v5.16b, v24.16b
+ umull v18.8h, v5.8b, v25.8b
+ umlal v18.8h, v6.8b, v24.8b
+ umull2 v19.8h, v5.16b, v25.16b
+ umlal2 v19.8h, v6.16b, v24.16b
+ rshrn v16.8b, v16.8h, #6
+ rshrn2 v16.16b, v17.8h, #6
+ rshrn v17.8b, v18.8h, #6
+ rshrn2 v17.16b, v19.8h, #6
+ st1 {v16.16b}, [x0], #16
+ st1 {v17.16b}, [x13], #16
+ b.le 3f
+ add v26.8h, v27.8h, v29.8h // ypos += 16*dy
+ b 2b
+
+3:
+ subs w4, w4, #2
+ b.le 9f
+ movi v16.8h, #128
+ add x0, x0, x1
+ add x13, x13, x1
+ add v30.8h, v30.8h, v16.8h // ypos = dy + y*(1<<6)*2
+ mov w3, w12
+ b 1b
+
+4:
+ subs w3, w3, #16
+ st1 {v31.16b}, [x0], #16
+ st1 {v31.16b}, [x13], #16
+ b.gt 4b
+ b 3b
+
+9:
+ ret
+
+L(ipred_z3_fill1_large_h16):
+ // Fallback case for max_base_y > 64; similar to the z1
+ // implementation. This does the filtering vertically, filling out
+ // a 2x pixel column at a time.
+ mov w15, #64
+ add x13, x0, x1
+ lsl x1, x1, #1
+
+ mov w12, w4
+1:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // ypos += dy
+ cmp w8, w6 // base >= max_base_y
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ add x8, x2, w8, uxtw
+ add x10, x2, w10, uxtw
+ dup v4.16b, w9 // frac
+ dup v5.16b, w11
+ ld1 {v0.16b, v1.16b}, [x8], #32 // left[base]
+ ld1 {v2.16b, v3.16b}, [x10], #32
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.16b, w9 // 64 - frac
+ dup v7.16b, w11
+ add w7, w7, w5 // ypos += dy
+2:
+ ext v16.16b, v0.16b, v1.16b, #1 // left[base+1]
+ ext v17.16b, v2.16b, v3.16b, #1
+ subs w4, w4, #16
+ umull v18.8h, v16.8b, v4.8b // left[base+1]*frac
+ umlal v18.8h, v0.8b, v6.8b // + left[base]*(64-frac)
+ umull2 v19.8h, v16.16b, v4.16b
+ umlal2 v19.8h, v0.16b, v6.16b
+ umull v20.8h, v17.8b, v5.8b
+ umlal v20.8h, v2.8b, v7.8b
+ umull2 v21.8h, v17.16b, v5.16b
+ umlal2 v21.8h, v2.16b, v7.16b
+ rshrn v16.8b, v18.8h, #6
+ rshrn2 v16.16b, v19.8h, #6
+ rshrn v17.8b, v20.8h, #6
+ rshrn2 v17.16b, v21.8h, #6
+ zip1 v18.16b, v16.16b, v17.16b
+ zip2 v19.16b, v16.16b, v17.16b
+ st1 {v18.h}[0], [x0], x1
+ st1 {v18.h}[1], [x13], x1
+ st1 {v18.h}[2], [x0], x1
+ st1 {v18.h}[3], [x13], x1
+ st1 {v18.h}[4], [x0], x1
+ st1 {v18.h}[5], [x13], x1
+ st1 {v18.h}[6], [x0], x1
+ st1 {v18.h}[7], [x13], x1
+ st1 {v19.h}[0], [x0], x1
+ st1 {v19.h}[1], [x13], x1
+ st1 {v19.h}[2], [x0], x1
+ st1 {v19.h}[3], [x13], x1
+ st1 {v19.h}[4], [x0], x1
+ st1 {v19.h}[5], [x13], x1
+ st1 {v19.h}[6], [x0], x1
+ st1 {v19.h}[7], [x13], x1
+ b.le 3f
+ mov v0.16b, v1.16b
+ ld1 {v1.16b}, [x8], #16 // left[base]
+ mov v2.16b, v3.16b
+ ld1 {v3.16b}, [x10], #16
+ b 2b
+
+3:
+ subs w3, w3, #2
+ b.le 9f
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ lsl x1, x1, #1
+ add x0, x0, #2
+ add x13, x13, #2
+ mov w4, w12
+ b 1b
+9:
+ ret
+
+L(ipred_z3_fill1_tbl):
+ .hword L(ipred_z3_fill1_tbl) - 640b
+ .hword L(ipred_z3_fill1_tbl) - 320b
+ .hword L(ipred_z3_fill1_tbl) - 160b
+ .hword L(ipred_z3_fill1_tbl) - 80b
+ .hword L(ipred_z3_fill1_tbl) - 40b
+endfunc
+
+function ipred_z3_fill_padding_neon, export=0
+ cmp w3, #16
+ adr x8, L(ipred_z3_fill_padding_tbl)
+ b.gt L(ipred_z3_fill_padding_wide)
+ // w3 = remaining width, w4 = constant height
+ mov w12, w4
+
+1:
+ // Fill a WxH rectangle with padding. W can be any number;
+ // this fills the exact width by filling in the largest
+ // power of two in the remaining width, and repeating.
+ clz w9, w3
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ sub x9, x8, w9, uxtw
+ br x9
+
+2:
+ st1 {v31.h}[0], [x0], x1
+ subs w4, w4, #4
+ st1 {v31.h}[0], [x13], x1
+ st1 {v31.h}[0], [x0], x1
+ st1 {v31.h}[0], [x13], x1
+ b.gt 2b
+ subs w3, w3, #2
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #2
+ add x13, x13, #2
+ mov w4, w12
+ b 1b
+
+4:
+ st1 {v31.s}[0], [x0], x1
+ subs w4, w4, #4
+ st1 {v31.s}[0], [x13], x1
+ st1 {v31.s}[0], [x0], x1
+ st1 {v31.s}[0], [x13], x1
+ b.gt 4b
+ subs w3, w3, #4
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #4
+ add x13, x13, #4
+ mov w4, w12
+ b 1b
+
+8:
+ st1 {v31.8b}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.8b}, [x13], x1
+ st1 {v31.8b}, [x0], x1
+ st1 {v31.8b}, [x13], x1
+ b.gt 4b
+ subs w3, w3, #8
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #8
+ add x13, x13, #8
+ mov w4, w12
+ b 1b
+
+16:
+32:
+64:
+ st1 {v31.16b}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.16b}, [x13], x1
+ st1 {v31.16b}, [x0], x1
+ st1 {v31.16b}, [x13], x1
+ b.gt 4b
+ subs w3, w3, #16
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #16
+ add x13, x13, #16
+ mov w4, w12
+ b 1b
+
+9:
+ ret
+
+L(ipred_z3_fill_padding_tbl):
+ .hword L(ipred_z3_fill_padding_tbl) - 64b
+ .hword L(ipred_z3_fill_padding_tbl) - 32b
+ .hword L(ipred_z3_fill_padding_tbl) - 16b
+ .hword L(ipred_z3_fill_padding_tbl) - 8b
+ .hword L(ipred_z3_fill_padding_tbl) - 4b
+ .hword L(ipred_z3_fill_padding_tbl) - 2b
+
+L(ipred_z3_fill_padding_wide):
+ // Fill a WxH rectangle with padding, with W > 16.
+ lsr x1, x1, #1
+ mov w12, w3
+ sub x1, x1, w3, uxtw
+1:
+ ands w5, w3, #15
+ b.eq 2f
+ // If the width isn't aligned to 16, first do one 16 byte write
+ // and align the start pointer.
+ sub w3, w3, w5
+ st1 {v31.16b}, [x0]
+ add x0, x0, w5, uxtw
+2:
+ // Fill the rest of the line with aligned 16 byte writes.
+ subs w3, w3, #16
+ st1 {v31.16b}, [x0], #16
+ b.gt 2b
+ subs w4, w4, #1
+ add x0, x0, x1
+ b.le 9f
+ mov w3, w12
+ b 1b
+9:
+ ret
+endfunc
+
+function ipred_z3_fill2_8bpc_neon, export=1
+ cmp w3, #8
+ add x10, x2, w6, uxtw // left[max_base_y]
+ movrel x11, increments
+ ld1r {v31.16b}, [x10] // padding
+ ld1 {v30.8h}, [x11] // increments
+ b.eq 80f
+
+40: // w == 4
+ dup v29.4h, w5 // dy
+
+ mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
+ // so max_base_y <= 32.
+ ld1 {v0.16b, v1.16b}, [x2] // left[]
+ add v30.4h, v29.4h, v30.4h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ xtn v24.8b, v30.8h // (uint8_t)ypos
+ uqshrn v26.8b, v30.8h, #6 // base
+ and v24.8b, v24.8b, v23.8b // frac
+
+ uqadd v27.8b, v26.8b, v20.8b // base + 1
+ uqadd v28.8b, v26.8b, v21.8b // base + 2
+ sub v25.8b, v22.8b, v24.8b // 64 - frac
+ uqadd v29.8b, v27.8b, v21.8b // base + 3
+
+ trn1 v24.2s, v24.2s, v24.2s // frac
+ trn1 v26.2s, v26.2s, v28.2s // base + 0, base + 2
+ trn1 v27.2s, v27.2s, v29.2s // base + 1, base + 3
+ trn1 v25.2s, v25.2s, v25.2s // 64 - frac
+
+ movi v21.16b, #4
+1:
+ mov v4.8b, v31.8b
+ mov v5.8b, v31.8b
+ tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2]
+ tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ rshrn v16.8b, v16.8h, #6
+ st1 {v16.s}[0], [x0], x1
+ subs w4, w4, #2
+ st1 {v16.s}[1], [x0], x1
+ b.le 9f
+
+ uqadd v26.8b, v26.8b, v21.8b // base += 4
+ uqadd v27.8b, v27.8b, v21.8b // base += 4
+ b 1b
+
+9:
+ ret
+
+80: // w == 8
+ dup v29.8h, w5 // dy
+
+ mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy
+ movi v23.16b, #0x3e
+
+ // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
+ // so max_base_y <= 32.
+ ld1 {v0.16b, v1.16b}, [x2] // left[]
+ add v30.8h, v29.8h, v30.8h // ypos
+
+ movi v22.16b, #64
+ movi v20.16b, #1
+ movi v21.16b, #2
+
+ xtn v24.8b, v30.8h // (uint8_t)ypos
+ uqshrn v26.8b, v30.8h, #6 // base
+ and v24.8b, v24.8b, v23.8b // frac
+
+ uqadd v27.8b, v26.8b, v20.8b // base + 1
+ uqadd v28.8b, v26.8b, v21.8b // base + 2
+ sub v25.8b, v22.8b, v24.8b // 64 - frac
+ uqadd v29.8b, v27.8b, v21.8b // base + 3
+
+ trn1 v24.2d, v24.2d, v24.2d // frac
+ trn1 v26.2d, v26.2d, v28.2d // base + 0, base + 2
+ trn1 v27.2d, v27.2d, v29.2d // base + 1, base + 3
+ trn1 v25.2d, v25.2d, v25.2d // 64 - frac
+
+ movi v21.16b, #4
+1:
+ mov v4.16b, v31.16b
+ mov v5.16b, v31.16b
+ tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2]
+ tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3]
+
+ umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
+ umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
+ umull2 v17.8h, v4.16b, v25.16b
+ umlal2 v17.8h, v5.16b, v24.16b
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ st1 {v16.8b}, [x0], x1
+ subs w4, w4, #2
+ st1 {v17.8b}, [x0], x1
+ b.le 9f
+
+ uqadd v26.16b, v26.16b, v21.16b // base += 4
+ uqadd v27.16b, v27.16b, v21.16b // base += 4
+ b 1b
+
+9:
+ ret
+endfunc
+
+
+// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int filt_idx,
+// const int max_width, const int max_height);
+function ipred_filter_8bpc_neon, export=1
+ and w5, w5, #511
+ movrel x6, X(filter_intra_taps)
+ lsl w5, w5, #6
+ add x6, x6, w5, uxtw
+ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
+ clz w9, w3
+ adr x5, L(ipred_filter_tbl)
+ ld1 {v20.8b, v21.8b, v22.8b}, [x6]
+ sub w9, w9, #26
+ ldrh w9, [x5, w9, uxtw #1]
+ sxtl v16.8h, v16.8b
+ sxtl v17.8h, v17.8b
+ sub x5, x5, w9, uxtw
+ sxtl v18.8h, v18.8b
+ sxtl v19.8h, v19.8b
+ add x6, x0, x1
+ lsl x1, x1, #1
+ sxtl v20.8h, v20.8b
+ sxtl v21.8h, v21.8b
+ sxtl v22.8h, v22.8b
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ldur s0, [x2, #1] // top (0-3)
+ sub x2, x2, #2
+ mov x7, #-2
+ uxtl v0.8h, v0.8b // top (0-3)
+4:
+ ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2)
+ mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ uxtl v1.8h, v1.8b // left (0-1) + topleft (2)
+ mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ sqrshrun v2.8b, v2.8h, #4
+ subs w4, w4, #2
+ st1 {v2.s}[0], [x0], x1
+ uxtl v0.8h, v2.8b
+ st1 {v2.s}[1], [x6], x1
+ ext v0.16b, v0.16b, v0.16b, #8 // move top from [4-7] to [0-3]
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ldur d0, [x2, #1] // top (0-7)
+ sub x2, x2, #2
+ mov x7, #-2
+ uxtl v0.8h, v0.8b // top (0-7)
+8:
+ ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2)
+ mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ uxtl v1.8h, v1.8b // left (0-1) + topleft (2)
+ mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
+ mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
+ mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v2.8b, v2.8h, #4
+ uxtl v1.8h, v2.8b // first block, in 16 bit
+ mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
+ mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0)
+ mla v3.8h, v21.8h, v1.h[3] // p5(left[0]) * filter(5)
+ mla v3.8h, v22.8h, v1.h[7] // p6(left[1]) * filter(6)
+ sqrshrun v3.8b, v3.8h, #4
+ subs w4, w4, #2
+ st2 {v2.s, v3.s}[0], [x0], x1
+ zip2 v0.2s, v2.2s, v3.2s
+ st2 {v2.s, v3.s}[1], [x6], x1
+ uxtl v0.8h, v0.8b
+ b.gt 8b
+ ret
+160:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x2, #1
+ sub x2, x2, #2
+ mov x7, #-2
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+
+1:
+ ld1 {v0.s}[0], [x2], x7 // left (0-1) + topleft (2)
+ uxtl v0.8h, v0.8b // left (0-1) + topleft (2)
+2:
+ ld1 {v2.16b}, [x8], #16 // top(0-15)
+ mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0)
+ mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
+ uxtl v1.8h, v2.8b // top(0-7)
+ uxtl2 v2.8h, v2.16b // top(8-15)
+ mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
+ mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
+ mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
+ mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
+ mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
+
+ mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
+ mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
+ mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v3.8b, v3.8h, #4
+ uxtl v0.8h, v3.8b // first block, in 16 bit
+ mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
+ mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0)
+ mla v4.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5)
+ mla v4.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6)
+
+ mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
+ mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
+ mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
+ sqrshrun v4.8b, v4.8h, #4
+ uxtl v0.8h, v4.8b // second block, in 16 bit
+ mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
+ mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0)
+ mla v5.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5)
+ mla v5.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6)
+
+ mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
+ mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
+ mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v5.8b, v5.8h, #4
+ uxtl v0.8h, v5.8b // third block, in 16 bit
+ mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
+ mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0)
+ mla v6.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5)
+ mla v6.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6)
+
+ subs w3, w3, #16
+ sqrshrun v6.8b, v6.8h, #4
+
+ st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16
+ st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16
+ b.le 8f
+ ins v0.h[2], v2.h[7]
+ ins v0.b[0], v6.b[7]
+ ins v0.b[2], v6.b[3]
+ b 2b
+8:
+ subs w4, w4, #2
+ b.le 9f
+ sub x8, x6, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_filter_tbl):
+ .hword L(ipred_filter_tbl) - 320b
+ .hword L(ipred_filter_tbl) - 160b
+ .hword L(ipred_filter_tbl) - 80b
+ .hword L(ipred_filter_tbl) - 40b
+endfunc
+
+// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint16_t *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_8bpc_neon, export=1
+ ld1 {v0.8h}, [x2]
+ clz w9, w4
+ adr x6, L(pal_pred_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x6, w9, uxtw #1]
+ xtn v0.8b, v0.8h
+ sub x6, x6, w9, uxtw
+ add x2, x0, x1
+ lsl x1, x1, #1
+ br x6
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b}, [x3], #16
+ subs w5, w5, #4
+ tbl v1.16b, {v0.16b}, v1.16b
+ st1 {v1.s}[0], [x0], x1
+ st1 {v1.s}[1], [x2], x1
+ st1 {v1.s}[2], [x0], x1
+ st1 {v1.s}[3], [x2], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b, v2.16b}, [x3], #32
+ subs w5, w5, #4
+ tbl v1.16b, {v0.16b}, v1.16b
+ st1 {v1.d}[0], [x0], x1
+ tbl v2.16b, {v0.16b}, v2.16b
+ st1 {v1.d}[1], [x2], x1
+ st1 {v2.d}[0], [x0], x1
+ st1 {v2.d}[1], [x2], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64
+ subs w5, w5, #4
+ tbl v1.16b, {v0.16b}, v1.16b
+ tbl v2.16b, {v0.16b}, v2.16b
+ st1 {v1.16b}, [x0], x1
+ tbl v3.16b, {v0.16b}, v3.16b
+ st1 {v2.16b}, [x2], x1
+ tbl v4.16b, {v0.16b}, v4.16b
+ st1 {v3.16b}, [x0], x1
+ st1 {v4.16b}, [x2], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
+ ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
+ subs w5, w5, #4
+ tbl v16.16b, {v0.16b}, v16.16b
+ tbl v17.16b, {v0.16b}, v17.16b
+ tbl v18.16b, {v0.16b}, v18.16b
+ tbl v19.16b, {v0.16b}, v19.16b
+ tbl v20.16b, {v0.16b}, v20.16b
+ st1 {v16.16b, v17.16b}, [x0], x1
+ tbl v21.16b, {v0.16b}, v21.16b
+ st1 {v18.16b, v19.16b}, [x2], x1
+ tbl v22.16b, {v0.16b}, v22.16b
+ st1 {v20.16b, v21.16b}, [x0], x1
+ tbl v23.16b, {v0.16b}, v23.16b
+ st1 {v22.16b, v23.16b}, [x2], x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
+ ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
+ subs w5, w5, #2
+ tbl v16.16b, {v0.16b}, v16.16b
+ tbl v17.16b, {v0.16b}, v17.16b
+ tbl v18.16b, {v0.16b}, v18.16b
+ tbl v19.16b, {v0.16b}, v19.16b
+ st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+ tbl v20.16b, {v0.16b}, v20.16b
+ tbl v21.16b, {v0.16b}, v21.16b
+ tbl v22.16b, {v0.16b}, v22.16b
+ tbl v23.16b, {v0.16b}, v23.16b
+ st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1
+ b.gt 64b
+ ret
+
+L(pal_pred_tbl):
+ .hword L(pal_pred_tbl) - 64b
+ .hword L(pal_pred_tbl) - 32b
+ .hword L(pal_pred_tbl) - 16b
+ .hword L(pal_pred_tbl) - 8b
+ .hword L(pal_pred_tbl) - 4b
+endfunc
+
+// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_128_8bpc_neon, export=1
+ clz w9, w3
+ adr x7, L(ipred_cfl_128_tbl)
+ sub w9, w9, #26
+ ldrh w9, [x7, w9, uxtw #1]
+ movi v0.8h, #128 // dc
+ dup v1.8h, w6 // alpha
+ sub x7, x7, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x7
+L(ipred_cfl_splat_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x5], #32
+ mul v2.8h, v2.8h, v1.8h // diff = ac * alpha
+ mul v3.8h, v3.8h, v1.8h
+ cmlt v4.8h, v2.8h, #0 // sign
+ cmlt v5.8h, v3.8h, #0
+ add v2.8h, v2.8h, v4.8h // diff + sign
+ add v3.8h, v3.8h, v5.8h
+ srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ srshr v3.8h, v3.8h, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign())
+ sqxtun v3.8b, v3.8h
+ st1 {v2.s}[0], [x0], x1
+ st1 {v2.s}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v3.s}[0], [x0], x1
+ st1 {v3.s}[1], [x6], x1
+ b.gt L(ipred_cfl_splat_w4)
+ ret
+L(ipred_cfl_splat_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64
+ mul v2.8h, v2.8h, v1.8h // diff = ac * alpha
+ mul v3.8h, v3.8h, v1.8h
+ mul v4.8h, v4.8h, v1.8h
+ mul v5.8h, v5.8h, v1.8h
+ cmlt v16.8h, v2.8h, #0 // sign
+ cmlt v17.8h, v3.8h, #0
+ cmlt v18.8h, v4.8h, #0
+ cmlt v19.8h, v5.8h, #0
+ add v2.8h, v2.8h, v16.8h // diff + sign
+ add v3.8h, v3.8h, v17.8h
+ add v4.8h, v4.8h, v18.8h
+ add v5.8h, v5.8h, v19.8h
+ srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ srshr v3.8h, v3.8h, #6
+ srshr v4.8h, v4.8h, #6
+ srshr v5.8h, v5.8h, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ add v4.8h, v4.8h, v0.8h
+ add v5.8h, v5.8h, v0.8h
+ sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign())
+ sqxtun v3.8b, v3.8h
+ sqxtun v4.8b, v4.8h
+ sqxtun v5.8b, v5.8h
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x6], x1
+ subs w4, w4, #4
+ st1 {v4.8b}, [x0], x1
+ st1 {v5.8b}, [x6], x1
+ b.gt L(ipred_cfl_splat_w8)
+ ret
+L(ipred_cfl_splat_w16):
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x5, w3, uxtw #1
+ sub x1, x1, w3, uxtw
+ mov w9, w3
+1:
+ ld1 {v2.8h, v3.8h}, [x5], #32
+ ld1 {v4.8h, v5.8h}, [x7], #32
+ mul v2.8h, v2.8h, v1.8h // diff = ac * alpha
+ mul v3.8h, v3.8h, v1.8h
+ mul v4.8h, v4.8h, v1.8h
+ mul v5.8h, v5.8h, v1.8h
+ cmlt v16.8h, v2.8h, #0 // sign
+ cmlt v17.8h, v3.8h, #0
+ cmlt v18.8h, v4.8h, #0
+ cmlt v19.8h, v5.8h, #0
+ add v2.8h, v2.8h, v16.8h // diff + sign
+ add v3.8h, v3.8h, v17.8h
+ add v4.8h, v4.8h, v18.8h
+ add v5.8h, v5.8h, v19.8h
+ srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ srshr v3.8h, v3.8h, #6
+ srshr v4.8h, v4.8h, #6
+ srshr v5.8h, v5.8h, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ add v4.8h, v4.8h, v0.8h
+ add v5.8h, v5.8h, v0.8h
+ sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign())
+ sqxtun v3.8b, v3.8h
+ sqxtun v4.8b, v4.8h
+ sqxtun v5.8b, v5.8h
+ subs w3, w3, #16
+ st1 {v2.8b, v3.8b}, [x0], #16
+ st1 {v4.8b, v5.8b}, [x6], #16
+ b.gt 1b
+ subs w4, w4, #2
+ add x5, x5, w9, uxtw #1
+ add x7, x7, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b.gt 1b
+ ret
+
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
+endfunc
+
+// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_top_8bpc_neon, export=1
+ clz w9, w3
+ adr x7, L(ipred_cfl_top_tbl)
+ sub w9, w9, #26
+ ldrh w9, [x7, w9, uxtw #1]
+ dup v1.8h, w6 // alpha
+ add x2, x2, #1
+ sub x7, x7, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x7
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x2]
+ uaddlv h0, v0.8b
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w4)
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+ uaddlv h0, v0.8b
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w8)
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ urshr v0.4h, v0.4h, #4
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b, v3.16b}, [x2]
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v2.4h, v2.4h, v3.4h
+ urshr v2.4h, v2.4h, #5
+ dup v0.8h, v2.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_top_tbl):
+ .hword L(ipred_cfl_top_tbl) - 32b
+ .hword L(ipred_cfl_top_tbl) - 16b
+ .hword L(ipred_cfl_top_tbl) - 8b
+ .hword L(ipred_cfl_top_tbl) - 4b
+endfunc
+
+// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_left_8bpc_neon, export=1
+ sub x2, x2, w4, uxtw
+ clz w9, w3
+ clz w8, w4
+ adr x10, L(ipred_cfl_splat_tbl)
+ adr x7, L(ipred_cfl_left_tbl)
+ sub w9, w9, #26
+ sub w8, w8, #26
+ ldrh w9, [x10, w9, uxtw #1]
+ ldrh w8, [x7, w8, uxtw #1]
+ dup v1.8h, w6 // alpha
+ sub x9, x10, w9, uxtw
+ sub x7, x7, w8, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x7
+
+L(ipred_cfl_left_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x2]
+ uaddlv h0, v0.8b
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2]
+ uaddlv h0, v0.8b
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ urshr v0.4h, v0.4h, #4
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b, v3.16b}, [x2]
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add v2.4h, v2.4h, v3.4h
+ urshr v2.4h, v2.4h, #5
+ dup v0.8h, v2.h[0]
+ br x9
+
+L(ipred_cfl_left_tbl):
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
+endfunc
+
+// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha);
+function ipred_cfl_8bpc_neon, export=1
+ sub x2, x2, w4, uxtw
+ add w8, w3, w4 // width + height
+ dup v1.8h, w6 // alpha
+ clz w9, w3
+ clz w6, w4
+ dup v16.8h, w8 // width + height
+ adr x7, L(ipred_cfl_tbl)
+ rbit w8, w8 // rbit(width + height)
+ sub w9, w9, #22 // 26 leading bits, minus table offset 4
+ sub w6, w6, #26
+ clz w8, w8 // ctz(width + height)
+ ldrh w9, [x7, w9, uxtw #1]
+ ldrh w6, [x7, w6, uxtw #1]
+ neg w8, w8 // -ctz(width + height)
+ sub x9, x7, w9, uxtw
+ sub x7, x7, w6, uxtw
+ ushr v16.8h, v16.8h, #1 // (width + height) >> 1
+ dup v17.8h, w8 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x7
+
+L(ipred_cfl_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2], #4
+ ins v0.s[1], wzr
+ add x2, x2, #1
+ uaddlv h0, v0.8b
+ br x9
+L(ipred_cfl_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.s}[0], [x2]
+ ins v2.s[1], wzr
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h2, v2.8b
+ cmp w4, #4
+ add v0.4h, v0.4h, v2.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 8/16
+ mov w16, #(0x3334/2)
+ movk w16, #(0x5556/2), lsl #16
+ add w17, w4, w4 // w17 = 2*h = 16 or 32
+ lsr w16, w16, w17
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2], #8
+ uaddlv h0, v0.8b
+ add x2, x2, #1
+ br x9
+L(ipred_cfl_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h2, v2.8b
+ cmp w4, #8
+ add v0.4h, v0.4h, v2.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x2], #16
+ uaddlv h0, v0.16b
+ add x2, x2, #1
+ br x9
+L(ipred_cfl_w16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h2, v2.16b
+ cmp w4, #16
+ add v0.4h, v0.4h, v2.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 4/8/32
+ cmp w4, #4
+ mov w16, #(0x3334/2)
+ mov w17, #(0x5556/2)
+ csel w16, w16, w17, eq
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b, v3.16b}, [x2], #32
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ add x2, x2, #1
+ add v0.4h, v2.4h, v3.4h
+ br x9
+L(ipred_cfl_w32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b, v3.16b}, [x2]
+ add v0.4h, v0.4h, v16.4h
+ uaddlv h2, v2.16b
+ uaddlv h3, v3.16b
+ cmp w4, #32
+ add v0.4h, v0.4h, v2.4h
+ add v0.4h, v0.4h, v3.4h
+ ushl v0.4h, v0.4h, v17.4h
+ b.eq 1f
+ // h = 8/16
+ mov w16, #(0x5556/2)
+ movk w16, #(0x3334/2), lsl #16
+ add w17, w4, w4 // w17 = 2*h = 16 or 32
+ lsr w16, w16, w17
+ dup v16.4h, w16
+ sqdmulh v0.4h, v0.4h, v16.4h
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_tbl):
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
+endfunc
+
+// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_8bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_420_tbl)
+ sub w8, w8, #27
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_420_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x10], x2
+ ld1 {v0.d}[1], [x1], x2
+ ld1 {v1.d}[1], [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v1.16b
+ add v0.8h, v0.8h, v1.8h
+ shl v0.8h, v0.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h}, [x0], #16
+ add v16.8h, v16.8h, v0.8h
+ b.gt 1b
+ trn2 v1.2d, v0.2d, v0.2d
+ trn2 v0.2d, v0.2d, v0.2d
+L(ipred_cfl_ac_420_w4_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ b.gt 2b
+3:
+ // Aggregate the sums
+ add v0.8h, v16.8h, v17.8h
+ uaddlv s0, v0.8h // sum
+ sub x0, x0, w6, uxtw #3
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
+ dup v4.8h, v4.h[0]
+6: // Subtract dc from ac
+ ld1 {v0.8h, v1.8h}, [x0]
+ subs w6, w6, #4
+ sub v0.8h, v0.8h, v4.8h
+ sub v1.8h, v1.8h, v4.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 6b
+ ret
+
+L(ipred_cfl_ac_420_w8):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_420_w8_wpad)
+1: // Copy and subsample input, without padding
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v1.16b}, [x10], x2
+ ld1 {v2.16b}, [x1], x2
+ uaddlp v0.8h, v0.16b
+ ld1 {v3.16b}, [x10], x2
+ uaddlp v1.8h, v1.16b
+ uaddlp v2.8h, v2.16b
+ uaddlp v3.8h, v3.16b
+ add v0.8h, v0.8h, v1.8h
+ add v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #1
+ shl v1.8h, v2.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h}, [x0], #32
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ b.gt 1b
+ mov v0.16b, v1.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1: // Copy and subsample input, padding 4
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x10], x2
+ ld1 {v0.d}[1], [x1], x2
+ ld1 {v1.d}[1], [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v1.16b
+ add v0.8h, v0.8h, v1.8h
+ shl v0.8h, v0.8h, #1
+ dup v1.4h, v0.h[3]
+ dup v3.4h, v0.h[7]
+ trn2 v2.2d, v0.2d, v0.2d
+ subs w8, w8, #2
+ st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+ add v16.4h, v16.4h, v0.4h
+ add v17.4h, v17.4h, v1.4h
+ add v18.4h, v18.4h, v2.4h
+ add v19.4h, v19.4h, v3.4h
+ b.gt 1b
+ trn1 v0.2d, v2.2d, v3.2d
+ trn1 v1.2d, v2.2d, v3.2d
+
+L(ipred_cfl_ac_420_w8_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ add v18.8h, v18.8h, v0.8h
+ add v19.8h, v19.8h, v1.8h
+ b.gt 2b
+3:
+
+L(ipred_cfl_ac_420_w8_calc_subtract_dc):
+ // Aggregate the sums
+ add v0.8h, v16.8h, v17.8h
+ add v2.8h, v18.8h, v19.8h
+ uaddlp v0.4s, v0.8h
+ uaddlp v2.4s, v2.8h
+ add v0.4s, v0.4s, v2.4s
+ addv s0, v0.4s // sum
+ sub x0, x0, w6, uxtw #4
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
+ dup v4.8h, v4.h[0]
+L(ipred_cfl_ac_420_w8_subtract_dc):
+6: // Subtract dc from ac
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+ subs w6, w6, #4
+ sub v0.8h, v0.8h, v4.8h
+ sub v1.8h, v1.8h, v4.8h
+ sub v2.8h, v2.8h, v4.8h
+ sub v3.8h, v3.8h, v4.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ b.gt 6b
+ ret
+
+L(ipred_cfl_ac_420_w16):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_420_w16_tbl)
+ ldrh w3, [x7, w3, uxtw #1]
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_420_w16_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, without padding
+ ld1 {v0.16b, v1.16b}, [x1], x2
+ ld1 {v2.16b, v3.16b}, [x10], x2
+ uaddlp v0.8h, v0.16b
+ ld1 {v4.16b, v5.16b}, [x1], x2
+ uaddlp v1.8h, v1.16b
+ ld1 {v6.16b, v7.16b}, [x10], x2
+ uaddlp v2.8h, v2.16b
+ uaddlp v3.8h, v3.16b
+ uaddlp v4.8h, v4.16b
+ uaddlp v5.8h, v5.16b
+ uaddlp v6.8h, v6.16b
+ uaddlp v7.8h, v7.16b
+ add v0.8h, v0.8h, v2.8h
+ add v1.8h, v1.8h, v3.8h
+ add v4.8h, v4.8h, v6.8h
+ add v5.8h, v5.8h, v7.8h
+ shl v0.8h, v0.8h, #1
+ shl v1.8h, v1.8h, #1
+ shl v2.8h, v4.8h, #1
+ shl v3.8h, v5.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 4
+ ldr d1, [x1, #16]
+ ld1 {v0.16b}, [x1], x2
+ ldr d3, [x10, #16]
+ ld1 {v2.16b}, [x10], x2
+ uaddlp v1.4h, v1.8b
+ ldr d5, [x1, #16]
+ uaddlp v0.8h, v0.16b
+ ld1 {v4.16b}, [x1], x2
+ uaddlp v3.4h, v3.8b
+ ldr d7, [x10, #16]
+ uaddlp v2.8h, v2.16b
+ ld1 {v6.16b}, [x10], x2
+ uaddlp v5.4h, v5.8b
+ uaddlp v4.8h, v4.16b
+ uaddlp v7.4h, v7.8b
+ uaddlp v6.8h, v6.16b
+ add v1.4h, v1.4h, v3.4h
+ add v0.8h, v0.8h, v2.8h
+ add v5.4h, v5.4h, v7.4h
+ add v4.8h, v4.8h, v6.8h
+ shl v1.4h, v1.4h, #1
+ shl v0.8h, v0.8h, #1
+ shl v3.4h, v5.4h, #1
+ shl v2.8h, v4.8h, #1
+ dup v4.4h, v1.h[3]
+ dup v5.4h, v3.h[3]
+ trn1 v1.2d, v1.2d, v4.2d
+ trn1 v3.2d, v3.2d, v5.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 8
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v2.16b}, [x10], x2
+ ld1 {v4.16b}, [x1], x2
+ uaddlp v0.8h, v0.16b
+ ld1 {v6.16b}, [x10], x2
+ uaddlp v2.8h, v2.16b
+ uaddlp v4.8h, v4.16b
+ uaddlp v6.8h, v6.16b
+ add v0.8h, v0.8h, v2.8h
+ add v4.8h, v4.8h, v6.8h
+ shl v0.8h, v0.8h, #1
+ shl v2.8h, v4.8h, #1
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 12
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v2.8b}, [x10], x2
+ ld1 {v4.8b}, [x1], x2
+ uaddlp v0.4h, v0.8b
+ ld1 {v6.8b}, [x10], x2
+ uaddlp v2.4h, v2.8b
+ uaddlp v4.4h, v4.8b
+ uaddlp v6.4h, v6.8b
+ add v0.4h, v0.4h, v2.4h
+ add v4.4h, v4.4h, v6.4h
+ shl v0.4h, v0.4h, #1
+ shl v2.4h, v4.4h, #1
+ dup v1.8h, v0.h[3]
+ dup v3.8h, v2.h[3]
+ trn1 v0.2d, v0.2d, v1.2d
+ trn1 v2.2d, v2.2d, v3.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+
+L(ipred_cfl_ac_420_w16_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 2b
+3:
+
+ // Double the height and reuse the w8 summing/subtracting
+ lsl w6, w6, #1
+ b L(ipred_cfl_ac_420_w8_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_tbl):
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
+ .hword 0
+
+L(ipred_cfl_ac_420_w16_tbl):
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
+endfunc
+
+// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_8bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_422_tbl)
+ sub w8, w8, #27
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_422_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v0.d}[1], [x10], x2
+ ld1 {v1.8b}, [x1], x2
+ ld1 {v1.d}[1], [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v1.16b
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v1.8h, #2
+ subs w8, w8, #4
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_422_w8_wpad)
+1: // Copy and subsample input, without padding
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v1.16b}, [x10], x2
+ ld1 {v2.16b}, [x1], x2
+ uaddlp v0.8h, v0.16b
+ ld1 {v3.16b}, [x10], x2
+ uaddlp v1.8h, v1.16b
+ uaddlp v2.8h, v2.16b
+ uaddlp v3.8h, v3.16b
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v1.8h, #2
+ shl v2.8h, v2.8h, #2
+ shl v3.8h, v3.8h, #2
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1: // Copy and subsample input, padding 4
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v0.d}[1], [x10], x2
+ ld1 {v2.8b}, [x1], x2
+ ld1 {v2.d}[1], [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v2.8h, v2.16b
+ shl v0.8h, v0.8h, #2
+ shl v2.8h, v2.8h, #2
+ dup v4.4h, v0.h[3]
+ dup v5.8h, v0.h[7]
+ dup v6.4h, v2.h[3]
+ dup v7.8h, v2.h[7]
+ trn2 v1.2d, v0.2d, v5.2d
+ trn1 v0.2d, v0.2d, v4.2d
+ trn2 v3.2d, v2.2d, v7.2d
+ trn1 v2.2d, v2.2d, v6.2d
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_422_w16_tbl)
+ ldrh w3, [x7, w3, uxtw #1]
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_422_w16_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, without padding
+ ld1 {v0.16b, v1.16b}, [x1], x2
+ ld1 {v2.16b, v3.16b}, [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v1.16b
+ uaddlp v2.8h, v2.16b
+ uaddlp v3.8h, v3.16b
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v1.8h, #2
+ shl v2.8h, v2.8h, #2
+ shl v3.8h, v3.8h, #2
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 4
+ ldr d1, [x1, #16]
+ ld1 {v0.16b}, [x1], x2
+ ldr d3, [x10, #16]
+ ld1 {v2.16b}, [x10], x2
+ uaddlp v1.4h, v1.8b
+ uaddlp v0.8h, v0.16b
+ uaddlp v3.4h, v3.8b
+ uaddlp v2.8h, v2.16b
+ shl v1.4h, v1.4h, #2
+ shl v0.8h, v0.8h, #2
+ shl v3.4h, v3.4h, #2
+ shl v2.8h, v2.8h, #2
+ dup v4.4h, v1.h[3]
+ dup v5.4h, v3.h[3]
+ trn1 v1.2d, v1.2d, v4.2d
+ trn1 v3.2d, v3.2d, v5.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 8
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v2.16b}, [x10], x2
+ uaddlp v0.8h, v0.16b
+ uaddlp v2.8h, v2.16b
+ shl v0.8h, v0.8h, #2
+ shl v2.8h, v2.8h, #2
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 12
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v2.8b}, [x10], x2
+ uaddlp v0.4h, v0.8b
+ uaddlp v2.4h, v2.8b
+ shl v0.4h, v0.4h, #2
+ shl v2.4h, v2.4h, #2
+ dup v1.8h, v0.h[3]
+ dup v3.8h, v2.h[3]
+ trn1 v0.2d, v0.2d, v1.2d
+ trn1 v2.2d, v2.2d, v3.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_tbl):
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
+ .hword 0
+
+L(ipred_cfl_ac_422_w16_tbl):
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
+
+// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_8bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_444_tbl)
+ sub w8, w8, #26
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_444_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input
+ ld1 {v0.s}[0], [x1], x2
+ ld1 {v0.s}[1], [x10], x2
+ ld1 {v1.s}[0], [x1], x2
+ ld1 {v1.s}[1], [x10], x2
+ ushll v0.8h, v0.8b, #3
+ ushll v1.8h, v1.8b, #3
+ subs w8, w8, #4
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x10], x2
+ ld1 {v2.8b}, [x1], x2
+ ushll v0.8h, v0.8b, #3
+ ld1 {v3.8b}, [x10], x2
+ ushll v1.8h, v1.8b, #3
+ ushll v2.8h, v2.8b, #3
+ ushll v3.8h, v3.8b, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v2.16b}, [x10], x2
+ ld1 {v4.16b}, [x1], x2
+ ushll2 v1.8h, v0.16b, #3
+ ushll v0.8h, v0.8b, #3
+ ld1 {v6.16b}, [x10], x2
+ ushll2 v3.8h, v2.16b, #3
+ ushll v2.8h, v2.8b, #3
+ ushll2 v5.8h, v4.16b, #3
+ ushll v4.8h, v4.8b, #3
+ ushll2 v7.8h, v6.16b, #3
+ ushll v6.8h, v6.8b, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ mov v0.16b, v6.16b
+ mov v1.16b, v7.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v2.8b}, [x10], x2
+ ld1 {v4.8b}, [x1], x2
+ ld1 {v6.8b}, [x10], x2
+ ushll v0.8h, v0.8b, #3
+ ushll v2.8h, v2.8b, #3
+ ushll v4.8h, v4.8b, #3
+ ushll v6.8h, v6.8b, #3
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ dup v5.8h, v4.h[7]
+ dup v7.8h, v6.h[7]
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ mov v0.16b, v6.16b
+ mov v1.16b, v7.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_444_w32_tbl)
+ ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_444_w32_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, without padding
+ ld1 {v2.16b, v3.16b}, [x1], x2
+ ld1 {v6.16b, v7.16b}, [x10], x2
+ ushll v0.8h, v2.8b, #3
+ ushll2 v1.8h, v2.16b, #3
+ ushll v2.8h, v3.8b, #3
+ ushll2 v3.8h, v3.16b, #3
+ ushll v4.8h, v6.8b, #3
+ ushll2 v5.8h, v6.16b, #3
+ ushll v6.8h, v7.8b, #3
+ ushll2 v7.8h, v7.16b, #3
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 8
+ ldr d2, [x1, #16]
+ ld1 {v1.16b}, [x1], x2
+ ldr d6, [x10, #16]
+ ld1 {v5.16b}, [x10], x2
+ ushll v2.8h, v2.8b, #3
+ ushll v0.8h, v1.8b, #3
+ ushll2 v1.8h, v1.16b, #3
+ ushll v6.8h, v6.8b, #3
+ ushll v4.8h, v5.8b, #3
+ ushll2 v5.8h, v5.16b, #3
+ dup v3.8h, v2.h[7]
+ dup v7.8h, v6.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 16
+ ld1 {v1.16b}, [x1], x2
+ ld1 {v5.16b}, [x10], x2
+ ushll v0.8h, v1.8b, #3
+ ushll2 v1.8h, v1.16b, #3
+ ushll v4.8h, v5.8b, #3
+ ushll2 v5.8h, v5.16b, #3
+ dup v2.8h, v1.h[7]
+ dup v3.8h, v1.h[7]
+ dup v6.8h, v5.h[7]
+ dup v7.8h, v5.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 24
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v4.8b}, [x10], x2
+ ushll v0.8h, v0.8b, #3
+ ushll v4.8h, v4.8b, #3
+ dup v1.8h, v0.h[7]
+ dup v2.8h, v0.h[7]
+ dup v3.8h, v0.h[7]
+ dup v5.8h, v4.h[7]
+ dup v6.8h, v4.h[7]
+ dup v7.8h, v4.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 1b
+
+L(ipred_cfl_ac_444_w32_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ add v16.8h, v16.8h, v4.8h
+ add v17.8h, v17.8h, v5.8h
+ add v18.8h, v18.8h, v6.8h
+ add v19.8h, v19.8h, v7.8h
+ b.gt 2b
+3:
+
+ // Quadruple the height and reuse the w8 subtracting
+ lsl w6, w6, #2
+ // Aggregate the sums, with wider intermediates earlier than in
+ // ipred_cfl_ac_420_w8_calc_subtract_dc.
+ uaddlp v0.4s, v16.8h
+ uaddlp v1.4s, v17.8h
+ uaddlp v2.4s, v18.8h
+ uaddlp v3.4s, v19.8h
+ add v0.4s, v0.4s, v1.4s
+ add v2.4s, v2.4s, v3.4s
+ add v0.4s, v0.4s, v2.4s
+ addv s0, v0.4s // sum
+ sub x0, x0, w6, uxtw #4
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
+ dup v4.8h, v4.h[0]
+ b L(ipred_cfl_ac_420_w8_subtract_dc)
+
+L(ipred_cfl_ac_444_tbl):
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
+
+L(ipred_cfl_ac_444_w32_tbl):
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/ipred16.S b/third_party/dav1d/src/arm/64/ipred16.S
new file mode 100644
index 0000000000..c48c48583c
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/ipred16.S
@@ -0,0 +1,4204 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height,
+// const int bitdepth_max);
+function ipred_dc_128_16bpc_neon, export=1
+ ldr w8, [sp]
+ clz w3, w3
+ adr x5, L(ipred_dc_128_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ dup v0.8h, w8
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ urshr v0.8h, v0.8h, #1
+ br x5
+4:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+16:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+32:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+ sub x1, x1, #64
+64:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_128_tbl):
+ .hword L(ipred_dc_128_tbl) - 640b
+ .hword L(ipred_dc_128_tbl) - 320b
+ .hword L(ipred_dc_128_tbl) - 160b
+ .hword L(ipred_dc_128_tbl) - 8b
+ .hword L(ipred_dc_128_tbl) - 4b
+endfunc
+
+// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_16bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_v_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #2
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+4:
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+8:
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h}, [x2]
+16:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+32:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ sub x1, x1, #64
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+64:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_v_tbl):
+ .hword L(ipred_v_tbl) - 640b
+ .hword L(ipred_v_tbl) - 320b
+ .hword L(ipred_v_tbl) - 160b
+ .hword L(ipred_v_tbl) - 80b
+ .hword L(ipred_v_tbl) - 40b
+endfunc
+
+// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_16bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_h_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ sub x2, x2, #8
+ sub x5, x5, w3, uxtw
+ mov x7, #-8
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ st1 {v3.4h}, [x0], x1
+ st1 {v2.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ stp q3, q3, [x0, #32]
+ stp q2, q2, [x6, #32]
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ stp q1, q1, [x0, #32]
+ stp q0, q0, [x6, #32]
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ stp q3, q3, [x0, #32]
+ stp q2, q2, [x6, #32]
+ stp q3, q3, [x0, #64]
+ stp q2, q2, [x6, #64]
+ stp q3, q3, [x0, #96]
+ stp q2, q2, [x6, #96]
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ stp q1, q1, [x0, #32]
+ stp q0, q0, [x6, #32]
+ stp q1, q1, [x0, #64]
+ stp q0, q0, [x6, #64]
+ stp q1, q1, [x0, #96]
+ stp q0, q0, [x6, #96]
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_h_tbl):
+ .hword L(ipred_h_tbl) - 64b
+ .hword L(ipred_h_tbl) - 32b
+ .hword L(ipred_h_tbl) - 16b
+ .hword L(ipred_h_tbl) - 8b
+ .hword L(ipred_h_tbl) - 4b
+endfunc
+
+// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_16bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_dc_top_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #2
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.4h, v0.h[0]
+4:
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+8:
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addv h0, v0.8h
+ urshr v2.4h, v0.4h, #4
+ dup v0.8h, v2.h[0]
+ dup v1.8h, v2.h[0]
+16:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v0.8h, v0.8h, v2.8h
+ uaddlv s0, v0.8h
+ rshrn v4.4h, v0.4s, #5
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+32:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+ addp v0.8h, v0.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v4.4h, v0.4s, #6
+ sub x1, x1, #64
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+64:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_top_tbl):
+ .hword L(ipred_dc_top_tbl) - 640b
+ .hword L(ipred_dc_top_tbl) - 320b
+ .hword L(ipred_dc_top_tbl) - 160b
+ .hword L(ipred_dc_top_tbl) - 80b
+ .hword L(ipred_dc_top_tbl) - 40b
+endfunc
+
+// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_16bpc_neon, export=1
+ sub x2, x2, w4, uxtw #1
+ clz w3, w3
+ clz w7, w4
+ adr x5, L(ipred_dc_left_tbl)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w7, w7, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w7, [x5, w7, uxtw #1]
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w7, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_left_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.8h, v0.h[0]
+ br x3
+L(ipred_dc_left_w4):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt L(ipred_dc_left_w4)
+ ret
+
+L(ipred_dc_left_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ br x3
+L(ipred_dc_left_w8):
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt L(ipred_dc_left_w8)
+ ret
+
+L(ipred_dc_left_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addv h0, v0.8h
+ urshr v2.4h, v0.4h, #4
+ dup v0.8h, v2.h[0]
+ dup v1.8h, v2.h[0]
+ br x3
+L(ipred_dc_left_w16):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+1:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v0.8h, v0.8h, v2.8h
+ uaddlp v0.4s, v0.8h
+ addv s0, v0.4s
+ rshrn v4.4h, v0.4s, #5
+ dup v0.8h, v4.h[0]
+ br x3
+L(ipred_dc_left_w32):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+1:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_h64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+ addp v0.8h, v0.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v4.4h, v0.4s, #6
+ dup v0.8h, v4.h[0]
+ br x3
+L(ipred_dc_left_w64):
+ AARCH64_VALID_JUMP_TARGET
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+ sub x1, x1, #64
+1:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_tbl):
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
+endfunc
+
+// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_16bpc_neon, export=1
+ sub x2, x2, w4, uxtw #1
+ add w7, w3, w4 // width + height
+ clz w3, w3
+ clz w6, w4
+ dup v16.4s, w7 // width + height
+ adr x5, L(ipred_dc_tbl)
+ rbit w7, w7 // rbit(width + height)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w6, w6, #25
+ clz w7, w7 // ctz(width + height)
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w6, [x5, w6, uxtw #1]
+ neg w7, w7 // -ctz(width + height)
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w6, uxtw
+ ushr v16.4s, v16.4s, #1 // (width + height) >> 1
+ dup v17.4s, w7 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2], #8
+ uaddlv s0, v0.4h
+ add x2, x2, #2
+ br x3
+L(ipred_dc_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.4h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s1, v1.4h
+ cmp w4, #4
+ add v0.2s, v0.2s, v1.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16
+ cmp w4, #16
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.4h, v0.h[0]
+2:
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2], #16
+ uaddlv s0, v0.8h
+ add x2, x2, #2
+ br x3
+L(ipred_dc_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s1, v1.8h
+ cmp w4, #8
+ add v0.2s, v0.2s, v1.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+2:
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h}, [x2], #32
+ addp v0.8h, v0.8h, v1.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x3
+L(ipred_dc_w16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8h, v2.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ addp v1.8h, v1.8h, v2.8h
+ uaddlv s1, v1.8h
+ cmp w4, #16
+ add v0.2s, v0.2s, v1.2s
+ ushl v4.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/8/32/64
+ tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v4.2s, v4.2s, v16.2s
+ ushr v4.2s, v4.2s, #17
+1:
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+2:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v0.8h, v0.8h, v2.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x3
+L(ipred_dc_w32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ addp v1.8h, v1.8h, v2.8h
+ addp v3.8h, v3.8h, v4.8h
+ addp v1.8h, v1.8h, v3.8h
+ uaddlv s1, v1.8h
+ cmp w4, #32
+ add v0.2s, v0.2s, v1.2s
+ ushl v4.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16/64
+ cmp w4, #8
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v4.2s, v4.2s, v16.2s
+ ushr v4.2s, v4.2s, #17
+1:
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+2:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+ addp v0.8h, v0.8h, v4.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x3
+L(ipred_dc_w64):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
+ add v0.2s, v0.2s, v16.2s
+ addp v1.8h, v1.8h, v2.8h
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
+ addp v3.8h, v3.8h, v4.8h
+ addp v20.8h, v20.8h, v21.8h
+ addp v22.8h, v22.8h, v23.8h
+ addp v1.8h, v1.8h, v3.8h
+ addp v20.8h, v20.8h, v22.8h
+ addp v1.8h, v1.8h, v20.8h
+ uaddlv s1, v1.8h
+ cmp w4, #64
+ add v0.2s, v0.2s, v1.2s
+ ushl v4.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 16/32
+ cmp w4, #16
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v4.2s, v4.2s, v16.2s
+ ushr v4.2s, v4.2s, #17
+1:
+ sub x1, x1, #64
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+2:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_tbl):
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
+endfunc
+
+// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_paeth_16bpc_neon, export=1
+ clz w9, w3
+ adr x5, L(ipred_paeth_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.8h}, [x2]
+ add x8, x2, #2
+ sub x2, x2, #8
+ sub x5, x5, w9, uxtw
+ mov x7, #-8
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v5.2d}, [x8]
+ sub v6.8h, v5.8h, v4.8h // top - topleft
+4:
+ ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7
+ zip1 v0.2d, v0.2d, v1.2d
+ zip1 v2.2d, v2.2d, v3.2d
+ add v16.8h, v6.8h, v0.8h // base
+ add v17.8h, v6.8h, v2.8h
+ sabd v20.8h, v5.8h, v16.8h // tdiff
+ sabd v21.8h, v5.8h, v17.8h
+ sabd v22.8h, v4.8h, v16.8h // tldiff
+ sabd v23.8h, v4.8h, v17.8h
+ sabd v16.8h, v0.8h, v16.8h // ldiff
+ sabd v17.8h, v2.8h, v17.8h
+ umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff)
+ umin v19.8h, v21.8h, v23.8h
+ cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff
+ cmge v21.8h, v23.8h, v21.8h
+ cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff
+ cmge v17.8h, v19.8h, v17.8h
+ bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bsl v20.16b, v5.16b, v4.16b
+ bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ...
+ bit v20.16b, v0.16b, v16.16b
+ st1 {v21.d}[1], [x0], x1
+ st1 {v21.d}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v20.d}[1], [x0], x1
+ st1 {v20.d}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v5.8h}, [x8], #16
+ mov w9, w3
+ // Set up pointers for four rows in parallel; x0, x6, x5, x10
+ add x5, x0, x1
+ add x10, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+1:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+2:
+ sub v6.8h, v5.8h, v4.8h // top - topleft
+ add v16.8h, v6.8h, v0.8h // base
+ add v17.8h, v6.8h, v1.8h
+ add v18.8h, v6.8h, v2.8h
+ add v19.8h, v6.8h, v3.8h
+ sabd v20.8h, v5.8h, v16.8h // tdiff
+ sabd v21.8h, v5.8h, v17.8h
+ sabd v22.8h, v5.8h, v18.8h
+ sabd v23.8h, v5.8h, v19.8h
+ sabd v24.8h, v4.8h, v16.8h // tldiff
+ sabd v25.8h, v4.8h, v17.8h
+ sabd v26.8h, v4.8h, v18.8h
+ sabd v27.8h, v4.8h, v19.8h
+ sabd v16.8h, v0.8h, v16.8h // ldiff
+ sabd v17.8h, v1.8h, v17.8h
+ sabd v18.8h, v2.8h, v18.8h
+ sabd v19.8h, v3.8h, v19.8h
+ umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff)
+ umin v29.8h, v21.8h, v25.8h
+ umin v30.8h, v22.8h, v26.8h
+ umin v31.8h, v23.8h, v27.8h
+ cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff
+ cmge v21.8h, v25.8h, v21.8h
+ cmge v22.8h, v26.8h, v22.8h
+ cmge v23.8h, v27.8h, v23.8h
+ cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff
+ cmge v17.8h, v29.8h, v17.8h
+ cmge v18.8h, v30.8h, v18.8h
+ cmge v19.8h, v31.8h, v19.8h
+ bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bsl v22.16b, v5.16b, v4.16b
+ bsl v21.16b, v5.16b, v4.16b
+ bsl v20.16b, v5.16b, v4.16b
+ bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ...
+ bit v22.16b, v2.16b, v18.16b
+ bit v21.16b, v1.16b, v17.16b
+ bit v20.16b, v0.16b, v16.16b
+ st1 {v23.8h}, [x0], #16
+ st1 {v22.8h}, [x6], #16
+ subs w3, w3, #8
+ st1 {v21.8h}, [x5], #16
+ st1 {v20.8h}, [x10], #16
+ b.le 8f
+ ld1 {v5.8h}, [x8], #16
+ b 2b
+8:
+ subs w4, w4, #4
+ b.le 9f
+ // End of horizontal loop, move pointers to next four rows
+ sub x8, x8, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ // Load the top row as early as possible
+ ld1 {v5.8h}, [x8], #16
+ add x5, x5, x1
+ add x10, x10, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_paeth_tbl):
+ .hword L(ipred_paeth_tbl) - 640b
+ .hword L(ipred_paeth_tbl) - 320b
+ .hword L(ipred_paeth_tbl) - 160b
+ .hword L(ipred_paeth_tbl) - 80b
+ .hword L(ipred_paeth_tbl) - 40b
+endfunc
+
+// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_16bpc_neon, export=1
+ movrel x10, X(sm_weights)
+ add x11, x10, w4, uxtw
+ add x10, x10, w3, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_tbl)
+ sub x12, x2, w4, uxtw #1
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.8h}, [x12] // bottom
+ add x8, x2, #2
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v6.2d}, [x8] // top
+ ld1r {v7.2s}, [x10] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
+ dup v5.8h, v6.h[3] // right
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+ uxtl v7.8h, v7.8b // weights_hor
+ add v31.4h, v4.4h, v5.4h // bottom+right
+4:
+ ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
+ ushll v20.4s, v31.4h, #8 // (bottom+right)*256
+ ushll v21.4s, v31.4h, #8
+ ushll v22.4s, v31.4h, #8
+ ushll v23.4s, v31.4h, #8
+ zip1 v1.2d, v1.2d, v0.2d // left, flipped
+ zip1 v0.2d, v3.2d, v2.2d
+ zip1 v16.2s, v16.2s, v17.2s // weights_ver
+ zip1 v18.2s, v18.2s, v19.2s
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v18.8h, v18.8b
+ smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor
+ smlal2 v21.4s, v0.8h, v7.8h
+ smlal v22.4s, v1.4h, v7.4h
+ smlal2 v23.4s, v1.8h, v7.8h
+ smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver
+ smlal2 v21.4s, v6.8h, v16.8h
+ smlal v22.4s, v6.4h, v18.4h
+ smlal2 v23.4s, v6.8h, v18.8h
+ rshrn v20.4h, v20.4s, #9
+ rshrn v21.4h, v21.4s, #9
+ rshrn v22.4h, v22.4s, #9
+ rshrn v23.4h, v23.4s, #9
+ st1 {v20.4h}, [x0], x1
+ st1 {v21.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.4h}, [x0], x1
+ st1 {v23.4h}, [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v6.8h}, [x8] // top
+ ld1 {v7.8b}, [x10] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
+ dup v5.8h, v6.h[7] // right
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+ uxtl v7.8h, v7.8b // weights_hor
+ add v31.4h, v4.4h, v5.4h // bottom+right
+8:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
+ ushll v20.4s, v31.4h, #8 // (bottom+right)*256
+ ushll v21.4s, v31.4h, #8
+ ushll v22.4s, v31.4h, #8
+ ushll v23.4s, v31.4h, #8
+ ushll v24.4s, v31.4h, #8
+ ushll v25.4s, v31.4h, #8
+ ushll v26.4s, v31.4h, #8
+ ushll v27.4s, v31.4h, #8
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ sub v2.8h, v2.8h, v5.8h
+ sub v3.8h, v3.8h, v5.8h
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+ smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor
+ smlal2 v21.4s, v3.8h, v7.8h // (left flipped)
+ smlal v22.4s, v2.4h, v7.4h
+ smlal2 v23.4s, v2.8h, v7.8h
+ smlal v24.4s, v1.4h, v7.4h
+ smlal2 v25.4s, v1.8h, v7.8h
+ smlal v26.4s, v0.4h, v7.4h
+ smlal2 v27.4s, v0.8h, v7.8h
+ smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver
+ smlal2 v21.4s, v6.8h, v16.8h
+ smlal v22.4s, v6.4h, v17.4h
+ smlal2 v23.4s, v6.8h, v17.8h
+ smlal v24.4s, v6.4h, v18.4h
+ smlal2 v25.4s, v6.8h, v18.8h
+ smlal v26.4s, v6.4h, v19.4h
+ smlal2 v27.4s, v6.8h, v19.8h
+ rshrn v20.4h, v20.4s, #9
+ rshrn2 v20.8h, v21.4s, #9
+ rshrn v21.4h, v22.4s, #9
+ rshrn2 v21.8h, v23.4s, #9
+ rshrn v22.4h, v24.4s, #9
+ rshrn2 v22.8h, v25.4s, #9
+ rshrn v23.4h, v26.4s, #9
+ rshrn2 v23.8h, v27.4s, #9
+ st1 {v20.8h}, [x0], x1
+ st1 {v21.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8h}, [x0], x1
+ st1 {v23.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x12, x2, w3, uxtw #1
+ sub x1, x1, w3, uxtw #1
+ ld1r {v5.8h}, [x12] // right
+ sub x2, x2, #4
+ mov x7, #-4
+ mov w9, w3
+ add v31.4h, v4.4h, v5.4h // bottom+right
+
+1:
+ ld2r {v0.8h, v1.8h}, [x2], x7 // left
+ ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+2:
+ ld1 {v7.16b}, [x10], #16 // weights_hor
+ ld1 {v2.8h, v3.8h}, [x8], #32 // top
+ ushll v20.4s, v31.4h, #8 // (bottom+right)*256
+ ushll v21.4s, v31.4h, #8
+ ushll v22.4s, v31.4h, #8
+ ushll v23.4s, v31.4h, #8
+ ushll v24.4s, v31.4h, #8
+ ushll v25.4s, v31.4h, #8
+ ushll v26.4s, v31.4h, #8
+ ushll v27.4s, v31.4h, #8
+ uxtl v6.8h, v7.8b // weights_hor
+ uxtl2 v7.8h, v7.16b
+ sub v2.8h, v2.8h, v4.8h // top-bottom
+ sub v3.8h, v3.8h, v4.8h
+ smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor
+ smlal2 v21.4s, v1.8h, v6.8h // (left flipped)
+ smlal v22.4s, v1.4h, v7.4h
+ smlal2 v23.4s, v1.8h, v7.8h
+ smlal v24.4s, v0.4h, v6.4h
+ smlal2 v25.4s, v0.8h, v6.8h
+ smlal v26.4s, v0.4h, v7.4h
+ smlal2 v27.4s, v0.8h, v7.8h
+ smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver
+ smlal2 v21.4s, v2.8h, v16.8h
+ smlal v22.4s, v3.4h, v16.4h
+ smlal2 v23.4s, v3.8h, v16.8h
+ smlal v24.4s, v2.4h, v17.4h
+ smlal2 v25.4s, v2.8h, v17.8h
+ smlal v26.4s, v3.4h, v17.4h
+ smlal2 v27.4s, v3.8h, v17.8h
+ rshrn v20.4h, v20.4s, #9
+ rshrn2 v20.8h, v21.4s, #9
+ rshrn v21.4h, v22.4s, #9
+ rshrn2 v21.8h, v23.4s, #9
+ rshrn v22.4h, v24.4s, #9
+ rshrn2 v22.8h, v25.4s, #9
+ rshrn v23.4h, v26.4s, #9
+ rshrn2 v23.8h, v27.4s, #9
+ subs w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ st1 {v22.8h, v23.8h}, [x6], #32
+ b.gt 2b
+ subs w4, w4, #2
+ b.le 9f
+ sub x8, x8, w9, uxtw #1
+ sub x10, x10, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_tbl):
+ .hword L(ipred_smooth_tbl) - 640b
+ .hword L(ipred_smooth_tbl) - 320b
+ .hword L(ipred_smooth_tbl) - 160b
+ .hword L(ipred_smooth_tbl) - 80b
+ .hword L(ipred_smooth_tbl) - 40b
+endfunc
+
+// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_v_16bpc_neon, export=1
+ movrel x7, X(sm_weights)
+ add x7, x7, w4, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_v_tbl)
+ sub x8, x2, w4, uxtw #1
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.8h}, [x8] // bottom
+ add x2, x2, #2
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v6.2d}, [x2] // top
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+4:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ zip1 v16.2s, v16.2s, v17.2s // weights_ver
+ zip1 v18.2s, v18.2s, v19.2s
+ ushll v16.8h, v16.8b, #7 // weights_ver << 7
+ ushll v18.8h, v18.8b, #7
+ sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
+ sqrdmulh v21.8h, v6.8h, v18.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v4.8h
+ st1 {v20.d}[0], [x0], x1
+ st1 {v20.d}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v21.d}[0], [x0], x1
+ st1 {v21.d}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v6.8h}, [x2] // top
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+8:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ ushll v16.8h, v16.8b, #7 // weights_ver << 7
+ ushll v17.8h, v17.8b, #7
+ ushll v18.8h, v18.8b, #7
+ ushll v19.8h, v19.8b, #7
+ sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
+ sqrdmulh v21.8h, v6.8h, v17.8h
+ sqrdmulh v22.8h, v6.8h, v18.8h
+ sqrdmulh v23.8h, v6.8h, v19.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v4.8h
+ add v22.8h, v22.8h, v4.8h
+ add v23.8h, v23.8h, v4.8h
+ st1 {v20.8h}, [x0], x1
+ st1 {v21.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8h}, [x0], x1
+ st1 {v23.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ // Set up pointers for four rows in parallel; x0, x6, x5, x8
+ add x5, x0, x1
+ add x8, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+
+1:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ ushll v16.8h, v16.8b, #7 // weights_ver << 7
+ ushll v17.8h, v17.8b, #7
+ ushll v18.8h, v18.8b, #7
+ ushll v19.8h, v19.8b, #7
+2:
+ ld1 {v2.8h, v3.8h}, [x2], #32 // top
+ sub v2.8h, v2.8h, v4.8h // top-bottom
+ sub v3.8h, v3.8h, v4.8h
+ sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
+ sqrdmulh v21.8h, v3.8h, v16.8h
+ sqrdmulh v22.8h, v2.8h, v17.8h
+ sqrdmulh v23.8h, v3.8h, v17.8h
+ sqrdmulh v24.8h, v2.8h, v18.8h
+ sqrdmulh v25.8h, v3.8h, v18.8h
+ sqrdmulh v26.8h, v2.8h, v19.8h
+ sqrdmulh v27.8h, v3.8h, v19.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v4.8h
+ add v22.8h, v22.8h, v4.8h
+ add v23.8h, v23.8h, v4.8h
+ add v24.8h, v24.8h, v4.8h
+ add v25.8h, v25.8h, v4.8h
+ add v26.8h, v26.8h, v4.8h
+ add v27.8h, v27.8h, v4.8h
+ subs w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ st1 {v22.8h, v23.8h}, [x6], #32
+ st1 {v24.8h, v25.8h}, [x5], #32
+ st1 {v26.8h, v27.8h}, [x8], #32
+ b.gt 2b
+ subs w4, w4, #4
+ b.le 9f
+ sub x2, x2, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ add x5, x5, x1
+ add x8, x8, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_v_tbl):
+ .hword L(ipred_smooth_v_tbl) - 640b
+ .hword L(ipred_smooth_v_tbl) - 320b
+ .hword L(ipred_smooth_v_tbl) - 160b
+ .hword L(ipred_smooth_v_tbl) - 80b
+ .hword L(ipred_smooth_v_tbl) - 40b
+endfunc
+
+// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_h_16bpc_neon, export=1
+ movrel x8, X(sm_weights)
+ add x8, x8, w3, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_h_tbl)
+ add x12, x2, w3, uxtw #1
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v5.8h}, [x12] // right
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v7.2s}, [x8] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
+ ushll v7.8h, v7.8b, #7 // weights_hor << 7
+4:
+ ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left
+ zip1 v1.2d, v1.2d, v0.2d // left, flipped
+ zip1 v0.2d, v3.2d, v2.2d
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8
+ sqrdmulh v21.8h, v1.8h, v7.8h
+ add v20.8h, v20.8h, v5.8h
+ add v21.8h, v21.8h, v5.8h
+ st1 {v20.d}[0], [x0], x1
+ st1 {v20.d}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v21.d}[0], [x0], x1
+ st1 {v21.d}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v7.8b}, [x8] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
+ ushll v7.8h, v7.8b, #7 // weights_hor << 7
+8:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
+ sub v3.8h, v3.8h, v5.8h // left-right
+ sub v2.8h, v2.8h, v5.8h
+ sub v1.8h, v1.8h, v5.8h
+ sub v0.8h, v0.8h, v5.8h
+ sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8
+ sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped)
+ sqrdmulh v22.8h, v1.8h, v7.8h
+ sqrdmulh v23.8h, v0.8h, v7.8h
+ add v20.8h, v20.8h, v5.8h
+ add v21.8h, v21.8h, v5.8h
+ add v22.8h, v22.8h, v5.8h
+ add v23.8h, v23.8h, v5.8h
+ st1 {v20.8h}, [x0], x1
+ st1 {v21.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8h}, [x0], x1
+ st1 {v23.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ sub x2, x2, #8
+ mov x7, #-8
+ // Set up pointers for four rows in parallel; x0, x6, x5, x10
+ add x5, x0, x1
+ add x10, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+
+1:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ sub v2.8h, v2.8h, v5.8h
+ sub v3.8h, v3.8h, v5.8h
+2:
+ ld1 {v7.16b}, [x8], #16 // weights_hor
+ ushll v6.8h, v7.8b, #7 // weights_hor << 7
+ ushll2 v7.8h, v7.16b, #7
+ sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8
+ sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped)
+ sqrdmulh v22.8h, v2.8h, v6.8h
+ sqrdmulh v23.8h, v2.8h, v7.8h
+ sqrdmulh v24.8h, v1.8h, v6.8h
+ sqrdmulh v25.8h, v1.8h, v7.8h
+ sqrdmulh v26.8h, v0.8h, v6.8h
+ sqrdmulh v27.8h, v0.8h, v7.8h
+ add v20.8h, v20.8h, v5.8h
+ add v21.8h, v21.8h, v5.8h
+ add v22.8h, v22.8h, v5.8h
+ add v23.8h, v23.8h, v5.8h
+ add v24.8h, v24.8h, v5.8h
+ add v25.8h, v25.8h, v5.8h
+ add v26.8h, v26.8h, v5.8h
+ add v27.8h, v27.8h, v5.8h
+ subs w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ st1 {v22.8h, v23.8h}, [x6], #32
+ st1 {v24.8h, v25.8h}, [x5], #32
+ st1 {v26.8h, v27.8h}, [x10], #32
+ b.gt 2b
+ subs w4, w4, #4
+ b.le 9f
+ sub x8, x8, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ add x5, x5, x1
+ add x10, x10, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_h_tbl):
+ .hword L(ipred_smooth_h_tbl) - 640b
+ .hword L(ipred_smooth_h_tbl) - 320b
+ .hword L(ipred_smooth_h_tbl) - 160b
+ .hword L(ipred_smooth_h_tbl) - 80b
+ .hword L(ipred_smooth_h_tbl) - 40b
+endfunc
+
+const padding_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+padding_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz,
+// const pixel *const in, const int end,
+// const int bitdepth_max);
+function ipred_z1_upsample_edge_16bpc_neon, export=1
+ dup v30.8h, w4 // bitdepth_max
+ movrel x4, padding_mask
+ ld1 {v0.8h, v1.8h}, [x2] // in[]
+ add x5, x2, w3, uxtw #1 // in[end]
+ sub x4, x4, w3, uxtw #1
+
+ ld1r {v2.8h}, [x5] // padding
+ ld1 {v3.8h, v4.8h}, [x4] // padding_mask
+
+ movi v31.8h, #9
+
+ bit v0.16b, v2.16b, v3.16b // padded in[]
+ bit v1.16b, v2.16b, v4.16b
+
+ ext v4.16b, v0.16b, v1.16b, #2
+ ext v5.16b, v1.16b, v2.16b, #2
+ ext v6.16b, v0.16b, v1.16b, #4
+ ext v7.16b, v1.16b, v2.16b, #4
+ ext v16.16b, v0.16b, v1.16b, #6
+ ext v17.16b, v1.16b, v2.16b, #6
+
+ add v18.8h, v4.8h, v6.8h // in[i+1] + in[i+2]
+ add v19.8h, v5.8h, v7.8h
+ add v20.8h, v0.8h, v16.8h
+ add v21.8h, v1.8h, v17.8h
+ umull v22.4s, v18.4h, v31.4h // 9*(in[i+1] + in[i+2])
+ umull2 v23.4s, v18.8h, v31.8h
+ umull v24.4s, v19.4h, v31.4h
+ umull2 v25.4s, v19.8h, v31.8h
+ usubw v22.4s, v22.4s, v20.4h
+ usubw2 v23.4s, v23.4s, v20.8h
+ usubw v24.4s, v24.4s, v21.4h
+ usubw2 v25.4s, v25.4s, v21.8h
+
+ sqrshrun v16.4h, v22.4s, #4
+ sqrshrun2 v16.8h, v23.4s, #4
+ sqrshrun v17.4h, v24.4s, #4
+ sqrshrun2 v17.8h, v25.4s, #4
+
+ smin v16.8h, v16.8h, v30.8h
+ smin v17.8h, v17.8h, v30.8h
+
+ zip1 v0.8h, v4.8h, v16.8h
+ zip2 v1.8h, v4.8h, v16.8h
+ zip1 v2.8h, v5.8h, v17.8h
+ zip2 v3.8h, v5.8h, v17.8h
+
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+
+ ret
+endfunc
+
+const edge_filter
+ .short 0, 4, 8, 0
+ .short 0, 5, 6, 0
+// Leaving out the coeffs for strength=3
+// .byte 2, 4, 4, 0
+endconst
+
+// void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz,
+// const pixel *const in, const int end,
+// const int strength);
+function ipred_z1_filter_edge_16bpc_neon, export=1
+ cmp w4, #3
+ b.eq L(fivetap) // if (strength == 3) goto fivetap
+
+ movrel x5, edge_filter, -6
+ add x5, x5, w4, uxtw #3 // edge_filter + 2*((strength - 1)*4 + 1)
+
+ ld1 {v31.s}[0], [x5] // kernel[1-2]
+
+ ld1 {v0.8h}, [x2], #16
+
+ dup v30.8h, v31.h[0]
+ dup v31.8h, v31.h[1]
+1:
+ // in[end], is the last valid pixel. We produce 16 pixels out by
+ // using 18 pixels in - the last pixel used is [17] of the ones
+ // read/buffered.
+ cmp w3, #17
+ ld1 {v1.8h, v2.8h}, [x2], #32
+ b.lt 2f
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v1.16b, v2.16b, #2
+ ext v5.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v1.16b, v2.16b, #4
+ mul v16.8h, v0.8h, v30.8h
+ mla v16.8h, v3.8h, v31.8h
+ mla v16.8h, v5.8h, v30.8h
+ mul v17.8h, v1.8h, v30.8h
+ mla v17.8h, v4.8h, v31.8h
+ mla v17.8h, v6.8h, v30.8h
+ subs w1, w1, #16
+ mov v0.16b, v2.16b
+ urshr v16.8h, v16.8h, #4
+ urshr v17.8h, v17.8h, #4
+ sub w3, w3, #16
+ st1 {v16.8h, v17.8h}, [x0], #32
+ b.gt 1b
+ ret
+2:
+ // Right padding
+
+ // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead)
+ movrel x5, padding_mask
+ sub w6, w3, #24
+ sub x5, x5, w3, uxtw #1
+ add x6, x2, w6, sxtw #1
+
+ ld1 {v3.8h, v4.8h}, [x5] // padding_mask
+
+ ld1r {v2.8h}, [x6]
+ bit v0.16b, v2.16b, v3.16b // Pad v0-v1
+ bit v1.16b, v2.16b, v4.16b
+
+ // Filter one block
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v1.16b, v2.16b, #2
+ ext v5.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v1.16b, v2.16b, #4
+ mul v16.8h, v0.8h, v30.8h
+ mla v16.8h, v3.8h, v31.8h
+ mla v16.8h, v5.8h, v30.8h
+ mul v17.8h, v1.8h, v30.8h
+ mla v17.8h, v4.8h, v31.8h
+ mla v17.8h, v6.8h, v30.8h
+ subs w1, w1, #16
+ urshr v16.8h, v16.8h, #4
+ urshr v17.8h, v17.8h, #4
+ st1 {v16.8h, v17.8h}, [x0], #32
+ b.le 9f
+5:
+ // After one block, any remaining output would only be filtering
+ // padding - thus just store the padding.
+ subs w1, w1, #16
+ st1 {v2.16b}, [x0], #16
+ b.gt 5b
+9:
+ ret
+
+L(fivetap):
+ sub x2, x2, #2 // topleft -= 1 pixel
+ movi v29.8h, #2
+ ld1 {v0.8h}, [x2], #16
+ movi v30.8h, #4
+ movi v31.8h, #4
+ ins v0.h[0], v0.h[1]
+1:
+ // in[end+1], is the last valid pixel. We produce 16 pixels out by
+ // using 20 pixels in - the last pixel used is [19] of the ones
+ // read/buffered.
+ cmp w3, #18
+ ld1 {v1.8h, v2.8h}, [x2], #32
+ b.lt 2f // if (end + 1 < 19)
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v1.16b, v2.16b, #2
+ ext v5.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v1.16b, v2.16b, #4
+ ext v16.16b, v0.16b, v1.16b, #6
+ ext v17.16b, v1.16b, v2.16b, #6
+ ext v18.16b, v0.16b, v1.16b, #8
+ ext v19.16b, v1.16b, v2.16b, #8
+ mul v20.8h, v0.8h, v29.8h
+ mla v20.8h, v3.8h, v30.8h
+ mla v20.8h, v5.8h, v31.8h
+ mla v20.8h, v16.8h, v30.8h
+ mla v20.8h, v18.8h, v29.8h
+ mul v21.8h, v1.8h, v29.8h
+ mla v21.8h, v4.8h, v30.8h
+ mla v21.8h, v6.8h, v31.8h
+ mla v21.8h, v17.8h, v30.8h
+ mla v21.8h, v19.8h, v29.8h
+ subs w1, w1, #16
+ mov v0.16b, v2.16b
+ urshr v20.8h, v20.8h, #4
+ urshr v21.8h, v21.8h, #4
+ sub w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ b.gt 1b
+ ret
+2:
+ // Right padding
+
+ // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead)
+ movrel x5, padding_mask, -2
+ sub w6, w3, #23
+ sub x5, x5, w3, uxtw #1
+ add x6, x2, w6, sxtw #1
+
+ ld1 {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask
+
+ ld1r {v28.8h}, [x6]
+ bit v0.16b, v28.16b, v3.16b // Pad v0-v2
+ bit v1.16b, v28.16b, v4.16b
+ bit v2.16b, v28.16b, v5.16b
+4:
+ // Filter one block
+ ext v3.16b, v0.16b, v1.16b, #2
+ ext v4.16b, v1.16b, v2.16b, #2
+ ext v5.16b, v0.16b, v1.16b, #4
+ ext v6.16b, v1.16b, v2.16b, #4
+ ext v16.16b, v0.16b, v1.16b, #6
+ ext v17.16b, v1.16b, v2.16b, #6
+ ext v18.16b, v0.16b, v1.16b, #8
+ ext v19.16b, v1.16b, v2.16b, #8
+ mul v20.8h, v0.8h, v29.8h
+ mla v20.8h, v3.8h, v30.8h
+ mla v20.8h, v5.8h, v31.8h
+ mla v20.8h, v16.8h, v30.8h
+ mla v20.8h, v18.8h, v29.8h
+ mul v21.8h, v1.8h, v29.8h
+ mla v21.8h, v4.8h, v30.8h
+ mla v21.8h, v6.8h, v31.8h
+ mla v21.8h, v17.8h, v30.8h
+ mla v21.8h, v19.8h, v29.8h
+ subs w1, w1, #16
+ mov v0.16b, v2.16b
+ mov v1.16b, v28.16b
+ mov v2.16b, v28.16b
+ urshr v20.8h, v20.8h, #4
+ urshr v21.8h, v21.8h, #4
+ sub w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ b.le 9f
+ // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
+ // filter properly once more - aka (w3 >= 0).
+ cmp w3, #0
+ b.ge 4b
+5:
+ // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
+ // last valid pixel - thus just output that without filtering.
+ subs w1, w1, #8
+ st1 {v28.8h}, [x0], #16
+ b.gt 5b
+9:
+ ret
+endfunc
+
+// void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px,
+// const int n);
+function ipred_pixel_set_16bpc_neon, export=1
+ dup v0.8h, w1
+1:
+ subs w2, w2, #8
+ st1 {v0.8h}, [x0], #16
+ b.gt 1b
+ ret
+endfunc
+
+// void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const top,
+// const int width, const int height,
+// const int dx, const int max_base_x);
+function ipred_z1_fill1_16bpc_neon, export=1
+ clz w9, w3
+ adr x8, L(ipred_z1_fill1_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ add x10, x2, w6, uxtw #1 // top[max_base_x]
+ sub x8, x8, w9, uxtw
+ ld1r {v31.8h}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ br x8
+40:
+ AARCH64_VALID_JUMP_TARGET
+4:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 49f
+ lsl w8, w8, #1
+ lsl w10, w10, #1
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ ext v1.16b, v0.16b, v0.16b, #2 // top[base+1]
+ ext v3.16b, v2.16b, v2.16b, #2
+ sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
+ sub v7.4h, v3.4h, v2.4h
+ ushll v16.4s, v0.4h, #6 // top[base]*64
+ ushll v17.4s, v2.4h, #6
+ smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
+ smlal v17.4s, v7.4h, v5.4h
+ rshrn v16.4h, v16.4s, #6
+ rshrn v17.4h, v17.4s, #6
+ st1 {v16.4h}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.4h}, [x0], x1
+ b.gt 4b
+ ret
+
+49:
+ st1 {v31.4h}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.4h}, [x0], x1
+ b.gt 49b
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+8:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 89f
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ld1 {v0.8h}, [x8] // top[base]
+ ld1 {v2.8h}, [x10]
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ ldr h1, [x8, #16]
+ ldr h3, [x10, #16]
+ dup v6.8h, w9 // 64 - frac
+ dup v7.8h, w11
+ ext v1.16b, v0.16b, v1.16b, #2 // top[base+1]
+ ext v3.16b, v2.16b, v3.16b, #2
+ umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac)
+ umlal v16.4s, v1.4h, v4.4h // + top[base+1]*frac
+ umull2 v17.4s, v0.8h, v6.8h
+ umlal2 v17.4s, v1.8h, v4.8h
+ umull v18.4s, v2.4h, v7.4h
+ umlal v18.4s, v3.4h, v5.4h
+ umull2 v19.4s, v2.8h, v7.8h
+ umlal2 v19.4s, v3.8h, v5.8h
+ rshrn v16.4h, v16.4s, #6
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ st1 {v16.8h}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.8h}, [x0], x1
+ b.gt 8b
+ ret
+
+89:
+ st1 {v31.8h}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.8h}, [x0], x1
+ b.gt 89b
+ ret
+
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+
+ mov w12, w3
+
+ add x13, x0, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+1:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 169f
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v6.8h, w9 // frac
+ dup v7.8h, w11
+ ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // top[base]
+ ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v16.8h, w9 // 64 - frac
+ dup v17.8h, w11
+ add w7, w7, w5 // xpos += dx
+2:
+ ext v18.16b, v0.16b, v1.16b, #2 // top[base+1]
+ ext v19.16b, v1.16b, v2.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #2
+ ext v21.16b, v4.16b, v5.16b, #2
+ subs w3, w3, #16
+ umull v22.4s, v0.4h, v16.4h // top[base]*(64-frac)
+ umlal v22.4s, v18.4h, v6.4h // + top[base+1]*frac
+ umull2 v23.4s, v0.8h, v16.8h
+ umlal2 v23.4s, v18.8h, v6.8h
+ umull v24.4s, v1.4h, v16.4h
+ umlal v24.4s, v19.4h, v6.4h
+ umull2 v25.4s, v1.8h, v16.8h
+ umlal2 v25.4s, v19.8h, v6.8h
+ umull v26.4s, v3.4h, v17.4h
+ umlal v26.4s, v20.4h, v7.4h
+ umull2 v27.4s, v3.8h, v17.8h
+ umlal2 v27.4s, v20.8h, v7.8h
+ umull v28.4s, v4.4h, v17.4h
+ umlal v28.4s, v21.4h, v7.4h
+ umull2 v29.4s, v4.8h, v17.8h
+ umlal2 v29.4s, v21.8h, v7.8h
+ rshrn v22.4h, v22.4s, #6
+ rshrn2 v22.8h, v23.4s, #6
+ rshrn v23.4h, v24.4s, #6
+ rshrn2 v23.8h, v25.4s, #6
+ rshrn v24.4h, v26.4s, #6
+ rshrn2 v24.8h, v27.4s, #6
+ rshrn v25.4h, v28.4s, #6
+ rshrn2 v25.8h, v29.4s, #6
+ st1 {v22.8h, v23.8h}, [x0], #32
+ st1 {v24.8h, v25.8h}, [x13], #32
+ b.le 3f
+ mov v0.16b, v2.16b
+ ld1 {v1.8h, v2.8h}, [x8], #32 // top[base]
+ mov v3.16b, v5.16b
+ ld1 {v4.8h, v5.8h}, [x10], #32
+ b 2b
+
+3:
+ subs w4, w4, #2
+ b.le 9f
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w3, w12
+ b 1b
+9:
+ ret
+
+169:
+ st1 {v31.8h}, [x0], #16
+ subs w3, w3, #8
+ st1 {v31.8h}, [x13], #16
+ b.gt 169b
+ subs w4, w4, #2
+ b.le 9b
+ add x0, x0, x1
+ add x13, x13, x1
+ mov w3, w12
+ b 169b
+
+L(ipred_z1_fill1_tbl):
+ .hword L(ipred_z1_fill1_tbl) - 640b
+ .hword L(ipred_z1_fill1_tbl) - 320b
+ .hword L(ipred_z1_fill1_tbl) - 160b
+ .hword L(ipred_z1_fill1_tbl) - 80b
+ .hword L(ipred_z1_fill1_tbl) - 40b
+endfunc
+
+function ipred_z1_fill2_16bpc_neon, export=1
+ cmp w3, #8
+ add x10, x2, w6, uxtw // top[max_base_x]
+ ld1r {v31.16b}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ b.eq 8f
+
+4: // w == 4
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 49f
+ lsl w8, w8, #1
+ lsl w10, w10, #1
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ uzp2 v1.8h, v0.8h, v0.8h // top[base+1]
+ uzp1 v0.8h, v0.8h, v0.8h // top[base]
+ uzp2 v3.8h, v2.8h, v2.8h
+ uzp1 v2.8h, v2.8h, v2.8h
+ sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
+ sub v7.4h, v3.4h, v2.4h
+ ushll v16.4s, v0.4h, #6 // top[base]*64
+ ushll v17.4s, v2.4h, #6
+ smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
+ smlal v17.4s, v7.4h, v5.4h
+ rshrn v16.4h, v16.4s, #6
+ rshrn v17.4h, v17.4s, #6
+ st1 {v16.4h}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.4h}, [x0], x1
+ b.gt 4b
+ ret
+
+49:
+ st1 {v31.4h}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.4h}, [x0], x1
+ b.gt 49b
+ ret
+
+8: // w == 8
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge 89f
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ld1 {v0.8h, v1.8h}, [x8] // top[base]
+ ld1 {v2.8h, v3.8h}, [x10]
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.8h, w9 // 64 - frac
+ dup v7.8h, w11
+ uzp2 v20.8h, v0.8h, v1.8h // top[base+1]
+ uzp1 v0.8h, v0.8h, v1.8h // top[base]
+ uzp2 v21.8h, v2.8h, v3.8h
+ uzp1 v2.8h, v2.8h, v3.8h
+ umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac)
+ umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac
+ umull2 v17.4s, v0.8h, v6.8h
+ umlal2 v17.4s, v20.8h, v4.8h
+ umull v18.4s, v2.4h, v7.4h
+ umlal v18.4s, v21.4h, v5.4h
+ umull2 v19.4s, v2.8h, v7.8h
+ umlal2 v19.4s, v21.8h, v5.8h
+ rshrn v16.4h, v16.4s, #6
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ st1 {v16.8h}, [x0], x1
+ add w7, w7, w5 // xpos += dx
+ subs w4, w4, #2
+ st1 {v17.8h}, [x0], x1
+ b.gt 8b
+ ret
+
+89:
+ st1 {v31.8h}, [x0], x1
+ subs w4, w4, #2
+ st1 {v31.8h}, [x0], x1
+ b.gt 89b
+ ret
+endfunc
+
+// void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src,
+// const int n);
+function ipred_reverse_16bpc_neon, export=1
+ sub x1, x1, #16
+ add x3, x0, #8
+ mov x4, #16
+1:
+ ld1 {v0.8h}, [x1]
+ subs w2, w2, #8
+ rev64 v0.8h, v0.8h
+ sub x1, x1, #16
+ st1 {v0.d}[1], [x0], x4
+ st1 {v0.d}[0], [x3], x4
+ b.gt 1b
+ ret
+endfunc
+
+// void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const left,
+// const int width, const int height,
+// const int dy, const int max_base_y);
+function ipred_z3_fill1_16bpc_neon, export=1
+ clz w9, w4
+ adr x8, L(ipred_z3_fill1_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ add x10, x2, w6, uxtw #1 // left[max_base_y]
+ sub x8, x8, w9, uxtw
+ ld1r {v31.8h}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ add x13, x0, x1
+ lsl x1, x1, #1
+ br x8
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+4:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ lsl w8, w8, #1
+ lsl w10, w10, #1
+ ldr q0, [x2, w8, uxtw] // left[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ext v1.16b, v0.16b, v0.16b, #2 // left[base+1]
+ ext v3.16b, v2.16b, v2.16b, #2
+ sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
+ sub v7.4h, v3.4h, v2.4h
+ ushll v16.4s, v0.4h, #6 // top[base]*64
+ ushll v17.4s, v2.4h, #6
+ smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
+ smlal v17.4s, v7.4h, v5.4h
+ rshrn v16.4h, v16.4s, #6
+ rshrn v17.4h, v17.4s, #6
+ subs w3, w3, #2
+ zip1 v18.8h, v16.8h, v17.8h
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ add w7, w7, w5 // xpos += dx
+ st1 {v18.s}[2], [x0]
+ st1 {v18.s}[3], [x13]
+ b.le 9f
+ sub x0, x0, x1 // ptr -= 4 * (2*stride)
+ sub x13, x13, x1
+ add x0, x0, #4
+ add x13, x13, #4
+ b 4b
+9:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+8:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ld1 {v0.8h}, [x8] // left[base]
+ ld1 {v2.8h}, [x10]
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ ldr h1, [x8, #16]
+ ldr h3, [x10, #16]
+ dup v6.8h, w9 // 64 - frac
+ dup v7.8h, w11
+ ext v1.16b, v0.16b, v1.16b, #2 // left[base+1]
+ ext v3.16b, v2.16b, v3.16b, #2
+ umull v16.4s, v0.4h, v6.4h // left[base]*(64-frac)
+ umlal v16.4s, v1.4h, v4.4h // + left[base+1]*frac
+ umull2 v17.4s, v0.8h, v6.8h
+ umlal2 v17.4s, v1.8h, v4.8h
+ umull v18.4s, v2.4h, v7.4h
+ umlal v18.4s, v3.4h, v5.4h
+ umull2 v19.4s, v2.8h, v7.8h
+ umlal2 v19.4s, v3.8h, v5.8h
+ rshrn v16.4h, v16.4s, #6
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ subs w3, w3, #2
+ zip1 v18.8h, v16.8h, v17.8h
+ zip2 v19.8h, v16.8h, v17.8h
+ add w7, w7, w5 // xpos += dx
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ st1 {v18.s}[2], [x0], x1
+ st1 {v18.s}[3], [x13], x1
+ st1 {v19.s}[0], [x0], x1
+ st1 {v19.s}[1], [x13], x1
+ st1 {v19.s}[2], [x0], x1
+ st1 {v19.s}[3], [x13], x1
+ b.le 9f
+ sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride)
+ sub x13, x13, x1, lsl #2
+ add x0, x0, #4
+ add x13, x13, #4
+ b 8b
+9:
+ ret
+
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
+ mov w12, w4
+1:
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // ypos += dy
+ cmp w8, w6 // base >= max_base_y
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v6.8h, w9 // frac
+ dup v7.8h, w11
+ ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // left[base]
+ ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v16.8h, w9 // 64 - frac
+ dup v17.8h, w11
+ add w7, w7, w5 // ypos += dy
+2:
+ ext v18.16b, v0.16b, v1.16b, #2 // left[base+1]
+ ext v19.16b, v1.16b, v2.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #2
+ ext v21.16b, v4.16b, v5.16b, #2
+ subs w4, w4, #16
+ umull v22.4s, v0.4h, v16.4h // left[base]*(64-frac)
+ umlal v22.4s, v18.4h, v6.4h // + left[base+1]*frac
+ umull2 v23.4s, v0.8h, v16.8h
+ umlal2 v23.4s, v18.8h, v6.8h
+ umull v24.4s, v1.4h, v16.4h
+ umlal v24.4s, v19.4h, v6.4h
+ umull2 v25.4s, v1.8h, v16.8h
+ umlal2 v25.4s, v19.8h, v6.8h
+ umull v26.4s, v3.4h, v17.4h
+ umlal v26.4s, v20.4h, v7.4h
+ umull2 v27.4s, v3.8h, v17.8h
+ umlal2 v27.4s, v20.8h, v7.8h
+ umull v28.4s, v4.4h, v17.4h
+ umlal v28.4s, v21.4h, v7.4h
+ umull2 v29.4s, v4.8h, v17.8h
+ umlal2 v29.4s, v21.8h, v7.8h
+ rshrn v22.4h, v22.4s, #6
+ rshrn2 v22.8h, v23.4s, #6
+ rshrn v23.4h, v24.4s, #6
+ rshrn2 v23.8h, v25.4s, #6
+ rshrn v24.4h, v26.4s, #6
+ rshrn2 v24.8h, v27.4s, #6
+ rshrn v25.4h, v28.4s, #6
+ rshrn2 v25.8h, v29.4s, #6
+ zip1 v18.8h, v22.8h, v24.8h
+ zip2 v19.8h, v22.8h, v24.8h
+ zip1 v20.8h, v23.8h, v25.8h
+ zip2 v21.8h, v23.8h, v25.8h
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ st1 {v18.s}[2], [x0], x1
+ st1 {v18.s}[3], [x13], x1
+ st1 {v19.s}[0], [x0], x1
+ st1 {v19.s}[1], [x13], x1
+ st1 {v19.s}[2], [x0], x1
+ st1 {v19.s}[3], [x13], x1
+ st1 {v20.s}[0], [x0], x1
+ st1 {v20.s}[1], [x13], x1
+ st1 {v20.s}[2], [x0], x1
+ st1 {v20.s}[3], [x13], x1
+ st1 {v21.s}[0], [x0], x1
+ st1 {v21.s}[1], [x13], x1
+ st1 {v21.s}[2], [x0], x1
+ st1 {v21.s}[3], [x13], x1
+ b.le 3f
+ mov v0.16b, v2.16b
+ ld1 {v1.8h, v2.8h}, [x8], #32 // left[base]
+ mov v3.16b, v5.16b
+ ld1 {v4.8h, v5.8h}, [x10], #32
+ b 2b
+
+3:
+ subs w3, w3, #2
+ b.le 9f
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ lsl x1, x1, #1
+ add x0, x0, #4
+ add x13, x13, #4
+ mov w4, w12
+ b 1b
+9:
+ ret
+
+L(ipred_z3_fill1_tbl):
+ .hword L(ipred_z3_fill1_tbl) - 640b
+ .hword L(ipred_z3_fill1_tbl) - 320b
+ .hword L(ipred_z3_fill1_tbl) - 160b
+ .hword L(ipred_z3_fill1_tbl) - 80b
+ .hword L(ipred_z3_fill1_tbl) - 40b
+endfunc
+
+function ipred_z3_fill_padding_neon, export=0
+ cmp w3, #8
+ adr x8, L(ipred_z3_fill_padding_tbl)
+ b.gt L(ipred_z3_fill_padding_wide)
+ // w3 = remaining width, w4 = constant height
+ mov w12, w4
+
+1:
+ // Fill a WxH rectangle with padding. W can be any number;
+ // this fills the exact width by filling in the largest
+ // power of two in the remaining width, and repeating.
+ clz w9, w3
+ sub w9, w9, #25
+ ldrh w9, [x8, w9, uxtw #1]
+ sub x9, x8, w9, uxtw
+ br x9
+
+2:
+ st1 {v31.s}[0], [x0], x1
+ subs w4, w4, #4
+ st1 {v31.s}[0], [x13], x1
+ st1 {v31.s}[0], [x0], x1
+ st1 {v31.s}[0], [x13], x1
+ b.gt 2b
+ subs w3, w3, #2
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #4
+ add x13, x13, #4
+ mov w4, w12
+ b 1b
+
+4:
+ st1 {v31.4h}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.4h}, [x13], x1
+ st1 {v31.4h}, [x0], x1
+ st1 {v31.4h}, [x13], x1
+ b.gt 4b
+ subs w3, w3, #4
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #8
+ add x13, x13, #8
+ mov w4, w12
+ b 1b
+
+8:
+16:
+32:
+64:
+ st1 {v31.8h}, [x0], x1
+ subs w4, w4, #4
+ st1 {v31.8h}, [x13], x1
+ st1 {v31.8h}, [x0], x1
+ st1 {v31.8h}, [x13], x1
+ b.gt 4b
+ subs w3, w3, #8
+ lsr x1, x1, #1
+ msub x0, x1, x12, x0 // ptr -= h * stride
+ msub x13, x1, x12, x13
+ b.le 9f
+ lsl x1, x1, #1
+ add x0, x0, #16
+ add x13, x13, #16
+ mov w4, w12
+ b 1b
+
+9:
+ ret
+
+L(ipred_z3_fill_padding_tbl):
+ .hword L(ipred_z3_fill_padding_tbl) - 64b
+ .hword L(ipred_z3_fill_padding_tbl) - 32b
+ .hword L(ipred_z3_fill_padding_tbl) - 16b
+ .hword L(ipred_z3_fill_padding_tbl) - 8b
+ .hword L(ipred_z3_fill_padding_tbl) - 4b
+ .hword L(ipred_z3_fill_padding_tbl) - 2b
+
+L(ipred_z3_fill_padding_wide):
+ // Fill a WxH rectangle with padding, with W > 8.
+ lsr x1, x1, #1
+ mov w12, w3
+ sub x1, x1, w3, uxtw #1
+1:
+ ands w5, w3, #7
+ b.eq 2f
+ // If the width isn't aligned to 8, first do one 8 pixel write
+ // and align the start pointer.
+ sub w3, w3, w5
+ st1 {v31.8h}, [x0]
+ add x0, x0, w5, uxtw #1
+2:
+ // Fill the rest of the line with aligned 8 pixel writes.
+ subs w3, w3, #8
+ st1 {v31.8h}, [x0], #16
+ b.gt 2b
+ subs w4, w4, #1
+ add x0, x0, x1
+ b.le 9f
+ mov w3, w12
+ b 1b
+9:
+ ret
+endfunc
+
+function ipred_z3_fill2_16bpc_neon, export=1
+ cmp w4, #8
+ add x10, x2, w6, uxtw // left[max_base_y]
+ ld1r {v31.16b}, [x10] // padding
+ mov w7, w5
+ mov w15, #64
+ add x13, x0, x1
+ lsl x1, x1, #1
+ b.eq 8f
+
+4: // h == 4
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ lsl w8, w8, #1
+ lsl w10, w10, #1
+ ldr q0, [x2, w8, uxtw] // top[base]
+ ldr q2, [x2, w10, uxtw]
+ dup v4.4h, w9 // frac
+ dup v5.4h, w11
+ uzp2 v1.8h, v0.8h, v0.8h // top[base+1]
+ uzp1 v0.8h, v0.8h, v0.8h // top[base]
+ uzp2 v3.8h, v2.8h, v2.8h
+ uzp1 v2.8h, v2.8h, v2.8h
+ sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base]
+ sub v7.4h, v3.4h, v2.4h
+ ushll v16.4s, v0.4h, #6 // top[base]*64
+ ushll v17.4s, v2.4h, #6
+ smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac
+ smlal v17.4s, v7.4h, v5.4h
+ rshrn v16.4h, v16.4s, #6
+ rshrn v17.4h, v17.4s, #6
+ subs w3, w3, #2
+ zip1 v18.8h, v16.8h, v17.8h
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ add w7, w7, w5 // xpos += dx
+ st1 {v18.s}[2], [x0]
+ st1 {v18.s}[3], [x13]
+ b.le 9f
+ sub x0, x0, x1 // ptr -= 4 * (2*stride)
+ sub x13, x13, x1
+ add x0, x0, #4
+ add x13, x13, #4
+ b 4b
+9:
+ ret
+
+8: // h == 8
+ lsr w8, w7, #6 // base
+ and w9, w7, #0x3e // frac
+ add w7, w7, w5 // xpos += dx
+ cmp w8, w6 // base >= max_base_x
+ lsr w10, w7, #6 // base
+ and w11, w7, #0x3e // frac
+ b.ge ipred_z3_fill_padding_neon
+ add x8, x2, w8, uxtw #1
+ add x10, x2, w10, uxtw #1
+ dup v4.8h, w9 // frac
+ dup v5.8h, w11
+ ld1 {v0.8h, v1.8h}, [x8] // top[base]
+ ld1 {v2.8h, v3.8h}, [x10]
+ sub w9, w15, w9 // 64 - frac
+ sub w11, w15, w11
+ dup v6.8h, w9 // 64 - frac
+ dup v7.8h, w11
+ uzp2 v20.8h, v0.8h, v1.8h // top[base+1]
+ uzp1 v0.8h, v0.8h, v1.8h // top[base]
+ uzp2 v21.8h, v2.8h, v3.8h
+ uzp1 v2.8h, v2.8h, v3.8h
+ umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac)
+ umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac
+ umull2 v17.4s, v0.8h, v6.8h
+ umlal2 v17.4s, v20.8h, v4.8h
+ umull v18.4s, v2.4h, v7.4h
+ umlal v18.4s, v21.4h, v5.4h
+ umull2 v19.4s, v2.8h, v7.8h
+ umlal2 v19.4s, v21.8h, v5.8h
+ rshrn v16.4h, v16.4s, #6
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ subs w3, w3, #2
+ zip1 v18.8h, v16.8h, v17.8h
+ zip2 v19.8h, v16.8h, v17.8h
+ add w7, w7, w5 // xpos += dx
+ st1 {v18.s}[0], [x0], x1
+ st1 {v18.s}[1], [x13], x1
+ st1 {v18.s}[2], [x0], x1
+ st1 {v18.s}[3], [x13], x1
+ st1 {v19.s}[0], [x0], x1
+ st1 {v19.s}[1], [x13], x1
+ st1 {v19.s}[2], [x0], x1
+ st1 {v19.s}[3], [x13], x1
+ b.le 9f
+ sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride)
+ sub x13, x13, x1, lsl #2
+ add x0, x0, #4
+ add x13, x13, #4
+ b 8b
+9:
+ ret
+endfunc
+
+
+// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int filt_idx,
+// const int max_width, const int max_height,
+// const int bitdepth_max);
+.macro filter_fn bpc
+function ipred_filter_\bpc\()bpc_neon
+ and w5, w5, #511
+ movrel x6, X(filter_intra_taps)
+ lsl w5, w5, #6
+ add x6, x6, w5, uxtw
+ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
+ clz w9, w3
+ adr x5, L(ipred_filter\bpc\()_tbl)
+ ld1 {v20.8b, v21.8b, v22.8b}, [x6]
+ sub w9, w9, #26
+ ldrh w9, [x5, w9, uxtw #1]
+ sxtl v16.8h, v16.8b
+ sxtl v17.8h, v17.8b
+ sub x5, x5, w9, uxtw
+ sxtl v18.8h, v18.8b
+ sxtl v19.8h, v19.8b
+ add x6, x0, x1
+ lsl x1, x1, #1
+ sxtl v20.8h, v20.8b
+ sxtl v21.8h, v21.8b
+ sxtl v22.8h, v22.8b
+ dup v31.8h, w8
+.if \bpc == 10
+ movi v30.8h, #0
+.endif
+ br x5
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ldur d0, [x2, #2] // top (0-3)
+ sub x2, x2, #4
+ mov x7, #-4
+4:
+ ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2)
+.if \bpc == 10
+ mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ srshr v2.8h, v2.8h, #4
+ smax v2.8h, v2.8h, v30.8h
+.else
+ smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0)
+ smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6)
+ smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ sqrshrun v2.4h, v2.4s, #4
+ sqrshrun2 v2.8h, v3.4s, #4
+.endif
+ smin v2.8h, v2.8h, v31.8h
+ subs w4, w4, #2
+ st1 {v2.d}[0], [x0], x1
+ ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3]
+ st1 {v2.d}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ldur q0, [x2, #2] // top (0-7)
+ sub x2, x2, #4
+ mov x7, #-4
+8:
+ ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2)
+.if \bpc == 10
+ mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
+ mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
+ mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
+ srshr v2.8h, v2.8h, #4
+ smax v2.8h, v2.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
+ mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0)
+ mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5)
+ mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6)
+ srshr v3.8h, v3.8h, #4
+ smax v3.8h, v3.8h, v30.8h
+.else
+ smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0)
+ smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6)
+ smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1)
+ smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2)
+ smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v2.4h, v2.4s, #4
+ sqrshrun2 v2.8h, v3.4s, #4
+ smin v2.8h, v2.8h, v31.8h
+ smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4)
+ smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0)
+ smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5)
+ smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6)
+ smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
+ smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
+ smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
+ smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
+ smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0)
+ smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5)
+ smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6)
+ sqrshrun v3.4h, v4.4s, #4
+ sqrshrun2 v3.8h, v5.4s, #4
+.endif
+ smin v3.8h, v3.8h, v31.8h
+ subs w4, w4, #2
+ st2 {v2.d, v3.d}[0], [x0], x1
+ zip2 v0.2d, v2.2d, v3.2d
+ st2 {v2.d, v3.d}[1], [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x2, #2
+ sub x2, x2, #4
+ mov x7, #-4
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+
+1:
+ ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2)
+2:
+ ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15)
+.if \bpc == 10
+ mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0)
+ mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
+ mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
+ mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
+ mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
+ mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
+ mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
+
+ mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
+ mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
+ mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
+ srshr v3.8h, v3.8h, #4
+ smax v3.8h, v3.8h, v30.8h
+ smin v3.8h, v3.8h, v31.8h
+ mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
+ mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0)
+ mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5)
+ mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6)
+
+ mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
+ mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
+ mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
+ srshr v4.8h, v4.8h, #4
+ smax v4.8h, v4.8h, v30.8h
+ smin v4.8h, v4.8h, v31.8h
+ mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
+ mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0)
+ mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5)
+ mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6)
+
+ mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
+ mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
+ mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
+ srshr v5.8h, v5.8h, #4
+ smax v5.8h, v5.8h, v30.8h
+ smin v5.8h, v5.8h, v31.8h
+ mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
+ mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0)
+ mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5)
+ mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6)
+
+ subs w3, w3, #16
+ srshr v6.8h, v6.8h, #4
+ smax v6.8h, v6.8h, v30.8h
+.else
+ smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0)
+ smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5)
+ smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6)
+ smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1)
+ smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2)
+ smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3)
+ smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4)
+ smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0)
+ smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
+ smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
+ smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
+ smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
+ smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
+ smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
+
+ smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1)
+ smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2)
+ smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v3.4h, v3.4s, #4
+ sqrshrun2 v3.8h, v4.4s, #4
+ smin v3.8h, v3.8h, v31.8h
+ smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4)
+ smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0)
+ smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5)
+ smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6)
+ smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
+ smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
+ smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
+ smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
+ smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0)
+ smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5)
+ smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6)
+
+ smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1)
+ smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2)
+ smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3)
+ sqrshrun v4.4h, v5.4s, #4
+ sqrshrun2 v4.8h, v6.4s, #4
+ smin v4.8h, v4.8h, v31.8h
+ smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4)
+ smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0)
+ smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5)
+ smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6)
+ smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
+ smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
+ smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
+ smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
+ smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0)
+ smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5)
+ smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6)
+
+ smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1)
+ smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2)
+ smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v5.4h, v24.4s, #4
+ sqrshrun2 v5.8h, v25.4s, #4
+ smin v5.8h, v5.8h, v31.8h
+ smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4)
+ smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0)
+ smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5)
+ smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6)
+ smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
+ smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
+ smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
+ smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
+ smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0)
+ smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5)
+ smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6)
+
+ subs w3, w3, #16
+ sqrshrun v6.4h, v26.4s, #4
+ sqrshrun2 v6.8h, v27.4s, #4
+.endif
+ smin v6.8h, v6.8h, v31.8h
+
+ ins v0.h[2], v2.h[7]
+ st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
+ ins v0.h[0], v6.h[7]
+ st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
+ ins v0.h[1], v6.h[3]
+ b.gt 2b
+ subs w4, w4, #2
+ b.le 9f
+ sub x8, x6, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_filter\bpc\()_tbl):
+ .hword L(ipred_filter\bpc\()_tbl) - 320b
+ .hword L(ipred_filter\bpc\()_tbl) - 160b
+ .hword L(ipred_filter\bpc\()_tbl) - 80b
+ .hword L(ipred_filter\bpc\()_tbl) - 40b
+endfunc
+.endm
+
+filter_fn 10
+filter_fn 12
+
+function ipred_filter_16bpc_neon, export=1
+ ldr w8, [sp]
+ cmp w8, 0x3ff
+ b.le ipred_filter_10bpc_neon
+ b ipred_filter_12bpc_neon
+endfunc
+
+// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint16_t *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_16bpc_neon, export=1
+ ld1 {v30.8h}, [x2]
+ clz w9, w4
+ adr x6, L(pal_pred_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x6, w9, uxtw #1]
+ movi v31.8h, #1, lsl #8
+ sub x6, x6, w9, uxtw
+ br x6
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, x1
+ lsl x1, x1, #1
+4:
+ ld1 {v1.16b}, [x3], #16
+ subs w5, w5, #4
+ // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
+ add v1.16b, v1.16b, v1.16b
+ zip1 v0.16b, v1.16b, v1.16b
+ zip2 v1.16b, v1.16b, v1.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ st1 {v0.d}[0], [x0], x1
+ tbl v1.16b, {v30.16b}, v1.16b
+ st1 {v0.d}[1], [x2], x1
+ st1 {v1.d}[0], [x0], x1
+ st1 {v1.d}[1], [x2], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, x1
+ lsl x1, x1, #1
+8:
+ ld1 {v2.16b, v3.16b}, [x3], #32
+ subs w5, w5, #4
+ add v2.16b, v2.16b, v2.16b
+ add v3.16b, v3.16b, v3.16b
+ zip1 v0.16b, v2.16b, v2.16b
+ zip2 v1.16b, v2.16b, v2.16b
+ zip1 v2.16b, v3.16b, v3.16b
+ zip2 v3.16b, v3.16b, v3.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v1.16b, {v30.16b}, v1.16b
+ st1 {v0.8h}, [x0], x1
+ tbl v2.16b, {v30.16b}, v2.16b
+ st1 {v1.8h}, [x2], x1
+ tbl v3.16b, {v30.16b}, v3.16b
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x2], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, x1
+ lsl x1, x1, #1
+16:
+ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+ subs w5, w5, #4
+ add v4.16b, v4.16b, v4.16b
+ add v5.16b, v5.16b, v5.16b
+ add v6.16b, v6.16b, v6.16b
+ add v7.16b, v7.16b, v7.16b
+ zip1 v0.16b, v4.16b, v4.16b
+ zip2 v1.16b, v4.16b, v4.16b
+ zip1 v2.16b, v5.16b, v5.16b
+ zip2 v3.16b, v5.16b, v5.16b
+ zip1 v4.16b, v6.16b, v6.16b
+ zip2 v5.16b, v6.16b, v6.16b
+ zip1 v6.16b, v7.16b, v7.16b
+ zip2 v7.16b, v7.16b, v7.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ add v4.8h, v4.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ add v5.8h, v5.8h, v31.8h
+ tbl v1.16b, {v30.16b}, v1.16b
+ add v6.8h, v6.8h, v31.8h
+ tbl v2.16b, {v30.16b}, v2.16b
+ add v7.8h, v7.8h, v31.8h
+ tbl v3.16b, {v30.16b}, v3.16b
+ tbl v4.16b, {v30.16b}, v4.16b
+ tbl v5.16b, {v30.16b}, v5.16b
+ st1 {v0.8h, v1.8h}, [x0], x1
+ tbl v6.16b, {v30.16b}, v6.16b
+ st1 {v2.8h, v3.8h}, [x2], x1
+ tbl v7.16b, {v30.16b}, v7.16b
+ st1 {v4.8h, v5.8h}, [x0], x1
+ st1 {v6.8h, v7.8h}, [x2], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, x1
+ lsl x1, x1, #1
+32:
+ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+ subs w5, w5, #2
+ add v4.16b, v4.16b, v4.16b
+ add v5.16b, v5.16b, v5.16b
+ add v6.16b, v6.16b, v6.16b
+ add v7.16b, v7.16b, v7.16b
+ zip1 v0.16b, v4.16b, v4.16b
+ zip2 v1.16b, v4.16b, v4.16b
+ zip1 v2.16b, v5.16b, v5.16b
+ zip2 v3.16b, v5.16b, v5.16b
+ zip1 v4.16b, v6.16b, v6.16b
+ zip2 v5.16b, v6.16b, v6.16b
+ zip1 v6.16b, v7.16b, v7.16b
+ zip2 v7.16b, v7.16b, v7.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ add v4.8h, v4.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ add v5.8h, v5.8h, v31.8h
+ tbl v1.16b, {v30.16b}, v1.16b
+ add v6.8h, v6.8h, v31.8h
+ tbl v2.16b, {v30.16b}, v2.16b
+ add v7.8h, v7.8h, v31.8h
+ tbl v3.16b, {v30.16b}, v3.16b
+ tbl v4.16b, {v30.16b}, v4.16b
+ tbl v5.16b, {v30.16b}, v5.16b
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ tbl v6.16b, {v30.16b}, v6.16b
+ tbl v7.16b, {v30.16b}, v7.16b
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x2, x0, #64
+64:
+ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+ subs w5, w5, #1
+ add v4.16b, v4.16b, v4.16b
+ add v5.16b, v5.16b, v5.16b
+ add v6.16b, v6.16b, v6.16b
+ add v7.16b, v7.16b, v7.16b
+ zip1 v0.16b, v4.16b, v4.16b
+ zip2 v1.16b, v4.16b, v4.16b
+ zip1 v2.16b, v5.16b, v5.16b
+ zip2 v3.16b, v5.16b, v5.16b
+ zip1 v4.16b, v6.16b, v6.16b
+ zip2 v5.16b, v6.16b, v6.16b
+ zip1 v6.16b, v7.16b, v7.16b
+ zip2 v7.16b, v7.16b, v7.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ add v4.8h, v4.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ add v5.8h, v5.8h, v31.8h
+ tbl v1.16b, {v30.16b}, v1.16b
+ add v6.8h, v6.8h, v31.8h
+ tbl v2.16b, {v30.16b}, v2.16b
+ add v7.8h, v7.8h, v31.8h
+ tbl v3.16b, {v30.16b}, v3.16b
+ tbl v4.16b, {v30.16b}, v4.16b
+ tbl v5.16b, {v30.16b}, v5.16b
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ tbl v6.16b, {v30.16b}, v6.16b
+ tbl v7.16b, {v30.16b}, v7.16b
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
+ b.gt 64b
+ ret
+
+L(pal_pred_tbl):
+ .hword L(pal_pred_tbl) - 640b
+ .hword L(pal_pred_tbl) - 320b
+ .hword L(pal_pred_tbl) - 160b
+ .hword L(pal_pred_tbl) - 80b
+ .hword L(pal_pred_tbl) - 40b
+endfunc
+
+// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_128_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ clz w9, w3
+ adr x7, L(ipred_cfl_128_tbl)
+ sub w9, w9, #26
+ ldrh w9, [x7, w9, uxtw #1]
+ urshr v0.8h, v31.8h, #1
+ dup v1.8h, w6 // alpha
+ sub x7, x7, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+L(ipred_cfl_splat_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x5], #32
+ subs w4, w4, #4
+ smull v2.4s, v4.4h, v1.4h // diff = ac * alpha
+ smull2 v3.4s, v4.8h, v1.8h
+ smull v4.4s, v5.4h, v1.4h
+ smull2 v5.4s, v5.8h, v1.8h
+ cmlt v16.4s, v2.4s, #0 // sign
+ cmlt v17.4s, v3.4s, #0
+ cmlt v18.4s, v4.4s, #0
+ cmlt v19.4s, v5.4s, #0
+ add v2.4s, v2.4s, v16.4s // diff + sign
+ add v3.4s, v3.4s, v17.4s
+ add v4.4s, v4.4s, v18.4s
+ add v5.4s, v5.4s, v19.4s
+ rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.d}[0], [x0], x1
+ st1 {v2.d}[1], [x6], x1
+ st1 {v3.d}[0], [x0], x1
+ st1 {v3.d}[1], [x6], x1
+ b.gt L(ipred_cfl_splat_w4)
+ ret
+L(ipred_cfl_splat_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x5], #32
+ subs w4, w4, #2
+ smull v2.4s, v4.4h, v1.4h // diff = ac * alpha
+ smull2 v3.4s, v4.8h, v1.8h
+ smull v4.4s, v5.4h, v1.4h
+ smull2 v5.4s, v5.8h, v1.8h
+ cmlt v16.4s, v2.4s, #0 // sign
+ cmlt v17.4s, v3.4s, #0
+ cmlt v18.4s, v4.4s, #0
+ cmlt v19.4s, v5.4s, #0
+ add v2.4s, v2.4s, v16.4s // diff + sign
+ add v3.4s, v3.4s, v17.4s
+ add v4.4s, v4.4s, v18.4s
+ add v5.4s, v5.4s, v19.4s
+ rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x6], x1
+ b.gt L(ipred_cfl_splat_w8)
+ ret
+L(ipred_cfl_splat_w16):
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x5, w3, uxtw #1
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+1:
+ ld1 {v2.8h, v3.8h}, [x5], #32
+ ld1 {v4.8h, v5.8h}, [x7], #32
+ subs w3, w3, #16
+ smull v16.4s, v2.4h, v1.4h // diff = ac * alpha
+ smull2 v17.4s, v2.8h, v1.8h
+ smull v18.4s, v3.4h, v1.4h
+ smull2 v19.4s, v3.8h, v1.8h
+ smull v2.4s, v4.4h, v1.4h
+ smull2 v3.4s, v4.8h, v1.8h
+ smull v4.4s, v5.4h, v1.4h
+ smull2 v5.4s, v5.8h, v1.8h
+ cmlt v20.4s, v16.4s, #0 // sign
+ cmlt v21.4s, v17.4s, #0
+ cmlt v22.4s, v18.4s, #0
+ cmlt v23.4s, v19.4s, #0
+ cmlt v24.4s, v2.4s, #0
+ cmlt v25.4s, v3.4s, #0
+ cmlt v26.4s, v4.4s, #0
+ cmlt v27.4s, v5.4s, #0
+ add v16.4s, v16.4s, v20.4s // diff + sign
+ add v17.4s, v17.4s, v21.4s
+ add v18.4s, v18.4s, v22.4s
+ add v19.4s, v19.4s, v23.4s
+ add v2.4s, v2.4s, v24.4s
+ add v3.4s, v3.4s, v25.4s
+ add v4.4s, v4.4s, v26.4s
+ add v5.4s, v5.4s, v27.4s
+ rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ rshrn v6.4h, v2.4s, #6
+ rshrn2 v6.8h, v3.4s, #6
+ rshrn v7.4h, v4.4s, #6
+ rshrn2 v7.8h, v5.4s, #6
+ add v2.8h, v16.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v17.8h, v0.8h
+ add v4.8h, v6.8h, v0.8h
+ add v5.8h, v7.8h, v0.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smax v4.8h, v4.8h, v30.8h
+ smax v5.8h, v5.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ smin v4.8h, v4.8h, v31.8h
+ smin v5.8h, v5.8h, v31.8h
+ st1 {v2.8h, v3.8h}, [x0], #32
+ st1 {v4.8h, v5.8h}, [x6], #32
+ b.gt 1b
+ subs w4, w4, #2
+ add x5, x5, w9, uxtw #1
+ add x7, x7, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b.gt 1b
+ ret
+
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
+endfunc
+
+// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_top_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ clz w9, w3
+ adr x7, L(ipred_cfl_top_tbl)
+ sub w9, w9, #26
+ ldrh w9, [x7, w9, uxtw #1]
+ dup v1.8h, w6 // alpha
+ add x2, x2, #2
+ sub x7, x7, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w4)
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w8)
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x2]
+ addp v0.8h, v2.8h, v3.8h
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #4
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v0.8h, v2.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v0.4h, v0.4s, #5
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_top_tbl):
+ .hword L(ipred_cfl_top_tbl) - 32b
+ .hword L(ipred_cfl_top_tbl) - 16b
+ .hword L(ipred_cfl_top_tbl) - 8b
+ .hword L(ipred_cfl_top_tbl) - 4b
+endfunc
+
+// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_left_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ sub x2, x2, w4, uxtw #1
+ clz w9, w3
+ clz w8, w4
+ adr x10, L(ipred_cfl_splat_tbl)
+ adr x7, L(ipred_cfl_left_tbl)
+ sub w9, w9, #26
+ sub w8, w8, #26
+ ldrh w9, [x10, w9, uxtw #1]
+ ldrh w8, [x7, w8, uxtw #1]
+ dup v1.8h, w6 // alpha
+ sub x9, x10, w9, uxtw
+ sub x7, x7, w8, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+
+L(ipred_cfl_left_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x2]
+ addp v0.8h, v2.8h, v3.8h
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #4
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v0.8h, v2.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v0.4h, v0.4s, #5
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_tbl):
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
+endfunc
+
+// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ sub x2, x2, w4, uxtw #1
+ add w8, w3, w4 // width + height
+ dup v1.8h, w6 // alpha
+ clz w9, w3
+ clz w6, w4
+ dup v16.4s, w8 // width + height
+ adr x7, L(ipred_cfl_tbl)
+ rbit w8, w8 // rbit(width + height)
+ sub w9, w9, #22 // 26 leading bits, minus table offset 4
+ sub w6, w6, #26
+ clz w8, w8 // ctz(width + height)
+ ldrh w9, [x7, w9, uxtw #1]
+ ldrh w6, [x7, w6, uxtw #1]
+ neg w8, w8 // -ctz(width + height)
+ sub x9, x7, w9, uxtw
+ sub x7, x7, w6, uxtw
+ ushr v16.4s, v16.4s, #1 // (width + height) >> 1
+ dup v17.4s, w8 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+
+L(ipred_cfl_h4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2], #8
+ uaddlv s0, v0.4h
+ add x2, x2, #2
+ br x9
+L(ipred_cfl_w4):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.4h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s2, v2.4h
+ cmp w4, #4
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16
+ cmp w4, #16
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8h}, [x2], #16
+ uaddlv s0, v0.8h
+ add x2, x2, #2
+ br x9
+L(ipred_cfl_w8):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s2, v2.8h
+ cmp w4, #8
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x2], #32
+ addp v0.8h, v2.8h, v3.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x9
+L(ipred_cfl_w16):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ addp v2.8h, v2.8h, v3.8h
+ uaddlv s2, v2.8h
+ cmp w4, #16
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/8/32
+ tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v0.8h, v2.8h, v4.8h
+ add x2, x2, #2
+ uaddlv s0, v0.8h
+ br x9
+L(ipred_cfl_w32):
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+ add v0.4s, v0.4s, v16.4s
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v2.8h, v2.8h, v4.8h
+ cmp w4, #32
+ uaddlv s2, v2.8h
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16
+ cmp w4, #8
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_tbl):
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
+endfunc
+
+// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_16bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_420_tbl)
+ sub w8, w8, #27
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v24.4s, #0
+ movi v25.4s, #0
+ movi v26.4s, #0
+ movi v27.4s, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_420_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v2.8h
+ addp v1.8h, v1.8h, v3.8h
+ add v0.8h, v0.8h, v1.8h
+ shl v0.8h, v0.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h}, [x0], #16
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ b.gt 1b
+ trn2 v1.2d, v0.2d, v0.2d
+ trn2 v0.2d, v0.2d, v0.2d
+L(ipred_cfl_ac_420_w4_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 2b
+3:
+L(ipred_cfl_ac_420_w4_calc_subtract_dc):
+ // Aggregate the sums
+ add v24.4s, v24.4s, v25.4s
+ add v26.4s, v26.4s, v27.4s
+ add v0.4s, v24.4s, v26.4s
+ addv s0, v0.4s // sum
+ sub x0, x0, w6, uxtw #3
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
+ dup v4.8h, v4.h[0]
+6: // Subtract dc from ac
+ ld1 {v0.8h, v1.8h}, [x0]
+ subs w6, w6, #4
+ sub v0.8h, v0.8h, v4.8h
+ sub v1.8h, v1.8h, v4.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 6b
+ ret
+
+L(ipred_cfl_ac_420_w8):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_420_w8_wpad)
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ ld1 {v4.8h, v5.8h}, [x1], x2
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v6.8h, v7.8h}, [x10], x2
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ add v0.8h, v0.8h, v2.8h
+ add v4.8h, v4.8h, v6.8h
+ shl v0.8h, v0.8h, #1
+ shl v1.8h, v4.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 1b
+ mov v0.16b, v1.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1: // Copy and subsample input, padding 4
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v2.8h
+ addp v1.8h, v1.8h, v3.8h
+ add v0.8h, v0.8h, v1.8h
+ shl v0.8h, v0.8h, #1
+ dup v1.4h, v0.h[3]
+ dup v3.4h, v0.h[7]
+ trn2 v2.2d, v0.2d, v0.2d
+ subs w8, w8, #2
+ st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw v25.4s, v25.4s, v1.4h
+ uaddw v26.4s, v26.4s, v2.4h
+ uaddw v27.4s, v27.4s, v3.4h
+ b.gt 1b
+ trn1 v0.2d, v2.2d, v3.2d
+ trn1 v1.2d, v2.2d, v3.2d
+
+L(ipred_cfl_ac_420_w8_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 2b
+3:
+
+ // Double the height and reuse the w4 summing/subtracting
+ lsl w6, w6, #1
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_w16):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_420_w16_tbl)
+ ldrh w3, [x7, w3, uxtw #1]
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_420_w16_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2
+ add v0.8h, v0.8h, v4.8h
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
+ add v2.8h, v2.8h, v6.8h
+ addp v16.8h, v16.8h, v17.8h
+ addp v18.8h, v18.8h, v19.8h
+ addp v20.8h, v20.8h, v21.8h
+ addp v22.8h, v22.8h, v23.8h
+ add v16.8h, v16.8h, v20.8h
+ add v18.8h, v18.8h, v22.8h
+ shl v0.8h, v0.8h, #1
+ shl v1.8h, v2.8h, #1
+ shl v2.8h, v16.8h, #1
+ shl v3.8h, v18.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 4
+ ldr q2, [x1, #32]
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ldr q5, [x10, #32]
+ ld1 {v3.8h, v4.8h}, [x10], x2
+ addp v2.8h, v2.8h, v2.8h
+ addp v0.8h, v0.8h, v1.8h
+ addp v5.8h, v5.8h, v5.8h
+ addp v3.8h, v3.8h, v4.8h
+ ldr q18, [x1, #32]
+ add v2.4h, v2.4h, v5.4h
+ ld1 {v16.8h, v17.8h}, [x1], x2
+ add v0.8h, v0.8h, v3.8h
+ ldr q21, [x10, #32]
+ ld1 {v19.8h, v20.8h}, [x10], x2
+ addp v18.8h, v18.8h, v18.8h
+ addp v16.8h, v16.8h, v17.8h
+ addp v21.8h, v21.8h, v21.8h
+ addp v19.8h, v19.8h, v20.8h
+ add v18.4h, v18.4h, v21.4h
+ add v16.8h, v16.8h, v19.8h
+ shl v1.4h, v2.4h, #1
+ shl v0.8h, v0.8h, #1
+ shl v3.4h, v18.4h, #1
+ shl v2.8h, v16.8h, #1
+ dup v4.4h, v1.h[3]
+ dup v5.4h, v3.h[3]
+ trn1 v1.2d, v1.2d, v4.2d
+ trn1 v3.2d, v3.2d, v5.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 8
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ ld1 {v4.8h, v5.8h}, [x1], x2
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v6.8h, v7.8h}, [x10], x2
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ add v0.8h, v0.8h, v2.8h
+ add v4.8h, v4.8h, v6.8h
+ shl v0.8h, v0.8h, #1
+ shl v2.8h, v4.8h, #1
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 12
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v2.8h}, [x10], x2
+ ld1 {v4.8h}, [x1], x2
+ ld1 {v6.8h}, [x10], x2
+ addp v0.8h, v0.8h, v4.8h
+ addp v2.8h, v2.8h, v6.8h
+ add v0.8h, v0.8h, v2.8h
+ shl v0.8h, v0.8h, #1
+ dup v1.8h, v0.h[3]
+ dup v3.8h, v0.h[7]
+ trn2 v2.2d, v0.2d, v3.2d
+ trn1 v0.2d, v0.2d, v1.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+
+L(ipred_cfl_ac_420_w16_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 2b
+3:
+
+ // Quadruple the height and reuse the w4 summing/subtracting
+ lsl w6, w6, #2
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_tbl):
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
+ .hword 0
+
+L(ipred_cfl_ac_420_w16_tbl):
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
+endfunc
+
+// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_16bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_422_tbl)
+ sub w8, w8, #27
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v24.4s, #0
+ movi v25.4s, #0
+ movi v26.4s, #0
+ movi v27.4s, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_422_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v2.8h, #2
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_422_w8_wpad)
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ ld1 {v4.8h, v5.8h}, [x1], x2
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v6.8h, v7.8h}, [x10], x2
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v2.8h, #2
+ shl v2.8h, v4.8h, #2
+ shl v3.8h, v6.8h, #2
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1: // Copy and subsample input, padding 4
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #2
+ shl v2.8h, v2.8h, #2
+ dup v4.4h, v0.h[3]
+ dup v5.8h, v0.h[7]
+ dup v6.4h, v2.h[3]
+ dup v7.8h, v2.h[7]
+ trn2 v1.2d, v0.2d, v5.2d
+ trn1 v0.2d, v0.2d, v4.2d
+ trn2 v3.2d, v2.2d, v7.2d
+ trn1 v2.2d, v2.2d, v6.2d
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_422_w16_tbl)
+ ldrh w3, [x7, w3, uxtw #1]
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_422_w16_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v2.8h, #2
+ shl v2.8h, v4.8h, #2
+ shl v3.8h, v6.8h, #2
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 4
+ ldr q2, [x1, #32]
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ldr q6, [x10, #32]
+ ld1 {v4.8h, v5.8h}, [x10], x2
+ addp v2.8h, v2.8h, v2.8h
+ addp v0.8h, v0.8h, v1.8h
+ addp v6.8h, v6.8h, v6.8h
+ addp v4.8h, v4.8h, v5.8h
+ shl v1.4h, v2.4h, #2
+ shl v0.8h, v0.8h, #2
+ shl v3.4h, v6.4h, #2
+ shl v2.8h, v4.8h, #2
+ dup v4.4h, v1.h[3]
+ dup v5.4h, v3.h[3]
+ trn1 v1.2d, v1.2d, v4.2d
+ trn1 v3.2d, v3.2d, v5.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 8
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #2
+ shl v2.8h, v2.8h, #2
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and subsample input, padding 12
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v2.8h}, [x10], x2
+ addp v0.8h, v0.8h, v0.8h
+ addp v2.8h, v2.8h, v2.8h
+ shl v0.4h, v0.4h, #2
+ shl v2.4h, v2.4h, #2
+ dup v1.8h, v0.h[3]
+ dup v3.8h, v2.h[3]
+ trn1 v0.2d, v0.2d, v1.2d
+ trn1 v2.2d, v2.2d, v3.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_tbl):
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
+ .hword 0
+
+L(ipred_cfl_ac_422_w16_tbl):
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
+
+// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_444_16bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_444_tbl)
+ sub w8, w8, #26
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v24.4s, #0
+ movi v25.4s, #0
+ movi v26.4s, #0
+ movi v27.4s, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_444_w4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input
+ ld1 {v0.4h}, [x1], x2
+ ld1 {v0.d}[1], [x10], x2
+ ld1 {v1.4h}, [x1], x2
+ ld1 {v1.d}[1], [x10], x2
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_444_w8):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ shl v0.8h, v0.8h, #3
+ ld1 {v3.8h}, [x10], x2
+ shl v1.8h, v1.8h, #3
+ shl v2.8h, v2.8h, #3
+ shl v3.8h, v3.8h, #3
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_444_w16):
+ AARCH64_VALID_JUMP_TARGET
+ cbnz w3, L(ipred_cfl_ac_444_w16_wpad)
+1: // Copy and expand input, without padding
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ shl v2.8h, v2.8h, #3
+ shl v3.8h, v3.8h, #3
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w16_wpad):
+1: // Copy and expand input, padding 8
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v2.8h}, [x10], x2
+ shl v0.8h, v0.8h, #3
+ shl v2.8h, v2.8h, #3
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_444_w32):
+ AARCH64_VALID_JUMP_TARGET
+ adr x7, L(ipred_cfl_ac_444_w32_tbl)
+ ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1
+ lsr x2, x2, #1 // Restore the stride to one line increments
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_444_w32_wpad0):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, without padding
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ shl v2.8h, v2.8h, #3
+ shl v3.8h, v3.8h, #3
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad2):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 8
+ ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2
+ shl v2.8h, v2.8h, #3
+ shl v0.8h, v0.8h, #3
+ shl v1.8h, v1.8h, #3
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad4):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 16
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ shl v1.8h, v1.8h, #3
+ shl v0.8h, v0.8h, #3
+ dup v2.8h, v1.h[7]
+ dup v3.8h, v1.h[7]
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ b L(ipred_cfl_ac_444_w32_hpad)
+
+L(ipred_cfl_ac_444_w32_wpad6):
+ AARCH64_VALID_JUMP_TARGET
+1: // Copy and expand input, padding 24
+ ld1 {v0.8h}, [x1], x2
+ shl v0.8h, v0.8h, #3
+ dup v1.8h, v0.h[7]
+ dup v2.8h, v0.h[7]
+ dup v3.8h, v0.h[7]
+ subs w8, w8, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+
+L(ipred_cfl_ac_444_w32_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 2b
+3:
+
+ // Multiply the height by eight and reuse the w4 subtracting
+ lsl w6, w6, #3
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_444_tbl):
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
+ .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
+
+L(ipred_cfl_ac_444_w32_tbl):
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
+ .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/itx.S b/third_party/dav1d/src/arm/64/itx.S
new file mode 100644
index 0000000000..b1b2f8fe65
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/itx.S
@@ -0,0 +1,3270 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
+
+// Most of the functions use the following register layout:
+// x0-x3 external parameters
+// x4 function pointer to first transform
+// x5 function pointer to second transform
+// x6 output parameter for helper function
+// x7 input parameter for helper function
+// x8 input stride for helper function
+// x9-x12 scratch variables for helper functions
+// x13 pointer to list of eob thresholds
+// x14 return pointer for helper function
+// x15 return pointer for main function
+
+// The SIMD registers most often use the following layout:
+// v0-v1 multiplication coefficients
+// v2-v7 scratch registers
+// v8-v15 unused
+// v16-v31 inputs/outputs of transforms
+
+// Potential further optimizations, that are left unimplemented for now:
+// - Trying to keep multiplication coefficients in registers across multiple
+// transform functions. (The register layout is designed to potentially
+// allow this.)
+// - Use a simplified version of the transforms themselves for cases where
+// we know a significant number of inputs are zero. E.g. if the eob value
+// indicates only a quarter of input values are set, for idct16 and up,
+// a significant amount of calculation can be skipped, at the cost of more
+// code duplication and special casing.
+
+const idct_coeffs, align=4
+ // idct4
+ .short 2896, 2896*8, 1567, 3784
+ // idct8
+ .short 799, 4017, 3406, 2276
+ // idct16
+ .short 401, 4076, 3166, 2598
+ .short 1931, 3612, 3920, 1189
+ // idct32
+ .short 201, 4091, 3035, 2751
+ .short 1751, 3703, 3857, 1380
+ .short 995, 3973, 3513, 2106
+ .short 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .short 101*8, 4095*8, 2967*8, -2824*8
+ .short 1660*8, 3745*8, 3822*8, -1474*8
+ .short 4076, 401, 4017, 799
+ .short 0, 0, 0, 0
+
+ .short 4036*8, -700*8, 2359*8, 3349*8
+ .short 3461*8, -2191*8, 897*8, 3996*8
+ .short -3166, -2598, -799, -4017
+ .short 0, 0, 0, 0
+
+ .short 501*8, 4065*8, 3229*8, -2520*8
+ .short 2019*8, 3564*8, 3948*8, -1092*8
+ .short 3612, 1931, 2276, 3406
+ .short 0, 0, 0, 0
+
+ .short 4085*8, -301*8, 2675*8, 3102*8
+ .short 3659*8, -1842*8, 1285*8, 3889*8
+ .short -3920, -1189, -3406, -2276
+ .short 0, 0, 0, 0
+endconst
+
+const iadst4_coeffs, align=4
+ // .h[4-5] can be interpreted as .s[2]
+ .short 1321, 3803, 2482, 3344, 3344, 0
+endconst
+
+const iadst8_coeffs, align=4
+ .short 4076, 401, 3612, 1931
+ .short 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .short 2896, 0, 1567, 3784, 0, 0, 0, 0
+endconst
+
+const iadst16_coeffs, align=4
+ .short 4091, 201, 3973, 995
+ .short 3703, 1751, 3290, 2440
+ .short 2751, 3035, 2106, 3513
+ .short 1380, 3857, 601, 4052
+endconst
+
+.macro smull_smlal d0, d1, s0, s1, c0, c1, sz
+ smull \d0\().4s, \s0\().4h, \c0
+ smlal \d0\().4s, \s1\().4h, \c1
+.ifc \sz, .8h
+ smull2 \d1\().4s, \s0\().8h, \c0
+ smlal2 \d1\().4s, \s1\().8h, \c1
+.endif
+.endm
+
+.macro smull_smlsl d0, d1, s0, s1, c0, c1, sz
+ smull \d0\().4s, \s0\().4h, \c0
+ smlsl \d0\().4s, \s1\().4h, \c1
+.ifc \sz, .8h
+ smull2 \d1\().4s, \s0\().8h, \c0
+ smlsl2 \d1\().4s, \s1\().8h, \c1
+.endif
+.endm
+
+.macro sqrshrn_sz d0, s0, s1, shift, sz
+ sqrshrn \d0\().4h, \s0\().4s, \shift
+.ifc \sz, .8h
+ sqrshrn2 \d0\().8h, \s1\().4s, \shift
+.endif
+.endm
+
+.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
+ sqrdmulh \r0\sz, \r0\sz, \c
+ sqrdmulh \r1\sz, \r1\sz, \c
+ sqrdmulh \r2\sz, \r2\sz, \c
+ sqrdmulh \r3\sz, \r3\sz, \c
+.ifnb \r4
+ sqrdmulh \r4\sz, \r4\sz, \c
+ sqrdmulh \r5\sz, \r5\sz, \c
+ sqrdmulh \r6\sz, \r6\sz, \c
+ sqrdmulh \r7\sz, \r7\sz, \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
+.ifnb \load
+ ld1 {\load}, [\src], x1
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ uaddw \adddst, \adddst, \addsrc
+.endif
+.ifnb \narrowsrc
+ sqxtun \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+ st1 {\store}, [\dst], x1
+.endif
+.endm
+.macro load_add_store_8x16 dst, src
+ mov \src, \dst
+ load_add_store v2.8b, v16.8h, , , , , , \dst, \src
+ load_add_store v3.8b, v17.8h, , , , , , \dst, \src
+ load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src
+ load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src
+ load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src
+ load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src
+ load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src
+ load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src
+ load_add_store v4.8b, v24.8h, v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src
+ load_add_store v5.8b, v25.8h, v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src
+ load_add_store v6.8b, v26.8h, v4.8b, v24.8h, v23.8h, v3.8b, v2.8b, \dst, \src
+ load_add_store v7.8b, v27.8h, v5.8b, v25.8h, v24.8h, v4.8b, v3.8b, \dst, \src
+ load_add_store v2.8b, v28.8h, v6.8b, v26.8h, v25.8h, v5.8b, v4.8b, \dst, \src
+ load_add_store v3.8b, v29.8h, v7.8b, v27.8h, v26.8h, v6.8b, v5.8b, \dst, \src
+ load_add_store v4.8b, v30.8h, v2.8b, v28.8h, v27.8h, v7.8b, v6.8b, \dst, \src
+ load_add_store v5.8b, v31.8h, v3.8b, v29.8h, v28.8h, v2.8b, v7.8b, \dst, \src
+ load_add_store , , v4.8b, v30.8h, v29.8h, v3.8b, v2.8b, \dst, \src
+ load_add_store , , v5.8b, v31.8h, v30.8h, v4.8b, v3.8b, \dst, \src
+ load_add_store , , , , v31.8h, v5.8b, v4.8b, \dst, \src
+ load_add_store , , , , , , v5.8b, \dst, \src
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ load_add_store v2.8b, v16.8h, , , , , , \dst, \src, \shiftbits
+ load_add_store v3.8b, v17.8h, , , , , , \dst, \src, \shiftbits
+ load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src, \shiftbits
+ load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src, \shiftbits
+ load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src, \shiftbits
+ load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src, \shiftbits
+ load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src, \shiftbits
+ load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src, \shiftbits
+ load_add_store , , v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src, \shiftbits
+ load_add_store , , v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src, \shiftbits
+ load_add_store , , , , v23.8h, v3.8b, v2.8b, \dst, \src, \shiftbits
+ load_add_store , , , , , , v3.8b, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src
+ mov \src, \dst
+ load_add_store v2.8b, v16.8h, , , , , , \dst, \src
+ load_add_store v3.8b, v17.8h, , , , , , \dst, \src
+ load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src
+ load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src
+ load_add_store , , v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src
+ load_add_store , , v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src
+ load_add_store , , , , v19.8h, v5.8b, v4.8b, \dst, \src
+ load_add_store , , , , , , v5.8b, \dst, \src
+.endm
+.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src
+.ifnb \load
+ ld1 {\load}[0], [\src], x1
+.endif
+.ifnb \inssrc
+ ins \insdst\().d[1], \inssrc\().d[0]
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #4
+.endif
+.ifnb \load
+ ld1 {\load}[1], [\src], x1
+.endif
+.ifnb \addsrc
+ uaddw \adddst, \adddst, \addsrc
+.endif
+.ifnb \store
+ st1 {\store}[0], [\dst], x1
+.endif
+.ifnb \narrowsrc
+ sqxtun \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+ st1 {\store}[1], [\dst], x1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src
+ load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src
+ load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src
+ load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src
+ load_add_store4 v4.s, v25, v24, v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src
+ load_add_store4 v5.s, v27, v26, v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src
+ load_add_store4 v6.s, v29, v28, v24.8h, v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src
+ load_add_store4 v7.s, v31, v30, v26.8h, v4.8b, v24.8h, v22.8h, v3.8b, v2.s, \dst, \src
+ load_add_store4 , , , v28.8h, v5.8b, v26.8h, v24.8h, v4.8b, v3.s, \dst, \src
+ load_add_store4 , , , v30.8h, v6.8b, v28.8h, v26.8h, v5.8b, v4.s, \dst, \src
+ load_add_store4 , , , , v7.8b, v30.8h, v28.8h, v6.8b, v5.s, \dst, \src
+ load_add_store4 , , , , , , v30.8h, v7.8b, v6.s, \dst, \src
+ load_add_store4 , , , , , , , , v7.s, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+ mov \src, \dst
+ load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src
+ load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src
+ load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src
+ load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src
+ load_add_store4 , , , v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src
+ load_add_store4 , , , v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src
+ load_add_store4 , , , , v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src
+ load_add_store4 , , , , , , v22.8h, v3.8b, v2.s, \dst, \src
+ load_add_store4 , , , , , , , , v3.s, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+ cbnz w3, 1f
+ mov w16, #2896*8
+ ld1r {v16.8h}, [x2]
+ dup v0.4h, w16
+ sqrdmulh v16.8h, v16.8h, v0.h[0]
+ strh wzr, [x2]
+.if (\w == 2*\h) || (2*\w == \h)
+ sqrdmulh v16.8h, v16.8h, v0.h[0]
+.endif
+.if \shift > 0
+ srshr v16.8h, v16.8h, #\shift
+.endif
+ sqrdmulh v16.8h, v16.8h, v0.h[0]
+ srshr v16.8h, v16.8h, #4
+ mov w4, #\h
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+1:
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v0.s}[1], [x0], x1
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x0], x1
+ subs w4, w4, #4
+ sub x0, x0, x1, lsl #2
+ uaddw v0.8h, v16.8h, v0.8b
+ sqxtun v0.8b, v0.8h
+ uaddw v1.8h, v16.8h, v1.8b
+ st1 {v0.s}[0], [x0], x1
+ sqxtun v1.8b, v1.8h
+ st1 {v0.s}[1], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ st1 {v1.s}[1], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w8_neon
+1:
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x0], x1
+ ld1 {v2.8b}, [x0], x1
+ uaddw v20.8h, v16.8h, v0.8b
+ ld1 {v3.8b}, [x0], x1
+ sub x0, x0, x1, lsl #2
+ subs w4, w4, #4
+ uaddw v21.8h, v16.8h, v1.8b
+ sqxtun v0.8b, v20.8h
+ uaddw v22.8h, v16.8h, v2.8b
+ sqxtun v1.8b, v21.8h
+ uaddw v23.8h, v16.8h, v3.8b
+ st1 {v0.8b}, [x0], x1
+ sqxtun v2.8b, v22.8h
+ st1 {v1.8b}, [x0], x1
+ sqxtun v3.8b, v23.8h
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w16_neon
+1:
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v1.16b}, [x0], x1
+ ld1 {v2.16b}, [x0], x1
+ subs w4, w4, #4
+ uaddw v20.8h, v16.8h, v0.8b
+ uaddw2 v21.8h, v16.8h, v0.16b
+ ld1 {v3.16b}, [x0], x1
+ uaddw v22.8h, v16.8h, v1.8b
+ uaddw2 v23.8h, v16.8h, v1.16b
+ sub x0, x0, x1, lsl #2
+ uaddw v24.8h, v16.8h, v2.8b
+ uaddw2 v25.8h, v16.8h, v2.16b
+ sqxtun v0.8b, v20.8h
+ sqxtun2 v0.16b, v21.8h
+ uaddw v26.8h, v16.8h, v3.8b
+ uaddw2 v27.8h, v16.8h, v3.16b
+ sqxtun v1.8b, v22.8h
+ sqxtun2 v1.16b, v23.8h
+ sqxtun v2.8b, v24.8h
+ sqxtun2 v2.16b, v25.8h
+ st1 {v0.16b}, [x0], x1
+ sqxtun v3.8b, v26.8h
+ sqxtun2 v3.16b, v27.8h
+ st1 {v1.16b}, [x0], x1
+ st1 {v2.16b}, [x0], x1
+ st1 {v3.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w32_neon
+1:
+ ld1 {v0.16b, v1.16b}, [x0], x1
+ subs w4, w4, #2
+ uaddw v20.8h, v16.8h, v0.8b
+ uaddw2 v21.8h, v16.8h, v0.16b
+ ld1 {v2.16b, v3.16b}, [x0]
+ uaddw v22.8h, v16.8h, v1.8b
+ uaddw2 v23.8h, v16.8h, v1.16b
+ sub x0, x0, x1
+ uaddw v24.8h, v16.8h, v2.8b
+ uaddw2 v25.8h, v16.8h, v2.16b
+ sqxtun v0.8b, v20.8h
+ sqxtun2 v0.16b, v21.8h
+ uaddw v26.8h, v16.8h, v3.8b
+ uaddw2 v27.8h, v16.8h, v3.16b
+ sqxtun v1.8b, v22.8h
+ sqxtun2 v1.16b, v23.8h
+ sqxtun v2.8b, v24.8h
+ sqxtun2 v2.16b, v25.8h
+ st1 {v0.16b, v1.16b}, [x0], x1
+ sqxtun v3.8b, v26.8h
+ sqxtun2 v3.16b, v27.8h
+ st1 {v2.16b, v3.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w64_neon
+1:
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+ subs w4, w4, #1
+ uaddw v20.8h, v16.8h, v0.8b
+ uaddw2 v21.8h, v16.8h, v0.16b
+ uaddw v22.8h, v16.8h, v1.8b
+ uaddw2 v23.8h, v16.8h, v1.16b
+ uaddw v24.8h, v16.8h, v2.8b
+ uaddw2 v25.8h, v16.8h, v2.16b
+ sqxtun v0.8b, v20.8h
+ sqxtun2 v0.16b, v21.8h
+ uaddw v26.8h, v16.8h, v3.8b
+ uaddw2 v27.8h, v16.8h, v3.16b
+ sqxtun v1.8b, v22.8h
+ sqxtun2 v1.16b, v23.8h
+ sqxtun v2.8b, v24.8h
+ sqxtun2 v2.16b, v25.8h
+ sqxtun v3.8b, v26.8h
+ sqxtun2 v3.16b, v27.8h
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+.macro iwht4
+ add v16.4h, v16.4h, v17.4h
+ sub v21.4h, v18.4h, v19.4h
+ sub v20.4h, v16.4h, v21.4h
+ sshr v20.4h, v20.4h, #1
+ sub v18.4h, v20.4h, v17.4h
+ sub v17.4h, v20.4h, v19.4h
+ add v19.4h, v21.4h, v18.4h
+ sub v16.4h, v16.4h, v17.4h
+.endm
+
+.macro idct_4 r0, r1, r2, r3, sz
+ smull_smlal v6, v7, \r1, \r3, v0.h[3], v0.h[2], \sz
+ smull_smlsl v4, v5, \r1, \r3, v0.h[2], v0.h[3], \sz
+ smull_smlal v2, v3, \r0, \r2, v0.h[0], v0.h[0], \sz
+ sqrshrn_sz v6, v6, v7, #12, \sz
+ sqrshrn_sz v7, v4, v5, #12, \sz
+ smull_smlsl v4, v5, \r0, \r2, v0.h[0], v0.h[0], \sz
+ sqrshrn_sz v2, v2, v3, #12, \sz
+ sqrshrn_sz v3, v4, v5, #12, \sz
+ sqadd \r0\sz, v2\sz, v6\sz
+ sqsub \r3\sz, v2\sz, v6\sz
+ sqadd \r1\sz, v3\sz, v7\sz
+ sqsub \r2\sz, v3\sz, v7\sz
+.endm
+
+function inv_dct_4h_x4_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.4h}, [x16]
+ idct_4 v16, v17, v18, v19, .4h
+ ret
+endfunc
+
+function inv_dct_8h_x4_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.4h}, [x16]
+ idct_4 v16, v17, v18, v19, .8h
+ ret
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel x16, iadst4_coeffs
+ ld1 {v0.8h}, [x16]
+
+ ssubl v3.4s, v16.4h, v18.4h
+ smull v4.4s, v16.4h, v0.h[0]
+ smlal v4.4s, v18.4h, v0.h[1]
+ smlal v4.4s, v19.4h, v0.h[2]
+ smull v7.4s, v17.4h, v0.h[3]
+ saddw v3.4s, v3.4s, v19.4h
+ smull v5.4s, v16.4h, v0.h[2]
+ smlsl v5.4s, v18.4h, v0.h[0]
+ smlsl v5.4s, v19.4h, v0.h[1]
+
+ add \o3\().4s, v4.4s, v5.4s
+ mul \o2\().4s, v3.4s, v0.s[2]
+ add \o0\().4s, v4.4s, v7.4s
+ add \o1\().4s, v5.4s, v7.4s
+ sub \o3\().4s, \o3\().4s, v7.4s
+
+ sqrshrn \o0\().4h, \o0\().4s, #12
+ sqrshrn \o2\().4h, \o2\().4s, #12
+ sqrshrn \o1\().4h, \o1\().4s, #12
+ sqrshrn \o3\().4h, \o3\().4s, #12
+.endm
+
+function inv_adst_4h_x4_neon, export=1
+ iadst_4x4 v16, v17, v18, v19
+ ret
+endfunc
+
+function inv_flipadst_4h_x4_neon, export=1
+ iadst_4x4 v19, v18, v17, v16
+ ret
+endfunc
+
+.macro iadst_8x4 o0, o1, o2, o3
+ movrel x16, iadst4_coeffs
+ ld1 {v0.8h}, [x16]
+
+ ssubl v2.4s, v16.4h, v18.4h
+ ssubl2 v3.4s, v16.8h, v18.8h
+ smull v4.4s, v16.4h, v0.h[0]
+ smlal v4.4s, v18.4h, v0.h[1]
+ smlal v4.4s, v19.4h, v0.h[2]
+ smull2 v5.4s, v16.8h, v0.h[0]
+ smlal2 v5.4s, v18.8h, v0.h[1]
+ smlal2 v5.4s, v19.8h, v0.h[2]
+ saddw v2.4s, v2.4s, v19.4h
+ saddw2 v3.4s, v3.4s, v19.8h
+ smull v6.4s, v16.4h, v0.h[2]
+ smlsl v6.4s, v18.4h, v0.h[0]
+ smlsl v6.4s, v19.4h, v0.h[1]
+ smull2 v7.4s, v16.8h, v0.h[2]
+ smlsl2 v7.4s, v18.8h, v0.h[0]
+ smlsl2 v7.4s, v19.8h, v0.h[1]
+
+ mul v18.4s, v2.4s, v0.s[2]
+ mul v19.4s, v3.4s, v0.s[2]
+
+ smull v2.4s, v17.4h, v0.h[3]
+ smull2 v3.4s, v17.8h, v0.h[3]
+
+ add v16.4s, v4.4s, v2.4s // out0
+ add v17.4s, v5.4s, v3.4s
+
+ add v4.4s, v4.4s, v6.4s // out3
+ add v5.4s, v5.4s, v7.4s
+
+ add v6.4s, v6.4s, v2.4s // out1
+ add v7.4s, v7.4s, v3.4s
+
+ sub v4.4s, v4.4s, v2.4s // out3
+ sub v5.4s, v5.4s, v3.4s
+
+ sqrshrn v18.4h, v18.4s, #12
+ sqrshrn2 v18.8h, v19.4s, #12
+
+ sqrshrn \o0\().4h, v16.4s, #12
+ sqrshrn2 \o0\().8h, v17.4s, #12
+
+.ifc \o2, v17
+ mov v17.16b, v18.16b
+.endif
+
+ sqrshrn \o1\().4h, v6.4s, #12
+ sqrshrn2 \o1\().8h, v7.4s, #12
+
+ sqrshrn \o3\().4h, v4.4s, #12
+ sqrshrn2 \o3\().8h, v5.4s, #12
+.endm
+
+function inv_adst_8h_x4_neon, export=1
+ iadst_8x4 v16, v17, v18, v19
+ ret
+endfunc
+
+function inv_flipadst_8h_x4_neon, export=1
+ iadst_8x4 v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4h_x4_neon, export=1
+ mov w16, #(5793-4096)*8
+ dup v0.4h, w16
+ sqrdmulh v4.4h, v16.4h, v0.h[0]
+ sqrdmulh v5.4h, v17.4h, v0.h[0]
+ sqrdmulh v6.4h, v18.4h, v0.h[0]
+ sqrdmulh v7.4h, v19.4h, v0.h[0]
+ sqadd v16.4h, v16.4h, v4.4h
+ sqadd v17.4h, v17.4h, v5.4h
+ sqadd v18.4h, v18.4h, v6.4h
+ sqadd v19.4h, v19.4h, v7.4h
+ ret
+endfunc
+
+function inv_identity_8h_x4_neon, export=1
+ mov w16, #(5793-4096)*8
+ dup v0.4h, w16
+ sqrdmulh v4.8h, v16.8h, v0.h[0]
+ sqrdmulh v5.8h, v17.8h, v0.h[0]
+ sqrdmulh v6.8h, v18.8h, v0.h[0]
+ sqrdmulh v7.8h, v19.8h, v0.h[0]
+ sqadd v16.8h, v16.8h, v4.8h
+ sqadd v17.8h, v17.8h, v5.8h
+ sqadd v18.8h, v18.8h, v6.8h
+ sqadd v19.8h, v19.8h, v7.8h
+ ret
+endfunc
+
+.macro identity_8x4_shift1 r0, r1, r2, r3, c
+.irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h
+ sqrdmulh v2.8h, \i, \c
+ srhadd \i, \i, v2.8h
+.endr
+.endm
+
+function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1
+ mov x15, x30
+ movi v31.8h, #0
+ ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
+ st1 {v31.8h}, [x2], #16
+
+ sshr v16.4h, v16.4h, #2
+ sshr v17.4h, v17.4h, #2
+ sshr v18.4h, v18.4h, #2
+ sshr v19.4h, v19.4h, #2
+
+ iwht4
+
+ st1 {v31.8h}, [x2], #16
+ transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23
+
+ iwht4
+
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v0.s}[1], [x0], x1
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x0], x1
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ movi v31.8h, #0
+ ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
+ st1 {v31.8h}, [x2], #16
+
+ blr x4
+
+ st1 {v31.8h}, [x2], #16
+ transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x5
+
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v0.s}[1], [x0], x1
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x0], x1
+ srshr v16.8h, v16.8h, #4
+ srshr v18.8h, v18.8h, #4
+
+L(itx_4x4_end):
+ sub x0, x0, x1, lsl #2
+ uaddw v16.8h, v16.8h, v0.8b
+ sqxtun v0.8b, v16.8h
+ uaddw v18.8h, v18.8h, v1.8b
+ st1 {v0.s}[0], [x0], x1
+ sqxtun v1.8b, v18.8h
+ st1 {v0.s}[1], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ st1 {v1.s}[1], [x0], x1
+
+ ret x15
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cbnz w3, 1f
+ mov w16, #2896*8
+ ld1r {v16.8h}, [x2]
+ dup v4.8h, w16
+ strh wzr, [x2]
+ sqrdmulh v16.8h, v16.8h, v4.h[0]
+ ld1 {v0.s}[0], [x0], x1
+ sqrdmulh v20.8h, v16.8h, v4.h[0]
+ ld1 {v0.s}[1], [x0], x1
+ srshr v16.8h, v20.8h, #4
+ ld1 {v1.s}[0], [x0], x1
+ srshr v18.8h, v20.8h, #4
+ ld1 {v1.s}[1], [x0], x1
+ b L(itx_4x4_end)
+1:
+.endif
+ adr x4, inv_\txfm1\()_4h_x4_neon
+ adr x5, inv_\txfm2\()_4h_x4_neon
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7, sz, szb
+ idct_4 \r0, \r2, \r4, \r6, \sz
+
+ smull_smlsl v2, v3, \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a
+ smull_smlal v4, v5, \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a
+ smull_smlsl v6, v7, \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a
+ sqrshrn_sz \r1, v2, v3, #12, \sz // t4a
+ sqrshrn_sz \r7, v4, v5, #12, \sz // t7a
+ smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a
+ sqrshrn_sz \r3, v6, v7, #12, \sz // t5a
+ sqrshrn_sz \r5, v2, v3, #12, \sz // t6a
+
+ sqadd v2\sz, \r1\sz, \r3\sz // t4
+ sqsub \r1\sz, \r1\sz, \r3\sz // t5a
+ sqadd v3\sz, \r7\sz, \r5\sz // t7
+ sqsub \r3\sz, \r7\sz, \r5\sz // t6a
+
+ smull_smlsl v4, v5, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5
+ smull_smlal v6, v7, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6
+ sqrshrn_sz v4, v4, v5, #12, \sz // t5
+ sqrshrn_sz v5, v6, v7, #12, \sz // t6
+
+ sqsub \r7\sz, \r0\sz, v3\sz // out7
+ sqadd \r0\sz, \r0\sz, v3\sz // out0
+ sqadd \r1\sz, \r2\sz, v5\sz // out1
+ sqsub v6\sz, \r2\sz, v5\sz // out6
+ sqadd \r2\sz, \r4\sz, v4\sz // out2
+ sqsub \r5\sz, \r4\sz, v4\sz // out5
+ sqadd \r3\sz, \r6\sz, v2\sz // out3
+ sqsub \r4\sz, \r6\sz, v2\sz // out4
+ mov \r6\szb, v6\szb // out6
+.endm
+
+function inv_dct_8h_x8_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.8h}, [x16]
+ idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b
+ ret
+endfunc
+
+function inv_dct_4h_x8_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.8h}, [x16]
+ idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b
+ ret
+endfunc
+
+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7, sz
+ movrel x16, iadst8_coeffs
+ ld1 {v0.8h, v1.8h}, [x16]
+
+ smull_smlal v2, v3, v23, v16, v0.h[0], v0.h[1], \sz
+ smull_smlsl v4, v5, v23, v16, v0.h[1], v0.h[0], \sz
+ smull_smlal v6, v7, v21, v18, v0.h[2], v0.h[3], \sz
+ sqrshrn_sz v16, v2, v3, #12, \sz // t0a
+ sqrshrn_sz v23, v4, v5, #12, \sz // t1a
+ smull_smlsl v2, v3, v21, v18, v0.h[3], v0.h[2], \sz
+ smull_smlal v4, v5, v19, v20, v0.h[4], v0.h[5], \sz
+ sqrshrn_sz v18, v6, v7, #12, \sz // t2a
+ sqrshrn_sz v21, v2, v3, #12, \sz // t3a
+ smull_smlsl v6, v7, v19, v20, v0.h[5], v0.h[4], \sz
+ smull_smlal v2, v3, v17, v22, v0.h[6], v0.h[7], \sz
+ sqrshrn_sz v20, v4, v5, #12, \sz // t4a
+ sqrshrn_sz v19, v6, v7, #12, \sz // t5a
+ smull_smlsl v4, v5, v17, v22, v0.h[7], v0.h[6], \sz
+ sqrshrn_sz v22, v2, v3, #12, \sz // t6a
+ sqrshrn_sz v17, v4, v5, #12, \sz // t7a
+
+ sqadd v2\sz, v16\sz, v20\sz // t0
+ sqsub v3\sz, v16\sz, v20\sz // t4
+ sqadd v4\sz, v23\sz, v19\sz // t1
+ sqsub v5\sz, v23\sz, v19\sz // t5
+ sqadd v6\sz, v18\sz, v22\sz // t2
+ sqsub v7\sz, v18\sz, v22\sz // t6
+ sqadd v18\sz, v21\sz, v17\sz // t3
+ sqsub v19\sz, v21\sz, v17\sz // t7
+
+ smull_smlal v16, v17, v3, v5, v1.h[3], v1.h[2], \sz
+ smull_smlsl v20, v21, v3, v5, v1.h[2], v1.h[3], \sz
+ smull_smlsl v22, v23, v19, v7, v1.h[3], v1.h[2], \sz
+
+ sqrshrn_sz v3, v16, v17, #12, \sz // t4a
+ sqrshrn_sz v5, v20, v21, #12, \sz // t5a
+
+ smull_smlal v16, v17, v19, v7, v1.h[2], v1.h[3], \sz
+
+ sqrshrn_sz v7, v22, v23, #12, \sz // t6a
+ sqrshrn_sz v19, v16, v17, #12, \sz // t7a
+
+ sqadd \o0\()\sz, v2\sz, v6\sz // out0
+ sqsub v2\sz, v2\sz, v6\sz // t2
+ sqadd \o7\()\sz, v4\sz, v18\sz // out7
+ sqsub v4\sz, v4\sz, v18\sz // t3
+ sqneg \o7\()\sz, \o7\()\sz // out7
+
+ sqadd \o1\()\sz, v3\sz, v7\sz // out1
+ sqsub v3\sz, v3\sz, v7\sz // t6
+ sqadd \o6\()\sz, v5\sz, v19\sz // out6
+ sqsub v5\sz, v5\sz, v19\sz // t7
+ sqneg \o1\()\sz, \o1\()\sz // out1
+
+ smull_smlal v18, v19, v2, v4, v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20)
+ smull_smlsl v6, v7, v2, v4, v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19)
+ smull_smlsl v20, v21, v3, v5, v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18)
+ sqrshrn_sz v2, v18, v19, #12, \sz // out3
+ smull_smlal v18, v19, v3, v5, v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21)
+ sqrshrn_sz v3, v20, v21, #12, \sz // out5
+ sqrshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21)
+ sqrshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19)
+
+ sqneg \o3\()\sz, v2\sz // out3
+ sqneg \o5\()\sz, v3\sz // out5
+.endm
+
+function inv_adst_8h_x8_neon, export=1
+ iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h
+ ret
+endfunc
+
+function inv_flipadst_8h_x8_neon, export=1
+ iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .8h
+ ret
+endfunc
+
+function inv_adst_4h_x8_neon, export=1
+ iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h
+ ret
+endfunc
+
+function inv_flipadst_4h_x8_neon, export=1
+ iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .4h
+ ret
+endfunc
+
+function inv_identity_8h_x8_neon, export=1
+ sqshl v16.8h, v16.8h, #1
+ sqshl v17.8h, v17.8h, #1
+ sqshl v18.8h, v18.8h, #1
+ sqshl v19.8h, v19.8h, #1
+ sqshl v20.8h, v20.8h, #1
+ sqshl v21.8h, v21.8h, #1
+ sqshl v22.8h, v22.8h, #1
+ sqshl v23.8h, v23.8h, #1
+ ret
+endfunc
+
+function inv_identity_4h_x8_neon, export=1
+ sqshl v16.4h, v16.4h, #1
+ sqshl v17.4h, v17.4h, #1
+ sqshl v18.4h, v18.4h, #1
+ sqshl v19.4h, v19.4h, #1
+ sqshl v20.4h, v20.4h, #1
+ sqshl v21.4h, v21.4h, #1
+ sqshl v22.4h, v22.4h, #1
+ sqshl v23.4h, v23.4h, #1
+ ret
+endfunc
+
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_neon
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+ ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2]
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2], #64
+ ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2]
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
+
+.ifc \variant, identity_
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
+ blr x4
+
+ srshr v16.8h, v16.8h, #1
+ srshr v17.8h, v17.8h, #1
+ srshr v18.8h, v18.8h, #1
+ srshr v19.8h, v19.8h, #1
+ srshr v20.8h, v20.8h, #1
+ srshr v21.8h, v21.8h, #1
+ srshr v22.8h, v22.8h, #1
+ srshr v23.8h, v23.8h, #1
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+
+ blr x5
+
+ load_add_store_8x8 x0, x7
+ ret x15
+endfunc
+.endm
+
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
+.macro def_fn_8x8 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ adr x5, inv_\txfm2\()_8h_x8_neon
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_8x8_neon
+.else
+ adr x4, inv_\txfm1\()_8h_x8_neon
+ b inv_txfm_add_8x8_neon
+.endif
+endfunc
+.endm
+
+def_fn_8x8 dct, dct
+def_fn_8x8 identity, identity
+def_fn_8x8 dct, adst
+def_fn_8x8 dct, flipadst
+def_fn_8x8 dct, identity
+def_fn_8x8 adst, dct
+def_fn_8x8 adst, adst
+def_fn_8x8 adst, flipadst
+def_fn_8x8 flipadst, dct
+def_fn_8x8 flipadst, adst
+def_fn_8x8 flipadst, flipadst
+def_fn_8x8 identity, dct
+def_fn_8x8 adst, identity
+def_fn_8x8 flipadst, identity
+def_fn_8x8 identity, adst
+def_fn_8x8 identity, flipadst
+
+function inv_txfm_add_8x4_neon
+ movi v30.8h, #0
+ movi v31.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+ ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
+ st1 {v30.8h,v31.8h}, [x2], #32
+ ld1 {v20.4h,v21.4h,v22.4h,v23.4h}, [x2]
+ st1 {v30.8h,v31.8h}, [x2]
+
+ scale_input .4h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x4
+
+ transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+ ins v16.d[1], v20.d[0]
+ ins v17.d[1], v21.d[0]
+ ins v18.d[1], v22.d[0]
+ ins v19.d[1], v23.d[0]
+
+ blr x5
+
+ load_add_store_8x4 x0, x7
+ ret x15
+endfunc
+
+function inv_txfm_add_4x8_neon
+ movi v28.8h, #0
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+ ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2]
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2]
+
+ scale_input .8h, v0.h[0], v16, v17, v18, v19
+
+ blr x4
+
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+ ins v20.d[0], v16.d[1]
+ ins v21.d[0], v17.d[1]
+ ins v22.d[0], v18.d[1]
+ ins v23.d[0], v19.d[1]
+
+ blr x5
+
+ load_add_store_4x8 x0, x7
+ ret x15
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ adr x4, inv_\txfm1\()_\h\()h_x\w\()_neon
+ adr x5, inv_\txfm2\()_\w\()h_x\h\()_neon
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct
+def_fn_48 \w, \h, identity, identity
+def_fn_48 \w, \h, dct, adst
+def_fn_48 \w, \h, dct, flipadst
+def_fn_48 \w, \h, dct, identity
+def_fn_48 \w, \h, adst, dct
+def_fn_48 \w, \h, adst, adst
+def_fn_48 \w, \h, adst, flipadst
+def_fn_48 \w, \h, flipadst, dct
+def_fn_48 \w, \h, flipadst, adst
+def_fn_48 \w, \h, flipadst, flipadst
+def_fn_48 \w, \h, identity, dct
+def_fn_48 \w, \h, adst, identity
+def_fn_48 \w, \h, flipadst, identity
+def_fn_48 \w, \h, identity, adst
+def_fn_48 \w, \h, identity, flipadst
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+
+.macro idct_16 sz, szb
+ idct_8 v16, v18, v20, v22, v24, v26, v28, v30, \sz, \szb
+
+ smull_smlsl v2, v3, v17, v31, v1.h[0], v1.h[1], \sz // -> t8a
+ smull_smlal v4, v5, v17, v31, v1.h[1], v1.h[0], \sz // -> t15a
+ smull_smlsl v6, v7, v25, v23, v1.h[2], v1.h[3], \sz // -> t9a
+ sqrshrn_sz v17, v2, v3, #12, \sz // t8a
+ sqrshrn_sz v31, v4, v5, #12, \sz // t15a
+ smull_smlal v2, v3, v25, v23, v1.h[3], v1.h[2], \sz // -> t14a
+ smull_smlsl v4, v5, v21, v27, v1.h[4], v1.h[5], \sz // -> t10a
+ sqrshrn_sz v23, v6, v7, #12, \sz // t9a
+ sqrshrn_sz v25, v2, v3, #12, \sz // t14a
+ smull_smlal v6, v7, v21, v27, v1.h[5], v1.h[4], \sz // -> t13a
+ smull_smlsl v2, v3, v29, v19, v1.h[6], v1.h[7], \sz // -> t11a
+ sqrshrn_sz v21, v4, v5, #12, \sz // t10a
+ sqrshrn_sz v27, v6, v7, #12, \sz // t13a
+ smull_smlal v4, v5, v29, v19, v1.h[7], v1.h[6], \sz // -> t12a
+ sqrshrn_sz v19, v2, v3, #12, \sz // t11a
+ sqrshrn_sz v29, v4, v5, #12, \sz // t12a
+
+ sqsub v2\sz, v17\sz, v23\sz // t9
+ sqadd v17\sz, v17\sz, v23\sz // t8
+ sqsub v3\sz, v31\sz, v25\sz // t14
+ sqadd v31\sz, v31\sz, v25\sz // t15
+ sqsub v23\sz, v19\sz, v21\sz // t10
+ sqadd v19\sz, v19\sz, v21\sz // t11
+ sqadd v25\sz, v29\sz, v27\sz // t12
+ sqsub v29\sz, v29\sz, v27\sz // t13
+
+ smull_smlsl v4, v5, v3, v2, v0.h[2], v0.h[3], \sz // -> t9a
+ smull_smlal v6, v7, v3, v2, v0.h[3], v0.h[2], \sz // -> t14a
+ sqrshrn_sz v21, v4, v5, #12, \sz // t9a
+ sqrshrn_sz v27, v6, v7, #12, \sz // t14a
+
+ smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a
+ smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a
+ sqrshrn_sz v29, v4, v5, #12, \sz // t13a
+ neg v6.4s, v6.4s
+.ifc \sz, .8h
+ neg v7.4s, v7.4s
+.endif
+ sqrshrn_sz v23, v6, v7, #12, \sz // t10a
+
+ sqsub v2\sz, v17\sz, v19\sz // t11a
+ sqadd v17\sz, v17\sz, v19\sz // t8a
+ sqsub v3\sz, v31\sz, v25\sz // t12a
+ sqadd v31\sz, v31\sz, v25\sz // t15a
+ sqadd v19\sz, v21\sz, v23\sz // t9
+ sqsub v21\sz, v21\sz, v23\sz // t10
+ sqsub v25\sz, v27\sz, v29\sz // t13
+ sqadd v27\sz, v27\sz, v29\sz // t14
+
+ smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], \sz // -> t11
+ smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], \sz // -> t12
+ smull_smlsl v2, v3, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
+
+ sqrshrn_sz v4, v4, v5, #12, \sz // t11
+ sqrshrn_sz v5, v6, v7, #12, \sz // t12
+ smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t13a
+ sqrshrn_sz v2, v2, v3, #12, \sz // t10a
+ sqrshrn_sz v3, v6, v7, #12, \sz // t13a
+
+ sqadd v6\sz, v16\sz, v31\sz // out0
+ sqsub v31\sz, v16\sz, v31\sz // out15
+ mov v16\szb, v6\szb
+ sqadd v23\sz, v30\sz, v17\sz // out7
+ sqsub v7\sz, v30\sz, v17\sz // out8
+ sqadd v17\sz, v18\sz, v27\sz // out1
+ sqsub v30\sz, v18\sz, v27\sz // out14
+ sqadd v18\sz, v20\sz, v3\sz // out2
+ sqsub v29\sz, v20\sz, v3\sz // out13
+ sqadd v3\sz, v28\sz, v19\sz // out6
+ sqsub v25\sz, v28\sz, v19\sz // out9
+ sqadd v19\sz, v22\sz, v5\sz // out3
+ sqsub v28\sz, v22\sz, v5\sz // out12
+ sqadd v20\sz, v24\sz, v4\sz // out4
+ sqsub v27\sz, v24\sz, v4\sz // out11
+ sqadd v21\sz, v26\sz, v2\sz // out5
+ sqsub v26\sz, v26\sz, v2\sz // out10
+ mov v24\szb, v7\szb
+ mov v22\szb, v3\szb
+.endm
+
+function inv_dct_8h_x16_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.8h, v1.8h}, [x16]
+ idct_16 .8h, .16b
+ ret
+endfunc
+
+function inv_dct_4h_x16_neon, export=1
+ movrel x16, idct_coeffs
+ ld1 {v0.8h, v1.8h}, [x16]
+ idct_16 .4h, .8b
+ ret
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15, sz, szb
+ movrel x16, iadst16_coeffs
+ ld1 {v0.8h, v1.8h}, [x16]
+ movrel x16, idct_coeffs
+
+ smull_smlal v2, v3, v31, v16, v0.h[0], v0.h[1], \sz // -> t0
+ smull_smlsl v4, v5, v31, v16, v0.h[1], v0.h[0], \sz // -> t1
+ smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t2
+ sqrshrn_sz v16, v2, v3, #12, \sz // t0
+ sqrshrn_sz v31, v4, v5, #12, \sz // t1
+ smull_smlsl v2, v3, v29, v18, v0.h[3], v0.h[2], \sz // -> t3
+ smull_smlal v4, v5, v27, v20, v0.h[4], v0.h[5], \sz // -> t4
+ sqrshrn_sz v18, v6, v7, #12, \sz // t2
+ sqrshrn_sz v29, v2, v3, #12, \sz // t3
+ smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t5
+ smull_smlal v2, v3, v25, v22, v0.h[6], v0.h[7], \sz // -> t6
+ sqrshrn_sz v20, v4, v5, #12, \sz // t4
+ sqrshrn_sz v27, v6, v7, #12, \sz // t5
+ smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t7
+ smull_smlal v6, v7, v23, v24, v1.h[0], v1.h[1], \sz // -> t8
+ sqrshrn_sz v22, v2, v3, #12, \sz // t6
+ sqrshrn_sz v25, v4, v5, #12, \sz // t7
+ smull_smlsl v2, v3, v23, v24, v1.h[1], v1.h[0], \sz // -> t9
+ smull_smlal v4, v5, v21, v26, v1.h[2], v1.h[3], \sz // -> t10
+ sqrshrn_sz v23, v6, v7, #12, \sz // t8
+ sqrshrn_sz v24, v2, v3, #12, \sz // t9
+ smull_smlsl v6, v7, v21, v26, v1.h[3], v1.h[2], \sz // -> t11
+ smull_smlal v2, v3, v19, v28, v1.h[4], v1.h[5], \sz // -> t12
+ sqrshrn_sz v21, v4, v5, #12, \sz // t10
+ sqrshrn_sz v26, v6, v7, #12, \sz // t11
+ smull_smlsl v4, v5, v19, v28, v1.h[5], v1.h[4], \sz // -> t13
+ smull_smlal v6, v7, v17, v30, v1.h[6], v1.h[7], \sz // -> t14
+ sqrshrn_sz v19, v2, v3, #12, \sz // t12
+ sqrshrn_sz v28, v4, v5, #12, \sz // t13
+ smull_smlsl v2, v3, v17, v30, v1.h[7], v1.h[6], \sz // -> t15
+ sqrshrn_sz v17, v6, v7, #12, \sz // t14
+ sqrshrn_sz v30, v2, v3, #12, \sz // t15
+
+ ld1 {v0.8h}, [x16]
+
+ sqsub v2\sz, v16\sz, v23\sz // t8a
+ sqadd v16\sz, v16\sz, v23\sz // t0a
+ sqsub v3\sz, v31\sz, v24\sz // t9a
+ sqadd v31\sz, v31\sz, v24\sz // t1a
+ sqadd v23\sz, v18\sz, v21\sz // t2a
+ sqsub v18\sz, v18\sz, v21\sz // t10a
+ sqadd v24\sz, v29\sz, v26\sz // t3a
+ sqsub v29\sz, v29\sz, v26\sz // t11a
+ sqadd v21\sz, v20\sz, v19\sz // t4a
+ sqsub v20\sz, v20\sz, v19\sz // t12a
+ sqadd v26\sz, v27\sz, v28\sz // t5a
+ sqsub v27\sz, v27\sz, v28\sz // t13a
+ sqadd v19\sz, v22\sz, v17\sz // t6a
+ sqsub v22\sz, v22\sz, v17\sz // t14a
+ sqadd v28\sz, v25\sz, v30\sz // t7a
+ sqsub v25\sz, v25\sz, v30\sz // t15a
+
+ smull_smlal v4, v5, v2, v3, v0.h[5], v0.h[4], \sz // -> t8
+ smull_smlsl v6, v7, v2, v3, v0.h[4], v0.h[5], \sz // -> t9
+ smull_smlal v2, v3, v18, v29, v0.h[7], v0.h[6], \sz // -> t10
+ sqrshrn_sz v17, v4, v5, #12, \sz // t8
+ sqrshrn_sz v30, v6, v7, #12, \sz // t9
+ smull_smlsl v4, v5, v18, v29, v0.h[6], v0.h[7], \sz // -> t11
+ smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t12
+ sqrshrn_sz v18, v2, v3, #12, \sz // t10
+ sqrshrn_sz v29, v4, v5, #12, \sz // t11
+ smull_smlal v2, v3, v27, v20, v0.h[4], v0.h[5], \sz // -> t13
+ smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t14
+ sqrshrn_sz v27, v6, v7, #12, \sz // t12
+ sqrshrn_sz v20, v2, v3, #12, \sz // t13
+ smull_smlal v6, v7, v25, v22, v0.h[6], v0.h[7], \sz // -> t15
+ sqrshrn_sz v25, v4, v5, #12, \sz // t14
+ sqrshrn_sz v22, v6, v7, #12, \sz // t15
+
+ sqsub v2\sz, v16\sz, v21\sz // t4
+ sqadd v16\sz, v16\sz, v21\sz // t0
+ sqsub v3\sz, v31\sz, v26\sz // t5
+ sqadd v31\sz, v31\sz, v26\sz // t1
+ sqadd v21\sz, v23\sz, v19\sz // t2
+ sqsub v23\sz, v23\sz, v19\sz // t6
+ sqadd v26\sz, v24\sz, v28\sz // t3
+ sqsub v24\sz, v24\sz, v28\sz // t7
+ sqadd v19\sz, v17\sz, v27\sz // t8a
+ sqsub v17\sz, v17\sz, v27\sz // t12a
+ sqadd v28\sz, v30\sz, v20\sz // t9a
+ sqsub v30\sz, v30\sz, v20\sz // t13a
+ sqadd v27\sz, v18\sz, v25\sz // t10a
+ sqsub v18\sz, v18\sz, v25\sz // t14a
+ sqadd v20\sz, v29\sz, v22\sz // t11a
+ sqsub v29\sz, v29\sz, v22\sz // t15a
+
+ smull_smlal v4, v5, v2, v3, v0.h[3], v0.h[2], \sz // -> t4a
+ smull_smlsl v6, v7, v2, v3, v0.h[2], v0.h[3], \sz // -> t5a
+ smull_smlsl v2, v3, v24, v23, v0.h[3], v0.h[2], \sz // -> t6a
+ sqrshrn_sz v22, v4, v5, #12, \sz // t4a
+ sqrshrn_sz v25, v6, v7, #12, \sz // t5a
+ smull_smlal v4, v5, v24, v23, v0.h[2], v0.h[3], \sz // -> t7a
+ smull_smlal v6, v7, v17, v30, v0.h[3], v0.h[2], \sz // -> t12
+ sqrshrn_sz v24, v2, v3, #12, \sz // t6a
+ sqrshrn_sz v23, v4, v5, #12, \sz // t7a
+ smull_smlsl v2, v3, v17, v30, v0.h[2], v0.h[3], \sz // -> t13
+ smull_smlsl v4, v5, v29, v18, v0.h[3], v0.h[2], \sz // -> t14
+ sqrshrn_sz v17, v6, v7, #12, \sz // t12
+ smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t15
+ sqrshrn_sz v29, v2, v3, #12, \sz // t13
+ sqrshrn_sz v30, v4, v5, #12, \sz // t14
+ sqrshrn_sz v18, v6, v7, #12, \sz // t15
+
+ sqsub v2\sz, v16\sz, v21\sz // t2a
+.ifc \o0, v16
+ sqadd \o0\sz, v16\sz, v21\sz // out0
+ sqsub v21\sz, v31\sz, v26\sz // t3a
+ sqadd \o15\sz, v31\sz, v26\sz // out15
+.else
+ sqadd v4\sz, v16\sz, v21\sz // out0
+ sqsub v21\sz, v31\sz, v26\sz // t3a
+ sqadd \o15\sz, v31\sz, v26\sz // out15
+ mov \o0\szb, v4\szb
+.endif
+ sqneg \o15\sz, \o15\sz // out15
+
+ sqsub v3\sz, v29\sz, v18\sz // t15a
+ sqadd \o13\sz, v29\sz, v18\sz // out13
+ sqadd \o2\sz, v17\sz, v30\sz // out2
+ sqsub v26\sz, v17\sz, v30\sz // t14a
+ sqneg \o13\sz, \o13\sz // out13
+
+ sqadd \o1\sz, v19\sz, v27\sz // out1
+ sqsub v27\sz, v19\sz, v27\sz // t10
+ sqadd \o14\sz, v28\sz, v20\sz // out14
+ sqsub v20\sz, v28\sz, v20\sz // t11
+ sqneg \o1\sz, \o1\sz // out1
+
+ sqadd \o3\sz, v22\sz, v24\sz // out3
+ sqsub v22\sz, v22\sz, v24\sz // t6
+ sqadd \o12\sz, v25\sz, v23\sz // out12
+ sqsub v23\sz, v25\sz, v23\sz // t7
+ sqneg \o3\sz, \o3\sz // out3
+
+ smull_smlsl v24, v25, v2, v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23)
+ smull_smlal v4, v5, v2, v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24)
+ smull_smlal v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26)
+
+ sqrshrn_sz v24, v24, v25, #12, \sz // out8
+ sqrshrn_sz v4, v4, v5, #12, \sz // out7
+ sqrshrn_sz v5, v6, v7, #12, \sz // out5
+ smull_smlsl v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21)
+ smull_smlal v2, v3, v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27)
+ sqrshrn_sz v26, v6, v7, #12, \sz // out10
+
+ smull_smlsl v6, v7, v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20)
+ smull_smlal v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25)
+ smull_smlsl v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22)
+
+ sqrshrn_sz \o4, v2, v3, #12, \sz // out4
+ sqrshrn_sz v6, v6, v7, #12, \sz // out11
+ sqrshrn_sz v7, v21, v25, #12, \sz // out9
+ sqrshrn_sz \o6, v22, v23, #12, \sz // out6
+
+.ifc \o8, v23
+ mov \o8\szb, v24\szb
+ mov \o10\szb, v26\szb
+.endif
+
+ sqneg \o7\sz, v4\sz // out7
+ sqneg \o5\sz, v5\sz // out5
+ sqneg \o11\sz, v6\sz // out11
+ sqneg \o9\sz, v7\sz // out9
+.endm
+
+function inv_adst_8h_x16_neon, export=1
+ iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b
+ ret
+endfunc
+
+function inv_flipadst_8h_x16_neon, export=1
+ iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b
+ ret
+endfunc
+
+function inv_adst_4h_x16_neon, export=1
+ iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b
+ ret
+endfunc
+
+function inv_flipadst_4h_x16_neon, export=1
+ iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b
+ ret
+endfunc
+
+function inv_identity_8h_x16_neon, export=1
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ sqrdmulh v2.8h, v\i\().8h, v0.h[0]
+ sqadd v\i\().8h, v\i\().8h, v\i\().8h
+ sqadd v\i\().8h, v\i\().8h, v2.8h
+.endr
+ ret
+endfunc
+
+function inv_identity_4h_x16_neon, export=1
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ sqrdmulh v2.4h, v\i\().4h, v0.h[0]
+ sqadd v\i\().4h, v\i\().4h, v\i\().4h
+ sqadd v\i\().4h, v\i\().4h, v2.4h
+.endr
+ ret
+endfunc
+
+.macro identity_8x16_shift2 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ sqrdmulh v2.8h, \i, \c
+ sshr v2.8h, v2.8h, #1
+ srhadd \i, \i, v2.8h
+.endr
+.endm
+
+.macro identity_8x16_shift1 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ sqrdmulh v2.8h, \i, \c
+ srshr v2.8h, v2.8h, #1
+ sqadd \i, \i, v2.8h
+.endr
+.endm
+
+.macro identity_8x8_shift1 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ sqrdmulh v2.8h, \i, \c
+ srshr v2.8h, v2.8h, #1
+ sqadd \i, \i, v2.8h
+.endr
+.endm
+
+.macro identity_8x8 c
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ sqrdmulh v2.8h, \i, \c
+ sqadd \i, \i, \i
+ sqadd \i, \i, v2.8h
+.endr
+.endm
+
+.macro def_horz_16 scale=0, identity=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x8_neon
+ AARCH64_VALID_CALL_TARGET
+ mov x14, x30
+ movi v7.8h, #0
+.if \identity
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+.elseif \scale
+ mov w16, #2896*8
+ dup v0.4h, w16
+.endif
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x7]
+ st1 {v7.8h}, [x7], x8
+.endr
+.if \scale
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+.if \identity
+ identity_8x16_shift2 v0.h[0]
+.else
+ blr x4
+.endif
+.if \shift > 0
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ srshr \i, \i, #\shift
+.endr
+.endif
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+ transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
+
+.irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h
+ st1 {\i}, [x6], #16
+.endr
+
+ ret x14
+endfunc
+.endm
+
+def_horz_16 scale=0, identity=0, shift=2
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=1, shift=0, suffix=_identity
+
+function inv_txfm_add_vert_8x16_neon
+ mov x14, x30
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ blr x5
+ load_add_store_8x16 x6, x7
+ ret x14
+endfunc
+
+function inv_txfm_add_16x16_neon
+ mov x15, x30
+ sub sp, sp, #512
+.irp i, 0, 8
+ add x6, sp, #(\i*16*2)
+.if \i == 8
+ cmp w3, w13
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #16*2
+ blr x9
+.endr
+ b 2f
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+2:
+.irp i, 0, 8
+ add x6, x0, #(\i)
+ add x7, sp, #(\i*2)
+ mov x8, #32
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+.macro def_fn_16x16 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+.ifc \txfm1, identity
+ adr x9, inv_txfm_horz_identity_16x8_neon
+.else
+ adr x9, inv_txfm_horz_16x8_neon
+ adr x4, inv_\txfm1\()_8h_x16_neon
+.endif
+ adr x5, inv_\txfm2\()_8h_x16_neon
+ mov x13, #\eob_half
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct, 36
+def_fn_16x16 identity, identity, 36
+def_fn_16x16 dct, adst, 36
+def_fn_16x16 dct, flipadst, 36
+def_fn_16x16 dct, identity, 8
+def_fn_16x16 adst, dct, 36
+def_fn_16x16 adst, adst, 36
+def_fn_16x16 adst, flipadst, 36
+def_fn_16x16 flipadst, dct, 36
+def_fn_16x16 flipadst, adst, 36
+def_fn_16x16 flipadst, flipadst, 36
+def_fn_16x16 identity, dct, 8
+
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_16x4_neon
+ mov x15, x30
+ movi v4.8h, #0
+
+.ifc \variant, identity_
+.irp i, v16.4h, v17.4h, v18.4h, v19.4h
+ ld1 {\i}, [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+.irp i, v16.d, v17.d, v18.d, v19.d
+ ld1 {\i}[1], [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+.irp i, v20.4h, v21.4h, v22.4h, v23.4h
+ ld1 {\i}, [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+.irp i, v20.d, v21.d, v22.d, v23.d
+ ld1 {\i}[1], [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+
+ identity_8x16_shift1 v0.h[0]
+.else
+.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
+ ld1 {\i}, [x2]
+ st1 {v4.4h}, [x2], #8
+.endr
+
+ blr x4
+
+ ins v16.d[1], v20.d[0]
+ ins v17.d[1], v21.d[0]
+ ins v18.d[1], v22.d[0]
+ ins v19.d[1], v23.d[0]
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ srshr \i, \i, #1
+.endr
+.endif
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ mov x6, x0
+ load_add_store_8x4 x6, x7
+
+.ifc \variant, identity_
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ mov v18.16b, v22.16b
+ mov v19.16b, v23.16b
+.else
+ ins v24.d[1], v28.d[0]
+ ins v25.d[1], v29.d[0]
+ ins v26.d[1], v30.d[0]
+ ins v27.d[1], v31.d[0]
+ srshr v16.8h, v24.8h, #1
+ srshr v17.8h, v25.8h, #1
+ srshr v18.8h, v26.8h, #1
+ srshr v19.8h, v27.8h, #1
+.endif
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ add x6, x0, #8
+ load_add_store_8x4 x6, x7
+
+ ret x15
+endfunc
+
+function inv_txfm_\variant\()add_4x16_neon
+ mov x15, x30
+ movi v2.8h, #0
+
+ mov x11, #32
+ cmp w3, w13
+ b.lt 1f
+
+ add x6, x2, #16
+.ifc \variant, identity_
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+ ld1 {\i}, [x6]
+ st1 {v2.8h}, [x6], x11
+.endr
+ mov w16, #(5793-4096)*8
+ dup v0.4h, w16
+ identity_8x4_shift1 v24, v25, v26, v27, v0.h[0]
+.else
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ ld1 {\i}, [x6]
+ st1 {v2.8h}, [x6], x11
+.endr
+ blr x4
+ srshr v24.8h, v16.8h, #1
+ srshr v25.8h, v17.8h, #1
+ srshr v26.8h, v18.8h, #1
+ srshr v27.8h, v19.8h, #1
+.endif
+ transpose_4x8h v24, v25, v26, v27, v4, v5, v6, v7
+ ins v28.d[0], v24.d[1]
+ ins v29.d[0], v25.d[1]
+ ins v30.d[0], v26.d[1]
+ ins v31.d[0], v27.d[1]
+
+ b 2f
+1:
+.irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
+ movi \i, #0
+.endr
+2:
+ movi v2.8h, #0
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ ld1 {\i}, [x2]
+ st1 {v2.8h}, [x2], x11
+.endr
+.ifc \variant, identity_
+ mov w16, #(5793-4096)*8
+ dup v0.4h, w16
+ identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
+.else
+ blr x4
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ srshr \i, \i, #1
+.endr
+.endif
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+ ins v20.d[0], v16.d[1]
+ ins v21.d[0], v17.d[1]
+ ins v22.d[0], v18.d[1]
+ ins v23.d[0], v19.d[1]
+
+ blr x5
+
+ load_add_store_4x16 x0, x6
+
+ ret x15
+endfunc
+.endm
+
+def_fn_416_base
+def_fn_416_base identity_
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+.if \w == 4
+ adr x4, inv_\txfm1\()_8h_x\w\()_neon
+ adr x5, inv_\txfm2\()_4h_x\h\()_neon
+ mov w13, #\eob_half
+.else
+ adr x4, inv_\txfm1\()_4h_x\w\()_neon
+ adr x5, inv_\txfm2\()_8h_x\h\()_neon
+.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+ b inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 29
+def_fn_416 \w, \h, identity, identity, 29
+def_fn_416 \w, \h, dct, adst, 29
+def_fn_416 \w, \h, dct, flipadst, 29
+def_fn_416 \w, \h, dct, identity, 8
+def_fn_416 \w, \h, adst, dct, 29
+def_fn_416 \w, \h, adst, adst, 29
+def_fn_416 \w, \h, adst, flipadst, 29
+def_fn_416 \w, \h, flipadst, dct, 29
+def_fn_416 \w, \h, flipadst, adst, 29
+def_fn_416 \w, \h, flipadst, flipadst, 29
+def_fn_416 \w, \h, identity, dct, 32
+def_fn_416 \w, \h, adst, identity, 8
+def_fn_416 \w, \h, flipadst, identity, 8
+def_fn_416 \w, \h, identity, adst, 32
+def_fn_416 \w, \h, identity, flipadst, 32
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_16x8_neon
+ mov x15, x30
+ movi v4.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x2]
+ st1 {v4.8h}, [x2], #16
+.endr
+
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.ifc \variant, identity_
+ mov w16, #2*(5793-4096)*8
+ dup v0.4h, w16
+ identity_8x16_shift1 v0.h[0]
+.else
+ blr x4
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ srshr \i, \i, #1
+.endr
+.endif
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+
+ blr x5
+
+ mov x6, x0
+ load_add_store_8x8 x6, x7
+
+.ifc \variant, identity_
+ mov v16.16b, v24.16b
+ mov v17.16b, v25.16b
+ mov v18.16b, v26.16b
+ mov v19.16b, v27.16b
+ mov v20.16b, v28.16b
+ mov v21.16b, v29.16b
+ mov v22.16b, v30.16b
+ mov v23.16b, v31.16b
+.else
+ srshr v16.8h, v24.8h, #1
+ srshr v17.8h, v25.8h, #1
+ srshr v18.8h, v26.8h, #1
+ srshr v19.8h, v27.8h, #1
+ srshr v20.8h, v28.8h, #1
+ srshr v21.8h, v29.8h, #1
+ srshr v22.8h, v30.8h, #1
+ srshr v23.8h, v31.8h, #1
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+
+ blr x5
+
+ add x0, x0, #8
+ load_add_store_8x8 x0, x7
+
+ ret x15
+endfunc
+
+function inv_txfm_\variant\()add_8x16_neon
+ mov x15, x30
+ movi v4.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+ mov x11, #32
+
+ cmp w3, w13
+ b.lt 1f
+
+ add x6, x2, #16
+.ifc \variant, identity_
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x6]
+ st1 {v4.8h}, [x6], x11
+.endr
+ scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ ld1 {\i}, [x6]
+ st1 {v4.8h}, [x6], x11
+.endr
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ srshr v24.8h, v16.8h, #1
+ srshr v25.8h, v17.8h, #1
+ srshr v26.8h, v18.8h, #1
+ srshr v27.8h, v19.8h, #1
+ srshr v28.8h, v20.8h, #1
+ srshr v29.8h, v21.8h, #1
+ srshr v30.8h, v22.8h, #1
+ srshr v31.8h, v23.8h, #1
+.endif
+ transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+ b 2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ movi \i, #0
+.endr
+
+2:
+ movi v4.8h, #0
+ mov w16, #2896*8
+ dup v0.4h, w16
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ ld1 {\i}, [x2]
+ st1 {v4.8h}, [x2], x11
+.endr
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+.ifc \variant, identity_
+ // The identity shl #1 and downshift srshr #1 cancel out
+.else
+ blr x4
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ srshr \i, \i, #1
+.endr
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+
+ blr x5
+
+ load_add_store_8x16 x0, x6
+
+ ret x15
+endfunc
+.endm
+
+def_fn_816_base
+def_fn_816_base identity_
+
+.macro def_fn_816 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ adr x4, inv_\txfm1\()_8h_x\w\()_neon
+ adr x5, inv_\txfm2\()_8h_x\h\()_neon
+.if \w == 8
+ mov x13, #\eob_half
+.endif
+.ifc \txfm1, identity
+ b inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+ b inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct, 43
+def_fn_816 \w, \h, identity, identity, 43
+def_fn_816 \w, \h, dct, adst, 43
+def_fn_816 \w, \h, dct, flipadst, 43
+def_fn_816 \w, \h, dct, identity, 8
+def_fn_816 \w, \h, adst, dct, 43
+def_fn_816 \w, \h, adst, adst, 43
+def_fn_816 \w, \h, adst, flipadst, 43
+def_fn_816 \w, \h, flipadst, dct, 43
+def_fn_816 \w, \h, flipadst, adst, 43
+def_fn_816 \w, \h, flipadst, flipadst, 43
+def_fn_816 \w, \h, identity, dct, 64
+def_fn_816 \w, \h, adst, identity, 8
+def_fn_816 \w, \h, flipadst, identity, 8
+def_fn_816 \w, \h, identity, adst, 64
+def_fn_816 \w, \h, identity, flipadst, 64
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_8h_x16_neon, export=1
+ movrel x16, idct_coeffs, 2*16
+ ld1 {v0.8h, v1.8h}, [x16]
+ sub x16, x16, #2*16
+
+ smull_smlsl v2, v3, v16, v31, v0.h[0], v0.h[1], .8h // -> t16a
+ smull_smlal v4, v5, v16, v31, v0.h[1], v0.h[0], .8h // -> t31a
+ smull_smlsl v6, v7, v24, v23, v0.h[2], v0.h[3], .8h // -> t17a
+ sqrshrn_sz v16, v2, v3, #12, .8h // t16a
+ sqrshrn_sz v31, v4, v5, #12, .8h // t31a
+ smull_smlal v2, v3, v24, v23, v0.h[3], v0.h[2], .8h // -> t30a
+ smull_smlsl v4, v5, v20, v27, v0.h[4], v0.h[5], .8h // -> t18a
+ sqrshrn_sz v24, v6, v7, #12, .8h // t17a
+ sqrshrn_sz v23, v2, v3, #12, .8h // t30a
+ smull_smlal v6, v7, v20, v27, v0.h[5], v0.h[4], .8h // -> t29a
+ smull_smlsl v2, v3, v28, v19, v0.h[6], v0.h[7], .8h // -> t19a
+ sqrshrn_sz v20, v4, v5, #12, .8h // t18a
+ sqrshrn_sz v27, v6, v7, #12, .8h // t29a
+ smull_smlal v4, v5, v28, v19, v0.h[7], v0.h[6], .8h // -> t28a
+ smull_smlsl v6, v7, v18, v29, v1.h[0], v1.h[1], .8h // -> t20a
+ sqrshrn_sz v28, v2, v3, #12, .8h // t19a
+ sqrshrn_sz v19, v4, v5, #12, .8h // t28a
+ smull_smlal v2, v3, v18, v29, v1.h[1], v1.h[0], .8h // -> t27a
+ smull_smlsl v4, v5, v26, v21, v1.h[2], v1.h[3], .8h // -> t21a
+ sqrshrn_sz v18, v6, v7, #12, .8h // t20a
+ sqrshrn_sz v29, v2, v3, #12, .8h // t27a
+ smull_smlal v6, v7, v26, v21, v1.h[3], v1.h[2], .8h // -> t26a
+ smull_smlsl v2, v3, v22, v25, v1.h[4], v1.h[5], .8h // -> t22a
+ sqrshrn_sz v26, v4, v5, #12, .8h // t21a
+ sqrshrn_sz v21, v6, v7, #12, .8h // t26a
+ smull_smlal v4, v5, v22, v25, v1.h[5], v1.h[4], .8h // -> t25a
+ smull_smlsl v6, v7, v30, v17, v1.h[6], v1.h[7], .8h // -> t23a
+ sqrshrn_sz v22, v2, v3, #12, .8h // t22a
+ sqrshrn_sz v25, v4, v5, #12, .8h // t25a
+ smull_smlal v2, v3, v30, v17, v1.h[7], v1.h[6], .8h // -> t24a
+ sqrshrn_sz v30, v6, v7, #12, .8h // t23a
+ sqrshrn_sz v17, v2, v3, #12, .8h // t24a
+
+ ld1 {v0.8h}, [x16]
+
+ sqsub v2.8h, v16.8h, v24.8h // t17
+ sqadd v16.8h, v16.8h, v24.8h // t16
+ sqsub v3.8h, v31.8h, v23.8h // t30
+ sqadd v31.8h, v31.8h, v23.8h // t31
+ sqsub v24.8h, v28.8h, v20.8h // t18
+ sqadd v28.8h, v28.8h, v20.8h // t19
+ sqadd v23.8h, v18.8h, v26.8h // t20
+ sqsub v18.8h, v18.8h, v26.8h // t21
+ sqsub v20.8h, v30.8h, v22.8h // t22
+ sqadd v30.8h, v30.8h, v22.8h // t23
+ sqadd v26.8h, v17.8h, v25.8h // t24
+ sqsub v17.8h, v17.8h, v25.8h // t25
+ sqsub v22.8h, v29.8h, v21.8h // t26
+ sqadd v29.8h, v29.8h, v21.8h // t27
+ sqadd v25.8h, v19.8h, v27.8h // t28
+ sqsub v19.8h, v19.8h, v27.8h // t29
+
+ smull_smlsl v4, v5, v3, v2, v0.h[4], v0.h[5], .8h // -> t17a
+ smull_smlal v6, v7, v3, v2, v0.h[5], v0.h[4], .8h // -> t30a
+ smull_smlal v2, v3, v19, v24, v0.h[5], v0.h[4], .8h // -> t18a
+ sqrshrn_sz v21, v4, v5, #12, .8h // t17a
+ sqrshrn_sz v27, v6, v7, #12, .8h // t30a
+ neg v2.4s, v2.4s // -> t18a
+ neg v3.4s, v3.4s // -> t18a
+ smull_smlsl v4, v5, v19, v24, v0.h[4], v0.h[5], .8h // -> t29a
+ smull_smlsl v6, v7, v22, v18, v0.h[6], v0.h[7], .8h // -> t21a
+ sqrshrn_sz v19, v2, v3, #12, .8h // t18a
+ sqrshrn_sz v24, v4, v5, #12, .8h // t29a
+ smull_smlal v2, v3, v22, v18, v0.h[7], v0.h[6], .8h // -> t26a
+ smull_smlal v4, v5, v17, v20, v0.h[7], v0.h[6], .8h // -> t22a
+ sqrshrn_sz v22, v6, v7, #12, .8h // t21a
+ sqrshrn_sz v18, v2, v3, #12, .8h // t26a
+ neg v4.4s, v4.4s // -> t22a
+ neg v5.4s, v5.4s // -> t22a
+ smull_smlsl v6, v7, v17, v20, v0.h[6], v0.h[7], .8h // -> t25a
+ sqrshrn_sz v17, v4, v5, #12, .8h // t22a
+ sqrshrn_sz v20, v6, v7, #12, .8h // t25a
+
+ sqsub v2.8h, v27.8h, v24.8h // t29
+ sqadd v27.8h, v27.8h, v24.8h // t30
+ sqsub v3.8h, v21.8h, v19.8h // t18
+ sqadd v21.8h, v21.8h, v19.8h // t17
+ sqsub v24.8h, v16.8h, v28.8h // t19a
+ sqadd v16.8h, v16.8h, v28.8h // t16a
+ sqsub v19.8h, v30.8h, v23.8h // t20a
+ sqadd v30.8h, v30.8h, v23.8h // t23a
+ sqsub v28.8h, v17.8h, v22.8h // t21
+ sqadd v17.8h, v17.8h, v22.8h // t22
+ sqadd v23.8h, v26.8h, v29.8h // t24a
+ sqsub v26.8h, v26.8h, v29.8h // t27a
+ sqadd v22.8h, v20.8h, v18.8h // t25
+ sqsub v20.8h, v20.8h, v18.8h // t26
+ sqsub v29.8h, v31.8h, v25.8h // t28a
+ sqadd v31.8h, v31.8h, v25.8h // t31a
+
+ smull_smlsl v4, v5, v2, v3, v0.h[2], v0.h[3], .8h // -> t18a
+ smull_smlal v6, v7, v2, v3, v0.h[3], v0.h[2], .8h // -> t29a
+ smull_smlsl v2, v3, v29, v24, v0.h[2], v0.h[3], .8h // -> t19
+ sqrshrn_sz v18, v4, v5, #12, .8h // t18a
+ sqrshrn_sz v25, v6, v7, #12, .8h // t29a
+ smull_smlal v4, v5, v29, v24, v0.h[3], v0.h[2], .8h // -> t28
+ smull_smlal v6, v7, v26, v19, v0.h[3], v0.h[2], .8h // -> t20
+ sqrshrn_sz v29, v2, v3, #12, .8h // t19
+ sqrshrn_sz v24, v4, v5, #12, .8h // t28
+ neg v6.4s, v6.4s // -> t20
+ neg v7.4s, v7.4s // -> t20
+ smull_smlsl v2, v3, v26, v19, v0.h[2], v0.h[3], .8h // -> t27
+ smull_smlal v4, v5, v20, v28, v0.h[3], v0.h[2], .8h // -> t21a
+ sqrshrn_sz v26, v6, v7, #12, .8h // t20
+ sqrshrn_sz v19, v2, v3, #12, .8h // t27
+ neg v4.4s, v4.4s // -> t21a
+ neg v5.4s, v5.4s // -> t21a
+ smull_smlsl v6, v7, v20, v28, v0.h[2], v0.h[3], .8h // -> t26a
+ sqrshrn_sz v20, v4, v5, #12, .8h // t21a
+ sqrshrn_sz v28, v6, v7, #12, .8h // t26a
+
+ sqsub v2.8h, v16.8h, v30.8h // t23
+ sqadd v16.8h, v16.8h, v30.8h // t16 = out16
+ sqsub v3.8h, v31.8h, v23.8h // t24
+ sqadd v31.8h, v31.8h, v23.8h // t31 = out31
+ sqsub v23.8h, v21.8h, v17.8h // t22a
+ sqadd v17.8h, v21.8h, v17.8h // t17a = out17
+ sqadd v30.8h, v27.8h, v22.8h // t30a = out30
+ sqsub v21.8h, v27.8h, v22.8h // t25a
+ sqsub v27.8h, v18.8h, v20.8h // t21
+ sqadd v18.8h, v18.8h, v20.8h // t18 = out18
+ sqadd v4.8h, v29.8h, v26.8h // t19a = out19
+ sqsub v26.8h, v29.8h, v26.8h // t20a
+ sqadd v29.8h, v25.8h, v28.8h // t29 = out29
+ sqsub v25.8h, v25.8h, v28.8h // t26
+ sqadd v28.8h, v24.8h, v19.8h // t28a = out28
+ sqsub v24.8h, v24.8h, v19.8h // t27a
+ mov v19.16b, v4.16b // out19
+
+ smull_smlsl v4, v5, v24, v26, v0.h[0], v0.h[0], .8h // -> t20
+ smull_smlal v6, v7, v24, v26, v0.h[0], v0.h[0], .8h // -> t27
+ sqrshrn_sz v20, v4, v5, #12, .8h // t20
+ sqrshrn_sz v22, v6, v7, #12, .8h // t27
+
+ smull_smlal v4, v5, v25, v27, v0.h[0], v0.h[0], .8h // -> t26a
+ smull_smlsl v6, v7, v25, v27, v0.h[0], v0.h[0], .8h // -> t21a
+ mov v27.16b, v22.16b // t27
+ sqrshrn_sz v26, v4, v5, #12, .8h // t26a
+
+ smull_smlsl v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22
+ smull_smlal v4, v5, v21, v23, v0.h[0], v0.h[0], .8h // -> t25
+ sqrshrn_sz v21, v6, v7, #12, .8h // t21a
+ sqrshrn_sz v22, v24, v25, #12, .8h // t22
+ sqrshrn_sz v25, v4, v5, #12, .8h // t25
+
+ smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], .8h // -> t23a
+ smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], .8h // -> t24a
+ sqrshrn_sz v23, v4, v5, #12, .8h // t23a
+ sqrshrn_sz v24, v6, v7, #12, .8h // t24a
+
+ ret
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x8_neon
+ mov x14, x30
+ movi v7.8h, #0
+ lsl x8, x8, #1
+.if \scale
+ mov w16, #2896*8
+ dup v0.4h, w16
+.endif
+
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x7]
+ st1 {v7.8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+.if \scale
+ scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct_8h_x16_neon
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+ transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
+
+.macro store1 r0, r1
+ st1 {\r0}, [x6], #16
+ st1 {\r1}, [x6], #16
+ add x6, x6, #32
+.endm
+ store1 v16.8h, v24.8h
+ store1 v17.8h, v25.8h
+ store1 v18.8h, v26.8h
+ store1 v19.8h, v27.8h
+ store1 v20.8h, v28.8h
+ store1 v21.8h, v29.8h
+ store1 v22.8h, v30.8h
+ store1 v23.8h, v31.8h
+.purgem store1
+ sub x6, x6, #64*8
+
+ movi v7.8h, #0
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x7]
+ st1 {v7.8h}, [x7], x8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in v0.h[1]
+ scale_input .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct32_odd_8h_x16_neon
+ transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
+ transpose_8x8h v23, v22, v21, v20, v19, v18, v17, v16, v4, v5
+.macro store2 r0, r1, shift
+ ld1 {v4.8h, v5.8h}, [x6]
+ sqsub v7.8h, v4.8h, \r0
+ sqsub v6.8h, v5.8h, \r1
+ sqadd v4.8h, v4.8h, \r0
+ sqadd v5.8h, v5.8h, \r1
+ rev64 v6.8h, v6.8h
+ rev64 v7.8h, v7.8h
+ srshr v4.8h, v4.8h, #\shift
+ srshr v5.8h, v5.8h, #\shift
+ srshr v6.8h, v6.8h, #\shift
+ srshr v7.8h, v7.8h, #\shift
+ ext v6.16b, v6.16b, v6.16b, #8
+ st1 {v4.8h, v5.8h}, [x6], #32
+ ext v7.16b, v7.16b, v7.16b, #8
+ st1 {v6.8h, v7.8h}, [x6], #32
+.endm
+
+ store2 v31.8h, v23.8h, \shift
+ store2 v30.8h, v22.8h, \shift
+ store2 v29.8h, v21.8h, \shift
+ store2 v28.8h, v20.8h, \shift
+ store2 v27.8h, v19.8h, \shift
+ store2 v26.8h, v18.8h, \shift
+ store2 v25.8h, v17.8h, \shift
+ store2 v24.8h, v16.8h, \shift
+.purgem store2
+ ret x14
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_8x32_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+
+ bl inv_dct_8h_x16_neon
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ st1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ sub x7, x7, x8, lsr #1
+ bl inv_dct32_odd_8h_x16_neon
+
+ neg x9, x8
+ mov x10, x6
+.macro combine r0, r1, r2, r3, op, stride
+ ld1 {v5.8h}, [x7], \stride
+ ld1 {v2.8b}, [x10], x1
+ ld1 {v6.8h}, [x7], \stride
+ ld1 {v3.8b}, [x10], x1
+ \op v5.8h, v5.8h, \r0
+ ld1 {v7.8h}, [x7], \stride
+ ld1 {v4.8b}, [x10], x1
+ srshr v5.8h, v5.8h, #4
+ \op v6.8h, v6.8h, \r1
+ uaddw v5.8h, v5.8h, v2.8b
+ srshr v6.8h, v6.8h, #4
+ \op v7.8h, v7.8h, \r2
+ sqxtun v2.8b, v5.8h
+ ld1 {v5.8h}, [x7], \stride
+ uaddw v6.8h, v6.8h, v3.8b
+ srshr v7.8h, v7.8h, #4
+ \op v5.8h, v5.8h, \r3
+ st1 {v2.8b}, [x6], x1
+ ld1 {v2.8b}, [x10], x1
+ sqxtun v3.8b, v6.8h
+ uaddw v7.8h, v7.8h, v4.8b
+ srshr v5.8h, v5.8h, #4
+ st1 {v3.8b}, [x6], x1
+ sqxtun v4.8b, v7.8h
+ uaddw v5.8h, v5.8h, v2.8b
+ st1 {v4.8b}, [x6], x1
+ sqxtun v2.8b, v5.8h
+ st1 {v2.8b}, [x6], x1
+.endm
+ combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
+ combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
+ combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
+ combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
+ sub x7, x7, x8
+ combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
+ combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
+ combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
+ combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
+.purgem combine
+
+ ret x14
+endfunc
+
+const eob_32x32
+ .short 36, 136, 300, 1024
+endconst
+
+const eob_16x32
+ .short 36, 151, 279, 512
+endconst
+
+const eob_16x32_shortside
+ .short 36, 512
+endconst
+
+const eob_8x32
+ .short 43, 107, 171, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1
+ movi v0.8h, #0
+ movrel x13, eob_32x32
+
+ mov x8, #2*32
+1:
+ mov w9, #0
+ movrel x12, eob_32x32
+2:
+ add w9, w9, #8
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x2]
+ st1 {v0.8h}, [x2], x8
+.endr
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+ load_add_store_8x8 x0, x7, shiftbits=2
+ ldrh w11, [x12], #2
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #2
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #2*8
+ b 1b
+9:
+ ret
+endfunc
+
+.macro shift_8_regs op, shift
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+ mov w16, #2896*8
+ mov w17, #2*(5793-4096)*8
+ dup v1.4h, w16
+ movi v0.8h, #0
+ mov v1.h[1], w17
+ movrel x13, eob_16x32\hshort
+
+ mov x8, #2*\h
+1:
+ mov w9, #0
+ movrel x12, eob_16x32\wshort
+2:
+ add w9, w9, #8
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ ld1 {\i}, [x2]
+ st1 {v0.8h}, [x2], x8
+.endr
+ scale_input .8h, v1.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+.if \w == 16
+ // 16x32
+ identity_8x8_shift1 v1.h[1]
+.else
+ // 32x16
+ shift_8_regs sqshl, 1
+ identity_8x8 v1.h[1]
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+.if \w == 16
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=4
+.endif
+ ldrh w11, [x12], #2
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #2
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #2*8
+ b 1b
+9:
+ ret
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+ movi v0.8h, #0
+ movrel x13, eob_8x32
+
+ mov w8, #2*\h
+1:
+ ldrh w12, [x13], #2
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ ld1 {\i}, [x2]
+ st1 {v0.8h}, [x2], x8
+.endr
+
+.if \w == 8
+ // 8x32
+ shift_8_regs srshr, 1
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+ cmp w3, w12
+.if \w == 8
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=3
+.endif
+
+ b.lt 9f
+.if \w == 8
+ sub x2, x2, x8, lsl #3
+ add x2, x2, #2*8
+.else
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+.endif
+ b 1b
+
+9:
+ ret
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #2048
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 8, 16, 24
+ add x6, sp, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_horz_dct_32x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #2048
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+ adr x4, inv_dct_8h_x16_neon
+
+.irp i, 0, 8, 16, 24
+ add x6, sp, #(\i*16*2)
+ add x7, x2, #(\i*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endif
+ mov x8, #2*32
+ bl inv_txfm_horz_scale_16x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #8
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x6, x0, #(\i)
+ add x7, sp, #(\i*2)
+ mov x8, #16*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #1024
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+
+ adr x5, inv_dct_8h_x16_neon
+
+.irp i, 0, 8
+ add x6, sp, #(\i*32*2)
+ add x7, x2, #(\i*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, #36
+ b.lt 1f
+.endif
+ mov x8, #2*16
+ bl inv_txfm_horz_scale_dct_32x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #1024
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+ movrel x13, eob_8x32
+
+ movi v28.8h, #0
+ mov x8, #2*32
+ mov w9, #32
+ mov x6, sp
+1:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x2]
+ st1 {v28.8h}, [x2], x8
+.endr
+ ldrh w12, [x13], #2
+ sub x2, x2, x8, lsl #3
+ sub w9, w9, #8
+ add x2, x2, #2*8
+
+ bl inv_dct_8h_x8_neon
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ srshr v\i\().8h, v\i\().8h, #2
+.endr
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+ cmp w3, w12
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
+
+ b.ge 1b
+ cbz w9, 3f
+
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+2:
+ subs w9, w9, #8
+.rept 2
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+ mov x6, x0
+ mov x7, sp
+ mov x8, #8*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+ mov x6, sp
+ mov x7, x2
+ mov x8, #8*2
+ bl inv_txfm_horz_dct_32x8_neon
+
+ mov x8, #2*32
+ mov w9, #0
+1:
+ add x6, x0, x9
+ add x7, sp, x9, lsl #1 // #(\i*2)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ add w9, w9, #8
+
+ bl inv_dct_8h_x8_neon
+
+ cmp w9, #32
+
+ load_add_store_8x8 x6, x7
+
+ b.lt 1b
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ ld1 {v0.8h, v1.8h}, [x17], #32
+
+ sqrdmulh v23.8h, v16.8h, v0.h[1] // t63a
+ sqrdmulh v16.8h, v16.8h, v0.h[0] // t32a
+ sqrdmulh v22.8h, v17.8h, v0.h[2] // t62a
+ sqrdmulh v17.8h, v17.8h, v0.h[3] // t33a
+ sqrdmulh v21.8h, v18.8h, v0.h[5] // t61a
+ sqrdmulh v18.8h, v18.8h, v0.h[4] // t34a
+ sqrdmulh v20.8h, v19.8h, v0.h[6] // t60a
+ sqrdmulh v19.8h, v19.8h, v0.h[7] // t35a
+
+ sqadd v24.8h, v16.8h, v17.8h // t32
+ sqsub v25.8h, v16.8h, v17.8h // t33
+ sqsub v26.8h, v19.8h, v18.8h // t34
+ sqadd v27.8h, v19.8h, v18.8h // t35
+ sqadd v28.8h, v20.8h, v21.8h // t60
+ sqsub v29.8h, v20.8h, v21.8h // t61
+ sqsub v30.8h, v23.8h, v22.8h // t62
+ sqadd v31.8h, v23.8h, v22.8h // t63
+
+ smull_smlal v2, v3, v29, v26, v1.h[0], v1.h[1], .8h // -> t34a
+ smull_smlsl v4, v5, v29, v26, v1.h[1], v1.h[0], .8h // -> t61a
+ neg v2.4s, v2.4s // t34a
+ neg v3.4s, v3.4s // t34a
+ smull_smlsl v6, v7, v30, v25, v1.h[1], v1.h[0], .8h // -> t33a
+ sqrshrn_sz v26, v2, v3, #12, .8h // t34a
+ smull_smlal v2, v3, v30, v25, v1.h[0], v1.h[1], .8h // -> t62a
+ sqrshrn_sz v29, v4, v5, #12, .8h // t61a
+ sqrshrn_sz v25, v6, v7, #12, .8h // t33a
+ sqrshrn_sz v30, v2, v3, #12, .8h // t62a
+
+ sqadd v16.8h, v24.8h, v27.8h // t32a
+ sqsub v19.8h, v24.8h, v27.8h // t35a
+ sqadd v17.8h, v25.8h, v26.8h // t33
+ sqsub v18.8h, v25.8h, v26.8h // t34
+ sqsub v20.8h, v31.8h, v28.8h // t60a
+ sqadd v23.8h, v31.8h, v28.8h // t63a
+ sqsub v21.8h, v30.8h, v29.8h // t61
+ sqadd v22.8h, v30.8h, v29.8h // t62
+
+ smull_smlal v2, v3, v21, v18, v1.h[2], v1.h[3], .8h // -> t61a
+ smull_smlsl v4, v5, v21, v18, v1.h[3], v1.h[2], .8h // -> t34a
+ smull_smlal v6, v7, v20, v19, v1.h[2], v1.h[3], .8h // -> t60
+ sqrshrn_sz v21, v2, v3, #12, .8h // t61a
+ sqrshrn_sz v18, v4, v5, #12, .8h // t34a
+ smull_smlsl v2, v3, v20, v19, v1.h[3], v1.h[2], .8h // -> t35
+ sqrshrn_sz v20, v6, v7, #12, .8h // t60
+ sqrshrn_sz v19, v2, v3, #12, .8h // t35
+
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
+
+ ret
+endfunc
+
+function inv_dct64_step2_neon
+ movrel x16, idct_coeffs
+ ld1 {v0.4h}, [x16]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ ldr q16, [x6, #2*8*0] // t32a
+ ldr q17, [x9, #2*8*8] // t39a
+ ldr q18, [x9, #2*8*0] // t63a
+ ldr q19, [x6, #2*8*8] // t56a
+ ldr q20, [x6, #2*8*16] // t40a
+ ldr q21, [x9, #2*8*24] // t47a
+ ldr q22, [x9, #2*8*16] // t55a
+ ldr q23, [x6, #2*8*24] // t48a
+
+ sqadd v24.8h, v16.8h, v17.8h // t32
+ sqsub v25.8h, v16.8h, v17.8h // t39
+ sqadd v26.8h, v18.8h, v19.8h // t63
+ sqsub v27.8h, v18.8h, v19.8h // t56
+ sqsub v28.8h, v21.8h, v20.8h // t40
+ sqadd v29.8h, v21.8h, v20.8h // t47
+ sqadd v30.8h, v23.8h, v22.8h // t48
+ sqsub v31.8h, v23.8h, v22.8h // t55
+
+ smull_smlal v2, v3, v27, v25, v0.h[3], v0.h[2], .8h // -> t56a
+ smull_smlsl v4, v5, v27, v25, v0.h[2], v0.h[3], .8h // -> t39a
+ smull_smlal v6, v7, v31, v28, v0.h[3], v0.h[2], .8h // -> t40a
+ sqrshrn_sz v25, v2, v3, #12, .8h // t56a
+ sqrshrn_sz v27, v4, v5, #12, .8h // t39a
+ neg v6.4s, v6.4s // t40a
+ neg v7.4s, v7.4s // t40a
+ smull_smlsl v2, v3, v31, v28, v0.h[2], v0.h[3], .8h // -> t55a
+ sqrshrn_sz v31, v6, v7, #12, .8h // t40a
+ sqrshrn_sz v28, v2, v3, #12, .8h // t55a
+
+ sqadd v16.8h, v24.8h, v29.8h // t32a
+ sqsub v19.8h, v24.8h, v29.8h // t47a
+ sqadd v17.8h, v27.8h, v31.8h // t39
+ sqsub v18.8h, v27.8h, v31.8h // t40
+ sqsub v20.8h, v26.8h, v30.8h // t48a
+ sqadd v23.8h, v26.8h, v30.8h // t63a
+ sqsub v21.8h, v25.8h, v28.8h // t55
+ sqadd v22.8h, v25.8h, v28.8h // t56
+
+ smull_smlsl v2, v3, v21, v18, v0.h[0], v0.h[0], .8h // -> t40a
+ smull_smlal v4, v5, v21, v18, v0.h[0], v0.h[0], .8h // -> t55a
+ smull_smlsl v6, v7, v20, v19, v0.h[0], v0.h[0], .8h // -> t47
+ sqrshrn_sz v18, v2, v3, #12, .8h // t40a
+ sqrshrn_sz v21, v4, v5, #12, .8h // t55a
+ smull_smlal v2, v3, v20, v19, v0.h[0], v0.h[0], .8h // -> t48
+ sqrshrn_sz v19, v6, v7, #12, .8h // t47
+ sqrshrn_sz v20, v2, v3, #12, .8h // t48
+
+ str q16, [x6, #2*8*0] // t32a
+ str q17, [x9, #2*8*0] // t39
+ str q18, [x6, #2*8*8] // t40a
+ str q19, [x9, #2*8*8] // t47
+ str q20, [x6, #2*8*16] // t48
+ str q21, [x9, #2*8*16] // t55a
+ str q22, [x6, #2*8*24] // t56
+ str q23, [x9, #2*8*24] // t63a
+
+ add x6, x6, #2*8
+ sub x9, x9, #2*8
+ cmp x6, x9
+ b.lt 1b
+ ret
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+.if \clear
+ ld1 {\i}, [\src]
+ st1 {\zero}, [\src], \strd
+.else
+ ld1 {\i}, [\src], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ st1 {\i}, [\dst], #16
+.endr
+.endm
+
+.macro clear_upper8
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ movi \i, #0
+.endr
+.endm
+
+.macro movi_if reg, val, cond
+.if \cond
+ movi \reg, \val
+.endif
+.endm
+
+.macro movdup_if reg, gpr, val, cond
+.if \cond
+ mov \gpr, \val
+ dup \reg, \gpr
+.endif
+.endm
+
+.macro st1_if regs, dst, cond
+.if \cond
+ st1 \regs, \dst
+.endif
+.endm
+
+.macro str_if reg, dst, cond
+.if \cond
+ str \reg, \dst
+.endif
+.endm
+
+.macro stroff_if reg, dst, dstoff, cond
+.if \cond
+ str \reg, \dst, \dstoff
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input .8h, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_8h_x64_neon, export=1
+ mov x14, x30
+ mov x6, sp
+ lsl x8, x8, #2
+
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ load8 x7, x8, v7.8h, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ add x7, x7, x8, lsr #1
+ scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct_8h_x16_neon
+
+ store16 x6
+
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ load8 x7, x8, v7.8h, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ lsr x8, x8, #1
+ sub x7, x7, x8, lsr #1
+ scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct32_odd_8h_x16_neon
+
+ add x10, x6, #16*15
+ sub x6, x6, #16*16
+
+ mov x9, #-16
+
+.macro store_addsub r0, r1, r2, r3
+ ld1 {v2.8h}, [x6], #16
+ ld1 {v3.8h}, [x6], #16
+ sqadd v6.8h, v2.8h, \r0
+ sqsub \r0, v2.8h, \r0
+ ld1 {v4.8h}, [x6], #16
+ sqadd v7.8h, v3.8h, \r1
+ sqsub \r1, v3.8h, \r1
+ ld1 {v5.8h}, [x6], #16
+ sqadd v2.8h, v4.8h, \r2
+ sub x6, x6, #16*4
+ sqsub \r2, v4.8h, \r2
+ st1 {v6.8h}, [x6], #16
+ st1 {\r0}, [x10], x9
+ sqadd v3.8h, v5.8h, \r3
+ sqsub \r3, v5.8h, \r3
+ st1 {v7.8h}, [x6], #16
+ st1 {\r1}, [x10], x9
+ st1 {v2.8h}, [x6], #16
+ st1 {\r2}, [x10], x9
+ st1 {v3.8h}, [x6], #16
+ st1 {\r3}, [x10], x9
+.endm
+ store_addsub v31.8h, v30.8h, v29.8h, v28.8h
+ store_addsub v27.8h, v26.8h, v25.8h, v24.8h
+ store_addsub v23.8h, v22.8h, v21.8h, v20.8h
+ store_addsub v19.8h, v18.8h, v17.8h, v16.8h
+.purgem store_addsub
+
+ add x6, x6, #2*8*16
+
+ movrel x17, idct64_coeffs
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ add x9, x7, x8, lsl #4 // offset 16
+ add x10, x7, x8, lsl #3 // offset 8
+ sub x9, x9, x8 // offset 15
+ sub x11, x10, x8 // offset 7
+ ld1 {v16.8h}, [x7] // in1 (offset 0)
+ ld1 {v17.8h}, [x9] // in31 (offset 15)
+ ld1 {v18.8h}, [x10] // in17 (offset 8)
+ ld1 {v19.8h}, [x11] // in15 (offset 7)
+ st1_if {v7.8h}, [x7], \clear
+ st1_if {v7.8h}, [x9], \clear
+ st1_if {v7.8h}, [x10], \clear
+ st1_if {v7.8h}, [x11], \clear
+ scale_if \scale, v0.h[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ add x7, x7, x8, lsl #2 // offset 4
+ sub x9, x9, x8, lsl #2 // offset 11
+ sub x10, x7, x8 // offset 3
+ add x11, x9, x8 // offset 12
+ ld1 {v16.8h}, [x10] // in7 (offset 3)
+ ld1 {v17.8h}, [x11] // in25 (offset 12)
+ ld1 {v18.8h}, [x9] // in23 (offset 11)
+ ld1 {v19.8h}, [x7] // in9 (offset 4)
+ st1_if {v7.8h}, [x7], \clear
+ st1_if {v7.8h}, [x9], \clear
+ st1_if {v7.8h}, [x10], \clear
+ st1_if {v7.8h}, [x11], \clear
+ scale_if \scale, v0.h[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ sub x10, x10, x8, lsl #1 // offset 1
+ sub x9, x9, x8, lsl #1 // offset 9
+ add x7, x7, x8 // offset 5
+ add x11, x11, x8 // offset 13
+ ldr q16, [x10, x8] // in5 (offset 2)
+ ldr q17, [x11] // in27 (offset 13)
+ ldr q18, [x9, x8] // in21 (offset 10)
+ ldr q19, [x7] // in11 (offset 5)
+ stroff_if q7, [x10, x8], \clear
+ str_if q7, [x11], \clear
+ stroff_if q7, [x9, x8], \clear
+ str_if q7, [x7], \clear
+ scale_if \scale, v0.h[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movdup_if v0.4h, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ ldr q16, [x10] // in3 (offset 1)
+ ldr q17, [x11, x8] // in29 (offset 14)
+ ldr q18, [x9] // in19 (offset 9)
+ ldr q19, [x7, x8] // in13 (offset 6)
+ str_if q7, [x10], \clear
+ stroff_if q7, [x11, x8], \clear
+ str_if q7, [x9], \clear
+ stroff_if q7, [x7, x8], \clear
+ scale_if \scale, v0.h[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+
+ sub x6, x6, #2*8*32
+ add x9, x6, #2*8*7
+
+ bl inv_dct64_step2_neon
+
+ ret x14
+endfunc
+.endm
+
+def_dct64_func
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+
+function inv_txfm_horz_dct_64x8_neon
+ mov x14, x30
+
+ mov x7, sp
+ add x8, sp, #2*8*(64 - 4)
+ add x9, x6, #2*56
+ mov x10, #2*64
+ mov x11, #-2*8*4
+
+ dup v7.8h, w12
+1:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+ transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
+
+.macro store_addsub src0, src1, src2, src3
+ sqsub v1.8h, \src0, \src1
+ sqadd v0.8h, \src0, \src1
+ sqsub v3.8h, \src2, \src3
+ srshl v1.8h, v1.8h, v7.8h
+ sqadd v2.8h, \src2, \src3
+ srshl v0.8h, v0.8h, v7.8h
+ srshl v3.8h, v3.8h, v7.8h
+ rev64 v1.8h, v1.8h
+ srshl v2.8h, v2.8h, v7.8h
+ rev64 v3.8h, v3.8h
+ ext v1.16b, v1.16b, v1.16b, #8
+ st1 {v0.8h}, [x6], x10
+ ext v3.16b, v3.16b, v3.16b, #8
+ st1 {v1.8h}, [x9], x10
+ st1 {v2.8h}, [x6], x10
+ st1 {v3.8h}, [x9], x10
+.endm
+ store_addsub v16.8h, v31.8h, v17.8h, v30.8h
+ store_addsub v18.8h, v29.8h, v19.8h, v28.8h
+ store_addsub v20.8h, v27.8h, v21.8h, v26.8h
+ store_addsub v22.8h, v25.8h, v23.8h, v24.8h
+.purgem store_addsub
+ sub x6, x6, x10, lsl #3
+ sub x9, x9, x10, lsl #3
+ add x6, x6, #16
+ sub x9, x9, #16
+
+ cmp x7, x8
+ b.lt 1b
+ ret x14
+endfunc
+
+function inv_txfm_add_vert_dct_8x64_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+ mov x7, sp
+ add x8, sp, #2*8*(64 - 4)
+ add x9, x6, x1, lsl #6
+ sub x9, x9, x1
+ neg x10, x1
+ mov x11, #-2*8*4
+
+1:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+
+.macro add_dest_addsub src0, src1, src2, src3
+ ld1 {v0.8b}, [x6], x1
+ ld1 {v1.8b}, [x9], x10
+ sqadd v4.8h, \src0, \src1
+ ld1 {v2.8b}, [x6]
+ sqsub v5.8h, \src0, \src1
+ ld1 {v3.8b}, [x9]
+ sqadd v6.8h, \src2, \src3
+ sqsub v7.8h, \src2, \src3
+ sub x6, x6, x1
+ sub x9, x9, x10
+ srshr v4.8h, v4.8h, #4
+ srshr v5.8h, v5.8h, #4
+ srshr v6.8h, v6.8h, #4
+ uaddw v4.8h, v4.8h, v0.8b
+ srshr v7.8h, v7.8h, #4
+ uaddw v5.8h, v5.8h, v1.8b
+ uaddw v6.8h, v6.8h, v2.8b
+ sqxtun v0.8b, v4.8h
+ uaddw v7.8h, v7.8h, v3.8b
+ sqxtun v1.8b, v5.8h
+ st1 {v0.8b}, [x6], x1
+ sqxtun v2.8b, v6.8h
+ st1 {v1.8b}, [x9], x10
+ sqxtun v3.8b, v7.8h
+ st1 {v2.8b}, [x6], x1
+ st1 {v3.8b}, [x9], x10
+.endm
+ add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h
+ add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h
+ add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h
+ add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h
+.purgem add_dest_addsub
+ cmp x7, x8
+ b.lt 1b
+
+ ret x14
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 8, 16, 24
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_8h_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x8_neon
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_dct_8h_x64_neon
+ add x6, x0, #(\i)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #64*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 8, 16, 24
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ mov x12, #-1 // shift
+ bl inv_txfm_dct_clear_scale_8h_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x8_neon
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i)
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, x5, #64*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ mov x15, x30
+
+ sub_sp 32*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 8, 16, 24
+ add x6, x5, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_horz_scale_dct_32x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x7, x5, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_dct_8h_x64_neon
+ add x6, x0, #(\i)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #32*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ mov x15, x30
+
+ sub_sp 64*16*2+64*8*2
+ add x4, sp, #64*8*2
+
+ movrel x13, eob_16x32
+
+.irp i, 0, 8
+ add x6, x4, #(\i*64*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #16*2
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_8h_x64_neon
+ add x6, x4, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x8_neon
+.if \i < 8
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+ adr x5, inv_dct_8h_x16_neon
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i)
+ add x7, x4, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, x4, #64*16*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ mov x15, x30
+
+ sub_sp 16*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+
+ adr x4, inv_dct_8h_x16_neon
+.irp i, 0, 8, 16, 24
+ add x6, x5, #(\i*16*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 24
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_horz_16x8_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #8
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x7, x5, #(\i*2)
+ mov x8, #16*2
+ bl inv_txfm_dct_8h_x64_neon
+ add x6, x0, #(\i)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #16*32*2
+ ret x15
+endfunc
diff --git a/third_party/dav1d/src/arm/64/itx16.S b/third_party/dav1d/src/arm/64/itx16.S
new file mode 100644
index 0000000000..eee3a9636d
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/itx16.S
@@ -0,0 +1,3648 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob,
+// int bitdepth_max);
+
+// Most of the functions use the following register layout:
+// x0-x3 external parameters
+// x4 function pointer to first transform
+// x5 function pointer to second transform
+// x6 output parameter for helper function
+// x7 input parameter for helper function
+// x8 input stride for helper function
+// x9-x12 scratch variables for helper functions
+// x13 pointer to list of eob thresholds
+// x14 return pointer for helper function
+// x15 return pointer for main function
+
+// The SIMD registers most often use the following layout:
+// v0-v1 multiplication coefficients
+// v2-v7 scratch registers
+// v8-v15 unused
+// v16-v31 inputs/outputs of transforms
+
+const idct_coeffs, align=4
+ // idct4
+ .int 2896, 2896*8*(1<<16), 1567, 3784
+ // idct8
+ .int 799, 4017, 3406, 2276
+ // idct16
+ .int 401, 4076, 3166, 2598
+ .int 1931, 3612, 3920, 1189
+ // idct32
+ .int 201, 4091, 3035, 2751
+ .int 1751, 3703, 3857, 1380
+ .int 995, 3973, 3513, 2106
+ .int 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)
+ .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16)
+ .int 4076, 401, 4017, 799
+
+ .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16)
+ .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16)
+ .int -3166, -2598, -799, -4017
+
+ .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16)
+ .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16)
+ .int 3612, 1931, 2276, 3406
+
+ .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16)
+ .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16)
+ .int -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+ .int 1321, 3803, 2482, 3344
+endconst
+
+const iadst8_coeffs, align=4
+ .int 4076, 401, 3612, 1931
+ .int 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .int 2896, 0, 1567, 3784
+endconst
+
+const iadst16_coeffs, align=4
+ .int 4091, 201, 3973, 995
+ .int 3703, 1751, 3290, 2440
+ .int 2751, 3035, 2106, 3513
+ .int 1380, 3857, 601, 4052
+endconst
+
+.macro mul_mla d, s0, s1, c0, c1
+ mul \d\().4s, \s0\().4s, \c0
+ mla \d\().4s, \s1\().4s, \c1
+.endm
+
+.macro mul_mls d, s0, s1, c0, c1
+ mul \d\().4s, \s0\().4s, \c0
+ mls \d\().4s, \s1\().4s, \c1
+.endm
+
+.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
+ sqrdmulh \r0\sz, \r0\sz, \c
+ sqrdmulh \r1\sz, \r1\sz, \c
+ sqrdmulh \r2\sz, \r2\sz, \c
+ sqrdmulh \r3\sz, \r3\sz, \c
+.ifnb \r4
+ sqrdmulh \r4\sz, \r4\sz, \c
+ sqrdmulh \r5\sz, \r5\sz, \c
+ sqrdmulh \r6\sz, \r6\sz, \c
+ sqrdmulh \r7\sz, \r7\sz, \c
+.endif
+.endm
+
+.macro smin_4s r0, r1, r2
+ smin \r0\().4s, \r1\().4s, \r2\().4s
+.endm
+.macro smax_4s r0, r1, r2
+ smax \r0\().4s, \r1\().4s, \r2\().4s
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
+.ifnb \load
+ ld1 {\load}, [\src], x1
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ usqadd \adddst, \addsrc
+.endif
+.ifnb \min
+ smin \min, \min, v7.8h
+.endif
+.ifnb \store
+ st1 {\store}, [\dst], x1
+.endif
+.endm
+.macro load_add_store_8x16 dst, src
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , \dst, \src
+ load_add_store v3.8h, v17.8h, , , , , \dst, \src
+ load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src
+ load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src
+ load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src
+ load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src
+ load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src
+ load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src
+ load_add_store v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src
+ load_add_store v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src
+ load_add_store v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src
+ load_add_store v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src
+ load_add_store v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src
+ load_add_store v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src
+ load_add_store v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src
+ load_add_store v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src
+ load_add_store , , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src
+ load_add_store , , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src
+ load_add_store , , , , v27.8h, v26.8h, \dst, \src
+ load_add_store , , , , , v27.8h, \dst, \src
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits
+ load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits
+ load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits
+ load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits
+ load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
+ load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
+ load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src, \shiftbits
+ load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src, \shiftbits
+ load_add_store , , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
+ load_add_store , , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
+ load_add_store , , , , v19.8h, v18.8h, \dst, \src, \shiftbits
+ load_add_store , , , , , v19.8h, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src, shiftbits=4
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits
+ load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits
+ load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits
+ load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits
+ load_add_store , , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
+ load_add_store , , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
+ load_add_store , , , , v5.8h, v4.8h, \dst, \src, \shiftbits
+ load_add_store , , , , , v5.8h, \dst, \src, \shiftbits
+.endm
+.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src
+.ifnb \load
+ ld1 {\load}[0], [\src], x1
+.endif
+.ifnb \inssrc
+ ins \insdst\().d[1], \inssrc\().d[0]
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #4
+.endif
+.ifnb \load
+ ld1 {\load}[1], [\src], x1
+.endif
+.ifnb \addsrc
+ usqadd \adddst, \addsrc
+.endif
+.ifnb \store
+ st1 {\store}[0], [\dst], x1
+.endif
+.ifnb \min
+ smin \min, \min, v7.8h
+.endif
+.ifnb \store
+ st1 {\store}[1], [\dst], x1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store4 v0.d, v17, v16, , , , , , \dst, \src
+ load_add_store4 v1.d, v19, v18, , , , , , \dst, \src
+ load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src
+ load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src
+ load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src
+ load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
+ load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
+ load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h, v2.d, \dst, \src
+ load_add_store4 , , , v28.8h, v26.8h, v19.8h, v17.8h, v3.d, \dst, \src
+ load_add_store4 , , , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src
+ load_add_store4 , , , , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src
+ load_add_store4 , , , , , , v23.8h, v21.d, \dst, \src
+ load_add_store4 , , , , , , , v23.d, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+ mov \src, \dst
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store4 v0.d, v17, v16, , , , , , \dst, \src
+ load_add_store4 v1.d, v19, v18, , , , , , \dst, \src
+ load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src
+ load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src
+ load_add_store4 , , , v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src
+ load_add_store4 , , , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
+ load_add_store4 , , , , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
+ load_add_store4 , , , , , , v3.8h, v2.d, \dst, \src
+ load_add_store4 , , , , , , , v3.d, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+ cbnz w3, 1f
+ movz w16, #2896*8, lsl #16
+ ld1r {v16.4s}, [x2]
+ dup v0.2s, w16
+ sqrdmulh v20.4s, v16.4s, v0.s[0]
+ str wzr, [x2]
+.if (\w == 2*\h) || (2*\w == \h)
+ sqrdmulh v20.4s, v20.4s, v0.s[0]
+.endif
+.if \shift > 0
+ sqrshrn v16.4h, v20.4s, #\shift
+ sqrshrn2 v16.8h, v20.4s, #\shift
+.else
+ sqxtn v16.4h, v20.4s
+ sqxtn2 v16.8h, v20.4s
+.endif
+ sqrdmulh v16.8h, v16.8h, v0.h[1]
+ srshr v16.8h, v16.8h, #4
+ mov w4, #\h
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ld1 {v1.d}[0], [x0], x1
+ subs w4, w4, #4
+ ld1 {v1.d}[1], [x0], x1
+ usqadd v0.8h, v16.8h
+ sub x0, x0, x1, lsl #2
+ usqadd v1.8h, v16.8h
+ smin v0.8h, v0.8h, v31.8h
+ st1 {v0.d}[0], [x0], x1
+ smin v1.8h, v1.8h, v31.8h
+ st1 {v0.d}[1], [x0], x1
+ st1 {v1.d}[0], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w8_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h}, [x0], x1
+ subs w4, w4, #4
+ ld1 {v1.8h}, [x0], x1
+ usqadd v0.8h, v16.8h
+ ld1 {v2.8h}, [x0], x1
+ usqadd v1.8h, v16.8h
+ ld1 {v3.8h}, [x0], x1
+ usqadd v2.8h, v16.8h
+ usqadd v3.8h, v16.8h
+ sub x0, x0, x1, lsl #2
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ st1 {v0.8h}, [x0], x1
+ smin v2.8h, v2.8h, v31.8h
+ st1 {v1.8h}, [x0], x1
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w16_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h, v1.8h}, [x0], x1
+ subs w4, w4, #2
+ ld1 {v2.8h, v3.8h}, [x0], x1
+ usqadd v0.8h, v16.8h
+ usqadd v1.8h, v16.8h
+ sub x0, x0, x1, lsl #1
+ usqadd v2.8h, v16.8h
+ usqadd v3.8h, v16.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ st1 {v0.8h, v1.8h}, [x0], x1
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.8h, v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w32_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+ subs w4, w4, #1
+ usqadd v0.8h, v16.8h
+ usqadd v1.8h, v16.8h
+ usqadd v2.8h, v16.8h
+ usqadd v3.8h, v16.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w64_neon
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+ sub x1, x1, #64
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ subs w4, w4, #1
+ usqadd v0.8h, v16.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]
+ usqadd v1.8h, v16.8h
+ sub x0, x0, #64
+ usqadd v2.8h, v16.8h
+ usqadd v3.8h, v16.8h
+ usqadd v4.8h, v16.8h
+ usqadd v5.8h, v16.8h
+ usqadd v6.8h, v16.8h
+ usqadd v7.8h, v16.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ smin v4.8h, v4.8h, v31.8h
+ smin v5.8h, v5.8h, v31.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ smin v6.8h, v6.8h, v31.8h
+ smin v7.8h, v7.8h, v31.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+.macro iwht4
+ add v16.4s, v16.4s, v17.4s
+ sub v21.4s, v18.4s, v19.4s
+ sub v20.4s, v16.4s, v21.4s
+ sshr v20.4s, v20.4s, #1
+ sub v18.4s, v20.4s, v17.4s
+ sub v17.4s, v20.4s, v19.4s
+ add v19.4s, v21.4s, v18.4s
+ sub v16.4s, v16.4s, v17.4s
+.endm
+
+.macro idct_4 r0, r1, r2, r3
+ mul_mla v6, \r1, \r3, v0.s[3], v0.s[2]
+ mul_mla v2, \r0, \r2, v0.s[0], v0.s[0]
+ mul_mls v4, \r1, \r3, v0.s[2], v0.s[3]
+ mul_mls v3, \r0, \r2, v0.s[0], v0.s[0]
+ srshr v6.4s, v6.4s, #12
+ srshr v2.4s, v2.4s, #12
+ srshr v7.4s, v4.4s, #12
+ srshr v3.4s, v3.4s, #12
+ sqadd \r0\().4s, v2.4s, v6.4s
+ sqsub \r3\().4s, v2.4s, v6.4s
+ sqadd \r1\().4s, v3.4s, v7.4s
+ sqsub \r2\().4s, v3.4s, v7.4s
+.endm
+
+function inv_dct_4s_x4_neon
+ AARCH64_VALID_CALL_TARGET
+ movrel x16, idct_coeffs
+ ld1 {v0.4s}, [x16]
+ idct_4 v16, v17, v18, v19
+ ret
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel x16, iadst4_coeffs
+ ld1 {v0.4s}, [x16]
+
+ sub v3.4s, v16.4s, v18.4s
+ mul v4.4s, v16.4s, v0.s[0]
+ mla v4.4s, v18.4s, v0.s[1]
+ mla v4.4s, v19.4s, v0.s[2]
+ mul v7.4s, v17.4s, v0.s[3]
+ add v3.4s, v3.4s, v19.4s
+ mul v5.4s, v16.4s, v0.s[2]
+ mls v5.4s, v18.4s, v0.s[0]
+ mls v5.4s, v19.4s, v0.s[1]
+
+ add \o3\().4s, v4.4s, v5.4s
+ mul \o2\().4s, v3.4s, v0.s[3]
+ add \o0\().4s, v4.4s, v7.4s
+ add \o1\().4s, v5.4s, v7.4s
+ sub \o3\().4s, \o3\().4s, v7.4s
+
+ srshr \o0\().4s, \o0\().4s, #12
+ srshr \o2\().4s, \o2\().4s, #12
+ srshr \o1\().4s, \o1\().4s, #12
+ srshr \o3\().4s, \o3\().4s, #12
+.endm
+
+function inv_adst_4s_x4_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_4x4 v16, v17, v18, v19
+ ret
+endfunc
+
+function inv_flipadst_4s_x4_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_4x4 v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x4_neon
+ AARCH64_VALID_CALL_TARGET
+ movz w16, #(5793-4096)*8, lsl #16
+ dup v0.2s, w16
+ sqrdmulh v4.4s, v16.4s, v0.s[0]
+ sqrdmulh v5.4s, v17.4s, v0.s[0]
+ sqrdmulh v6.4s, v18.4s, v0.s[0]
+ sqrdmulh v7.4s, v19.4s, v0.s[0]
+ sqadd v16.4s, v16.4s, v4.4s
+ sqadd v17.4s, v17.4s, v5.4s
+ sqadd v18.4s, v18.4s, v6.4s
+ sqadd v19.4s, v19.4s, v7.4s
+ ret
+endfunc
+
+function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
+ mov x15, x30
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v30.4s, v31.4s}, [x2], #32
+
+ sshr v16.4s, v16.4s, #2
+ sshr v17.4s, v17.4s, #2
+ sshr v18.4s, v18.4s, #2
+ sshr v19.4s, v19.4s, #2
+
+ iwht4
+
+ st1 {v30.4s, v31.4s}, [x2], #32
+ transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23
+
+ iwht4
+
+ ld1 {v0.d}[0], [x0], x1
+ sqxtn v16.4h, v16.4s
+ ld1 {v0.d}[1], [x0], x1
+ sqxtn2 v16.8h, v17.4s
+ ld1 {v1.d}[0], [x0], x1
+ sqxtn v18.4h, v18.4s
+ ld1 {v1.d}[1], [x0], x1
+ sqxtn2 v18.8h, v19.4s
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v30.4s, v31.4s}, [x2], #32
+
+ blr x4
+
+ st1 {v30.4s, v31.4s}, [x2], #32
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x5
+
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ ld1 {v1.d}[0], [x0], x1
+ ld1 {v1.d}[1], [x0], x1
+ srshr v16.8h, v16.8h, #4
+ srshr v18.8h, v18.8h, #4
+
+L(itx_4x4_end):
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+ sub x0, x0, x1, lsl #2
+ usqadd v0.8h, v16.8h
+ usqadd v1.8h, v18.8h
+ smin v0.8h, v0.8h, v31.8h
+ st1 {v0.d}[0], [x0], x1
+ smin v1.8h, v1.8h, v31.8h
+ st1 {v0.d}[1], [x0], x1
+ st1 {v1.d}[0], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+
+ ret x15
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cbnz w3, 1f
+ movz w16, #2896*8, lsl #16
+ ld1r {v16.4s}, [x2]
+ dup v4.2s, w16
+ str wzr, [x2]
+ sqrdmulh v16.4s, v16.4s, v4.s[0]
+ ld1 {v0.d}[0], [x0], x1
+ sqxtn v20.4h, v16.4s
+ sqxtn2 v20.8h, v16.4s
+ ld1 {v0.d}[1], [x0], x1
+ sqrdmulh v20.8h, v20.8h, v4.h[1]
+ ld1 {v1.d}[0], [x0], x1
+ srshr v16.8h, v20.8h, #4
+ ld1 {v1.d}[1], [x0], x1
+ srshr v18.8h, v20.8h, #4
+ movi v30.8h, #0
+ b L(itx_4x4_end)
+1:
+.endif
+ adr x4, inv_\txfm1\()_4s_x4_neon
+ movrel x5, X(inv_\txfm2\()_4h_x4_neon)
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_4 \r0, \r2, \r4, \r6
+
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ smin_4s \r, \r, v5
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a
+ mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
+ mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a
+ mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a
+ srshr \r1\().4s, v2.4s, #12 // t4a
+ srshr \r7\().4s, v3.4s, #12 // t7a
+ srshr \r3\().4s, v6.4s, #12 // t5a
+ srshr \r5\().4s, v7.4s, #12 // t6a
+
+ sqadd v2.4s, \r1\().4s, \r3\().4s // t4
+ sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a
+ sqadd v3.4s, \r7\().4s, \r5\().4s // t7
+ sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a
+
+.irp r, v2, \r1, v3, \r3
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, \r1, v3, \r3
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5
+ mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6
+ srshr v7.4s, v7.4s, #12 // t5
+ srshr v6.4s, v6.4s, #12 // t6
+
+ sqsub \r7\().4s, \r0\().4s, v3.4s // out7
+ sqadd \r0\().4s, \r0\().4s, v3.4s // out0
+ sqadd \r1\().4s, \r2\().4s, v6.4s // out1
+ sqsub v6.4s, \r2\().4s, v6.4s // out6
+ sqadd \r2\().4s, \r4\().4s, v7.4s // out2
+ sqsub \r5\().4s, \r4\().4s, v7.4s // out5
+ sqadd \r3\().4s, \r6\().4s, v2.4s // out3
+ sqsub \r4\().4s, \r6\().4s, v2.4s // out4
+ mov \r6\().16b, v6.16b // out6
+.endm
+
+function inv_dct_4s_x8_neon
+ AARCH64_VALID_CALL_TARGET
+ movrel x16, idct_coeffs
+ ld1 {v0.4s, v1.4s}, [x16]
+ idct_8 v16, v17, v18, v19, v20, v21, v22, v23
+ ret
+endfunc
+
+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
+ movrel x16, iadst8_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mla v2, v23, v16, v0.s[0], v0.s[1]
+ mul_mls v4, v23, v16, v0.s[1], v0.s[0]
+ mul_mla v6, v21, v18, v0.s[2], v0.s[3]
+ srshr v16.4s, v2.4s, #12 // t0a
+ srshr v23.4s, v4.4s, #12 // t1a
+ mul_mls v2, v21, v18, v0.s[3], v0.s[2]
+ mul_mla v4, v19, v20, v1.s[0], v1.s[1]
+ srshr v18.4s, v6.4s, #12 // t2a
+ srshr v21.4s, v2.4s, #12 // t3a
+ mul_mls v6, v19, v20, v1.s[1], v1.s[0]
+ mul_mla v2, v17, v22, v1.s[2], v1.s[3]
+ srshr v20.4s, v4.4s, #12 // t4a
+ srshr v19.4s, v6.4s, #12 // t5a
+ mul_mls v4, v17, v22, v1.s[3], v1.s[2]
+ srshr v22.4s, v2.4s, #12 // t6a
+ srshr v17.4s, v4.4s, #12 // t7a
+
+ ld1 {v0.4s}, [x16]
+
+ movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+
+ sqadd v2.4s, v16.4s, v20.4s // t0
+ sqsub v3.4s, v16.4s, v20.4s // t4
+ mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+ sqadd v4.4s, v23.4s, v19.4s // t1
+ sqsub v5.4s, v23.4s, v19.4s // t5
+ sqadd v6.4s, v18.4s, v22.4s // t2
+ sqsub v7.4s, v18.4s, v22.4s // t6
+ sqadd v18.4s, v21.4s, v17.4s // t3
+ sqsub v19.4s, v21.4s, v17.4s // t7
+
+.irp r, v2, v3, v4, v5, v6, v7, v18, v19
+ smin_4s \r, \r, v1
+.endr
+.irp r, v2, v3, v4, v5, v6, v7, v18, v19
+ smax_4s \r, \r, v20
+.endr
+
+ mul_mla v16, v3, v5, v0.s[3], v0.s[2]
+ mul_mls v20, v3, v5, v0.s[2], v0.s[3]
+ mul_mls v22, v19, v7, v0.s[3], v0.s[2]
+
+ srshr v3.4s, v16.4s, #12 // t4a
+ srshr v5.4s, v20.4s, #12 // t5a
+
+ mul_mla v16, v19, v7, v0.s[2], v0.s[3]
+
+ srshr v7.4s, v22.4s, #12 // t6a
+ srshr v19.4s, v16.4s, #12 // t7a
+
+ sqadd \o0\().4s, v2.4s, v6.4s // out0
+ sqsub v2.4s, v2.4s, v6.4s // t2
+ sqadd \o7\().4s, v4.4s, v18.4s // out7
+ sqsub v4.4s, v4.4s, v18.4s // t3
+
+ mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ sqadd \o1\().4s, v3.4s, v7.4s // out1
+ sqsub v3.4s, v3.4s, v7.4s // t6
+ sqadd \o6\().4s, v5.4s, v19.4s // out6
+ sqsub v5.4s, v5.4s, v19.4s // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, v2, v4, v3, v5
+ smin_4s \r, \r, v1
+.endr
+.irp r, v2, v4, v3, v5
+ smax_4s \r, \r, v18
+.endr
+
+ sqneg \o7\().4s, \o7\().4s // out7
+ sqneg \o1\().4s, \o1\().4s // out1
+
+ mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20)
+ mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19)
+ mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18)
+ srshr v2.4s, v18.4s, #12 // out3
+ mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21)
+ srshr v3.4s, v20.4s, #12 // out5
+ srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21)
+ srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19)
+
+ sqneg \o3\().4s, v2.4s // out3
+ sqneg \o5\().4s, v3.4s // out5
+.endm
+
+function inv_adst_4s_x8_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_8 v16, v17, v18, v19, v20, v21, v22, v23
+ ret
+endfunc
+
+function inv_flipadst_4s_x8_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_8 v23, v22, v21, v20, v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x8_neon
+ AARCH64_VALID_CALL_TARGET
+ sqshl v16.4s, v16.4s, #1
+ sqshl v17.4s, v17.4s, #1
+ sqshl v18.4s, v18.4s, #1
+ sqshl v19.4s, v19.4s, #1
+ sqshl v20.4s, v20.4s, #1
+ sqshl v21.4s, v21.4s, #1
+ sqshl v22.4s, v22.4s, #1
+ sqshl v23.4s, v23.4s, #1
+ ret
+endfunc
+
+function inv_txfm_add_8x8_neon
+ movi v31.4s, #0
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v31.4s}, [x6], x11
+.endr
+
+ blr x4
+
+ sqrshrn v24.4h, v16.4s, #1
+ sqrshrn v25.4h, v17.4s, #1
+ sqrshrn v26.4h, v18.4s, #1
+ sqrshrn v27.4h, v19.4s, #1
+ sqrshrn2 v24.8h, v20.4s, #1
+ sqrshrn2 v25.8h, v21.4s, #1
+ sqrshrn2 v26.8h, v22.4s, #1
+ sqrshrn2 v27.8h, v23.4s, #1
+
+ transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+ movi \i, #0
+.endr
+
+2:
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x2]
+ st1 {v31.4s}, [x2], x11
+.endr
+
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+
+ transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23
+
+ mov v20.16b, v24.16b
+ mov v21.16b, v25.16b
+ mov v22.16b, v26.16b
+ mov v23.16b, v27.16b
+
+ blr x5
+
+ load_add_store_8x8 x0, x7
+ ret x15
+endfunc
+
+.macro def_fn_8x8 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ movrel x5, X(inv_\txfm2\()_8h_x8_neon)
+ mov w13, #\eob_half
+ adr x4, inv_\txfm1\()_4s_x8_neon
+ b inv_txfm_add_8x8_neon
+endfunc
+.endm
+
+def_fn_8x8 dct, dct, 10
+def_fn_8x8 identity, identity, 10
+def_fn_8x8 dct, adst, 10
+def_fn_8x8 dct, flipadst, 10
+def_fn_8x8 dct, identity, 4
+def_fn_8x8 adst, dct, 10
+def_fn_8x8 adst, adst, 10
+def_fn_8x8 adst, flipadst, 10
+def_fn_8x8 flipadst, dct, 10
+def_fn_8x8 flipadst, adst, 10
+def_fn_8x8 flipadst, flipadst, 10
+def_fn_8x8 identity, dct, 4
+def_fn_8x8 adst, identity, 4
+def_fn_8x8 flipadst, identity, 4
+def_fn_8x8 identity, adst, 4
+def_fn_8x8 identity, flipadst, 4
+
+function inv_txfm_add_8x4_neon
+ movi v28.4s, #0
+ movi v29.4s, #0
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+ ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2]
+ st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2]
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x4
+
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ sqxtn v20.4h, v20.4s
+ sqxtn v21.4h, v21.4s
+ sqxtn v22.4h, v22.4s
+ sqxtn v23.4h, v23.4s
+
+ transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+ ins v16.d[1], v20.d[0]
+ ins v17.d[1], v21.d[0]
+ ins v18.d[1], v22.d[0]
+ ins v19.d[1], v23.d[0]
+
+ blr x5
+
+ load_add_store_8x4 x0, x7
+ ret x15
+endfunc
+
+function inv_txfm_add_4x8_neon
+ movz w16, #2896*8, lsl #16
+ movi v31.4s, #0
+ dup v30.2s, w16
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v31.4s}, [x6], x11
+.endr
+ scale_input .4s, v30.s[0], v16, v17, v18, v19
+ blr x4
+ sqxtn v20.4h, v16.4s
+ sqxtn v21.4h, v17.4s
+ sqxtn v22.4h, v18.4s
+ sqxtn v23.4h, v19.4s
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+
+ b 2f
+
+1:
+.irp i, v20, v21, v22, v23
+ movi \i\().4h, #0
+.endr
+
+2:
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x2]
+ st1 {v31.4s}, [x2], x11
+.endr
+ scale_input .4s, v30.s[0], v16, v17, v18, v19
+ blr x4
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7
+
+ blr x5
+
+ load_add_store_4x8 x0, x7
+ ret x15
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+.if \w == 4
+ mov w13, #\eob_half
+.endif
+ movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon)
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct, 13
+def_fn_48 \w, \h, identity, identity, 13
+def_fn_48 \w, \h, dct, adst, 13
+def_fn_48 \w, \h, dct, flipadst, 13
+def_fn_48 \w, \h, dct, identity, 4
+def_fn_48 \w, \h, adst, dct, 13
+def_fn_48 \w, \h, adst, adst, 13
+def_fn_48 \w, \h, adst, flipadst, 13
+def_fn_48 \w, \h, flipadst, dct, 13
+def_fn_48 \w, \h, flipadst, adst, 13
+def_fn_48 \w, \h, flipadst, flipadst, 13
+def_fn_48 \w, \h, identity, dct, 16
+def_fn_48 \w, \h, adst, identity, 4
+def_fn_48 \w, \h, flipadst, identity, 4
+def_fn_48 \w, \h, identity, adst, 16
+def_fn_48 \w, \h, identity, flipadst, 16
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+
+function inv_dct_4s_x16_neon
+ AARCH64_VALID_CALL_TARGET
+ movrel x16, idct_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ idct_8 v16, v18, v20, v22, v24, v26, v28, v30
+
+ // idct_8 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v18, v20, v22, v24, v26, v28, v30
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v16, v18, v20, v22, v24, v26, v28, v30
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ ld1 {v0.4s, v1.4s}, [x16]
+ sub x16, x16, #32
+
+ mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a
+ mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a
+ mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a
+ srshr v17.4s, v2.4s, #12 // t8a
+ srshr v31.4s, v3.4s, #12 // t15a
+ mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a
+ mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a
+ srshr v23.4s, v6.4s, #12 // t9a
+ srshr v25.4s, v2.4s, #12 // t14a
+ mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a
+ mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a
+ srshr v21.4s, v3.4s, #12 // t10a
+ srshr v27.4s, v6.4s, #12 // t13a
+ mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a
+ srshr v19.4s, v2.4s, #12 // t11a
+ srshr v29.4s, v3.4s, #12 // t12a
+
+ ld1 {v0.4s}, [x16]
+
+ sqsub v2.4s, v17.4s, v23.4s // t9
+ sqadd v17.4s, v17.4s, v23.4s // t8
+ sqsub v3.4s, v31.4s, v25.4s // t14
+ sqadd v31.4s, v31.4s, v25.4s // t15
+ sqsub v23.4s, v19.4s, v21.4s // t10
+ sqadd v19.4s, v19.4s, v21.4s // t11
+ sqadd v25.4s, v29.4s, v27.4s // t12
+ sqsub v29.4s, v29.4s, v27.4s // t13
+
+.irp r, v2, v17, v3, v31, v23, v19, v25, v29
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v17, v3, v31, v23, v19, v25, v29
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a
+ mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a
+ srshr v21.4s, v7.4s, #12 // t9a
+ srshr v27.4s, v6.4s, #12 // t14a
+
+ mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a
+ mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a
+ srshr v29.4s, v7.4s, #12 // t13a
+ neg v6.4s, v6.4s
+ srshr v23.4s, v6.4s, #12 // t10a
+
+ sqsub v2.4s, v17.4s, v19.4s // t11a
+ sqadd v17.4s, v17.4s, v19.4s // t8a
+ sqsub v3.4s, v31.4s, v25.4s // t12a
+ sqadd v31.4s, v31.4s, v25.4s // t15a
+ sqadd v19.4s, v21.4s, v23.4s // t9
+ sqsub v21.4s, v21.4s, v23.4s // t10
+ sqsub v25.4s, v27.4s, v29.4s // t13
+ sqadd v27.4s, v27.4s, v29.4s // t14
+
+.irp r, v2, v17, v3, v31, v19, v21, v25, v27
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v17, v3, v31, v19, v21, v25, v27
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11
+ mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12
+ mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a
+
+ srshr v7.4s, v7.4s, #12 // t11
+ srshr v6.4s, v6.4s, #12 // t12
+ mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a
+ srshr v2.4s, v2.4s, #12 // t10a
+ srshr v3.4s, v3.4s, #12 // t13a
+
+ sqadd v1.4s, v16.4s, v31.4s // out0
+ sqsub v31.4s, v16.4s, v31.4s // out15
+ mov v16.16b, v1.16b
+ sqadd v23.4s, v30.4s, v17.4s // out7
+ sqsub v1.4s, v30.4s, v17.4s // out8
+ sqadd v17.4s, v18.4s, v27.4s // out1
+ sqsub v30.4s, v18.4s, v27.4s // out14
+ sqadd v18.4s, v20.4s, v3.4s // out2
+ sqsub v29.4s, v20.4s, v3.4s // out13
+ sqadd v3.4s, v28.4s, v19.4s // out6
+ sqsub v25.4s, v28.4s, v19.4s // out9
+ sqadd v19.4s, v22.4s, v6.4s // out3
+ sqsub v28.4s, v22.4s, v6.4s // out12
+ sqadd v20.4s, v24.4s, v7.4s // out4
+ sqsub v27.4s, v24.4s, v7.4s // out11
+ sqadd v21.4s, v26.4s, v2.4s // out5
+ sqsub v26.4s, v26.4s, v2.4s // out10
+ mov v24.16b, v1.16b
+ mov v22.16b, v3.16b
+
+ ret
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+ movrel x16, iadst16_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0
+ mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1
+ mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2
+ srshr v16.4s, v2.4s, #12 // t0
+ srshr v31.4s, v4.4s, #12 // t1
+ mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3
+ mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4
+ srshr v18.4s, v6.4s, #12 // t2
+ srshr v29.4s, v2.4s, #12 // t3
+ mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5
+ mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6
+ srshr v20.4s, v4.4s, #12 // t4
+ srshr v27.4s, v6.4s, #12 // t5
+ mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7
+ ld1 {v0.4s, v1.4s}, [x16]
+ movrel x16, idct_coeffs
+ mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8
+ srshr v22.4s, v2.4s, #12 // t6
+ srshr v25.4s, v4.4s, #12 // t7
+ mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9
+ mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10
+ srshr v23.4s, v6.4s, #12 // t8
+ srshr v24.4s, v2.4s, #12 // t9
+ mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11
+ mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12
+ srshr v21.4s, v4.4s, #12 // t10
+ srshr v26.4s, v6.4s, #12 // t11
+ mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13
+ mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14
+ srshr v19.4s, v2.4s, #12 // t12
+ srshr v28.4s, v4.4s, #12 // t13
+ mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15
+ srshr v17.4s, v6.4s, #12 // t14
+ srshr v30.4s, v2.4s, #12 // t15
+
+ ld1 {v0.4s, v1.4s}, [x16]
+
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ sqsub v2.4s, v16.4s, v23.4s // t8a
+ sqadd v16.4s, v16.4s, v23.4s // t0a
+ sqsub v3.4s, v31.4s, v24.4s // t9a
+ sqadd v31.4s, v31.4s, v24.4s // t1a
+ sqadd v23.4s, v18.4s, v21.4s // t2a
+ sqsub v18.4s, v18.4s, v21.4s // t10a
+ sqadd v24.4s, v29.4s, v26.4s // t3a
+ sqsub v29.4s, v29.4s, v26.4s // t11a
+ sqadd v21.4s, v20.4s, v19.4s // t4a
+ sqsub v20.4s, v20.4s, v19.4s // t12a
+ sqadd v26.4s, v27.4s, v28.4s // t5a
+ sqsub v27.4s, v27.4s, v28.4s // t13a
+ sqadd v19.4s, v22.4s, v17.4s // t6a
+ sqsub v22.4s, v22.4s, v17.4s // t14a
+ sqadd v28.4s, v25.4s, v30.4s // t7a
+ sqsub v25.4s, v25.4s, v30.4s // t15a
+
+.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
+ smax_4s \r, \r, v7
+.endr
+
+ mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8
+ mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9
+ mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10
+ srshr v17.4s, v4.4s, #12 // t8
+ srshr v30.4s, v6.4s, #12 // t9
+ mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11
+ mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12
+ srshr v18.4s, v2.4s, #12 // t10
+ srshr v29.4s, v4.4s, #12 // t11
+ mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13
+ mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14
+ srshr v27.4s, v6.4s, #12 // t12
+ srshr v20.4s, v2.4s, #12 // t13
+ mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15
+ srshr v25.4s, v4.4s, #12 // t14
+ srshr v22.4s, v6.4s, #12 // t15
+
+ sqsub v2.4s, v16.4s, v21.4s // t4
+ sqadd v16.4s, v16.4s, v21.4s // t0
+ sqsub v3.4s, v31.4s, v26.4s // t5
+ sqadd v31.4s, v31.4s, v26.4s // t1
+ sqadd v21.4s, v23.4s, v19.4s // t2
+ sqsub v23.4s, v23.4s, v19.4s // t6
+ sqadd v26.4s, v24.4s, v28.4s // t3
+ sqsub v24.4s, v24.4s, v28.4s // t7
+ sqadd v19.4s, v17.4s, v27.4s // t8a
+ sqsub v17.4s, v17.4s, v27.4s // t12a
+ sqadd v28.4s, v30.4s, v20.4s // t9a
+ sqsub v30.4s, v30.4s, v20.4s // t13a
+ sqadd v27.4s, v18.4s, v25.4s // t10a
+ sqsub v18.4s, v18.4s, v25.4s // t14a
+ sqadd v20.4s, v29.4s, v22.4s // t11a
+ sqsub v29.4s, v29.4s, v22.4s // t15a
+
+.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
+ smax_4s \r, \r, v7
+.endr
+
+ mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a
+ mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a
+ mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a
+ srshr v22.4s, v4.4s, #12 // t4a
+ srshr v25.4s, v6.4s, #12 // t5a
+ mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a
+ mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12
+ srshr v24.4s, v2.4s, #12 // t6a
+ srshr v23.4s, v4.4s, #12 // t7a
+ mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13
+ mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14
+ srshr v17.4s, v6.4s, #12 // t12
+ mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15
+ srshr v29.4s, v2.4s, #12 // t13
+ srshr v30.4s, v4.4s, #12 // t14
+ srshr v18.4s, v6.4s, #12 // t15
+
+ sqsub v2.4s, v16.4s, v21.4s // t2a
+.ifc \o0, v16
+ sqadd \o0\().4s, v16.4s, v21.4s // out0
+ sqsub v21.4s, v31.4s, v26.4s // t3a
+ sqadd \o15\().4s, v31.4s, v26.4s // out15
+.else
+ sqadd v4.4s, v16.4s, v21.4s // out0
+ sqsub v21.4s, v31.4s, v26.4s // t3a
+ sqadd \o15\().4s, v31.4s, v26.4s // out15
+ mov \o0\().16b, v4.16b
+.endif
+
+ sqsub v3.4s, v29.4s, v18.4s // t15a
+ sqadd \o13\().4s, v29.4s, v18.4s // out13
+ sqadd \o2\().4s, v17.4s, v30.4s // out2
+ sqsub v26.4s, v17.4s, v30.4s // t14a
+
+ sqadd \o1\().4s, v19.4s, v27.4s // out1
+ sqsub v27.4s, v19.4s, v27.4s // t10
+ sqadd \o14\().4s, v28.4s, v20.4s // out14
+ sqsub v20.4s, v28.4s, v20.4s // t11
+
+ sqadd \o3\().4s, v22.4s, v24.4s // out3
+ sqsub v22.4s, v22.4s, v24.4s // t6
+ sqadd \o12\().4s, v25.4s, v23.4s // out12
+ sqsub v23.4s, v25.4s, v23.4s // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, v2, v21, v3, v26, v27, v20, v22, v23
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v21, v3, v26, v27, v20, v22, v23
+ smax_4s \r, \r, v7
+.endr
+
+ sqneg \o15\().4s, \o15\().4s // out15
+ sqneg \o13\().4s, \o13\().4s // out13
+ sqneg \o1\().4s, \o1\().4s // out1
+ sqneg \o3\().4s, \o3\().4s // out3
+
+ mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
+ mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24)
+ mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26)
+
+ srshr v24.4s, v24.4s, #12 // out8
+ srshr v4.4s, v4.4s, #12 // out7
+ srshr v5.4s, v6.4s, #12 // out5
+ mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21)
+ mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27)
+ srshr v26.4s, v6.4s, #12 // out10
+
+ mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20)
+ mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25)
+ mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22)
+
+ srshr \o4\().4s, v2.4s, #12 // out4
+ srshr v6.4s, v6.4s, #12 // out11
+ srshr v7.4s, v21.4s, #12 // out9
+ srshr \o6\().4s, v22.4s, #12 // out6
+
+.ifc \o8, v23
+ mov \o8\().16b, v24.16b
+ mov \o10\().16b, v26.16b
+.endif
+
+ sqneg \o7\().4s, v4.4s // out7
+ sqneg \o5\().4s, v5.4s // out5
+ sqneg \o11\().4s, v6.4s // out11
+ sqneg \o9\().4s, v7.4s // out9
+.endm
+
+function inv_adst_4s_x16_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ ret
+endfunc
+
+function inv_flipadst_4s_x16_neon
+ AARCH64_VALID_CALL_TARGET
+ iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x16_neon
+ AARCH64_VALID_CALL_TARGET
+ movz w16, #2*(5793-4096)*8, lsl #16
+ dup v0.2s, w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ sqrdmulh v2.4s, v\i\().4s, v0.s[0]
+ sqadd v\i\().4s, v\i\().4s, v\i\().4s
+ sqadd v\i\().4s, v\i\().4s, v2.4s
+.endr
+ ret
+endfunc
+
+.macro identity_4x16_shift1 c
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ sqrdmulh v3.4s, \i, \c
+ srshr v3.4s, v3.4s, #1
+ sqadd \i, \i, v3.4s
+.endr
+.endm
+
+.macro identity_4x16 c
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ sqrdmulh v3.4s, \i, \c
+ sqadd \i, \i, \i
+ sqadd \i, \i, v3.4s
+.endr
+.endm
+
+.macro def_horz_16 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x4_neon
+ mov x14, x30
+ movi v7.4s, #0
+.if \scale
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.endif
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+.if \scale
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ blr x4
+ sqrshrn v16.4h, v16.4s, #\shift
+ sqrshrn v17.4h, v17.4s, #\shift
+ sqrshrn v18.4h, v18.4s, #\shift
+ sqrshrn v19.4h, v19.4s, #\shift
+ sqrshrn2 v16.8h, v20.4s, #\shift
+ sqrshrn2 v17.8h, v21.4s, #\shift
+ sqrshrn2 v18.8h, v22.4s, #\shift
+ sqrshrn2 v19.8h, v23.4s, #\shift
+ sqrshrn v20.4h, v24.4s, #\shift
+ sqrshrn v21.4h, v25.4s, #\shift
+ sqrshrn v22.4h, v26.4s, #\shift
+ sqrshrn v23.4h, v27.4s, #\shift
+ sqrshrn2 v20.8h, v28.4s, #\shift
+ sqrshrn2 v21.8h, v29.4s, #\shift
+ sqrshrn2 v22.8h, v30.4s, #\shift
+ sqrshrn2 v23.8h, v31.4s, #\shift
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7
+
+.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h
+ st1 {\i}, [x6], #16
+.endr
+
+ ret x14
+endfunc
+.endm
+
+def_horz_16 scale=0, shift=2
+def_horz_16 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_8x16_neon
+ mov x14, x30
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ blr x5
+ load_add_store_8x16 x6, x7
+ ret x14
+endfunc
+
+function inv_txfm_add_16x16_neon
+ mov x15, x30
+ sub sp, sp, #512
+ ldrh w12, [x13], #2
+.irp i, 0, 4, 8, 12
+ add x6, sp, #(\i*16*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 12
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #16*4
+ bl inv_txfm_horz_16x4_neon
+.endr
+ b 3f
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+3:
+.irp i, 0, 8
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+const eob_16x16
+ .short 10, 36, 78, 256
+endconst
+
+const eob_16x16_identity
+ .short 4, 8, 12, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+ adr x4, inv_\txfm1\()_4s_x16_neon
+ movrel x5, X(inv_\txfm2\()_8h_x16_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_16x16
+.else
+ movrel x13, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_16x16_identity
+.else
+ movrel x13, eob_16x16
+.endif
+.endif
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+function inv_txfm_add_16x4_neon
+ mov x15, x30
+ movi v4.4s, #0
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], #16
+.endr
+
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ mov x6, x0
+ load_add_store_8x4 x6, x7
+
+ sqrshrn v16.4h, v24.4s, #1
+ sqrshrn v17.4h, v25.4s, #1
+ sqrshrn v18.4h, v26.4s, #1
+ sqrshrn v19.4h, v27.4s, #1
+ sqrshrn2 v16.8h, v28.4s, #1
+ sqrshrn2 v17.8h, v29.4s, #1
+ sqrshrn2 v18.8h, v30.4s, #1
+ sqrshrn2 v19.8h, v31.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ add x6, x0, #16
+ load_add_store_8x4 x6, x7
+
+ ret x15
+endfunc
+
+function inv_txfm_add_4x16_neon
+ ldrh w12, [x13, #4]
+ mov x15, x30
+
+ mov x11, #64
+
+ cmp w3, w12
+ ldrh w12, [x13, #2]
+ b.lt 1f
+
+ add x6, x2, #48
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ sqrshrn v28.4h, v16.4s, #1
+ sqrshrn v29.4h, v17.4s, #1
+ sqrshrn v30.4h, v18.4s, #1
+ sqrshrn v31.4h, v19.4s, #1
+ transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v28.4h, v29.4h, v30.4h, v31.4h
+ movi \i, #0
+.endr
+2:
+ cmp w3, w12
+ ldrh w12, [x13, #0]
+ b.lt 1f
+
+ add x6, x2, #32
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ sqrshrn v24.4h, v16.4s, #1
+ sqrshrn v25.4h, v17.4s, #1
+ sqrshrn v26.4h, v18.4s, #1
+ sqrshrn v27.4h, v19.4s, #1
+ transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v24.4h, v25.4h, v26.4h, v27.4h
+ movi \i, #0
+.endr
+2:
+ cmp w3, w12
+ b.lt 1f
+
+ add x6, x2, #16
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ sqrshrn v20.4h, v16.4s, #1
+ sqrshrn v21.4h, v17.4s, #1
+ sqrshrn v22.4h, v18.4s, #1
+ sqrshrn v23.4h, v19.4s, #1
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v20.4h, v21.4h, v22.4h, v23.4h
+ movi \i, #0
+.endr
+2:
+
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x2]
+ st1 {v2.4s}, [x2], x11
+.endr
+ blr x4
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+
+ blr x5
+
+ load_add_store_4x16 x0, x6
+
+ ret x15
+endfunc
+
+const eob_4x16
+ .short 13, 29, 45, 64
+endconst
+
+const eob_4x16_identity1
+ .short 16, 32, 48, 64
+endconst
+
+const eob_4x16_identity2
+ .short 4, 8, 12, 64
+endconst
+
+.macro def_fn_416 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+.if \w == 4
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_4x16
+.else
+ movrel x13, eob_4x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_4x16_identity2
+.else
+ movrel x13, eob_4x16
+.endif
+.endif
+.else
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon)
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct
+def_fn_416 \w, \h, identity, identity
+def_fn_416 \w, \h, dct, adst
+def_fn_416 \w, \h, dct, flipadst
+def_fn_416 \w, \h, dct, identity
+def_fn_416 \w, \h, adst, dct
+def_fn_416 \w, \h, adst, adst
+def_fn_416 \w, \h, adst, flipadst
+def_fn_416 \w, \h, flipadst, dct
+def_fn_416 \w, \h, flipadst, adst
+def_fn_416 \w, \h, flipadst, flipadst
+def_fn_416 \w, \h, identity, dct
+def_fn_416 \w, \h, adst, identity
+def_fn_416 \w, \h, flipadst, identity
+def_fn_416 \w, \h, identity, adst
+def_fn_416 \w, \h, identity, flipadst
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+
+function inv_txfm_add_16x8_neon
+ mov x15, x30
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+ blr x4
+
+ sqrshrn v8.4h, v16.4s, #1
+ sqrshrn v9.4h, v17.4s, #1
+ sqrshrn v10.4h, v18.4s, #1
+ sqrshrn v11.4h, v19.4s, #1
+ sqrshrn2 v8.8h, v20.4s, #1
+ sqrshrn2 v9.8h, v21.4s, #1
+ sqrshrn2 v10.8h, v22.4s, #1
+ sqrshrn2 v11.8h, v23.4s, #1
+ sqrshrn v12.4h, v24.4s, #1
+ sqrshrn v13.4h, v25.4s, #1
+ sqrshrn v14.4h, v26.4s, #1
+ sqrshrn v15.4h, v27.4s, #1
+ sqrshrn2 v12.8h, v28.4s, #1
+ sqrshrn2 v13.8h, v29.4s, #1
+ sqrshrn2 v14.8h, v30.4s, #1
+ sqrshrn2 v15.8h, v31.4s, #1
+
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+ transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5
+
+ b 2f
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h
+ movi \i, #0
+.endr
+2:
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+
+ movi v4.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], x11
+.endr
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+
+ mov v20.16b, v8.16b
+ mov v21.16b, v9.16b
+ mov v22.16b, v10.16b
+ mov v23.16b, v11.16b
+
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ sqrshrn v8.4h, v24.4s, #1
+ sqrshrn v9.4h, v25.4s, #1
+ sqrshrn v10.4h, v26.4s, #1
+ sqrshrn v11.4h, v27.4s, #1
+ sqrshrn2 v8.8h, v28.4s, #1
+ sqrshrn2 v9.8h, v29.4s, #1
+ sqrshrn2 v10.8h, v30.4s, #1
+ sqrshrn2 v11.8h, v31.4s, #1
+
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+
+ blr x5
+
+ mov x6, x0
+ load_add_store_8x8 x6, x7
+
+ mov v16.16b, v8.16b
+ mov v17.16b, v9.16b
+ mov v18.16b, v10.16b
+ mov v19.16b, v11.16b
+ mov v20.16b, v12.16b
+ mov v21.16b, v13.16b
+ mov v22.16b, v14.16b
+ mov v23.16b, v15.16b
+
+ blr x5
+
+ add x0, x0, #16
+ load_add_store_8x8 x0, x7
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret x15
+endfunc
+
+function inv_txfm_add_8x16_neon
+ mov x15, x30
+ stp d8, d9, [sp, #-0x20]!
+ stp d10, d11, [sp, #0x10]
+ ldrh w12, [x13, #4]
+
+ mov x11, #64
+
+ cmp w3, w12
+ ldrh w12, [x13, #2]
+ b.lt 1f
+
+ add x6, x2, #48
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v28.4h, v16.4s, #1
+ sqrshrn v29.4h, v17.4s, #1
+ sqrshrn v30.4h, v18.4s, #1
+ sqrshrn v31.4h, v19.4s, #1
+ sqrshrn2 v28.8h, v20.4s, #1
+ sqrshrn2 v29.8h, v21.4s, #1
+ sqrshrn2 v30.8h, v22.4s, #1
+ sqrshrn2 v31.8h, v23.4s, #1
+ transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v28.8h, v29.8h, v30.8h, v31.8h
+ movi \i, #0
+.endr
+
+2:
+ cmp w3, w12
+ ldrh w12, [x13, #0]
+ b.lt 1f
+
+ add x6, x2, #32
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v24.4h, v16.4s, #1
+ sqrshrn v25.4h, v17.4s, #1
+ sqrshrn v26.4h, v18.4s, #1
+ sqrshrn v27.4h, v19.4s, #1
+ sqrshrn2 v24.8h, v20.4s, #1
+ sqrshrn2 v25.8h, v21.4s, #1
+ sqrshrn2 v26.8h, v22.4s, #1
+ sqrshrn2 v27.8h, v23.4s, #1
+ transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+ movi \i, #0
+.endr
+
+2:
+ cmp w3, w12
+ b.lt 1f
+
+ add x6, x2, #16
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v8.4h, v16.4s, #1
+ sqrshrn v9.4h, v17.4s, #1
+ sqrshrn v10.4h, v18.4s, #1
+ sqrshrn v11.4h, v19.4s, #1
+ sqrshrn2 v8.8h, v20.4s, #1
+ sqrshrn2 v9.8h, v21.4s, #1
+ sqrshrn2 v10.8h, v22.4s, #1
+ sqrshrn2 v11.8h, v23.4s, #1
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h
+ movi \i, #0
+.endr
+
+2:
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ mov v20.16b, v8.16b
+ mov v21.16b, v9.16b
+ mov v22.16b, v10.16b
+ mov v23.16b, v11.16b
+
+ blr x5
+
+ load_add_store_8x16 x0, x6
+
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x20
+
+ ret x15
+endfunc
+
+const eob_8x16
+ .short 10, 43, 75, 128
+endconst
+
+const eob_8x16_identity1
+ .short 4, 64, 96, 128
+endconst
+
+const eob_8x16_identity2
+ .short 4, 8, 12, 128
+endconst
+
+.macro def_fn_816 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_8x16
+.else
+ movrel x13, eob_8x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_8x16_identity2
+.else
+ movrel x13, eob_8x16
+.endif
+.endif
+.if \h == 8
+ ldrh w13, [x13]
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct
+def_fn_816 \w, \h, identity, identity
+def_fn_816 \w, \h, dct, adst
+def_fn_816 \w, \h, dct, flipadst
+def_fn_816 \w, \h, dct, identity
+def_fn_816 \w, \h, adst, dct
+def_fn_816 \w, \h, adst, adst
+def_fn_816 \w, \h, adst, flipadst
+def_fn_816 \w, \h, flipadst, dct
+def_fn_816 \w, \h, flipadst, adst
+def_fn_816 \w, \h, flipadst, flipadst
+def_fn_816 \w, \h, identity, dct
+def_fn_816 \w, \h, adst, identity
+def_fn_816 \w, \h, flipadst, identity
+def_fn_816 \w, \h, identity, adst
+def_fn_816 \w, \h, identity, flipadst
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_4s_x16_neon
+ movrel x16, idct_coeffs, 4*16
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a
+ mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a
+ mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a
+ srshr v16.4s, v2.4s, #12 // t16a
+ srshr v31.4s, v4.4s, #12 // t31a
+ mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a
+ mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a
+ srshr v24.4s, v6.4s, #12 // t17a
+ srshr v23.4s, v2.4s, #12 // t30a
+ mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a
+ mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a
+ srshr v20.4s, v4.4s, #12 // t18a
+ srshr v27.4s, v6.4s, #12 // t29a
+ mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a
+ ld1 {v0.4s, v1.4s}, [x16]
+ sub x16, x16, #4*24
+ mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a
+ srshr v28.4s, v2.4s, #12 // t19a
+ srshr v19.4s, v4.4s, #12 // t28a
+ mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a
+ mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a
+ srshr v18.4s, v6.4s, #12 // t20a
+ srshr v29.4s, v2.4s, #12 // t27a
+ mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a
+ mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a
+ srshr v26.4s, v4.4s, #12 // t21a
+ srshr v21.4s, v6.4s, #12 // t26a
+ mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a
+ mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a
+ srshr v22.4s, v2.4s, #12 // t22a
+ srshr v25.4s, v4.4s, #12 // t25a
+ mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a
+ srshr v30.4s, v6.4s, #12 // t23a
+ srshr v17.4s, v2.4s, #12 // t24a
+
+ ld1 {v0.4s, v1.4s}, [x16]
+
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+ sqsub v2.4s, v16.4s, v24.4s // t17
+ sqadd v16.4s, v16.4s, v24.4s // t16
+ sqsub v3.4s, v31.4s, v23.4s // t30
+ sqadd v31.4s, v31.4s, v23.4s // t31
+ sqsub v24.4s, v28.4s, v20.4s // t18
+ sqadd v28.4s, v28.4s, v20.4s // t19
+ sqadd v23.4s, v18.4s, v26.4s // t20
+ sqsub v18.4s, v18.4s, v26.4s // t21
+ sqsub v20.4s, v30.4s, v22.4s // t22
+ sqadd v30.4s, v30.4s, v22.4s // t23
+ sqadd v26.4s, v17.4s, v25.4s // t24
+ sqsub v17.4s, v17.4s, v25.4s // t25
+ sqsub v22.4s, v29.4s, v21.4s // t26
+ sqadd v29.4s, v29.4s, v21.4s // t27
+ sqadd v25.4s, v19.4s, v27.4s // t28
+ sqsub v19.4s, v19.4s, v27.4s // t29
+
+.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a
+ mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a
+ mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a
+ srshr v21.4s, v7.4s, #12 // t17a
+ srshr v27.4s, v6.4s, #12 // t30a
+ neg v2.4s, v2.4s // -> t18a
+ mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a
+ mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a
+ srshr v19.4s, v2.4s, #12 // t18a
+ srshr v24.4s, v7.4s, #12 // t29a
+ mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a
+ mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a
+ srshr v22.4s, v6.4s, #12 // t21a
+ srshr v18.4s, v2.4s, #12 // t26a
+ neg v7.4s, v7.4s // -> t22a
+ mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a
+ srshr v17.4s, v7.4s, #12 // t22a
+ srshr v20.4s, v6.4s, #12 // t25a
+
+ sqsub v2.4s, v27.4s, v24.4s // t29
+ sqadd v27.4s, v27.4s, v24.4s // t30
+ sqsub v3.4s, v21.4s, v19.4s // t18
+ sqadd v21.4s, v21.4s, v19.4s // t17
+ sqsub v24.4s, v16.4s, v28.4s // t19a
+ sqadd v16.4s, v16.4s, v28.4s // t16a
+ sqsub v19.4s, v30.4s, v23.4s // t20a
+ sqadd v30.4s, v30.4s, v23.4s // t23a
+ sqsub v28.4s, v17.4s, v22.4s // t21
+ sqadd v17.4s, v17.4s, v22.4s // t22
+ sqadd v23.4s, v26.4s, v29.4s // t24a
+ sqsub v26.4s, v26.4s, v29.4s // t27a
+ sqadd v22.4s, v20.4s, v18.4s // t25
+ sqsub v20.4s, v20.4s, v18.4s // t26
+ sqsub v29.4s, v31.4s, v25.4s // t28a
+ sqadd v31.4s, v31.4s, v25.4s // t31a
+
+.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a
+ mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a
+ mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19
+ srshr v18.4s, v7.4s, #12 // t18a
+ srshr v25.4s, v6.4s, #12 // t29a
+ mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28
+ mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20
+ srshr v29.4s, v2.4s, #12 // t19
+ srshr v24.4s, v7.4s, #12 // t28
+ neg v6.4s, v6.4s // -> t20
+ mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27
+ mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a
+ srshr v26.4s, v6.4s, #12 // t20
+ srshr v19.4s, v2.4s, #12 // t27
+ neg v7.4s, v7.4s // -> t21a
+ mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a
+ srshr v20.4s, v7.4s, #12 // t21a
+ srshr v28.4s, v6.4s, #12 // t26a
+
+ sqsub v2.4s, v16.4s, v30.4s // t23
+ sqadd v16.4s, v16.4s, v30.4s // t16 = out16
+ sqsub v3.4s, v31.4s, v23.4s // t24
+ sqadd v31.4s, v31.4s, v23.4s // t31 = out31
+ sqsub v23.4s, v21.4s, v17.4s // t22a
+ sqadd v17.4s, v21.4s, v17.4s // t17a = out17
+ sqadd v30.4s, v27.4s, v22.4s // t30a = out30
+ sqsub v21.4s, v27.4s, v22.4s // t25a
+ sqsub v27.4s, v18.4s, v20.4s // t21
+ sqadd v18.4s, v18.4s, v20.4s // t18 = out18
+ sqadd v7.4s, v29.4s, v26.4s // t19a = out19
+ sqsub v26.4s, v29.4s, v26.4s // t20a
+ sqadd v29.4s, v25.4s, v28.4s // t29 = out29
+ sqsub v25.4s, v25.4s, v28.4s // t26
+ sqadd v28.4s, v24.4s, v19.4s // t28a = out28
+ sqsub v24.4s, v24.4s, v19.4s // t27a
+ mov v19.16b, v7.16b // out19
+
+.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20
+ mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27
+ srshr v20.4s, v7.4s, #12 // t20
+ srshr v22.4s, v6.4s, #12 // t27
+
+ mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a
+ mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a
+ mov v27.16b, v22.16b // t27
+ srshr v26.4s, v7.4s, #12 // t26a
+
+ mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22
+ mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25
+ srshr v21.4s, v6.4s, #12 // t21a
+ srshr v22.4s, v24.4s, #12 // t22
+ srshr v25.4s, v7.4s, #12 // t25
+
+ mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a
+ mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a
+ srshr v23.4s, v7.4s, #12 // t23a
+ srshr v24.4s, v6.4s, #12 // t24a
+
+ ret
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x4_neon
+ mov x14, x30
+ movi v7.4s, #0
+ lsl x8, x8, #1
+.if \scale
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.endif
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+.if \scale
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct_4s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
+ transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
+ transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
+ transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5
+ transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5
+
+.macro store1 r0, r1, r2, r3
+ st1 {\r0}, [x6], #16
+ st1 {\r1}, [x6], #16
+ st1 {\r2}, [x6], #16
+ st1 {\r3}, [x6], #16
+.endm
+ store1 v16.4s, v20.4s, v24.4s, v28.4s
+ store1 v17.4s, v21.4s, v25.4s, v29.4s
+ store1 v18.4s, v22.4s, v26.4s, v30.4s
+ store1 v19.4s, v23.4s, v27.4s, v31.4s
+.purgem store1
+ sub x6, x6, #64*4
+
+ movi v7.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in v0.s[1]
+ scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct32_odd_4s_x16_neon
+ transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5
+ transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5
+ transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5
+ transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5
+.macro store2 r0, r1, r2, r3, shift
+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6]
+ sqsub v4.4s, v0.4s, \r0
+ sqadd v0.4s, v0.4s, \r0
+ sqsub v5.4s, v1.4s, \r1
+ sqadd v1.4s, v1.4s, \r1
+ sqsub v6.4s, v2.4s, \r2
+ sqadd v2.4s, v2.4s, \r2
+ sqsub v7.4s, v3.4s, \r3
+ sqadd v3.4s, v3.4s, \r3
+ sqrshrn v0.4h, v0.4s, #\shift
+ sqrshrn2 v0.8h, v1.4s, #\shift
+ sqrshrn v1.4h, v2.4s, #\shift
+ sqrshrn2 v1.8h, v3.4s, #\shift
+ sqrshrn v2.4h, v7.4s, #\shift
+ sqrshrn2 v2.8h, v6.4s, #\shift
+ sqrshrn v3.4h, v5.4s, #\shift
+ sqrshrn2 v3.8h, v4.4s, #\shift
+ st1 {v0.8h, v1.8h}, [x6], #32
+ rev64 v2.8h, v2.8h
+ rev64 v3.8h, v3.8h
+ st1 {v2.8h, v3.8h}, [x6], #32
+.endm
+
+ store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift
+ store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift
+ store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift
+ store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift
+.purgem store2
+ ret x14
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_8x32_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+
+ bl X(inv_dct_8h_x16_neon)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ st1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ sub x7, x7, x8, lsr #1
+ bl X(inv_dct32_odd_8h_x16_neon)
+
+ neg x9, x8
+ mov x10, x6
+ mvni v1.8h, #0xfc, lsl #8 // 0x3ff
+.macro combine r0, r1, r2, r3, op, stride
+ ld1 {v5.8h}, [x7], \stride
+ ld1 {v2.8h}, [x10], x1
+ ld1 {v6.8h}, [x7], \stride
+ ld1 {v3.8h}, [x10], x1
+ \op v5.8h, v5.8h, \r0
+ ld1 {v7.8h}, [x7], \stride
+ ld1 {v4.8h}, [x10], x1
+ srshr v5.8h, v5.8h, #4
+ \op v6.8h, v6.8h, \r1
+ usqadd v2.8h, v5.8h
+ srshr v6.8h, v6.8h, #4
+ \op v7.8h, v7.8h, \r2
+ ld1 {v5.8h}, [x7], \stride
+ usqadd v3.8h, v6.8h
+ smin v2.8h, v2.8h, v1.8h
+ srshr v7.8h, v7.8h, #4
+ \op v5.8h, v5.8h, \r3
+ st1 {v2.8h}, [x6], x1
+ ld1 {v2.8h}, [x10], x1
+ usqadd v4.8h, v7.8h
+ smin v3.8h, v3.8h, v1.8h
+ srshr v5.8h, v5.8h, #4
+ st1 {v3.8h}, [x6], x1
+ usqadd v2.8h, v5.8h
+ smin v4.8h, v4.8h, v1.8h
+ st1 {v4.8h}, [x6], x1
+ smin v2.8h, v2.8h, v1.8h
+ st1 {v2.8h}, [x6], x1
+.endm
+ combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
+ combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
+ combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
+ combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
+ sub x7, x7, x8
+ combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
+ combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
+ combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
+ combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
+.purgem combine
+
+ ret x14
+endfunc
+
+const eob_32x32
+ .short 10, 36, 78, 136, 210, 300, 406, 1024
+endconst
+
+const eob_16x32
+ .short 10, 36, 78, 151, 215, 279, 343, 512
+endconst
+
+const eob_16x32_shortside
+ .short 10, 36, 78, 512
+endconst
+
+const eob_8x32
+ .short 10, 43, 75, 107, 139, 171, 203, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1
+ movi v0.8h, #0
+ movi v1.8h, #0
+ movrel x13, eob_32x32, 2
+
+ mov x8, #4*32
+1:
+ mov w9, #0
+ movrel x12, eob_32x32, 2
+2:
+ add w9, w9, #8
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+ load_add_store_8x8 x0, x7, shiftbits=2
+ ldrh w11, [x12], #4
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #2*8
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #4
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw #1
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #4*8
+ b 1b
+9:
+ ret
+endfunc
+
+.macro shift_16_regs op, shift
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ movz w16, #2896*8, lsl #16
+ movz w17, #2*(5793-4096)*8, lsl #16
+ movi v0.4s, #0
+ movi v1.4s, #0
+ movrel x13, eob_16x32\hshort, 2
+
+ mov x8, #4*\h
+1:
+ mov w9, #0
+ movrel x12, eob_16x32\wshort, 2
+2:
+ add w9, w9, #8
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ dup v2.2s, w16
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ mov v2.s[1], w17
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+
+.if \w == 16
+ // 16x32
+ identity_4x16_shift1 v2.s[1]
+.else
+ // 32x16
+ shift_16_regs sqshl, 1
+ identity_4x16 v2.s[1]
+.endif
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+.if \w == 16
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=4
+.endif
+ ldrh w11, [x12], #4
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #16
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #4
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw #1
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #4*8
+ b 1b
+9:
+ ret
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ movi v0.4s, #0
+ movi v1.4s, #0
+ // Working on 8x8 blocks, read every other entry from eob_8x32
+ movrel x13, eob_8x32, 2
+
+ mov w8, #4*\h
+1:
+ // Working on 8x8 blocks, read every other entry from eob_8x32
+ ldrh w12, [x13], #4
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+
+.if \w == 8
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn2 v16.8h, v17.4s, #1
+ sqrshrn v17.4h, v18.4s, #1
+ sqrshrn2 v17.8h, v19.4s, #1
+ sqrshrn v18.4h, v20.4s, #1
+ sqrshrn2 v18.8h, v21.4s, #1
+ sqrshrn v19.4h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ sqrshrn v20.4h, v24.4s, #1
+ sqrshrn2 v20.8h, v25.4s, #1
+ sqrshrn v21.4h, v26.4s, #1
+ sqrshrn2 v21.8h, v27.4s, #1
+ sqrshrn v22.4h, v28.4s, #1
+ sqrshrn2 v22.8h, v29.4s, #1
+ sqrshrn v23.4h, v30.4s, #1
+ sqrshrn2 v23.8h, v31.4s, #1
+.else
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+
+ cmp w3, w12
+.if \w == 8
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=3
+.endif
+
+ b.lt 9f
+.if \w == 8
+ sub x2, x2, x8, lsl #3
+ add x2, x2, #4*8
+.else
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #2*8
+.endif
+ b 1b
+
+9:
+ ret
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #2048
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, sp, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #2048
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+ adr x4, inv_dct_4s_x16_neon
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, sp, #(\i*16*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endif
+ mov x8, #4*32
+ bl inv_txfm_horz_scale_16x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #16*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #1024
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+
+ movrel x13, eob_16x32
+ movrel x5, X(inv_dct_8h_x16_neon)
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12
+ add x6, sp, #(\i*32*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 12
+ ldrh w12, [x13], #2
+.endif
+.endif
+ mov x8, #4*16
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #1024
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+ movrel x13, eob_8x32
+
+ movi v28.4s, #0
+ mov x8, #4*32
+ mov w9, #32
+ mov x6, sp
+ mov x7, x2
+1:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().4s}, [x7]
+ st1 {v28.4s}, [x7], x8
+.endr
+ ldrh w12, [x13], #2
+ sub w9, w9, #4
+ sub x7, x7, x8, lsl #3
+ add x7, x7, #4*4
+
+ bl inv_dct_4s_x8_neon
+
+ sqrshrn v16.4h, v16.4s, #2
+ sqrshrn v17.4h, v17.4s, #2
+ sqrshrn v18.4h, v18.4s, #2
+ sqrshrn v19.4h, v19.4s, #2
+ sqrshrn2 v16.8h, v20.4s, #2
+ sqrshrn2 v17.8h, v21.4s, #2
+ sqrshrn2 v18.8h, v22.4s, #2
+ sqrshrn2 v19.8h, v23.4s, #2
+
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ cmp w3, w12
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+
+ b.ge 1b
+ cbz w9, 3f
+
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+2:
+ subs w9, w9, #4
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
+ b.gt 2b
+
+3:
+ mov x6, x0
+ mov x7, sp
+ mov x8, #8*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+.irp i, 0, 4
+ add x6, sp, #(\i*32*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ cmp w3, #10
+ b.lt 1f
+.endif
+ mov x8, #8*4
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 2f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+
+2:
+ mov x8, #2*32
+ mov w9, #0
+1:
+ add x6, x0, x9, lsl #1
+ add x7, sp, x9, lsl #1 // #(\i*2)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ add w9, w9, #8
+
+ bl X(inv_dct_8h_x8_neon)
+
+ cmp w9, #32
+
+ load_add_store_8x8 x6, x7
+
+ b.lt 1b
+
+ add sp, sp, #512
+ ret x15
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ ld1 {v0.4s, v1.4s}, [x17], #32
+
+ sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a
+ sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a
+ sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a
+ sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a
+ sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a
+ sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a
+ sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a
+ sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a
+
+ ld1 {v0.4s}, [x17], #16
+
+ sqadd v24.4s, v16.4s, v17.4s // t32
+ sqsub v25.4s, v16.4s, v17.4s // t33
+ sqsub v26.4s, v19.4s, v18.4s // t34
+ sqadd v27.4s, v19.4s, v18.4s // t35
+ sqadd v28.4s, v20.4s, v21.4s // t60
+ sqsub v29.4s, v20.4s, v21.4s // t61
+ sqsub v30.4s, v23.4s, v22.4s // t62
+ sqadd v31.4s, v23.4s, v22.4s // t63
+
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a
+ mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a
+ neg v2.4s, v2.4s // t34a
+ mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a
+ srshr v26.4s, v2.4s, #12 // t34a
+ mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a
+ srshr v29.4s, v7.4s, #12 // t61a
+ srshr v25.4s, v6.4s, #12 // t33a
+ srshr v30.4s, v2.4s, #12 // t62a
+
+ sqadd v16.4s, v24.4s, v27.4s // t32a
+ sqsub v19.4s, v24.4s, v27.4s // t35a
+ sqadd v17.4s, v25.4s, v26.4s // t33
+ sqsub v18.4s, v25.4s, v26.4s // t34
+ sqsub v20.4s, v31.4s, v28.4s // t60a
+ sqadd v23.4s, v31.4s, v28.4s // t63a
+ sqsub v21.4s, v30.4s, v29.4s // t61
+ sqadd v22.4s, v30.4s, v29.4s // t62
+
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a
+ mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a
+ mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60
+ srshr v21.4s, v2.4s, #12 // t61a
+ srshr v18.4s, v7.4s, #12 // t34a
+ mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35
+ srshr v20.4s, v6.4s, #12 // t60
+ srshr v19.4s, v2.4s, #12 // t35
+
+ st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64
+ st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64
+
+ ret
+endfunc
+
+function inv_dct64_step2_neon
+ movrel x16, idct_coeffs
+ ld1 {v0.4s}, [x16]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ ldr q16, [x6, #4*4*0] // t32a
+ ldr q17, [x9, #4*4*8] // t39a
+ ldr q18, [x9, #4*4*0] // t63a
+ ldr q19, [x6, #4*4*8] // t56a
+ ldr q20, [x6, #4*4*16] // t40a
+ ldr q21, [x9, #4*4*24] // t47a
+ ldr q22, [x9, #4*4*16] // t55a
+ ldr q23, [x6, #4*4*24] // t48a
+
+ sqadd v24.4s, v16.4s, v17.4s // t32
+ sqsub v25.4s, v16.4s, v17.4s // t39
+ sqadd v26.4s, v18.4s, v19.4s // t63
+ sqsub v27.4s, v18.4s, v19.4s // t56
+ sqsub v28.4s, v21.4s, v20.4s // t40
+ sqadd v29.4s, v21.4s, v20.4s // t47
+ sqadd v30.4s, v23.4s, v22.4s // t48
+ sqsub v31.4s, v23.4s, v22.4s // t55
+
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a
+ mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a
+ mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a
+ srshr v25.4s, v2.4s, #12 // t56a
+ srshr v27.4s, v7.4s, #12 // t39a
+ neg v6.4s, v6.4s // t40a
+ mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a
+ srshr v31.4s, v6.4s, #12 // t40a
+ srshr v28.4s, v2.4s, #12 // t55a
+
+ sqadd v16.4s, v24.4s, v29.4s // t32a
+ sqsub v19.4s, v24.4s, v29.4s // t47a
+ sqadd v17.4s, v27.4s, v31.4s // t39
+ sqsub v18.4s, v27.4s, v31.4s // t40
+ sqsub v20.4s, v26.4s, v30.4s // t48a
+ sqadd v23.4s, v26.4s, v30.4s // t63a
+ sqsub v21.4s, v25.4s, v28.4s // t55
+ sqadd v22.4s, v25.4s, v28.4s // t56
+
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a
+ mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a
+ mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47
+ srshr v18.4s, v2.4s, #12 // t40a
+ srshr v21.4s, v7.4s, #12 // t55a
+ mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48
+ srshr v19.4s, v6.4s, #12 // t47
+ srshr v20.4s, v2.4s, #12 // t48
+
+ str q16, [x6, #4*4*0] // t32a
+ str q17, [x9, #4*4*0] // t39
+ str q18, [x6, #4*4*8] // t40a
+ str q19, [x9, #4*4*8] // t47
+ str q20, [x6, #4*4*16] // t48
+ str q21, [x9, #4*4*16] // t55a
+ str q22, [x6, #4*4*24] // t56
+ str q23, [x9, #4*4*24] // t63a
+
+ add x6, x6, #4*4
+ sub x9, x9, #4*4
+ cmp x6, x9
+ b.lt 1b
+ ret
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+.if \clear
+ ld1 {\i}, [\src]
+ st1 {\zero}, [\src], \strd
+.else
+ ld1 {\i}, [\src], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ st1 {\i}, [\dst], #16
+.endr
+.endm
+
+.macro clear_upper8
+.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ movi \i, #0
+.endr
+.endm
+
+.macro movi_if reg, val, cond
+.if \cond
+ movi \reg, \val
+.endif
+.endm
+
+.macro movz16dup_if reg, gpr, val, cond
+.if \cond
+ movz \gpr, \val, lsl #16
+ dup \reg, \gpr
+.endif
+.endm
+
+.macro st1_if regs, dst, cond
+.if \cond
+ st1 \regs, \dst
+.endif
+.endm
+
+.macro str_if reg, dst, cond
+.if \cond
+ str \reg, \dst
+.endif
+.endm
+
+.macro stroff_if reg, dst, dstoff, cond
+.if \cond
+ str \reg, \dst, \dstoff
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_4s_x64_neon
+ mov x14, x30
+ mov x6, sp
+ lsl x8, x8, #2
+
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ load8 x7, x8, v7.4s, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ add x7, x7, x8, lsr #1
+ scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct_4s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
+ store16 x6
+
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ load8 x7, x8, v7.4s, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ lsr x8, x8, #1
+ sub x7, x7, x8, lsr #1
+ scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct32_odd_4s_x16_neon
+
+ add x10, x6, #16*15
+ sub x6, x6, #16*16
+
+ mov x9, #-16
+
+ movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
+.macro store_addsub r0, r1, r2, r3
+ ld1 {v2.4s}, [x6], #16
+ ld1 {v3.4s}, [x6], #16
+ sqadd v6.4s, v2.4s, \r0
+ sqsub \r0, v2.4s, \r0
+ ld1 {v4.4s}, [x6], #16
+ sqadd v7.4s, v3.4s, \r1
+ sqsub \r1, v3.4s, \r1
+ smin v6.4s, v6.4s, v1.4s
+ smin \r0, \r0, v1.4s
+ ld1 {v5.4s}, [x6], #16
+ sqadd v2.4s, v4.4s, \r2
+ sub x6, x6, #16*4
+ smax v6.4s, v6.4s, v0.4s
+ smax \r0, \r0, v0.4s
+ sqsub \r2, v4.4s, \r2
+ smin v7.4s, v7.4s, v1.4s
+ smin \r1, \r1, v1.4s
+ st1 {v6.4s}, [x6], #16
+ st1 {\r0}, [x10], x9
+ smin v2.4s, v2.4s, v1.4s
+ smin \r2, \r2, v1.4s
+ smax v7.4s, v7.4s, v0.4s
+ smax \r1, \r1, v0.4s
+ sqadd v3.4s, v5.4s, \r3
+ sqsub \r3, v5.4s, \r3
+ smax v2.4s, v2.4s, v0.4s
+ smax \r2, \r2, v0.4s
+ smin v3.4s, v3.4s, v1.4s
+ smin \r3, \r3, v1.4s
+ st1 {v7.4s}, [x6], #16
+ st1 {\r1}, [x10], x9
+ smax v3.4s, v3.4s, v0.4s
+ smax \r3, \r3, v0.4s
+ st1 {v2.4s}, [x6], #16
+ st1 {\r2}, [x10], x9
+ st1 {v3.4s}, [x6], #16
+ st1 {\r3}, [x10], x9
+.endm
+ store_addsub v31.4s, v30.4s, v29.4s, v28.4s
+ store_addsub v27.4s, v26.4s, v25.4s, v24.4s
+ store_addsub v23.4s, v22.4s, v21.4s, v20.4s
+ store_addsub v19.4s, v18.4s, v17.4s, v16.4s
+.purgem store_addsub
+
+ add x6, x6, #4*4*16
+
+ movrel x17, idct64_coeffs
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ add x9, x7, x8, lsl #4 // offset 16
+ add x10, x7, x8, lsl #3 // offset 8
+ sub x9, x9, x8 // offset 15
+ sub x11, x10, x8 // offset 7
+ ld1 {v16.4s}, [x7] // in1 (offset 0)
+ ld1 {v17.4s}, [x9] // in31 (offset 15)
+ ld1 {v18.4s}, [x10] // in17 (offset 8)
+ ld1 {v19.4s}, [x11] // in15 (offset 7)
+ st1_if {v7.4s}, [x7], \clear
+ st1_if {v7.4s}, [x9], \clear
+ st1_if {v7.4s}, [x10], \clear
+ st1_if {v7.4s}, [x11], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ add x7, x7, x8, lsl #2 // offset 4
+ sub x9, x9, x8, lsl #2 // offset 11
+ sub x10, x7, x8 // offset 3
+ add x11, x9, x8 // offset 12
+ ld1 {v16.4s}, [x10] // in7 (offset 3)
+ ld1 {v17.4s}, [x11] // in25 (offset 12)
+ ld1 {v18.4s}, [x9] // in23 (offset 11)
+ ld1 {v19.4s}, [x7] // in9 (offset 4)
+ st1_if {v7.4s}, [x7], \clear
+ st1_if {v7.4s}, [x9], \clear
+ st1_if {v7.4s}, [x10], \clear
+ st1_if {v7.4s}, [x11], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ sub x10, x10, x8, lsl #1 // offset 1
+ sub x9, x9, x8, lsl #1 // offset 9
+ add x7, x7, x8 // offset 5
+ add x11, x11, x8 // offset 13
+ ldr q16, [x10, x8] // in5 (offset 2)
+ ldr q17, [x11] // in27 (offset 13)
+ ldr q18, [x9, x8] // in21 (offset 10)
+ ldr q19, [x7] // in11 (offset 5)
+ stroff_if q7, [x10, x8], \clear
+ str_if q7, [x11], \clear
+ stroff_if q7, [x9, x8], \clear
+ str_if q7, [x7], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ ldr q16, [x10] // in3 (offset 1)
+ ldr q17, [x11, x8] // in29 (offset 14)
+ ldr q18, [x9] // in19 (offset 9)
+ ldr q19, [x7, x8] // in13 (offset 6)
+ str_if q7, [x10], \clear
+ stroff_if q7, [x11, x8], \clear
+ str_if q7, [x9], \clear
+ stroff_if q7, [x7, x8], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+
+ sub x6, x6, #4*4*32
+ add x9, x6, #4*4*7
+
+ bl inv_dct64_step2_neon
+
+ ret x14
+endfunc
+.endm
+
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+
+function inv_txfm_horz_dct_64x4_neon
+ mov x14, x30
+
+ mov x7, sp
+ add x8, sp, #4*4*(64 - 4)
+ add x9, x6, #2*56
+ mov x10, #2*64
+ mov x11, #-4*4*4
+
+ dup v7.4s, w12
+1:
+ ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64
+ ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11
+ ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11
+ transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
+ transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
+ transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5
+ transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5
+
+.macro store_addsub src0, src1, src2, src3
+ sqsub v1.4s, \src0, \src1
+ sqadd v0.4s, \src0, \src1
+ sqsub v3.4s, \src2, \src3
+ srshl v1.4s, v1.4s, v7.4s
+ sqadd v2.4s, \src2, \src3
+ srshl v3.4s, v3.4s, v7.4s
+ srshl v0.4s, v0.4s, v7.4s
+ srshl v2.4s, v2.4s, v7.4s
+ sqxtn v3.4h, v3.4s
+ sqxtn2 v3.8h, v1.4s
+ sqxtn v0.4h, v0.4s
+ sqxtn2 v0.8h, v2.4s
+ rev64 v3.8h, v3.8h
+ st1 {v0.8h}, [x6], x10
+ st1 {v3.8h}, [x9], x10
+.endm
+ store_addsub v16.4s, v31.4s, v20.4s, v27.4s
+ store_addsub v17.4s, v30.4s, v21.4s, v26.4s
+ store_addsub v18.4s, v29.4s, v22.4s, v25.4s
+ store_addsub v19.4s, v28.4s, v23.4s, v24.4s
+.purgem store_addsub
+ sub x6, x6, x10, lsl #2
+ sub x9, x9, x10, lsl #2
+ add x6, x6, #16
+ sub x9, x9, #16
+
+ cmp x7, x8
+ b.lt 1b
+ ret x14
+endfunc
+
+function inv_txfm_add_vert_dct_8x64_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+ mov x7, sp
+ add x8, sp, #2*8*(64 - 4)
+ add x9, x6, x1, lsl #6
+ sub x9, x9, x1
+ neg x10, x1
+ mov x11, #-2*8*4
+
+1:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+.macro add_dest_addsub src0, src1, src2, src3
+ ld1 {v0.8h}, [x6], x1
+ ld1 {v1.8h}, [x9], x10
+ sqadd v4.8h, \src0, \src1
+ ld1 {v2.8h}, [x6]
+ sqsub \src0, \src0, \src1
+ ld1 {v3.8h}, [x9]
+ sqadd v5.8h, \src2, \src3
+ sqsub \src2, \src2, \src3
+ sub x6, x6, x1
+ sub x9, x9, x10
+ srshr v4.8h, v4.8h, #4
+ srshr v5.8h, v5.8h, #4
+ srshr \src0, \src0, #4
+ usqadd v0.8h, v4.8h
+ srshr \src2, \src2, #4
+ usqadd v1.8h, \src0
+ usqadd v2.8h, v5.8h
+ smin v0.8h, v0.8h, v7.8h
+ usqadd v3.8h, \src2
+ smin v1.8h, v1.8h, v7.8h
+ st1 {v0.8h}, [x6], x1
+ smin v2.8h, v2.8h, v7.8h
+ st1 {v1.8h}, [x9], x10
+ smin v3.8h, v3.8h, v7.8h
+ st1 {v2.8h}, [x6], x1
+ st1 {v3.8h}, [x9], x10
+.endm
+ add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h
+ add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h
+ add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h
+ add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h
+.purgem add_dest_addsub
+ cmp x7, x8
+ b.lt 1b
+
+ ret x14
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*4*4
+ add x5, sp, #64*4*4
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_4s_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #64*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*4*4
+ add x5, sp, #64*4*4
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ mov x12, #-1 // shift
+ bl inv_txfm_dct_clear_scale_4s_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i*2)
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, x5, #64*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ mov x15, x30
+
+ sub_sp 32*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+ ldrh w12, [x13], #2
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x7, x5, #(\i*2)
+ mov x8, #32*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #32*32*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ mov x15, x30
+
+ sub_sp 64*16*2+64*4*4
+ add x4, sp, #64*4*4
+
+ movrel x13, eob_16x32
+
+.irp i, 0, 4, 8, 12
+ add x6, x4, #(\i*64*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #16*4
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_4s_x64_neon
+ add x6, x4, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 12
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+ movrel x5, X(inv_dct_8h_x16_neon)
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i*2)
+ add x7, x4, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, x4, #64*16*2
+ ret x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ mov x15, x30
+
+ sub_sp 16*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+
+ adr x4, inv_dct_4s_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*16*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_16x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x7, x5, #(\i*2)
+ mov x8, #16*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #16*32*2
+ ret x15
+endfunc
diff --git a/third_party/dav1d/src/arm/64/loopfilter.S b/third_party/dav1d/src/arm/64/loopfilter.S
new file mode 100644
index 0000000000..63d5de10ad
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/loopfilter.S
@@ -0,0 +1,1129 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// depending on how many pixels need to be stored, returns:
+// x14 = (1 << 0) : 0 pixels
+// x14 = (1 << 4) : inner 4 pixels
+// x14 = (1 << 6) : inner 6 pixels
+// x14 = 0 : all pixels
+.macro loop_filter wd
+function lpf_16_wd\wd\()_neon
+ uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0)
+ uabd v1.16b, v25.16b, v24.16b // abs(q1 - q0)
+ uabd v2.16b, v23.16b, v24.16b // abs(p0 - q0)
+ uabd v3.16b, v22.16b, v25.16b // abs(p1 - q1)
+.if \wd >= 6
+ uabd v4.16b, v21.16b, v22.16b // abs(p2 - p1)
+ uabd v5.16b, v26.16b, v25.16b // abs(q2 - q1)
+.endif
+.if \wd >= 8
+ uabd v6.16b, v20.16b, v21.16b // abs(p3 - p2)
+ uabd v7.16b, v27.16b, v26.16b // abs(q3 - q3)
+.endif
+.if \wd >= 6
+ umax v4.16b, v4.16b, v5.16b
+.endif
+ uqadd v2.16b, v2.16b, v2.16b // abs(p0 - q0) * 2
+.if \wd >= 8
+ umax v6.16b, v6.16b, v7.16b
+.endif
+ ushr v3.16b, v3.16b, #1
+.if \wd >= 8
+ umax v4.16b, v4.16b, v6.16b
+.endif
+.if \wd >= 6
+ and v4.16b, v4.16b, v14.16b
+.endif
+ umax v0.16b, v0.16b, v1.16b // max(abs(p1 - p0), abs(q1 - q0))
+ uqadd v2.16b, v2.16b, v3.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+ umax v4.16b, v0.16b, v4.16b
+ cmhs v1.16b, v11.16b, v4.16b // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+ cmhs v1.16b, v11.16b, v0.16b // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+ cmhs v2.16b, v10.16b, v2.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+ and v1.16b, v1.16b, v2.16b // fm
+ and v1.16b, v1.16b, v13.16b // fm && wd >= 4
+.if \wd >= 6
+ and v14.16b, v14.16b, v1.16b // fm && wd > 4
+.endif
+.if \wd >= 16
+ and v15.16b, v15.16b, v1.16b // fm && wd == 16
+.endif
+
+ mov x16, v1.d[0]
+ mov x17, v1.d[1]
+ adds x16, x16, x17
+ b.ne 9f // if (!fm || wd < 4) return;
+ mov x14, #(1 << 0)
+ ret
+9:
+.if \wd >= 6
+ movi v10.16b, #1
+ uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0)
+ uabd v3.16b, v22.16b, v23.16b // abs(p1 - p0)
+ uabd v4.16b, v25.16b, v24.16b // abs(q1 - q0)
+ uabd v5.16b, v26.16b, v24.16b // abs(q2 - q0)
+.if \wd >= 8
+ uabd v6.16b, v20.16b, v23.16b // abs(p3 - p0)
+ uabd v7.16b, v27.16b, v24.16b // abs(q3 - q0)
+.endif
+ umax v2.16b, v2.16b, v3.16b
+ umax v4.16b, v4.16b, v5.16b
+.if \wd >= 8
+ umax v6.16b, v6.16b, v7.16b
+.endif
+ umax v2.16b, v2.16b, v4.16b
+.if \wd >= 8
+ umax v2.16b, v2.16b, v6.16b
+.endif
+
+.if \wd == 16
+ uabd v3.16b, v17.16b, v23.16b // abs(p6 - p0)
+ uabd v4.16b, v18.16b, v23.16b // abs(p5 - p0)
+ uabd v5.16b, v19.16b, v23.16b // abs(p4 - p0)
+.endif
+ cmhs v2.16b, v10.16b, v2.16b // flat8in
+.if \wd == 16
+ uabd v6.16b, v28.16b, v24.16b // abs(q4 - q0)
+ uabd v7.16b, v29.16b, v24.16b // abs(q5 - q0)
+ uabd v8.16b, v30.16b, v24.16b // abs(q6 - q0)
+.endif
+ and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4
+ bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in
+.if \wd == 16
+ umax v3.16b, v3.16b, v4.16b
+ umax v5.16b, v5.16b, v6.16b
+.endif
+ mov x16, v1.d[0]
+ mov x17, v1.d[1]
+.if \wd == 16
+ umax v7.16b, v7.16b, v8.16b
+ umax v3.16b, v3.16b, v5.16b
+ umax v3.16b, v3.16b, v7.16b
+ cmhs v3.16b, v10.16b, v3.16b // flat8out
+.endif
+ adds x16, x16, x17
+.if \wd == 16
+ and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16
+ and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
+ bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
+.endif
+ b.eq 1f // skip wd == 4 case
+.endif
+ movi v3.16b, #128
+ eor v2.16b, v22.16b, v3.16b // p1 - 128
+ eor v3.16b, v25.16b, v3.16b // q1 - 128
+ cmhi v0.16b, v0.16b, v12.16b // hev
+ sqsub v2.16b, v2.16b, v3.16b // iclip_diff(p1 - q1)
+ and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1)
+ bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev)
+ usubl v2.8h, v24.8b, v23.8b
+ movi v5.8h, #3
+ usubl2 v3.8h, v24.16b, v23.16b
+ mul v2.8h, v2.8h, v5.8h
+ mul v3.8h, v3.8h, v5.8h
+ movi v6.16b, #4
+ saddw v2.8h, v2.8h, v4.8b
+ saddw2 v3.8h, v3.8h, v4.16b
+ movi v7.16b, #3
+ sqxtn v2.8b, v2.8h // f
+ sqxtn2 v2.16b, v3.8h
+ sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 127)
+ sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127)
+ sshr v4.16b, v4.16b, #3 // f1
+ sshr v5.16b, v5.16b, #3 // f2
+ mov v2.16b, v23.16b // p0
+ mov v3.16b, v24.16b // q0
+ neg v6.16b, v4.16b // -f1
+ srshr v4.16b, v4.16b, #1 // (f1 + 1) >> 1
+ // p0 + f2, q0 - f1
+ usqadd v2.16b, v5.16b // out p0
+ usqadd v3.16b, v6.16b // out q0
+ neg v6.16b, v4.16b // -((f1 + 1) >> 1)
+ bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4)
+ bit v24.16b, v3.16b, v1.16b // if (fm && wd >= 4)
+ mov v2.16b, v22.16b // p1
+ mov v3.16b, v25.16b // q1
+ // p1 + ((f1 + 1) >> 1), q1 - ((f1 + 1) >> 1)
+ usqadd v2.16b, v4.16b // out p1
+ usqadd v3.16b, v6.16b // out q1
+ bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev)
+ bit v25.16b, v3.16b, v0.16b // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+ b.eq 2f // skip if there's no flat8in
+
+ uaddl v0.8h, v21.8b, v21.8b // p2 * 2
+ uaddl2 v1.8h, v21.16b, v21.16b
+ uaddl v2.8h, v21.8b, v22.8b // p2 + p1
+ uaddl2 v3.8h, v21.16b, v22.16b
+ uaddl v4.8h, v22.8b, v23.8b // p1 + p0
+ uaddl2 v5.8h, v22.16b, v23.16b
+ uaddl v6.8h, v23.8b, v24.8b // p0 + q0
+ uaddl2 v7.8h, v23.16b, v24.16b
+ add v8.8h, v0.8h, v2.8h
+ add v9.8h, v1.8h, v3.8h
+ add v10.8h, v4.8h, v6.8h
+ add v11.8h, v5.8h, v7.8h
+ uaddl v12.8h, v24.8b, v25.8b // q0 + q1
+ uaddl2 v13.8h, v24.16b, v25.16b
+ add v8.8h, v8.8h, v10.8h
+ add v9.8h, v9.8h, v11.8h
+ sub v12.8h, v12.8h, v0.8h
+ sub v13.8h, v13.8h, v1.8h
+ uaddl v10.8h, v25.8b, v26.8b // q1 + q2
+ uaddl2 v11.8h, v25.16b, v26.16b
+ rshrn v0.8b, v8.8h, #3 // out p1
+ rshrn2 v0.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v12.8h
+ add v9.8h, v9.8h, v13.8h
+ sub v10.8h, v10.8h, v2.8h
+ sub v11.8h, v11.8h, v3.8h
+ uaddl v12.8h, v26.8b, v26.8b // q2 + q2
+ uaddl2 v13.8h, v26.16b, v26.16b
+ rshrn v1.8b, v8.8h, #3 // out p0
+ rshrn2 v1.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v10.8h
+ add v9.8h, v9.8h, v11.8h
+ sub v12.8h, v12.8h, v4.8h
+ sub v13.8h, v13.8h, v5.8h
+ rshrn v2.8b, v8.8h, #3 // out q0
+ rshrn2 v2.16b, v9.8h, #3
+
+ bit v22.16b, v0.16b, v14.16b // p1 if (flat8in)
+ add v8.8h, v8.8h, v12.8h
+ add v9.8h, v9.8h, v13.8h
+ bit v23.16b, v1.16b, v14.16b // p0 if (flat8in)
+ rshrn v3.8b, v8.8h, #3 // out q1
+ rshrn2 v3.16b, v9.8h, #3
+ bit v24.16b, v2.16b, v14.16b // q0 if (flat8in)
+ bit v25.16b, v3.16b, v14.16b // q1 if (flat8in)
+.elseif \wd >= 8
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+.if \wd == 8
+ b.eq 8f // skip if there's no flat8in
+.else
+ b.eq 2f // skip if there's no flat8in
+.endif
+
+ uaddl v0.8h, v20.8b, v21.8b // p3 + p2
+ uaddl2 v1.8h, v20.16b, v21.16b
+ uaddl v2.8h, v22.8b, v25.8b // p1 + q1
+ uaddl2 v3.8h, v22.16b, v25.16b
+ uaddl v4.8h, v20.8b, v22.8b // p3 + p1
+ uaddl2 v5.8h, v20.16b, v22.16b
+ uaddl v6.8h, v23.8b, v26.8b // p0 + q2
+ uaddl2 v7.8h, v23.16b, v26.16b
+ add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2)
+ add v9.8h, v1.8h, v1.8h
+ uaddw v8.8h, v8.8h, v23.8b // + p0
+ uaddw2 v9.8h, v9.8h, v23.16b
+ uaddw v8.8h, v8.8h, v24.8b // + q0
+ uaddw2 v9.8h, v9.8h, v24.16b
+ add v8.8h, v8.8h, v4.8h
+ add v9.8h, v9.8h, v5.8h // + p3 + p1
+ sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2
+ sub v3.8h, v3.8h, v1.8h
+ sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1
+ sub v7.8h, v7.8h, v5.8h
+ rshrn v10.8b, v8.8h, #3 // out p2
+ rshrn2 v10.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v2.8h
+ add v9.8h, v9.8h, v3.8h
+ uaddl v0.8h, v20.8b, v23.8b // p3 + p0
+ uaddl2 v1.8h, v20.16b, v23.16b
+ uaddl v2.8h, v24.8b, v27.8b // q0 + q3
+ uaddl2 v3.8h, v24.16b, v27.16b
+ rshrn v11.8b, v8.8h, #3 // out p1
+ rshrn2 v11.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v6.8h
+ add v9.8h, v9.8h, v7.8h
+ sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0
+ sub v3.8h, v3.8h, v1.8h
+ uaddl v4.8h, v21.8b, v24.8b // p2 + q0
+ uaddl2 v5.8h, v21.16b, v24.16b
+ uaddl v6.8h, v25.8b, v27.8b // q1 + q3
+ uaddl2 v7.8h, v25.16b, v27.16b
+ rshrn v12.8b, v8.8h, #3 // out p0
+ rshrn2 v12.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v2.8h
+ add v9.8h, v9.8h, v3.8h
+ sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0
+ sub v7.8h, v7.8h, v5.8h
+ uaddl v0.8h, v22.8b, v25.8b // p1 + q1
+ uaddl2 v1.8h, v22.16b, v25.16b
+ uaddl v2.8h, v26.8b, v27.8b // q2 + q3
+ uaddl2 v3.8h, v26.16b, v27.16b
+ rshrn v13.8b, v8.8h, #3 // out q0
+ rshrn2 v13.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v6.8h
+ add v9.8h, v9.8h, v7.8h
+ sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
+ sub v3.8h, v3.8h, v1.8h
+ rshrn v0.8b, v8.8h, #3 // out q1
+ rshrn2 v0.16b, v9.8h, #3
+
+ add v8.8h, v8.8h, v2.8h
+ add v9.8h , v9.8h, v3.8h
+
+ bit v21.16b, v10.16b, v14.16b
+ bit v22.16b, v11.16b, v14.16b
+ bit v23.16b, v12.16b, v14.16b
+ rshrn v1.8b, v8.8h, #3 // out q2
+ rshrn2 v1.16b, v9.8h, #3
+ bit v24.16b, v13.16b, v14.16b
+ bit v25.16b, v0.16b, v14.16b
+ bit v26.16b, v1.16b, v14.16b
+.endif
+2:
+.if \wd == 16
+ mov x16, v15.d[0]
+ mov x17, v15.d[1]
+ adds x16, x16, x17
+ b.ne 1f // check if flat8out is needed
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+ b.eq 8f // if there was no flat8in, just write the inner 4 pixels
+ b 7f // if flat8in was used, write the inner 6 pixels
+1:
+
+ uaddl v2.8h, v17.8b, v17.8b // p6 + p6
+ uaddl2 v3.8h, v17.16b, v17.16b
+ uaddl v4.8h, v17.8b, v18.8b // p6 + p5
+ uaddl2 v5.8h, v17.16b, v18.16b
+ uaddl v6.8h, v17.8b, v19.8b // p6 + p4
+ uaddl2 v7.8h, v17.16b, v19.16b
+ uaddl v8.8h, v17.8b, v20.8b // p6 + p3
+ uaddl2 v9.8h, v17.16b, v20.16b
+ add v12.8h, v2.8h, v4.8h
+ add v13.8h, v3.8h, v5.8h
+ add v10.8h, v6.8h, v8.8h
+ add v11.8h, v7.8h, v9.8h
+ uaddl v6.8h, v17.8b, v21.8b // p6 + p2
+ uaddl2 v7.8h, v17.16b, v21.16b
+ add v12.8h, v12.8h, v10.8h
+ add v13.8h, v13.8h, v11.8h
+ uaddl v8.8h, v17.8b, v22.8b // p6 + p1
+ uaddl2 v9.8h, v17.16b, v22.16b
+ uaddl v10.8h, v18.8b, v23.8b // p5 + p0
+ uaddl2 v11.8h, v18.16b, v23.16b
+ add v6.8h, v6.8h, v8.8h
+ add v7.8h, v7.8h, v9.8h
+ uaddl v8.8h, v19.8b, v24.8b // p4 + q0
+ uaddl2 v9.8h, v19.16b, v24.16b
+ add v12.8h, v12.8h, v6.8h
+ add v13.8h, v13.8h, v7.8h
+ add v10.8h, v10.8h, v8.8h
+ add v11.8h, v11.8h, v9.8h
+ uaddl v6.8h, v20.8b, v25.8b // p3 + q1
+ uaddl2 v7.8h, v20.16b, v25.16b
+ add v12.8h, v12.8h, v10.8h
+ add v13.8h, v13.8h, v11.8h
+ sub v6.8h, v6.8h, v2.8h
+ sub v7.8h, v7.8h, v3.8h
+ uaddl v2.8h, v21.8b, v26.8b // p2 + q2
+ uaddl2 v3.8h, v21.16b, v26.16b
+ rshrn v0.8b, v12.8h, #4 // out p5
+ rshrn2 v0.16b, v13.8h, #4
+ add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1)
+ add v13.8h, v13.8h, v7.8h
+ sub v2.8h, v2.8h, v4.8h
+ sub v3.8h, v3.8h, v5.8h
+ uaddl v4.8h, v22.8b, v27.8b // p1 + q3
+ uaddl2 v5.8h, v22.16b, v27.16b
+ uaddl v6.8h, v17.8b, v19.8b // p6 + p4
+ uaddl2 v7.8h, v17.16b, v19.16b
+ rshrn v1.8b, v12.8h, #4 // out p4
+ rshrn2 v1.16b, v13.8h, #4
+ add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2)
+ add v13.8h, v13.8h, v3.8h
+ sub v4.8h, v4.8h, v6.8h
+ sub v5.8h, v5.8h, v7.8h
+ uaddl v6.8h, v23.8b, v28.8b // p0 + q4
+ uaddl2 v7.8h, v23.16b, v28.16b
+ uaddl v8.8h, v17.8b, v20.8b // p6 + p3
+ uaddl2 v9.8h, v17.16b, v20.16b
+ rshrn v2.8b, v12.8h, #4 // out p3
+ rshrn2 v2.16b, v13.8h, #4
+ add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3)
+ add v13.8h, v13.8h, v5.8h
+ sub v6.8h, v6.8h, v8.8h
+ sub v7.8h, v7.8h, v9.8h
+ uaddl v8.8h, v24.8b, v29.8b // q0 + q5
+ uaddl2 v9.8h, v24.16b, v29.16b
+ uaddl v4.8h, v17.8b, v21.8b // p6 + p2
+ uaddl2 v5.8h, v17.16b, v21.16b
+ rshrn v3.8b, v12.8h, #4 // out p2
+ rshrn2 v3.16b, v13.8h, #4
+ add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4)
+ add v13.8h, v13.8h, v7.8h
+ sub v8.8h, v8.8h, v4.8h
+ sub v9.8h, v9.8h, v5.8h
+ uaddl v6.8h, v25.8b, v30.8b // q1 + q6
+ uaddl2 v7.8h, v25.16b, v30.16b
+ uaddl v10.8h, v17.8b, v22.8b // p6 + p1
+ uaddl2 v11.8h, v17.16b, v22.16b
+ rshrn v4.8b, v12.8h, #4 // out p1
+ rshrn2 v4.16b, v13.8h, #4
+ add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5)
+ add v13.8h, v13.8h, v9.8h
+ sub v6.8h, v6.8h, v10.8h
+ sub v7.8h, v7.8h, v11.8h
+ uaddl v8.8h, v26.8b, v30.8b // q2 + q6
+ uaddl2 v9.8h, v26.16b, v30.16b
+ bif v0.16b, v18.16b, v15.16b // out p5
+ uaddl v10.8h, v18.8b, v23.8b // p5 + p0
+ uaddl2 v11.8h, v18.16b, v23.16b
+ rshrn v5.8b, v12.8h, #4 // out p0
+ rshrn2 v5.16b, v13.8h, #4
+ add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6)
+ add v13.8h, v13.8h, v7.8h
+ sub v8.8h, v8.8h, v10.8h
+ sub v9.8h, v9.8h, v11.8h
+ uaddl v10.8h, v27.8b, v30.8b // q3 + q6
+ uaddl2 v11.8h, v27.16b, v30.16b
+ bif v1.16b, v19.16b, v15.16b // out p4
+ uaddl v18.8h, v19.8b, v24.8b // p4 + q0
+ uaddl2 v19.8h, v19.16b, v24.16b
+ rshrn v6.8b, v12.8h, #4 // out q0
+ rshrn2 v6.16b, v13.8h, #4
+ add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
+ add v13.8h, v13.8h, v9.8h
+ sub v10.8h, v10.8h, v18.8h
+ sub v11.8h, v11.8h, v19.8h
+ uaddl v8.8h, v28.8b, v30.8b // q4 + q6
+ uaddl2 v9.8h, v28.16b, v30.16b
+ bif v2.16b, v20.16b, v15.16b // out p3
+ uaddl v18.8h, v20.8b, v25.8b // p3 + q1
+ uaddl2 v19.8h, v20.16b, v25.16b
+ rshrn v7.8b, v12.8h, #4 // out q1
+ rshrn2 v7.16b, v13.8h, #4
+ add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
+ add v13.8h, v13.8h, v11.8h
+ sub v18.8h, v8.8h, v18.8h
+ sub v19.8h, v9.8h, v19.8h
+ uaddl v10.8h, v29.8b, v30.8b // q5 + q6
+ uaddl2 v11.8h, v29.16b, v30.16b
+ bif v3.16b, v21.16b, v15.16b // out p2
+ uaddl v20.8h, v21.8b, v26.8b // p2 + q2
+ uaddl2 v21.8h, v21.16b, v26.16b
+ rshrn v8.8b, v12.8h, #4 // out q2
+ rshrn2 v8.16b, v13.8h, #4
+ add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6)
+ add v13.8h, v13.8h, v19.8h
+ sub v10.8h, v10.8h, v20.8h
+ sub v11.8h, v11.8h, v21.8h
+ uaddl v18.8h, v30.8b, v30.8b // q6 + q6
+ uaddl2 v19.8h, v30.16b, v30.16b
+ bif v4.16b, v22.16b, v15.16b // out p1
+ uaddl v20.8h, v22.8b, v27.8b // p1 + q3
+ uaddl2 v21.8h, v22.16b, v27.16b
+ rshrn v9.8b, v12.8h, #4 // out q3
+ rshrn2 v9.16b, v13.8h, #4
+ add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
+ add v13.8h, v13.8h, v11.8h
+ sub v18.8h, v18.8h, v20.8h
+ sub v19.8h, v19.8h, v21.8h
+ bif v5.16b, v23.16b, v15.16b // out p0
+ rshrn v10.8b, v12.8h, #4 // out q4
+ rshrn2 v10.16b, v13.8h, #4
+ add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6)
+ add v13.8h, v13.8h, v19.8h
+ rshrn v11.8b, v12.8h, #4 // out q5
+ rshrn2 v11.16b, v13.8h, #4
+ bif v6.16b, v24.16b, v15.16b // out q0
+ bif v7.16b, v25.16b, v15.16b // out q1
+ bif v8.16b, v26.16b, v15.16b // out q2
+ bif v9.16b, v27.16b, v15.16b // out q3
+ bif v10.16b, v28.16b, v15.16b // out q4
+ bif v11.16b, v29.16b, v15.16b // out q5
+.endif
+
+ mov x14, #0
+ ret
+.if \wd == 16
+7:
+ // Return to a shorter epilogue, writing only the inner 6 pixels
+ mov x14, #(1 << 6)
+ ret
+.endif
+.if \wd >= 8
+8:
+ // Return to a shorter epilogue, writing only the inner 4 pixels
+ mov x14, #(1 << 4)
+ ret
+.endif
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_16_wd16
+ bl lpf_16_wd16_neon
+ cbz x14, 1f
+ tbnz x14, #6, 7f
+ tbnz x14, #4, 8f
+ ret x15
+1:
+.endm
+
+.macro lpf_16_wd8
+ bl lpf_16_wd8_neon
+ cbz x14, 1f
+ tbnz x14, #4, 8f
+ ret x15
+1:
+.endm
+
+.macro lpf_16_wd6
+ bl lpf_16_wd6_neon
+ cbz x14, 1f
+ ret x15
+1:
+.endm
+
+.macro lpf_16_wd4
+ bl lpf_16_wd4_neon
+ cbz x14, 1f
+ ret x15
+1:
+.endm
+
+function lpf_v_4_16_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #1
+ ld1 {v22.16b}, [x16], x1 // p1
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v23.16b}, [x16], x1 // p0
+ ld1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+
+ lpf_16_wd4
+
+ sub x16, x0, x1, lsl #1
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_4_16_neon
+ mov x15, x30
+ sub x16, x0, #2
+ add x0, x16, x1, lsl #3
+ ld1 {v22.s}[0], [x16], x1
+ ld1 {v22.s}[2], [x0], x1
+ ld1 {v23.s}[0], [x16], x1
+ ld1 {v23.s}[2], [x0], x1
+ ld1 {v24.s}[0], [x16], x1
+ ld1 {v24.s}[2], [x0], x1
+ ld1 {v25.s}[0], [x16], x1
+ ld1 {v25.s}[2], [x0], x1
+ ld1 {v22.s}[1], [x16], x1
+ ld1 {v22.s}[3], [x0], x1
+ ld1 {v23.s}[1], [x16], x1
+ ld1 {v23.s}[3], [x0], x1
+ ld1 {v24.s}[1], [x16], x1
+ ld1 {v24.s}[3], [x0], x1
+ ld1 {v25.s}[1], [x16], x1
+ ld1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_16_wd4
+
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #2
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v22.s}[0], [x16], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x16], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x16], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x16], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x16], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x16], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x16], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x16], x1
+ st1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+ ret x15
+endfunc
+
+function lpf_v_6_16_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #1
+ sub x16, x16, x1
+ ld1 {v21.16b}, [x16], x1 // p2
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v22.16b}, [x16], x1 // p1
+ ld1 {v25.16b}, [x0], x1 // q1
+ ld1 {v23.16b}, [x16], x1 // p0
+ ld1 {v26.16b}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+
+ lpf_16_wd6
+
+ sub x16, x0, x1, lsl #1
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_6_16_neon
+ mov x15, x30
+ sub x16, x0, #4
+ add x0, x16, x1, lsl #3
+ ld1 {v20.d}[0], [x16], x1
+ ld1 {v20.d}[1], [x0], x1
+ ld1 {v21.d}[0], [x16], x1
+ ld1 {v21.d}[1], [x0], x1
+ ld1 {v22.d}[0], [x16], x1
+ ld1 {v22.d}[1], [x0], x1
+ ld1 {v23.d}[0], [x16], x1
+ ld1 {v23.d}[1], [x0], x1
+ ld1 {v24.d}[0], [x16], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v25.d}[0], [x16], x1
+ ld1 {v25.d}[1], [x0], x1
+ ld1 {v26.d}[0], [x16], x1
+ ld1 {v26.d}[1], [x0], x1
+ ld1 {v27.d}[0], [x16], x1
+ ld1 {v27.d}[1], [x0], x1
+ add x0, x0, #4
+
+ transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_16_wd6
+
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #2
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v22.s}[0], [x16], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x16], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x16], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x16], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x16], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x16], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x16], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x16], x1
+ st1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+ ret x15
+endfunc
+
+function lpf_v_8_16_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #2
+ ld1 {v20.16b}, [x16], x1 // p3
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v21.16b}, [x16], x1 // p2
+ ld1 {v25.16b}, [x0], x1 // q1
+ ld1 {v22.16b}, [x16], x1 // p1
+ ld1 {v26.16b}, [x0], x1 // q2
+ ld1 {v23.16b}, [x16], x1 // p0
+ ld1 {v27.16b}, [x0], x1 // q3
+ sub x0, x0, x1, lsl #2
+
+ lpf_16_wd8
+
+ sub x16, x0, x1, lsl #1
+ sub x16, x16, x1
+ st1 {v21.16b}, [x16], x1 // p2
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v25.16b}, [x0], x1 // q1
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v26.16b}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+ ret x15
+
+8:
+ sub x16, x0, x1, lsl #1
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_8_16_neon
+ mov x15, x30
+ sub x16, x0, #4
+ add x0, x16, x1, lsl #3
+ ld1 {v20.d}[0], [x16], x1
+ ld1 {v20.d}[1], [x0], x1
+ ld1 {v21.d}[0], [x16], x1
+ ld1 {v21.d}[1], [x0], x1
+ ld1 {v22.d}[0], [x16], x1
+ ld1 {v22.d}[1], [x0], x1
+ ld1 {v23.d}[0], [x16], x1
+ ld1 {v23.d}[1], [x0], x1
+ ld1 {v24.d}[0], [x16], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v25.d}[0], [x16], x1
+ ld1 {v25.d}[1], [x0], x1
+ ld1 {v26.d}[0], [x16], x1
+ ld1 {v26.d}[1], [x0], x1
+ ld1 {v27.d}[0], [x16], x1
+ ld1 {v27.d}[1], [x0], x1
+ add x0, x0, #4
+
+ transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_16_wd8
+
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #4
+ transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v20.d}[0], [x16], x1
+ st1 {v20.d}[1], [x0], x1
+ st1 {v21.d}[0], [x16], x1
+ st1 {v21.d}[1], [x0], x1
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ st1 {v26.d}[0], [x16], x1
+ st1 {v26.d}[1], [x0], x1
+ st1 {v27.d}[0], [x16], x1
+ st1 {v27.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+8:
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #2
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v22.s}[0], [x16], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x16], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x16], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x16], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x16], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x16], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x16], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x16], x1
+ st1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+ ret x15
+endfunc
+
+function lpf_v_16_16_neon
+ mov x15, x30
+
+ sub x16, x0, x1, lsl #3
+ add x16, x16, x1
+ ld1 {v17.16b}, [x16], x1 // p6
+ ld1 {v24.16b}, [x0], x1 // q0
+ ld1 {v18.16b}, [x16], x1 // p5
+ ld1 {v25.16b}, [x0], x1 // q1
+ ld1 {v19.16b}, [x16], x1 // p4
+ ld1 {v26.16b}, [x0], x1 // q2
+ ld1 {v20.16b}, [x16], x1 // p3
+ ld1 {v27.16b}, [x0], x1 // q3
+ ld1 {v21.16b}, [x16], x1 // p2
+ ld1 {v28.16b}, [x0], x1 // q4
+ ld1 {v22.16b}, [x16], x1 // p1
+ ld1 {v29.16b}, [x0], x1 // q5
+ ld1 {v23.16b}, [x16], x1 // p0
+ ld1 {v30.16b}, [x0], x1 // q6
+ sub x0, x0, x1, lsl #3
+ add x0, x0, x1
+
+ lpf_16_wd16
+
+ sub x16, x0, x1, lsl #2
+ sub x16, x16, x1, lsl #1
+ st1 {v0.16b}, [x16], x1 // p5
+ st1 {v6.16b}, [x0], x1 // q0
+ st1 {v1.16b}, [x16], x1 // p4
+ st1 {v7.16b}, [x0], x1 // q1
+ st1 {v2.16b}, [x16], x1 // p3
+ st1 {v8.16b}, [x0], x1 // q2
+ st1 {v3.16b}, [x16], x1 // p2
+ st1 {v9.16b}, [x0], x1 // q3
+ st1 {v4.16b}, [x16], x1 // p1
+ st1 {v10.16b}, [x0], x1 // q4
+ st1 {v5.16b}, [x16], x1 // p0
+ st1 {v11.16b}, [x0], x1 // q5
+ sub x0, x0, x1, lsl #2
+ sub x0, x0, x1, lsl #1
+ ret x15
+7:
+ sub x16, x0, x1
+ sub x16, x16, x1, lsl #1
+ st1 {v21.16b}, [x16], x1 // p2
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v25.16b}, [x0], x1 // q1
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v26.16b}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+ ret x15
+
+8:
+ sub x16, x0, x1, lsl #1
+ st1 {v22.16b}, [x16], x1 // p1
+ st1 {v24.16b}, [x0], x1 // q0
+ st1 {v23.16b}, [x16], x1 // p0
+ st1 {v25.16b}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_16_16_neon
+ mov x15, x30
+ sub x16, x0, #8
+ ld1 {v16.d}[0], [x16], x1
+ ld1 {v24.d}[0], [x0], x1
+ ld1 {v17.d}[0], [x16], x1
+ ld1 {v25.d}[0], [x0], x1
+ ld1 {v18.d}[0], [x16], x1
+ ld1 {v26.d}[0], [x0], x1
+ ld1 {v19.d}[0], [x16], x1
+ ld1 {v27.d}[0], [x0], x1
+ ld1 {v20.d}[0], [x16], x1
+ ld1 {v28.d}[0], [x0], x1
+ ld1 {v21.d}[0], [x16], x1
+ ld1 {v29.d}[0], [x0], x1
+ ld1 {v22.d}[0], [x16], x1
+ ld1 {v30.d}[0], [x0], x1
+ ld1 {v23.d}[0], [x16], x1
+ ld1 {v31.d}[0], [x0], x1
+ ld1 {v16.d}[1], [x16], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v17.d}[1], [x16], x1
+ ld1 {v25.d}[1], [x0], x1
+ ld1 {v18.d}[1], [x16], x1
+ ld1 {v26.d}[1], [x0], x1
+ ld1 {v19.d}[1], [x16], x1
+ ld1 {v27.d}[1], [x0], x1
+ ld1 {v20.d}[1], [x16], x1
+ ld1 {v28.d}[1], [x0], x1
+ ld1 {v21.d}[1], [x16], x1
+ ld1 {v29.d}[1], [x0], x1
+ ld1 {v22.d}[1], [x16], x1
+ ld1 {v30.d}[1], [x0], x1
+ ld1 {v23.d}[1], [x16], x1
+ ld1 {v31.d}[1], [x0], x1
+
+ transpose_8x16b v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+ transpose_8x16b v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
+
+ lpf_16_wd16
+
+ sub x0, x0, x1, lsl #4
+ sub x16, x0, #8
+
+ transpose_8x16b v16, v17, v0, v1, v2, v3, v4, v5, v18, v19
+ transpose_8x16b v6, v7, v8, v9, v10, v11, v30, v31, v18, v19
+
+ st1 {v16.d}[0], [x16], x1
+ st1 {v6.d}[0], [x0], x1
+ st1 {v17.d}[0], [x16], x1
+ st1 {v7.d}[0], [x0], x1
+ st1 {v0.d}[0], [x16], x1
+ st1 {v8.d}[0], [x0], x1
+ st1 {v1.d}[0], [x16], x1
+ st1 {v9.d}[0], [x0], x1
+ st1 {v2.d}[0], [x16], x1
+ st1 {v10.d}[0], [x0], x1
+ st1 {v3.d}[0], [x16], x1
+ st1 {v11.d}[0], [x0], x1
+ st1 {v4.d}[0], [x16], x1
+ st1 {v30.d}[0], [x0], x1
+ st1 {v5.d}[0], [x16], x1
+ st1 {v31.d}[0], [x0], x1
+ st1 {v16.d}[1], [x16], x1
+ st1 {v6.d}[1], [x0], x1
+ st1 {v17.d}[1], [x16], x1
+ st1 {v7.d}[1], [x0], x1
+ st1 {v0.d}[1], [x16], x1
+ st1 {v8.d}[1], [x0], x1
+ st1 {v1.d}[1], [x16], x1
+ st1 {v9.d}[1], [x0], x1
+ st1 {v2.d}[1], [x16], x1
+ st1 {v10.d}[1], [x0], x1
+ st1 {v3.d}[1], [x16], x1
+ st1 {v11.d}[1], [x0], x1
+ st1 {v4.d}[1], [x16], x1
+ st1 {v30.d}[1], [x0], x1
+ st1 {v5.d}[1], [x16], x1
+ st1 {v31.d}[1], [x0], x1
+ ret x15
+
+7:
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #4
+ transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v20.d}[0], [x16], x1
+ st1 {v20.d}[1], [x0], x1
+ st1 {v21.d}[0], [x16], x1
+ st1 {v21.d}[1], [x0], x1
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ st1 {v26.d}[0], [x16], x1
+ st1 {v26.d}[1], [x0], x1
+ st1 {v27.d}[0], [x16], x1
+ st1 {v27.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+8:
+ sub x16, x0, x1, lsl #4
+ sub x16, x16, #2
+ transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #3
+
+ st1 {v22.s}[0], [x16], x1
+ st1 {v22.s}[2], [x0], x1
+ st1 {v23.s}[0], [x16], x1
+ st1 {v23.s}[2], [x0], x1
+ st1 {v24.s}[0], [x16], x1
+ st1 {v24.s}[2], [x0], x1
+ st1 {v25.s}[0], [x16], x1
+ st1 {v25.s}[2], [x0], x1
+ st1 {v22.s}[1], [x16], x1
+ st1 {v22.s}[3], [x0], x1
+ st1 {v23.s}[1], [x16], x1
+ st1 {v23.s}[3], [x0], x1
+ st1 {v24.s}[1], [x16], x1
+ st1 {v24.s}[3], [x0], x1
+ st1 {v25.s}[1], [x16], x1
+ st1 {v25.s}[3], [x0], x1
+ add x0, x0, #2
+ ret x15
+endfunc
+
+// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint32_t *const vmask,
+// const uint8_t (*l)[4], ptrdiff_t b4_stride,
+// const Av1FilterLUT *lut, const int w)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
+ mov x11, x30
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+ ldp w6, w7, [x2] // vmask[0], vmask[1]
+.ifc \type, y
+ ldr w2, [x2, #8] // vmask[2]
+.endif
+ add x5, x5, #128 // Move to sharp part of lut
+.ifc \type, y
+ orr w7, w7, w2 // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+ sub x4, x3, x4, lsl #2
+.else
+ sub x3, x3, #4
+ lsl x4, x4, #2
+.endif
+ orr w6, w6, w7 // vmask[0] |= vmask[1]
+
+1:
+ tst w6, #0x0f
+.ifc \dir, v
+ ld1 {v0.16b}, [x4], #16
+ ld1 {v1.16b}, [x3], #16
+.else
+ ld2 {v0.s,v1.s}[0], [x3], x4
+ ld2 {v0.s,v1.s}[1], [x3], x4
+ ld2 {v0.s,v1.s}[2], [x3], x4
+ ld2 {v0.s,v1.s}[3], [x3], x4
+.endif
+ b.eq 7f // if (!(vm & bits)) continue;
+
+ ld1r {v5.16b}, [x5] // sharp[0]
+ add x5, x5, #8
+ movi v2.4s, #0xff
+ dup v13.4s, w6 // vmask[0]
+
+ and v0.16b, v0.16b, v2.16b // Keep only lowest byte in each 32 bit word
+ and v1.16b, v1.16b, v2.16b
+ cmtst v3.16b, v1.16b, v2.16b // Check for nonzero values in l[0][0]
+ movi v4.16b, #1
+ ld1r {v6.16b}, [x5] // sharp[1]
+ sub x5, x5, #8
+ bif v1.16b, v0.16b, v3.16b // if (!l[0][0]) L = l[offset][0]
+ cmtst v2.4s, v1.4s, v2.4s // L != 0
+ mul v1.4s, v1.4s, v4.4s // L
+.ifc \type, y
+ dup v15.4s, w2 // vmask[2]
+.endif
+ dup v14.4s, w7 // vmask[1]
+ mov x16, v2.d[0]
+ mov x17, v2.d[1]
+ adds x16, x16, x17
+ b.eq 7f // if (!L) continue;
+ neg v5.16b, v5.16b // -sharp[0]
+ movrel x16, word_1248
+ ushr v12.16b, v1.16b, #4 // H
+ ld1 {v16.4s}, [x16]
+ sshl v3.16b, v1.16b, v5.16b // L >> sharp[0]
+.ifc \type, y
+ cmtst v15.4s, v15.4s, v16.4s // if (vmask[2] & bits)
+.endif
+ movi v7.16b, #2
+ umin v3.16b, v3.16b, v6.16b // imin(L >> sharp[0], sharp[1])
+ add v0.16b, v1.16b, v7.16b // L + 2
+ umax v11.16b, v3.16b, v4.16b // imax(imin(), 1) = limit = I
+ add v0.16b, v0.16b, v0.16b // 2*(L + 2)
+ cmtst v14.4s, v14.4s, v16.4s // if (vmask[1] & bits)
+ add v10.16b, v0.16b, v11.16b // 2*(L + 2) + limit = E
+ cmtst v13.4s, v13.4s, v16.4s // if (vmask[0] & bits)
+ and v13.16b, v13.16b, v2.16b // vmask[0] &= L != 0
+
+.ifc \type, y
+ tst w2, #0x0f
+ b.eq 2f
+ // wd16
+ bl lpf_\dir\()_16_16_neon
+ b 8f
+2:
+.endif
+ tst w7, #0x0f
+ b.eq 3f
+.ifc \type, y
+ // wd8
+ bl lpf_\dir\()_8_16_neon
+.else
+ // wd6
+ bl lpf_\dir\()_6_16_neon
+.endif
+ b 8f
+3:
+ // wd4
+ bl lpf_\dir\()_4_16_neon
+.ifc \dir, h
+ b 8f
+7:
+ // For dir h, the functions above increment x0.
+ // If the whole function is skipped, increment it here instead.
+ add x0, x0, x1, lsl #4
+.else
+7:
+.endif
+8:
+ lsr w6, w6, #4 // vmask[0] >>= 4
+ lsr w7, w7, #4 // vmask[1] >>= 4
+.ifc \type, y
+ lsr w2, w2, #4 // vmask[2] >>= 4
+.endif
+.ifc \dir, v
+ add x0, x0, #16
+.else
+ // For dir h, x0 is returned incremented
+.endif
+ cbnz w6, 1b
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret x11
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
+
+const word_1248
+ .word 1, 2, 4, 8
+endconst
diff --git a/third_party/dav1d/src/arm/64/loopfilter16.S b/third_party/dav1d/src/arm/64/loopfilter16.S
new file mode 100644
index 0000000000..d181a3e623
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/loopfilter16.S
@@ -0,0 +1,925 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// depending on how many pixels need to be stored, returns:
+// x14 = (1 << 0) : 0 pixels
+// x14 = (1 << 4) : inner 4 pixels
+// x14 = (1 << 6) : inner 6 pixels
+// x14 = 0 : all pixels
+.macro loop_filter wd
+function lpf_8_wd\wd\()_neon
+ uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0)
+ uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0)
+ uabd v2.8h, v23.8h, v24.8h // abs(p0 - q0)
+ uabd v3.8h, v22.8h, v25.8h // abs(p1 - q1)
+.if \wd >= 6
+ uabd v4.8h, v21.8h, v22.8h // abs(p2 - p1)
+ uabd v5.8h, v26.8h, v25.8h // abs(q2 - q1)
+.endif
+.if \wd >= 8
+ uabd v6.8h, v20.8h, v21.8h // abs(p3 - p2)
+ uabd v7.8h, v27.8h, v26.8h // abs(q3 - q3)
+.endif
+.if \wd >= 6
+ umax v4.8h, v4.8h, v5.8h
+.endif
+ uqadd v2.8h, v2.8h, v2.8h // abs(p0 - q0) * 2
+.if \wd >= 8
+ umax v6.8h, v6.8h, v7.8h
+.endif
+ ushr v3.8h, v3.8h, #1
+.if \wd >= 8
+ umax v4.8h, v4.8h, v6.8h
+.endif
+.if \wd >= 6
+ and v4.16b, v4.16b, v14.16b
+.endif
+ umax v0.8h, v0.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0))
+ uqadd v2.8h, v2.8h, v3.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+ umax v4.8h, v0.8h, v4.8h
+ cmhs v1.8h, v11.8h, v4.8h // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+ cmhs v1.8h, v11.8h, v0.8h // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+ cmhs v2.8h, v10.8h, v2.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+ and v1.16b, v1.16b, v2.16b // fm
+ and v1.16b, v1.16b, v13.16b // fm && wd >= 4
+.if \wd >= 6
+ and v14.16b, v14.16b, v1.16b // fm && wd > 4
+.endif
+.if \wd >= 16
+ and v15.16b, v15.16b, v1.16b // fm && wd == 16
+.endif
+
+ mov x16, v1.d[0]
+ mov x17, v1.d[1]
+ adds x16, x16, x17
+ b.ne 9f // if (!fm || wd < 4) return;
+ mov x14, #(1 << 0)
+ ret
+9:
+.if \wd >= 6
+ movi v10.8h, #1
+ uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
+ uabd v3.8h, v22.8h, v23.8h // abs(p1 - p0)
+ uabd v4.8h, v25.8h, v24.8h // abs(q1 - q0)
+ uabd v5.8h, v26.8h, v24.8h // abs(q2 - q0)
+ dup v9.8h, w9 // bitdepth_min_8
+.if \wd >= 8
+ uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0)
+ uabd v7.8h, v27.8h, v24.8h // abs(q3 - q0)
+.endif
+ umax v2.8h, v2.8h, v3.8h
+ umax v4.8h, v4.8h, v5.8h
+.if \wd >= 8
+ umax v6.8h, v6.8h, v7.8h
+.endif
+ umax v2.8h, v2.8h, v4.8h
+ ushl v10.8h, v10.8h, v9.8h // F = 1 << bitdepth_min_8
+.if \wd >= 8
+ umax v2.8h, v2.8h, v6.8h
+.endif
+
+.if \wd == 16
+ uabd v3.8h, v17.8h, v23.8h // abs(p6 - p0)
+ uabd v4.8h, v18.8h, v23.8h // abs(p5 - p0)
+ uabd v5.8h, v19.8h, v23.8h // abs(p4 - p0)
+.endif
+ cmhs v2.8h, v10.8h, v2.8h // flat8in
+.if \wd == 16
+ uabd v6.8h, v28.8h, v24.8h // abs(q4 - q0)
+ uabd v7.8h, v29.8h, v24.8h // abs(q5 - q0)
+ uabd v8.8h, v30.8h, v24.8h // abs(q6 - q0)
+.endif
+ and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4
+ bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in
+.if \wd == 16
+ umax v3.8h, v3.8h, v4.8h
+ umax v5.8h, v5.8h, v6.8h
+.endif
+ mov x16, v1.d[0]
+ mov x17, v1.d[1]
+.if \wd == 16
+ umax v7.8h, v7.8h, v8.8h
+ umax v3.8h, v3.8h, v5.8h
+ umax v3.8h, v3.8h, v7.8h
+ cmhs v3.8h, v10.8h, v3.8h // flat8out
+.endif
+ adds x16, x16, x17
+.if \wd == 16
+ and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16
+ and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
+ bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
+.endif
+ b.eq 1f // skip wd == 4 case
+.endif
+
+ dup v3.8h, w8 // bitdepth_max
+ sub v2.8h, v22.8h, v25.8h // p1 - q1
+ ushr v3.8h, v3.8h, #1 // 128 << bitdepth_min_8 - 1
+ cmhi v0.8h, v0.8h, v12.8h // hev
+ not v9.16b, v3.16b // - 128 * (1 << bitdepth_min_8)
+ smin v2.8h, v2.8h, v3.8h // iclip_diff(p1 - q1)
+ smax v2.8h, v2.8h, v9.8h // iclip_diff(p1 - q1)
+ and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1)
+ sub v2.8h, v24.8h, v23.8h
+ movi v5.8h, #3
+ bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev)
+ mul v2.8h, v2.8h, v5.8h
+ movi v6.8h, #4
+ add v2.8h, v2.8h, v4.8h
+ smin v2.8h, v2.8h, v3.8h // f = iclip_diff()
+ smax v2.8h, v2.8h, v9.8h // f = iclip_diff()
+ sqadd v4.8h, v6.8h, v2.8h // f + 4
+ sqadd v5.8h, v5.8h, v2.8h // f + 3
+ smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1)
+ smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1)
+ sshr v4.8h, v4.8h, #3 // f1
+ sshr v5.8h, v5.8h, #3 // f2
+ movi v9.8h, #0
+ dup v3.8h, w8 // bitdepth_max
+ sqadd v2.8h, v23.8h, v5.8h // p0 + f2
+ sqsub v6.8h, v24.8h, v4.8h // q0 - f1
+ srshr v4.8h, v4.8h, #1 // (f1 + 1) >> 1
+ smin v2.8h, v2.8h, v3.8h // out p0 = iclip_pixel()
+ smin v6.8h, v6.8h, v3.8h // out q0 = iclip_pixel()
+ smax v2.8h, v2.8h, v9.8h // out p0 = iclip_pixel()
+ smax v6.8h, v6.8h, v9.8h // out q0 = iclip_pixel()
+ bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4)
+ bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4)
+ sqadd v2.8h, v22.8h, v4.8h // p1 + f
+ sqsub v6.8h, v25.8h, v4.8h // q1 - f
+ smin v2.8h, v2.8h, v3.8h // out p1 = iclip_pixel()
+ smin v6.8h, v6.8h, v3.8h // out q1 = iclip_pixel()
+ smax v2.8h, v2.8h, v9.8h // out p1 = iclip_pixel()
+ smax v6.8h, v6.8h, v9.8h // out q1 = iclip_pixel()
+ bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev)
+ bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+ b.eq 2f // skip if there's no flat8in
+
+ add v0.8h, v21.8h, v21.8h // p2 * 2
+ add v2.8h, v21.8h, v22.8h // p2 + p1
+ add v4.8h, v22.8h, v23.8h // p1 + p0
+ add v6.8h, v23.8h, v24.8h // p0 + q0
+ add v8.8h, v0.8h, v2.8h
+ add v10.8h, v4.8h, v6.8h
+ add v12.8h, v24.8h, v25.8h // q0 + q1
+ add v8.8h, v8.8h, v10.8h
+ sub v12.8h, v12.8h, v0.8h
+ add v10.8h, v25.8h, v26.8h // q1 + q2
+ urshr v0.8h, v8.8h, #3 // out p1
+
+ add v8.8h, v8.8h, v12.8h
+ sub v10.8h, v10.8h, v2.8h
+ add v12.8h, v26.8h, v26.8h // q2 + q2
+ urshr v1.8h, v8.8h, #3 // out p0
+
+ add v8.8h, v8.8h, v10.8h
+ sub v12.8h, v12.8h, v4.8h
+ urshr v2.8h, v8.8h, #3 // out q0
+
+ bit v22.16b, v0.16b, v14.16b // p1 if (flat8in)
+ add v8.8h, v8.8h, v12.8h
+ bit v23.16b, v1.16b, v14.16b // p0 if (flat8in)
+ urshr v3.8h, v8.8h, #3 // out q1
+ bit v24.16b, v2.16b, v14.16b // q0 if (flat8in)
+ bit v25.16b, v3.16b, v14.16b // q1 if (flat8in)
+.elseif \wd >= 8
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+.if \wd == 8
+ b.eq 8f // skip if there's no flat8in
+.else
+ b.eq 2f // skip if there's no flat8in
+.endif
+
+ add v0.8h, v20.8h, v21.8h // p3 + p2
+ add v2.8h, v22.8h, v25.8h // p1 + q1
+ add v4.8h, v20.8h, v22.8h // p3 + p1
+ add v6.8h, v23.8h, v26.8h // p0 + q2
+ add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2)
+ add v9.8h, v23.8h, v24.8h // p0 + q0
+ add v8.8h, v8.8h, v4.8h // + p3 + p1
+ sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2
+ add v8.8h, v8.8h, v9.8h // + p0 + q0
+ sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1
+ urshr v10.8h, v8.8h, #3 // out p2
+
+ add v8.8h, v8.8h, v2.8h
+ add v0.8h, v20.8h, v23.8h // p3 + p0
+ add v2.8h, v24.8h, v27.8h // q0 + q3
+ urshr v11.8h, v8.8h, #3 // out p1
+
+ add v8.8h, v8.8h, v6.8h
+ sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0
+ add v4.8h, v21.8h, v24.8h // p2 + q0
+ add v6.8h, v25.8h, v27.8h // q1 + q3
+ urshr v12.8h, v8.8h, #3 // out p0
+
+ add v8.8h, v8.8h, v2.8h
+ sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0
+ add v0.8h, v22.8h, v25.8h // p1 + q1
+ add v2.8h, v26.8h, v27.8h // q2 + q3
+ urshr v13.8h, v8.8h, #3 // out q0
+
+ add v8.8h, v8.8h, v6.8h
+ sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
+ urshr v0.8h, v8.8h, #3 // out q1
+
+ add v8.8h, v8.8h, v2.8h
+
+ bit v21.16b, v10.16b, v14.16b
+ bit v22.16b, v11.16b, v14.16b
+ bit v23.16b, v12.16b, v14.16b
+ urshr v1.8h, v8.8h, #3 // out q2
+ bit v24.16b, v13.16b, v14.16b
+ bit v25.16b, v0.16b, v14.16b
+ bit v26.16b, v1.16b, v14.16b
+.endif
+2:
+.if \wd == 16
+ mov x16, v15.d[0]
+ mov x17, v15.d[1]
+ adds x16, x16, x17
+ b.ne 1f // check if flat8out is needed
+ mov x16, v14.d[0]
+ mov x17, v14.d[1]
+ adds x16, x16, x17
+ b.eq 8f // if there was no flat8in, just write the inner 4 pixels
+ b 7f // if flat8in was used, write the inner 6 pixels
+1:
+
+ add v2.8h, v17.8h, v17.8h // p6 + p6
+ add v4.8h, v17.8h, v18.8h // p6 + p5
+ add v6.8h, v17.8h, v19.8h // p6 + p4
+ add v8.8h, v17.8h, v20.8h // p6 + p3
+ add v12.8h, v2.8h, v4.8h
+ add v10.8h, v6.8h, v8.8h
+ add v6.8h, v17.8h, v21.8h // p6 + p2
+ add v12.8h, v12.8h, v10.8h
+ add v8.8h, v17.8h, v22.8h // p6 + p1
+ add v10.8h, v18.8h, v23.8h // p5 + p0
+ add v6.8h, v6.8h, v8.8h
+ add v8.8h, v19.8h, v24.8h // p4 + q0
+ add v12.8h, v12.8h, v6.8h
+ add v10.8h, v10.8h, v8.8h
+ add v6.8h, v20.8h, v25.8h // p3 + q1
+ add v12.8h, v12.8h, v10.8h
+ sub v6.8h, v6.8h, v2.8h
+ add v2.8h, v21.8h, v26.8h // p2 + q2
+ urshr v0.8h, v12.8h, #4 // out p5
+ add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1)
+ sub v2.8h, v2.8h, v4.8h
+ add v4.8h, v22.8h, v27.8h // p1 + q3
+ add v6.8h, v17.8h, v19.8h // p6 + p4
+ urshr v1.8h, v12.8h, #4 // out p4
+ add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2)
+ sub v4.8h, v4.8h, v6.8h
+ add v6.8h, v23.8h, v28.8h // p0 + q4
+ add v8.8h, v17.8h, v20.8h // p6 + p3
+ urshr v2.8h, v12.8h, #4 // out p3
+ add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3)
+ sub v6.8h, v6.8h, v8.8h
+ add v8.8h, v24.8h, v29.8h // q0 + q5
+ add v4.8h, v17.8h, v21.8h // p6 + p2
+ urshr v3.8h, v12.8h, #4 // out p2
+ add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4)
+ sub v8.8h, v8.8h, v4.8h
+ add v6.8h, v25.8h, v30.8h // q1 + q6
+ add v10.8h, v17.8h, v22.8h // p6 + p1
+ urshr v4.8h, v12.8h, #4 // out p1
+ add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5)
+ sub v6.8h, v6.8h, v10.8h
+ add v8.8h, v26.8h, v30.8h // q2 + q6
+ bif v0.16b, v18.16b, v15.16b // out p5
+ add v10.8h, v18.8h, v23.8h // p5 + p0
+ urshr v5.8h, v12.8h, #4 // out p0
+ add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6)
+ sub v8.8h, v8.8h, v10.8h
+ add v10.8h, v27.8h, v30.8h // q3 + q6
+ bif v1.16b, v19.16b, v15.16b // out p4
+ add v18.8h, v19.8h, v24.8h // p4 + q0
+ urshr v6.8h, v12.8h, #4 // out q0
+ add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
+ sub v10.8h, v10.8h, v18.8h
+ add v8.8h, v28.8h, v30.8h // q4 + q6
+ bif v2.16b, v20.16b, v15.16b // out p3
+ add v18.8h, v20.8h, v25.8h // p3 + q1
+ urshr v7.8h, v12.8h, #4 // out q1
+ add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
+ sub v18.8h, v8.8h, v18.8h
+ add v10.8h, v29.8h, v30.8h // q5 + q6
+ bif v3.16b, v21.16b, v15.16b // out p2
+ add v20.8h, v21.8h, v26.8h // p2 + q2
+ urshr v8.8h, v12.8h, #4 // out q2
+ add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6)
+ sub v10.8h, v10.8h, v20.8h
+ add v18.8h, v30.8h, v30.8h // q6 + q6
+ bif v4.16b, v22.16b, v15.16b // out p1
+ add v20.8h, v22.8h, v27.8h // p1 + q3
+ urshr v9.8h, v12.8h, #4 // out q3
+ add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
+ sub v18.8h, v18.8h, v20.8h
+ bif v5.16b, v23.16b, v15.16b // out p0
+ urshr v10.8h, v12.8h, #4 // out q4
+ add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6)
+ urshr v11.8h, v12.8h, #4 // out q5
+ bif v6.16b, v24.16b, v15.16b // out q0
+ bif v7.16b, v25.16b, v15.16b // out q1
+ bif v8.16b, v26.16b, v15.16b // out q2
+ bif v9.16b, v27.16b, v15.16b // out q3
+ bif v10.16b, v28.16b, v15.16b // out q4
+ bif v11.16b, v29.16b, v15.16b // out q5
+.endif
+
+ mov x14, #0
+ ret
+.if \wd == 16
+7:
+ // Return to a shorter epilogue, writing only the inner 6 pixels
+ mov x14, #(1 << 6)
+ ret
+.endif
+.if \wd >= 8
+8:
+ // Return to a shorter epilogue, writing only the inner 4 pixels
+ mov x14, #(1 << 4)
+ ret
+.endif
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_8_wd16
+ bl lpf_8_wd16_neon
+ cbz x14, 1f
+ tbnz x14, #6, 7f
+ tbnz x14, #4, 8f
+ ret x15
+1:
+.endm
+
+.macro lpf_8_wd8
+ bl lpf_8_wd8_neon
+ cbz x14, 1f
+ tbnz x14, #4, 8f
+ ret x15
+1:
+.endm
+
+.macro lpf_8_wd6
+ bl lpf_8_wd6_neon
+ cbz x14, 1f
+ ret x15
+1:
+.endm
+
+.macro lpf_8_wd4
+ bl lpf_8_wd4_neon
+ cbz x14, 1f
+ ret x15
+1:
+.endm
+
+function lpf_v_4_8_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #1
+ ld1 {v22.8h}, [x16], x1 // p1
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v23.8h}, [x16], x1 // p0
+ ld1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+
+ lpf_8_wd4
+
+ sub x16, x0, x1, lsl #1
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_4_8_neon
+ mov x15, x30
+ sub x16, x0, #4
+ add x0, x16, x1, lsl #2
+ ld1 {v22.d}[0], [x16], x1
+ ld1 {v22.d}[1], [x0], x1
+ ld1 {v23.d}[0], [x16], x1
+ ld1 {v23.d}[1], [x0], x1
+ ld1 {v24.d}[0], [x16], x1
+ ld1 {v24.d}[1], [x0], x1
+ ld1 {v25.d}[0], [x16], x1
+ ld1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_8_wd4
+
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #4
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+endfunc
+
+function lpf_v_6_8_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #1
+ sub x16, x16, x1
+ ld1 {v21.8h}, [x16], x1 // p2
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v22.8h}, [x16], x1 // p1
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v23.8h}, [x16], x1 // p0
+ ld1 {v26.8h}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+
+ lpf_8_wd6
+
+ sub x16, x0, x1, lsl #1
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_6_8_neon
+ mov x15, x30
+ sub x16, x0, #8
+ add x0, x16, x1, lsl #2
+ ld1 {v20.8h}, [x16], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v21.8h}, [x16], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v22.8h}, [x16], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v23.8h}, [x16], x1
+ ld1 {v27.8h}, [x0], x1
+ add x0, x0, #8
+
+ transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_8_wd6
+
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #4
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+endfunc
+
+function lpf_v_8_8_neon
+ mov x15, x30
+ sub x16, x0, x1, lsl #2
+ ld1 {v20.8h}, [x16], x1 // p3
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v21.8h}, [x16], x1 // p2
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v22.8h}, [x16], x1 // p1
+ ld1 {v26.8h}, [x0], x1 // q2
+ ld1 {v23.8h}, [x16], x1 // p0
+ ld1 {v27.8h}, [x0], x1 // q3
+ sub x0, x0, x1, lsl #2
+
+ lpf_8_wd8
+
+ sub x16, x0, x1, lsl #1
+ sub x16, x16, x1
+ st1 {v21.8h}, [x16], x1 // p2
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v25.8h}, [x0], x1 // q1
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v26.8h}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+ ret x15
+
+8:
+ sub x16, x0, x1, lsl #1
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_8_8_neon
+ mov x15, x30
+ sub x16, x0, #8
+ add x0, x16, x1, lsl #2
+ ld1 {v20.8h}, [x16], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v21.8h}, [x16], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v22.8h}, [x16], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v23.8h}, [x16], x1
+ ld1 {v27.8h}, [x0], x1
+ add x0, x0, #8
+
+ transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ lpf_8_wd8
+
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #8
+ transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v20.8h}, [x16], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v21.8h}, [x16], x1
+ st1 {v25.8h}, [x0], x1
+ st1 {v22.8h}, [x16], x1
+ st1 {v26.8h}, [x0], x1
+ st1 {v23.8h}, [x16], x1
+ st1 {v27.8h}, [x0], x1
+ add x0, x0, #8
+ ret x15
+8:
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #4
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+endfunc
+
+function lpf_v_16_8_neon
+ mov x15, x30
+
+ sub x16, x0, x1, lsl #3
+ add x16, x16, x1
+ ld1 {v17.8h}, [x16], x1 // p6
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v18.8h}, [x16], x1 // p5
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v19.8h}, [x16], x1 // p4
+ ld1 {v26.8h}, [x0], x1 // q2
+ ld1 {v20.8h}, [x16], x1 // p3
+ ld1 {v27.8h}, [x0], x1 // q3
+ ld1 {v21.8h}, [x16], x1 // p2
+ ld1 {v28.8h}, [x0], x1 // q4
+ ld1 {v22.8h}, [x16], x1 // p1
+ ld1 {v29.8h}, [x0], x1 // q5
+ ld1 {v23.8h}, [x16], x1 // p0
+ ld1 {v30.8h}, [x0], x1 // q6
+ sub x0, x0, x1, lsl #3
+ add x0, x0, x1
+
+ lpf_8_wd16
+
+ sub x16, x0, x1, lsl #2
+ sub x16, x16, x1, lsl #1
+ st1 {v0.8h}, [x16], x1 // p5
+ st1 {v6.8h}, [x0], x1 // q0
+ st1 {v1.8h}, [x16], x1 // p4
+ st1 {v7.8h}, [x0], x1 // q1
+ st1 {v2.8h}, [x16], x1 // p3
+ st1 {v8.8h}, [x0], x1 // q2
+ st1 {v3.8h}, [x16], x1 // p2
+ st1 {v9.8h}, [x0], x1 // q3
+ st1 {v4.8h}, [x16], x1 // p1
+ st1 {v10.8h}, [x0], x1 // q4
+ st1 {v5.8h}, [x16], x1 // p0
+ st1 {v11.8h}, [x0], x1 // q5
+ sub x0, x0, x1, lsl #2
+ sub x0, x0, x1, lsl #1
+ ret x15
+7:
+ sub x16, x0, x1
+ sub x16, x16, x1, lsl #1
+ st1 {v21.8h}, [x16], x1 // p2
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v25.8h}, [x0], x1 // q1
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v26.8h}, [x0], x1 // q2
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+ ret x15
+
+8:
+ sub x16, x0, x1, lsl #1
+ st1 {v22.8h}, [x16], x1 // p1
+ st1 {v24.8h}, [x0], x1 // q0
+ st1 {v23.8h}, [x16], x1 // p0
+ st1 {v25.8h}, [x0], x1 // q1
+ sub x0, x0, x1, lsl #1
+ ret x15
+endfunc
+
+function lpf_h_16_8_neon
+ mov x15, x30
+ sub x16, x0, #16
+ ld1 {v16.8h}, [x16], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v17.8h}, [x16], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v18.8h}, [x16], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v19.8h}, [x16], x1
+ ld1 {v27.8h}, [x0], x1
+ ld1 {v20.8h}, [x16], x1
+ ld1 {v28.8h}, [x0], x1
+ ld1 {v21.8h}, [x16], x1
+ ld1 {v29.8h}, [x0], x1
+ ld1 {v22.8h}, [x16], x1
+ ld1 {v30.8h}, [x0], x1
+ ld1 {v23.8h}, [x16], x1
+ ld1 {v31.8h}, [x0], x1
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+ transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
+
+ lpf_8_wd16
+
+ sub x0, x0, x1, lsl #3
+ sub x16, x0, #16
+
+ transpose_8x8h v16, v17, v0, v1, v2, v3, v4, v5, v18, v19
+ transpose_8x8h v6, v7, v8, v9, v10, v11, v30, v31, v18, v19
+
+ st1 {v16.8h}, [x16], x1
+ st1 {v6.8h}, [x0], x1
+ st1 {v17.8h}, [x16], x1
+ st1 {v7.8h}, [x0], x1
+ st1 {v0.8h}, [x16], x1
+ st1 {v8.8h}, [x0], x1
+ st1 {v1.8h}, [x16], x1
+ st1 {v9.8h}, [x0], x1
+ st1 {v2.8h}, [x16], x1
+ st1 {v10.8h}, [x0], x1
+ st1 {v3.8h}, [x16], x1
+ st1 {v11.8h}, [x0], x1
+ st1 {v4.8h}, [x16], x1
+ st1 {v30.8h}, [x0], x1
+ st1 {v5.8h}, [x16], x1
+ st1 {v31.8h}, [x0], x1
+ ret x15
+
+7:
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #8
+ transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v20.8h}, [x16], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v21.8h}, [x16], x1
+ st1 {v25.8h}, [x0], x1
+ st1 {v22.8h}, [x16], x1
+ st1 {v26.8h}, [x0], x1
+ st1 {v23.8h}, [x16], x1
+ st1 {v27.8h}, [x0], x1
+ add x0, x0, #8
+ ret x15
+8:
+ sub x16, x0, x1, lsl #3
+ sub x16, x16, #4
+ transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29
+ add x0, x16, x1, lsl #2
+
+ st1 {v22.d}[0], [x16], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x16], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x16], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x16], x1
+ st1 {v25.d}[1], [x0], x1
+ add x0, x0, #4
+ ret x15
+endfunc
+
+// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint32_t *const vmask,
+// const uint8_t (*l)[4], ptrdiff_t b4_stride,
+// const Av1FilterLUT *lut, const int w,
+// const int bitdepth_max)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
+ mov x11, x30
+ mov w8, w7 // bitdepth_max
+ clz w9, w8
+ mov w10, #24
+ sub w9, w10, w9 // bitdepth_min_8
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+ ldp w6, w7, [x2] // vmask[0], vmask[1]
+.ifc \type, y
+ ldr w2, [x2, #8] // vmask[2]
+.endif
+ add x5, x5, #128 // Move to sharp part of lut
+.ifc \type, y
+ orr w7, w7, w2 // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+ sub x4, x3, x4, lsl #2
+.else
+ sub x3, x3, #4
+ lsl x4, x4, #2
+.endif
+ orr w6, w6, w7 // vmask[0] |= vmask[1]
+
+1:
+ tst w6, #0x03
+.ifc \dir, v
+ ld1 {v0.8b}, [x4], #8
+ ld1 {v1.8b}, [x3], #8
+.else
+ ld2 {v0.s,v1.s}[0], [x3], x4
+ ld2 {v0.s,v1.s}[1], [x3], x4
+.endif
+ b.eq 7f // if (!(vm & bits)) continue;
+
+ ld1r {v5.8b}, [x5] // sharp[0]
+ add x5, x5, #8
+ movi v2.2s, #0xff
+ dup v13.2s, w6 // vmask[0]
+ dup v31.8h, w9 // bitdepth_min_8
+
+ and v0.8b, v0.8b, v2.8b // Keep only lowest byte in each 32 bit word
+ and v1.8b, v1.8b, v2.8b
+ cmtst v3.8b, v1.8b, v2.8b // Check for nonzero values in l[0][0]
+ movi v4.8b, #1
+ ld1r {v6.8b}, [x5] // sharp[1]
+ sub x5, x5, #8
+ bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0]
+ cmtst v2.2s, v1.2s, v2.2s // L != 0
+ mul v1.2s, v1.2s, v4.2s // L
+.ifc \type, y
+ dup v15.2s, w2 // vmask[2]
+.endif
+ dup v14.2s, w7 // vmask[1]
+ mov x16, v2.d[0]
+ cmp x16, #0
+ b.eq 7f // if (!L) continue;
+ neg v5.8b, v5.8b // -sharp[0]
+ movrel x16, word_12
+ ushr v12.8b, v1.8b, #4 // H
+ ld1 {v16.2s}, [x16]
+ sshl v3.8b, v1.8b, v5.8b // L >> sharp[0]
+.ifc \type, y
+ cmtst v15.2s, v15.2s, v16.2s // if (vmask[2] & bits)
+.endif
+ movi v7.8b, #2
+ umin v3.8b, v3.8b, v6.8b // imin(L >> sharp[0], sharp[1])
+ add v0.8b, v1.8b, v7.8b // L + 2
+ umax v11.8b, v3.8b, v4.8b // imax(imin(), 1) = limit = I
+ add v0.8b, v0.8b, v0.8b // 2*(L + 2)
+ cmtst v14.2s, v14.2s, v16.2s // if (vmask[1] & bits)
+ uxtl v12.8h, v12.8b
+ add v10.8b, v0.8b, v11.8b // 2*(L + 2) + limit = E
+ cmtst v13.2s, v13.2s, v16.2s // if (vmask[0] & bits)
+ uxtl v11.8h, v11.8b
+ uxtl v10.8h, v10.8b
+ and v13.8b, v13.8b, v2.8b // vmask[0] &= L != 0
+ sxtl v14.8h, v14.8b
+ sxtl v13.8h, v13.8b
+.ifc \type, y
+ sxtl v15.8h, v15.8b
+.endif
+ ushl v12.8h, v12.8h, v31.8h
+ ushl v11.8h, v11.8h, v31.8h
+ ushl v10.8h, v10.8h, v31.8h
+
+.ifc \type, y
+ tst w2, #0x03
+ b.eq 2f
+ // wd16
+ bl lpf_\dir\()_16_8_neon
+ b 8f
+2:
+.endif
+ tst w7, #0x03
+ b.eq 3f
+.ifc \type, y
+ // wd8
+ bl lpf_\dir\()_8_8_neon
+.else
+ // wd6
+ bl lpf_\dir\()_6_8_neon
+.endif
+ b 8f
+3:
+ // wd4
+ bl lpf_\dir\()_4_8_neon
+.ifc \dir, h
+ b 8f
+7:
+ // For dir h, the functions above increment x0.
+ // If the whole function is skipped, increment it here instead.
+ add x0, x0, x1, lsl #3
+.else
+7:
+.endif
+8:
+ lsr w6, w6, #2 // vmask[0] >>= 2
+ lsr w7, w7, #2 // vmask[1] >>= 2
+.ifc \type, y
+ lsr w2, w2, #2 // vmask[2] >>= 2
+.endif
+.ifc \dir, v
+ add x0, x0, #16
+.else
+ // For dir h, x0 is returned incremented
+.endif
+ cbnz w6, 1b
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ ret x11
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
+
+const word_12
+ .word 1, 2
+endconst
diff --git a/third_party/dav1d/src/arm/64/looprestoration.S b/third_party/dav1d/src/arm/64/looprestoration.S
new file mode 100644
index 0000000000..a598b72b03
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration.S
@@ -0,0 +1,1336 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges);
+function wiener_filter7_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*6
+
+ mov w17, #(1 << 14) - (1 << 2)
+ dup v30.8h, w17
+ movi v31.8h, #8, lsl #8
+
+ // x9 - t6
+ // x10 - t5
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_7)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter7_h_8bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ mov x13, x14 // t2
+ subs w5, w5, #1 // h--
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += stride
+
+L(main_7):
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+L(main_loop_7):
+ bl wiener_filter7_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_7)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v3_7)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter7_hv_8bpc_neon
+ bl wiener_filter7_hv_8bpc_neon
+L(v1_7):
+ bl wiener_filter7_v_8bpc_neon
+
+ mov sp, x29
+ ldp x29, x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_7):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter7_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x15, x15, #384*2*4 // t0 += 384*2*4
+ bl wiener_filter7_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_7)
+L(v3_7):
+ bl wiener_filter7_v_8bpc_neon
+L(v2_7):
+ bl wiener_filter7_v_8bpc_neon
+ b L(v1_7)
+endfunc
+
+
+function wiener_filter7_h_8bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #3
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+ b 2f
+
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 3x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ shl v22.8h, v18.8h, #7
+ mul v6.8h, v18.8h, v0.h[3]
+ mla v6.8h, v19.8h, v0.h[4]
+ mla v6.8h, v20.8h, v0.h[5]
+ mla v6.8h, v21.8h, v0.h[6]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ shl v23.8h, v18.8h, #7
+ mul v7.8h, v18.8h, v0.h[3]
+ mla v7.8h, v19.8h, v0.h[4]
+ mla v7.8h, v20.8h, v0.h[5]
+ mla v7.8h, v21.8h, v0.h[6]
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter7_v_8bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, afterwards.
+ stp x10, x11, [sp, #-64]!
+ stp x12, x13, [sp, #16]
+ stp x14, x14, [sp, #32]
+ stp x0, x4, [sp, #48]
+1:
+ ld1 {v20.8h, v21.8h}, [x11], #32
+ ld1 {v24.8h, v25.8h}, [x13], #32
+
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ add v24.8h, v24.8h, v20.8h
+ ld1 {v26.8h, v27.8h}, [x14], #32
+
+ ld1 {v16.8h, v17.8h}, [x9], #32
+ add v28.8h, v26.8h, v18.8h
+ ld1 {v22.8h, v23.8h}, [x12], #32
+
+ add v16.8h, v26.8h, v16.8h
+ add v25.8h, v25.8h, v21.8h
+
+ smull v2.4s, v22.4h, v1.h[3]
+ smlal v2.4s, v24.4h, v1.h[4]
+ smlal v2.4s, v28.4h, v1.h[5]
+ smlal v2.4s, v16.4h, v1.h[6]
+ add v29.8h, v27.8h, v19.8h
+ smull2 v3.4s, v22.8h, v1.h[3]
+ smlal2 v3.4s, v24.8h, v1.h[4]
+ smlal2 v3.4s, v28.8h, v1.h[5]
+ smlal2 v3.4s, v16.8h, v1.h[6]
+ add v17.8h, v27.8h, v17.8h
+ smull v4.4s, v23.4h, v1.h[3]
+ smlal v4.4s, v25.4h, v1.h[4]
+ smlal v4.4s, v29.4h, v1.h[5]
+ smlal v4.4s, v17.4h, v1.h[6]
+ smull2 v5.4s, v23.8h, v1.h[3]
+ smlal2 v5.4s, v25.8h, v1.h[4]
+ smlal2 v5.4s, v29.8h, v1.h[5]
+ smlal2 v5.4s, v17.8h, v1.h[6]
+ sqrshrun v2.4h, v2.4s, #11
+ sqrshrun2 v2.8h, v3.4s, #11
+ sqrshrun v3.4h, v4.4s, #11
+ sqrshrun2 v3.8h, v5.4s, #11
+ sqxtun v2.8b, v2.8h
+ sqxtun2 v2.16b, v3.8h
+ subs w4, w4, #16
+ st1 {v2.16b}, [x0], #16
+ b.gt 1b
+
+ ldp x0, x4, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #64
+
+ add x0, x0, x1
+ ret
+endfunc
+
+function wiener_filter7_hv_8bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, and x15==x9, afterwards.
+ stp x10, x11, [sp, #-80]!
+ stp x12, x13, [sp, #16]
+ stp x14, x15, [sp, #32]
+ stp x10, x0, [sp, #48]
+ stp x3, x4, [sp, #64]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #3
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+ b 2f
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 3x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #3
+ ext v3.16b, v2.16b, v3.16b, #13
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ shl v22.8h, v18.8h, #7
+ mul v6.8h, v18.8h, v0.h[3]
+ mla v6.8h, v19.8h, v0.h[4]
+ mla v6.8h, v20.8h, v0.h[5]
+ mla v6.8h, v21.8h, v0.h[6]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ shl v23.8h, v18.8h, #7
+ mul v7.8h, v18.8h, v0.h[3]
+ mla v7.8h, v19.8h, v0.h[4]
+ mla v7.8h, v20.8h, v0.h[5]
+ mla v7.8h, v21.8h, v0.h[6]
+
+ ld1 {v20.8h, v21.8h}, [x11], #32
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ ld1 {v26.8h, v27.8h}, [x13], #32
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ ld1 {v28.8h, v29.8h}, [x14], #32
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ ld1 {v16.8h, v17.8h}, [x9], #32
+ add v26.8h, v20.8h, v26.8h
+
+ ld1 {v24.8h, v25.8h}, [x12], #32
+ add v28.8h, v18.8h, v28.8h
+
+ add v16.8h, v16.8h, v6.8h
+ add v27.8h, v21.8h, v27.8h
+
+ smull v18.4s, v24.4h, v1.h[3]
+ smlal v18.4s, v26.4h, v1.h[4]
+ smlal v18.4s, v28.4h, v1.h[5]
+ smlal v18.4s, v16.4h, v1.h[6]
+ add v29.8h, v19.8h, v29.8h
+ smull2 v19.4s, v24.8h, v1.h[3]
+ smlal2 v19.4s, v26.8h, v1.h[4]
+ smlal2 v19.4s, v28.8h, v1.h[5]
+ smlal2 v19.4s, v16.8h, v1.h[6]
+ add v17.8h, v17.8h, v7.8h
+ smull v20.4s, v25.4h, v1.h[3]
+ smlal v20.4s, v27.4h, v1.h[4]
+ smlal v20.4s, v29.4h, v1.h[5]
+ smlal v20.4s, v17.4h, v1.h[6]
+ smull2 v21.4s, v25.8h, v1.h[3]
+ smlal2 v21.4s, v27.8h, v1.h[4]
+ smlal2 v21.4s, v29.8h, v1.h[5]
+ smlal2 v21.4s, v17.8h, v1.h[6]
+ sqrshrun v18.4h, v18.4s, #11
+ sqrshrun2 v18.8h, v19.4s, #11
+ sqrshrun v19.4h, v20.4s, #11
+ sqrshrun2 v19.8h, v21.4s, #11
+ st1 {v6.8h, v7.8h}, [x15], #32
+ sqxtun v18.8b, v18.8h
+ sqxtun2 v18.16b, v19.8h
+ subs w4, w4, #16
+
+ st1 {v18.16b}, [x0], #16
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #64]
+ ldp x15, x0, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #80
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges);
+function wiener_filter5_8bpc_neon, export=1
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*4
+
+ mov w17, #(1 << 14) - (1 << 2)
+ dup v30.8h, w17
+ movi v31.8h, #8, lsl #8
+
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_5)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter5_h_8bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_8bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x12, x14 // t3
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+
+L(main_5):
+ mov x15, x11 // t0 = t4
+L(main_loop_5):
+ bl wiener_filter5_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_5)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v2_5)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter5_hv_8bpc_neon
+ bl wiener_filter5_hv_8bpc_neon
+L(end_5):
+
+ mov sp, x29
+ ldp x29, x30, [sp], #16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_5):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter5_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x15, x15, #384*2*3 // t0 += 384*2*3
+ bl wiener_filter5_hv_8bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_5)
+L(v2_5):
+ bl wiener_filter5_v_8bpc_neon
+ add x0, x0, x1
+ mov x11, x12
+ mov x12, x13
+ mov x13, x14
+L(v1_5):
+ bl wiener_filter5_v_8bpc_neon
+ b L(end_5)
+endfunc
+
+
+function wiener_filter5_h_8bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #2
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+ b 2f
+
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 3x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ shl v22.8h, v17.8h, #7
+ mul v6.8h, v17.8h, v0.h[3]
+ mla v6.8h, v18.8h, v0.h[4]
+ mla v6.8h, v19.8h, v0.h[5]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ shl v23.8h, v17.8h, #7
+ mul v7.8h, v17.8h, v0.h[3]
+ mla v7.8h, v18.8h, v0.h[4]
+ mla v7.8h, v19.8h, v0.h[5]
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter5_v_8bpc_neon
+ stp x11, x12, [sp, #-48]!
+ stp x13, x14, [sp, #16]
+ stp x0, x4, [sp, #32]
+1:
+ ld1 {v18.8h, v19.8h}, [x12], #32
+ ld1 {v22.8h, v23.8h}, [x14], #32
+ ld1 {v16.8h, v17.8h}, [x11], #32
+
+ add v24.8h, v22.8h, v18.8h
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ add v16.8h, v22.8h, v16.8h
+ add v25.8h, v23.8h, v19.8h
+
+ smull v2.4s, v20.4h, v1.h[3]
+ smlal v2.4s, v24.4h, v1.h[4]
+ smlal v2.4s, v16.4h, v1.h[5]
+ add v17.8h, v23.8h, v17.8h
+ smull2 v3.4s, v20.8h, v1.h[3]
+ smlal2 v3.4s, v24.8h, v1.h[4]
+ smlal2 v3.4s, v16.8h, v1.h[5]
+ smull v4.4s, v21.4h, v1.h[3]
+ smlal v4.4s, v25.4h, v1.h[4]
+ smlal v4.4s, v17.4h, v1.h[5]
+ smull2 v5.4s, v21.8h, v1.h[3]
+ smlal2 v5.4s, v25.8h, v1.h[4]
+ smlal2 v5.4s, v17.8h, v1.h[5]
+ sqrshrun v2.4h, v2.4s, #11
+ sqrshrun2 v2.8h, v3.4s, #11
+ sqrshrun v3.4h, v4.4s, #11
+ sqrshrun2 v3.8h, v5.4s, #11
+ sqxtun v2.8b, v2.8h
+ sqxtun2 v2.16b, v3.8h
+ subs w4, w4, #16
+ st1 {v2.16b}, [x0], #16
+ b.gt 1b
+
+ ldp x0, x4, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #48
+
+ ret
+endfunc
+
+function wiener_filter5_hv_8bpc_neon
+ // Backing up/restoring registers shifted, so that x11 gets the value
+ // of x12, etc, and x15==x11, afterwards.
+ stp x12, x13, [sp, #-64]!
+ stp x14, x15, [sp, #16]
+ stp x12, x0, [sp, #32]
+ stp x3, x4, [sp, #48]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #2
+ ld1 {v3.16b}, [x3], #16
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v3.16b}, [x3], #16
+ ld1 {v2.s}[3], [x2], #4
+ // Move x3 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+ b 2f
+1:
+ ld1 {v3.16b}, [x3], #16
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 2x the first byte at the front.
+ dup v2.16b, v3.b[0]
+ // Move x3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #2
+ ext v3.16b, v2.16b, v3.16b, #14
+
+2:
+ ld1 {v4.8b}, [x3], #8
+ uxtl v2.8h, v3.8b
+ uxtl2 v3.8h, v3.16b
+ uxtl v4.8h, v4.8b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr b28, [x3, w17, sxtw]
+ sub x6, x6, w4, uxtw #1
+ dup v28.8h, v28.h[0]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
+
+ bit v2.16b, v28.16b, v25.16b
+ bit v3.16b, v28.16b, v26.16b
+ bit v4.16b, v28.16b, v27.16b
+
+4: // Loop horizontally
+
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ shl v22.8h, v17.8h, #7
+ mul v6.8h, v17.8h, v0.h[3]
+ mla v6.8h, v18.8h, v0.h[4]
+ mla v6.8h, v19.8h, v0.h[5]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ shl v23.8h, v17.8h, #7
+ mul v7.8h, v17.8h, v0.h[3]
+ mla v7.8h, v18.8h, v0.h[4]
+ mla v7.8h, v19.8h, v0.h[5]
+
+ ld1 {v18.8h, v19.8h}, [x12], #32
+
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ ld1 {v24.8h, v25.8h}, [x14], #32
+ sqadd v6.8h, v6.8h, v22.8h
+ sqadd v7.8h, v7.8h, v23.8h
+ ld1 {v16.8h, v17.8h}, [x11], #32
+ sshr v6.8h, v6.8h, #3
+ sshr v7.8h, v7.8h, #3
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ add v6.8h, v6.8h, v31.8h
+ add v7.8h, v7.8h, v31.8h
+
+ add v24.8h, v24.8h, v18.8h
+ add v16.8h, v16.8h, v6.8h
+
+ smull v18.4s, v20.4h, v1.h[3]
+ smlal v18.4s, v24.4h, v1.h[4]
+ smlal v18.4s, v16.4h, v1.h[5]
+ add v25.8h, v25.8h, v19.8h
+ smull2 v19.4s, v20.8h, v1.h[3]
+ smlal2 v19.4s, v24.8h, v1.h[4]
+ smlal2 v19.4s, v16.8h, v1.h[5]
+ add v17.8h, v17.8h, v7.8h
+ smull v20.4s, v21.4h, v1.h[3]
+ smlal v20.4s, v25.4h, v1.h[4]
+ smlal v20.4s, v17.4h, v1.h[5]
+ smull2 v21.4s, v21.8h, v1.h[3]
+ smlal2 v21.4s, v25.8h, v1.h[4]
+ smlal2 v21.4s, v17.8h, v1.h[5]
+ sqrshrun v18.4h, v18.4s, #11
+ sqrshrun2 v18.8h, v19.4s, #11
+ sqrshrun v19.4h, v20.4s, #11
+ sqrshrun2 v19.8h, v21.4s, #11
+ st1 {v6.8h, v7.8h}, [x15], #32
+ sqxtun v18.8b, v18.8h
+ sqxtun2 v18.16b, v19.8h
+ subs w4, w4, #16
+
+ st1 {v18.16b}, [x0], #16
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ ld1 {v4.16b}, [x3], #16
+ tst w7, #2 // LR_HAVE_RIGHT
+ uxtl v3.8h, v4.8b
+ uxtl2 v4.8h, v4.16b
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #48]
+ ldp x15, x0, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #64
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_h_8bpc_neon, export=1
+ add w5, w5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add x10, x0, #(4*SUM_STRIDE) // sumsq
+ add x11, x1, #(2*SUM_STRIDE) // sum
+ add x12, x3, x4 // src
+ lsl x4, x4, #1
+ mov x9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add w13, w5, #7
+ bic w13, w13, #7
+ sub x9, x9, w13, uxtw #1
+
+ // Store the width for the vertical loop
+ mov w8, w5
+
+ // Subtract the number of pixels read from the input from the stride
+ add w13, w13, #8
+ sub x4, x4, w13, uxtw
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 2f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #2
+ sub x12, x12, #2
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 2 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add x4, x4, #2
+
+
+1: // Loop vertically
+ ld1 {v0.16b}, [x3], #16
+ ld1 {v4.16b}, [x12], #16
+
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 0f
+ cbz x2, 2f
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v1.s}[3], [x2], #4
+ // Move x3/x12 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #2
+ sub x12, x12, #2
+ ld1 {v5.s}[3], [x2], #4
+ ext v0.16b, v1.16b, v0.16b, #14
+ ext v4.16b, v5.16b, v4.16b, #14
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+ // and shift v0 to have 2x the first byte at the front.
+ dup v1.16b, v0.b[0]
+ dup v5.16b, v4.b[0]
+ // Move x3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #2
+ sub x12, x12, #2
+ ext v0.16b, v1.16b, v0.16b, #14
+ ext v4.16b, v5.16b, v4.16b, #14
+
+2:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+ umull v5.8h, v4.8b, v4.8b
+ umull2 v6.8h, v4.16b, v4.16b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w5, #(2 + 16 - 2 + 1)
+ ldr b30, [x3, w13, sxtw]
+ ldr b31, [x12, w13, sxtw]
+ // Fill v30/v31 with the right padding pixel
+ dup v30.16b, v30.b[0]
+ dup v31.16b, v31.b[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w5, #10
+ b.ge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in v0/4.b[w] onwards
+ movrel x13, right_ext_mask
+ sub x13, x13, w5, uxtw
+ ld1 {v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v29.16b
+ bit v4.16b, v31.16b, v29.16b
+
+ // Update the precalculated squares
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+ umull v5.8h, v4.8b, v4.8b
+ umull2 v6.8h, v4.16b, v4.16b
+
+4: // Loop horizontally
+ ext v16.16b, v0.16b, v0.16b, #1
+ ext v17.16b, v0.16b, v0.16b, #2
+ ext v18.16b, v4.16b, v4.16b, #1
+ ext v19.16b, v4.16b, v4.16b, #2
+ uaddl v3.8h, v0.8b, v16.8b
+ uaddw v3.8h, v3.8h, v17.8b
+ uaddl v7.8h, v4.8b, v18.8b
+ uaddw v7.8h, v7.8h, v19.8b
+
+ ext v20.16b, v1.16b, v2.16b, #2
+ ext v21.16b, v1.16b, v2.16b, #4
+ ext v22.16b, v5.16b, v6.16b, #2
+ ext v23.16b, v5.16b, v6.16b, #4
+
+ uaddl v26.4s, v1.4h, v20.4h
+ uaddl2 v27.4s, v1.8h, v20.8h
+ uaddw v26.4s, v26.4s, v21.4h
+ uaddw2 v27.4s, v27.4s, v21.8h
+
+ uaddl v28.4s, v5.4h, v22.4h
+ uaddl2 v29.4s, v5.8h, v22.8h
+ uaddw v28.4s, v28.4s, v23.4h
+ uaddw2 v29.4s, v29.4s, v23.8h
+
+ subs w5, w5, #8
+
+ st1 {v3.8h}, [x1], #16
+ st1 {v7.8h}, [x11], #16
+ st1 {v26.4s,v27.4s}, [x0], #32
+ st1 {v28.4s,v29.4s}, [x10], #32
+
+ b.le 9f
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8b}, [x3], #8
+ ld1 {v7.8b}, [x12], #8
+ mov v1.16b, v2.16b
+ mov v5.16b, v6.16b
+ ext v0.16b, v0.16b, v3.16b, #8
+ ext v4.16b, v4.16b, v7.16b, #8
+ umull v2.8h, v3.8b, v3.8b
+ umull v6.8h, v7.8b, v7.8b
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs w6, w6, #2
+ b.le 0f
+ // Jump to the next row and loop horizontally
+ add x0, x0, x9, lsl #1
+ add x10, x10, x9, lsl #1
+ add x1, x1, x9
+ add x11, x11, x9
+ add x3, x3, x4
+ add x12, x12, x4
+ mov w5, w8
+ b 1b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_h_8bpc_neon, export=1
+ add w5, w5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add x10, x0, #(4*SUM_STRIDE) // sumsq
+ add x11, x1, #(2*SUM_STRIDE) // sum
+ add x12, x3, x4 // src
+ lsl x4, x4, #1
+ mov x9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add w13, w5, #7
+ bic w13, w13, #7
+ sub x9, x9, w13, uxtw #1
+ add w13, w13, #8
+ sub x4, x4, w13, uxtw
+
+ // Store the width for the vertical loop
+ mov w8, w5
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 2f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #3
+ sub x12, x12, #3
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add x4, x4, #3
+
+1: // Loop vertically
+ ld1 {v0.16b}, [x3], #16
+ ld1 {v4.16b}, [x12], #16
+
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 0f
+ cbz x2, 2f
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v1.s}[3], [x2], #4
+ // Move x3/x12 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #3
+ sub x12, x12, #3
+ ld1 {v5.s}[3], [x2], #4
+ ext v0.16b, v1.16b, v0.16b, #13
+ ext v4.16b, v5.16b, v4.16b, #13
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+ // and shift v0 to have 3x the first byte at the front.
+ dup v1.16b, v0.b[0]
+ dup v5.16b, v4.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #3
+ sub x12, x12, #3
+ ext v0.16b, v1.16b, v0.16b, #13
+ ext v4.16b, v5.16b, v4.16b, #13
+
+2:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+ umull v5.8h, v4.8b, v4.8b
+ umull2 v6.8h, v4.16b, v4.16b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w5, #(2 + 16 - 3 + 1)
+ ldr b30, [x3, w13, sxtw]
+ ldr b31, [x12, w13, sxtw]
+ // Fill v30/v31 with the right padding pixel
+ dup v30.16b, v30.b[0]
+ dup v31.16b, v31.b[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w5, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in v0/4.b[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel x13, right_ext_mask, -1
+ sub x13, x13, w5, uxtw
+ ld1 {v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v29.16b
+ bit v4.16b, v31.16b, v29.16b
+
+ // Update the precalculated squares
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+ umull v5.8h, v4.8b, v4.8b
+ umull2 v6.8h, v4.16b, v4.16b
+
+4: // Loop horizontally
+ ext v16.16b, v0.16b, v0.16b, #1
+ ext v17.16b, v0.16b, v0.16b, #2
+ ext v18.16b, v0.16b, v0.16b, #3
+ ext v19.16b, v0.16b, v0.16b, #4
+ ext v20.16b, v4.16b, v4.16b, #1
+ ext v21.16b, v4.16b, v4.16b, #2
+ ext v22.16b, v4.16b, v4.16b, #3
+ ext v23.16b, v4.16b, v4.16b, #4
+ uaddl v3.8h, v0.8b, v16.8b
+ uaddl v24.8h, v17.8b, v18.8b
+ uaddl v7.8h, v4.8b, v20.8b
+ uaddw v3.8h, v3.8h, v19.8b
+ uaddl v25.8h, v21.8b, v22.8b
+ uaddw v7.8h, v7.8h, v23.8b
+ add v3.8h, v3.8h, v24.8h
+ add v7.8h, v7.8h, v25.8h
+
+ ext v16.16b, v1.16b, v2.16b, #2
+ ext v17.16b, v1.16b, v2.16b, #4
+ ext v18.16b, v1.16b, v2.16b, #6
+ ext v19.16b, v1.16b, v2.16b, #8
+ ext v20.16b, v5.16b, v6.16b, #2
+ ext v21.16b, v5.16b, v6.16b, #4
+ ext v22.16b, v5.16b, v6.16b, #6
+ ext v23.16b, v5.16b, v6.16b, #8
+
+ uaddl v26.4s, v1.4h, v16.4h
+ uaddl2 v27.4s, v1.8h, v16.8h
+ uaddl v16.4s, v17.4h, v18.4h
+ uaddl2 v17.4s, v17.8h, v18.8h
+ uaddl v28.4s, v5.4h, v20.4h
+ uaddl2 v29.4s, v5.8h, v20.8h
+ uaddw v26.4s, v26.4s, v19.4h
+ uaddw2 v27.4s, v27.4s, v19.8h
+ uaddl v20.4s, v21.4h, v22.4h
+ uaddl2 v21.4s, v21.8h, v22.8h
+ uaddw v28.4s, v28.4s, v23.4h
+ uaddw2 v29.4s, v29.4s, v23.8h
+ add v26.4s, v26.4s, v16.4s
+ add v27.4s, v27.4s, v17.4s
+ add v28.4s, v28.4s, v20.4s
+ add v29.4s, v29.4s, v21.4s
+
+ subs w5, w5, #8
+
+ st1 {v3.8h}, [x1], #16
+ st1 {v7.8h}, [x11], #16
+ st1 {v26.4s,v27.4s}, [x0], #32
+ st1 {v28.4s,v29.4s}, [x10], #32
+
+ b.le 9f
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8b}, [x3], #8
+ ld1 {v7.8b}, [x12], #8
+ mov v1.16b, v2.16b
+ mov v5.16b, v6.16b
+ ext v0.16b, v0.16b, v3.16b, #8
+ ext v4.16b, v4.16b, v7.16b, #8
+ umull v2.8h, v3.8b, v3.8b
+ umull v6.8h, v7.8b, v7.8b
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs w6, w6, #2
+ b.le 0f
+ // Jump to the next row and loop horizontally
+ add x0, x0, x9, lsl #1
+ add x10, x10, x9, lsl #1
+ add x1, x1, x9
+ add x11, x11, x9
+ add x3, x3, x4
+ add x12, x12, x4
+ mov w5, w8
+ b 1b
+0:
+ ret
+endfunc
+
+sgr_funcs 8
diff --git a/third_party/dav1d/src/arm/64/looprestoration16.S b/third_party/dav1d/src/arm/64/looprestoration16.S
new file mode 100644
index 0000000000..8954e604cf
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration16.S
@@ -0,0 +1,1419 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+const right_ext_mask_buf
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+right_ext_mask:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+endconst
+
+// void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges,
+// const int bitdepth_max);
+function wiener_filter7_16bpc_neon, export=1
+ ldr w8, [sp]
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-32]!
+ stp d8, d9, [sp, #16]
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*6
+
+ dup v28.8h, w8 // bitdepth_max
+ clz w8, w8
+ movi v30.4s, #1
+ sub w10, w8, #38 // -(bitdepth + 6)
+ sub w11, w8, #11 // round_bits_v
+ sub w8, w8, #25 // -round_bits_h
+ neg w10, w10 // bitdepth + 6
+ neg w11, w11 // -round_bits_v
+ dup v2.4s, w10
+ dup v29.4s, w8 // -round_bits_h
+ dup v27.4s, w11 // -round_bits_v
+ movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192
+ ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6)
+
+ zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
+
+ // x9 - t6
+ // x10 - t5
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_7)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter7_h_16bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ mov x13, x14 // t2
+ subs w5, w5, #1 // h--
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += stride
+
+L(main_7):
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+L(main_loop_7):
+ bl wiener_filter7_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_7)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v3_7)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter7_hv_16bpc_neon
+ bl wiener_filter7_hv_16bpc_neon
+L(v1_7):
+ bl wiener_filter7_v_16bpc_neon
+
+ mov sp, x29
+ ldp d8, d9, [sp, #16]
+ ldp x29, x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_7):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x9, x14 // t6
+ mov x10, x14 // t5
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_7)
+ add x3, x3, x1 // src += p_stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v2_7)
+ add x3, x3, x1 // src += p_stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter7_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x3, x3, x1 // src += p_stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter7_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v3_7)
+ add x15, x15, #384*2*4 // t0 += 384*2*4
+ bl wiener_filter7_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_7)
+L(v3_7):
+ bl wiener_filter7_v_16bpc_neon
+L(v2_7):
+ bl wiener_filter7_v_16bpc_neon
+ b L(v1_7)
+endfunc
+
+
+function wiener_filter7_h_16bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #6
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+ b 2f
+
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v4 with the leftmost pixel
+ // and shift v3 to have 3x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ smull v6.4s, v18.4h, v0.h[3]
+ smlal v6.4s, v19.4h, v0.h[2]
+ smlal v6.4s, v20.4h, v0.h[1]
+ smlal v6.4s, v21.4h, v0.h[0]
+ smull2 v7.4s, v18.8h, v0.h[3]
+ smlal2 v7.4s, v19.8h, v0.h[2]
+ smlal2 v7.4s, v20.8h, v0.h[1]
+ smlal2 v7.4s, v21.8h, v0.h[0]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ smull v16.4s, v18.4h, v0.h[3]
+ smlal v16.4s, v19.4h, v0.h[2]
+ smlal v16.4s, v20.4h, v0.h[1]
+ smlal v16.4s, v21.4h, v0.h[0]
+ smull2 v17.4s, v18.8h, v0.h[3]
+ smlal2 v17.4s, v19.8h, v0.h[2]
+ smlal2 v17.4s, v20.8h, v0.h[1]
+ smlal2 v17.4s, v21.8h, v0.h[0]
+
+ mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v16.4s, v16.4s, v30.4s
+ add v17.4s, v17.4s, v30.4s
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v16.4s, v16.4s, v29.4s
+ srshl v17.4s, v17.4s, v29.4s
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v16.4s
+ sqxtun2 v7.8h, v17.4s
+ umin v6.8h, v6.8h, v24.8h
+ umin v7.8h, v7.8h, v24.8h
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter7_v_16bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, afterwards.
+ stp x10, x11, [sp, #-64]!
+ stp x12, x13, [sp, #16]
+ stp x14, x14, [sp, #32]
+ stp x0, x4, [sp, #48]
+1:
+ ld1 {v16.8h, v17.8h}, [x9], #32
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ ld1 {v20.8h, v21.8h}, [x11], #32
+ ld1 {v22.8h, v23.8h}, [x12], #32
+ ld1 {v24.8h, v25.8h}, [x13], #32
+ ld1 {v6.8h, v7.8h}, [x14], #32
+
+ smull v2.4s, v16.4h, v0.h[4]
+ smlal v2.4s, v18.4h, v0.h[5]
+ smlal v2.4s, v20.4h, v0.h[6]
+ smlal v2.4s, v22.4h, v0.h[7]
+ smlal v2.4s, v24.4h, v0.h[6]
+ smlal v2.4s, v6.4h, v0.h[5]
+ smlal v2.4s, v6.4h, v0.h[4]
+ smull2 v3.4s, v16.8h, v0.h[4]
+ smlal2 v3.4s, v18.8h, v0.h[5]
+ smlal2 v3.4s, v20.8h, v0.h[6]
+ smlal2 v3.4s, v22.8h, v0.h[7]
+ smlal2 v3.4s, v24.8h, v0.h[6]
+ smlal2 v3.4s, v6.8h, v0.h[5]
+ smlal2 v3.4s, v6.8h, v0.h[4]
+ smull v4.4s, v17.4h, v0.h[4]
+ smlal v4.4s, v19.4h, v0.h[5]
+ smlal v4.4s, v21.4h, v0.h[6]
+ smlal v4.4s, v23.4h, v0.h[7]
+ smlal v4.4s, v25.4h, v0.h[6]
+ smlal v4.4s, v7.4h, v0.h[5]
+ smlal v4.4s, v7.4h, v0.h[4]
+ smull2 v5.4s, v17.8h, v0.h[4]
+ smlal2 v5.4s, v19.8h, v0.h[5]
+ smlal2 v5.4s, v21.8h, v0.h[6]
+ smlal2 v5.4s, v23.8h, v0.h[7]
+ smlal2 v5.4s, v25.8h, v0.h[6]
+ smlal2 v5.4s, v7.8h, v0.h[5]
+ smlal2 v5.4s, v7.8h, v0.h[4]
+ srshl v2.4s, v2.4s, v27.4s // -round_bits_v
+ srshl v3.4s, v3.4s, v27.4s
+ srshl v4.4s, v4.4s, v27.4s
+ srshl v5.4s, v5.4s, v27.4s
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v28.8h // bitdepth_max
+ umin v3.8h, v3.8h, v28.8h
+ subs w4, w4, #16
+ st1 {v2.8h, v3.8h}, [x0], #32
+ b.gt 1b
+
+ ldp x0, x4, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #64
+
+ add x0, x0, x1
+ ret
+endfunc
+
+function wiener_filter7_hv_16bpc_neon
+ // Backing up/restoring registers shifted, so that x9 gets the value
+ // of x10, etc, and x15==x9, afterwards.
+ stp x10, x11, [sp, #-80]!
+ stp x12, x13, [sp, #16]
+ stp x14, x15, [sp, #32]
+ stp x10, x0, [sp, #48]
+ stp x3, x4, [sp, #64]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #6
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+ b 2f
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v4 with the leftmost pixel
+ // and shift v3 to have 3x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #6
+ ext v3.16b, v2.16b, v3.16b, #10
+ ext v2.16b, v4.16b, v2.16b, #10
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #19
+ b.ge 4f // If w >= 19, all used input pixels are valid
+
+ // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
+ sub w17, w4, #22
+ // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -6
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ ext v17.16b, v2.16b, v3.16b, #4
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v20.16b, v2.16b, v3.16b, #10
+ ext v21.16b, v2.16b, v3.16b, #12
+ ext v18.16b, v2.16b, v3.16b, #6
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v2.8h
+ smull v6.4s, v18.4h, v0.h[3]
+ smlal v6.4s, v19.4h, v0.h[2]
+ smlal v6.4s, v20.4h, v0.h[1]
+ smlal v6.4s, v21.4h, v0.h[0]
+ smull2 v7.4s, v18.8h, v0.h[3]
+ smlal2 v7.4s, v19.8h, v0.h[2]
+ smlal2 v7.4s, v20.8h, v0.h[1]
+ smlal2 v7.4s, v21.8h, v0.h[0]
+
+ ext v17.16b, v3.16b, v4.16b, #4
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v20.16b, v3.16b, v4.16b, #10
+ ext v21.16b, v3.16b, v4.16b, #12
+ ext v18.16b, v3.16b, v4.16b, #6
+
+ add v19.8h, v19.8h, v17.8h
+ add v20.8h, v20.8h, v16.8h
+ add v21.8h, v21.8h, v3.8h
+ smull v24.4s, v18.4h, v0.h[3]
+ smlal v24.4s, v19.4h, v0.h[2]
+ smlal v24.4s, v20.4h, v0.h[1]
+ smlal v24.4s, v21.4h, v0.h[0]
+ smull2 v25.4s, v18.8h, v0.h[3]
+ smlal2 v25.4s, v19.8h, v0.h[2]
+ smlal2 v25.4s, v20.8h, v0.h[1]
+ smlal2 v25.4s, v21.8h, v0.h[0]
+
+ ld1 {v16.8h, v17.8h}, [x9], #32
+
+ mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v24.4s, v24.4s, v30.4s
+ add v25.4s, v25.4s, v30.4s
+ ld1 {v18.8h, v19.8h}, [x10], #32
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v24.4s, v24.4s, v29.4s
+ srshl v25.4s, v25.4s, v29.4s
+ ld1 {v20.8h, v21.8h}, [x11], #32
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v24.4s
+ sqxtun2 v7.8h, v25.4s
+ ld1 {v22.8h, v23.8h}, [x12], #32
+ umin v6.8h, v6.8h, v26.8h
+ umin v7.8h, v7.8h, v26.8h
+ ld1 {v24.8h, v25.8h}, [x13], #32
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ ld1 {v8.8h, v9.8h}, [x14], #32
+
+ smull v1.4s, v16.4h, v0.h[4]
+ smlal v1.4s, v18.4h, v0.h[5]
+ smlal v1.4s, v20.4h, v0.h[6]
+ smlal v1.4s, v22.4h, v0.h[7]
+ smlal v1.4s, v24.4h, v0.h[6]
+ smlal v1.4s, v8.4h, v0.h[5]
+ smlal v1.4s, v6.4h, v0.h[4]
+ smull2 v5.4s, v16.8h, v0.h[4]
+ smlal2 v5.4s, v18.8h, v0.h[5]
+ smlal2 v5.4s, v20.8h, v0.h[6]
+ smlal2 v5.4s, v22.8h, v0.h[7]
+ smlal2 v5.4s, v24.8h, v0.h[6]
+ smlal2 v5.4s, v8.8h, v0.h[5]
+ smlal2 v5.4s, v6.8h, v0.h[4]
+ smull v26.4s, v17.4h, v0.h[4]
+ smlal v26.4s, v19.4h, v0.h[5]
+ smlal v26.4s, v21.4h, v0.h[6]
+ smlal v26.4s, v23.4h, v0.h[7]
+ smlal v26.4s, v25.4h, v0.h[6]
+ smlal v26.4s, v9.4h, v0.h[5]
+ smlal v26.4s, v7.4h, v0.h[4]
+ smull2 v16.4s, v17.8h, v0.h[4]
+ smlal2 v16.4s, v19.8h, v0.h[5]
+ smlal2 v16.4s, v21.8h, v0.h[6]
+ smlal2 v16.4s, v23.8h, v0.h[7]
+ smlal2 v16.4s, v25.8h, v0.h[6]
+ smlal2 v16.4s, v9.8h, v0.h[5]
+ smlal2 v16.4s, v7.8h, v0.h[4]
+ srshl v1.4s, v1.4s, v27.4s // -round_bits_v
+ srshl v5.4s, v5.4s, v27.4s
+ srshl v26.4s, v26.4s, v27.4s
+ srshl v16.4s, v16.4s, v27.4s
+ sqxtun v18.4h, v1.4s
+ sqxtun2 v18.8h, v5.4s
+ sqxtun v19.4h, v26.4s
+ sqxtun2 v19.8h, v16.4s
+ st1 {v6.8h, v7.8h}, [x15], #32
+ umin v18.8h, v18.8h, v28.8h // bitdepth_max
+ umin v19.8h, v19.8h, v28.8h
+ subs w4, w4, #16
+
+ st1 {v18.8h, v19.8h}, [x0], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #64]
+ ldp x15, x0, [sp, #48]
+ ldp x13, x14, [sp, #32]
+ ldp x11, x12, [sp, #16]
+ ldp x9, x10, [sp], #80
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+// void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
+// const pixel (*left)[4], const pixel *lpf,
+// const int w, int h,
+// const int16_t filter[2][8],
+// const enum LrEdgeFlags edges,
+// const int bitdepth_max);
+function wiener_filter5_16bpc_neon, export=1
+ ldr w8, [sp]
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp, #-32]!
+ stp d8, d9, [sp, #16]
+ mov x29, sp
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
+ sub_sp 384*2*4
+
+ dup v28.8h, w8 // bitdepth_max
+ clz w8, w8
+ movi v30.4s, #1
+ sub w10, w8, #38 // -(bitdepth + 6)
+ sub w11, w8, #11 // round_bits_v
+ sub w8, w8, #25 // -round_bits_h
+ neg w10, w10 // bitdepth + 6
+ neg w11, w11 // -round_bits_v
+ dup v2.4s, w10
+ dup v29.4s, w8 // -round_bits_h
+ dup v27.4s, w11 // -round_bits_v
+ movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192
+ ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6)
+
+ zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
+
+ // x11 - t4
+ // x12 - t3
+ // x13 - t2
+ // x14 - t1
+ // x15 - t0
+ mov x14, sp // t1
+ b.eq L(no_top_5)
+
+ mov x16, x2 // backup left
+ mov x2, #0
+ bl wiener_filter5_h_16bpc_neon
+ add x3, x3, x1 // lpf += stride
+ mov x11, x14 // t4
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_16bpc_neon
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
+ mov x12, x14 // t3
+ add x14, x14, #384*2 // t1 += 384*2
+ mov x2, x16 // left
+ mov x16, x3 // backup lpf
+ mov x3, x0 // lpf = p
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+
+L(main_5):
+ mov x15, x11 // t0 = t4
+L(main_loop_5):
+ bl wiener_filter5_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_loop_5)
+ tst w7, #8 // LR_HAVE_BOTTOM
+ b.eq L(v2_5)
+
+ mov x3, x16 // restore lpf
+ mov x2, #0 // left = NULL
+ bl wiener_filter5_hv_16bpc_neon
+ bl wiener_filter5_hv_16bpc_neon
+L(end_5):
+
+ mov sp, x29
+ ldp d8, d9, [sp, #16]
+ ldp x29, x30, [sp], #32
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+L(no_top_5):
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
+ mov x3, x0 // lpf = p
+
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ mov x11, x14 // t4
+ mov x12, x14 // t3
+ mov x13, x14 // t2
+ b.eq L(v1_5)
+ add x3, x3, x1 // src += stride
+ add x14, x14, #384*2 // t1 += 384*2
+ bl wiener_filter5_h_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x3, x3, x1 // src += stride
+ add x15, x14, #384*2 // t0 = t1 + 384*2
+ bl wiener_filter5_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.eq L(v2_5)
+ add x15, x15, #384*2*3 // t0 += 384*2*3
+ bl wiener_filter5_hv_16bpc_neon
+ subs w5, w5, #1 // h--
+ b.ne L(main_5)
+L(v2_5):
+ bl wiener_filter5_v_16bpc_neon
+ add x0, x0, x1
+ mov x11, x12
+ mov x12, x13
+ mov x13, x14
+L(v1_5):
+ bl wiener_filter5_v_16bpc_neon
+ b L(end_5)
+endfunc
+
+
+function wiener_filter5_h_16bpc_neon
+ stp x3, x4, [sp, #-32]!
+ str x14, [sp, #16]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #4
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+ b 2f
+
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v3 to have 3x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ smull v6.4s, v17.4h, v0.h[3]
+ smlal v6.4s, v18.4h, v0.h[2]
+ smlal v6.4s, v19.4h, v0.h[1]
+ smull2 v7.4s, v17.8h, v0.h[3]
+ smlal2 v7.4s, v18.8h, v0.h[2]
+ smlal2 v7.4s, v19.8h, v0.h[1]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ smull v16.4s, v17.4h, v0.h[3]
+ smlal v16.4s, v18.4h, v0.h[2]
+ smlal v16.4s, v19.4h, v0.h[1]
+ smull2 v17.4s, v17.8h, v0.h[3]
+ smlal2 v17.4s, v18.8h, v0.h[2]
+ smlal2 v17.4s, v19.8h, v0.h[1]
+
+ mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v16.4s, v16.4s, v30.4s
+ add v17.4s, v17.4s, v30.4s
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v16.4s, v16.4s, v29.4s
+ srshl v17.4s, v17.4s, v29.4s
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v16.4s
+ sqxtun2 v7.8h, v17.4s
+ umin v6.8h, v6.8h, v24.8h
+ umin v7.8h, v7.8h, v24.8h
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ subs w4, w4, #16
+
+ st1 {v6.8h, v7.8h}, [x14], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldr x14, [sp, #16]
+ ldp x3, x4, [sp], #32
+ ret
+endfunc
+
+function wiener_filter5_v_16bpc_neon
+ stp x11, x12, [sp, #-48]!
+ stp x13, x14, [sp, #16]
+ stp x0, x4, [sp, #32]
+1:
+ ld1 {v16.8h, v17.8h}, [x11], #32
+ ld1 {v18.8h, v19.8h}, [x12], #32
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ ld1 {v22.8h, v23.8h}, [x14], #32
+
+ smull v2.4s, v16.4h, v0.h[5]
+ smlal v2.4s, v18.4h, v0.h[6]
+ smlal v2.4s, v20.4h, v0.h[7]
+ smlal v2.4s, v22.4h, v0.h[6]
+ smlal v2.4s, v22.4h, v0.h[5]
+ smull2 v3.4s, v16.8h, v0.h[5]
+ smlal2 v3.4s, v18.8h, v0.h[6]
+ smlal2 v3.4s, v20.8h, v0.h[7]
+ smlal2 v3.4s, v22.8h, v0.h[6]
+ smlal2 v3.4s, v22.8h, v0.h[5]
+ smull v4.4s, v17.4h, v0.h[5]
+ smlal v4.4s, v19.4h, v0.h[6]
+ smlal v4.4s, v21.4h, v0.h[7]
+ smlal v4.4s, v23.4h, v0.h[6]
+ smlal v4.4s, v23.4h, v0.h[5]
+ smull2 v5.4s, v17.8h, v0.h[5]
+ smlal2 v5.4s, v19.8h, v0.h[6]
+ smlal2 v5.4s, v21.8h, v0.h[7]
+ smlal2 v5.4s, v23.8h, v0.h[6]
+ smlal2 v5.4s, v23.8h, v0.h[5]
+ srshl v2.4s, v2.4s, v27.4s // -round_bits_v
+ srshl v3.4s, v3.4s, v27.4s
+ srshl v4.4s, v4.4s, v27.4s
+ srshl v5.4s, v5.4s, v27.4s
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v28.8h // bitdepth_max
+ umin v3.8h, v3.8h, v28.8h
+
+ subs w4, w4, #16
+ st1 {v2.8h, v3.8h}, [x0], #32
+ b.gt 1b
+
+ ldp x0, x4, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #48
+
+ ret
+endfunc
+
+function wiener_filter5_hv_16bpc_neon
+ // Backing up/restoring registers shifted, so that x11 gets the value
+ // of x12, etc, and x15==x11, afterwards.
+ stp x12, x13, [sp, #-64]!
+ stp x14, x15, [sp, #16]
+ stp x12, x0, [sp, #32]
+ stp x3, x4, [sp, #48]
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 1f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #4
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ b 2f
+
+0:
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ ld1 {v4.d}[1], [x2], #8
+ // Move x3 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+ b 2f
+1:
+ ld1 {v2.8h, v3.8h}, [x3], #32
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v3 to have 2x the first pixel at the front.
+ dup v4.8h, v2.h[0]
+ // Move x3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #4
+ ext v3.16b, v2.16b, v3.16b, #12
+ ext v2.16b, v4.16b, v2.16b, #12
+
+2:
+ ld1 {v4.8h}, [x3], #16
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w4, #18
+ b.ge 4f // If w >= 18, all used input pixels are valid
+
+ // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
+ // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
+ sub w17, w4, #23
+ // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
+ // buffer pointer.
+ movrel x6, right_ext_mask, -4
+ ldr h26, [x3, w17, sxtw #1]
+ sub x6, x6, w4, uxtw #1
+ dup v26.8h, v26.h[0]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
+
+ bit v2.16b, v26.16b, v23.16b
+ bit v3.16b, v26.16b, v24.16b
+ bit v4.16b, v26.16b, v25.16b
+
+4: // Loop horizontally
+ ext v16.16b, v2.16b, v3.16b, #2
+ ext v18.16b, v2.16b, v3.16b, #6
+ ext v19.16b, v2.16b, v3.16b, #8
+ ext v17.16b, v2.16b, v3.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v2.8h
+ smull v6.4s, v17.4h, v0.h[3]
+ smlal v6.4s, v18.4h, v0.h[2]
+ smlal v6.4s, v19.4h, v0.h[1]
+ smull2 v7.4s, v17.8h, v0.h[3]
+ smlal2 v7.4s, v18.8h, v0.h[2]
+ smlal2 v7.4s, v19.8h, v0.h[1]
+
+ ext v16.16b, v3.16b, v4.16b, #2
+ ext v18.16b, v3.16b, v4.16b, #6
+ ext v19.16b, v3.16b, v4.16b, #8
+ ext v17.16b, v3.16b, v4.16b, #4
+ add v18.8h, v18.8h, v16.8h
+ add v19.8h, v19.8h, v3.8h
+ smull v24.4s, v17.4h, v0.h[3]
+ smlal v24.4s, v18.4h, v0.h[2]
+ smlal v24.4s, v19.4h, v0.h[1]
+ smull2 v25.4s, v17.8h, v0.h[3]
+ smlal2 v25.4s, v18.8h, v0.h[2]
+ smlal2 v25.4s, v19.8h, v0.h[1]
+
+ ld1 {v16.8h, v17.8h}, [x11], #32
+ mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ add v24.4s, v24.4s, v30.4s
+ add v25.4s, v25.4s, v30.4s
+ ld1 {v18.8h, v19.8h}, [x12], #32
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ srshl v24.4s, v24.4s, v29.4s
+ srshl v25.4s, v25.4s, v29.4s
+ ld1 {v20.8h, v21.8h}, [x13], #32
+ sqxtun v6.4h, v6.4s
+ sqxtun2 v6.8h, v7.4s
+ sqxtun v7.4h, v24.4s
+ sqxtun2 v7.8h, v25.4s
+ ld1 {v22.8h, v23.8h}, [x14], #32
+ umin v6.8h, v6.8h, v26.8h
+ umin v7.8h, v7.8h, v26.8h
+ sub v6.8h, v6.8h, v31.8h
+ sub v7.8h, v7.8h, v31.8h
+
+ smull v8.4s, v16.4h, v0.h[5]
+ smlal v8.4s, v18.4h, v0.h[6]
+ smlal v8.4s, v20.4h, v0.h[7]
+ smlal v8.4s, v22.4h, v0.h[6]
+ smlal v8.4s, v6.4h, v0.h[5]
+ smull2 v9.4s, v16.8h, v0.h[5]
+ smlal2 v9.4s, v18.8h, v0.h[6]
+ smlal2 v9.4s, v20.8h, v0.h[7]
+ smlal2 v9.4s, v22.8h, v0.h[6]
+ smlal2 v9.4s, v6.8h, v0.h[5]
+ smull v1.4s, v17.4h, v0.h[5]
+ smlal v1.4s, v19.4h, v0.h[6]
+ smlal v1.4s, v21.4h, v0.h[7]
+ smlal v1.4s, v23.4h, v0.h[6]
+ smlal v1.4s, v7.4h, v0.h[5]
+ smull2 v5.4s, v17.8h, v0.h[5]
+ smlal2 v5.4s, v19.8h, v0.h[6]
+ smlal2 v5.4s, v21.8h, v0.h[7]
+ smlal2 v5.4s, v23.8h, v0.h[6]
+ smlal2 v5.4s, v7.8h, v0.h[5]
+ srshl v8.4s, v8.4s, v27.4s // -round_bits_v
+ srshl v9.4s, v9.4s, v27.4s
+ srshl v1.4s, v1.4s, v27.4s
+ srshl v5.4s, v5.4s, v27.4s
+ sqxtun v8.4h, v8.4s
+ sqxtun2 v8.8h, v9.4s
+ sqxtun v9.4h, v1.4s
+ sqxtun2 v9.8h, v5.4s
+ st1 {v6.8h, v7.8h}, [x15], #32
+ umin v8.8h, v8.8h, v28.8h // bitdepth_max
+ umin v9.8h, v9.8h, v28.8h
+
+ subs w4, w4, #16
+
+ st1 {v8.8h, v9.8h}, [x0], #32
+
+ b.le 0f
+ mov v2.16b, v4.16b
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8h, v4.8h}, [x3], #32
+ b.ne 4b // If we don't need to pad, just keep filtering.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+0:
+ ldp x3, x4, [sp, #48]
+ ldp x15, x0, [sp, #32]
+ ldp x13, x14, [sp, #16]
+ ldp x11, x12, [sp], #64
+
+ add x3, x3, x1
+ add x0, x0, x1
+
+ ret
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_h_16bpc_neon, export=1
+ add w5, w5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add x10, x0, #(4*SUM_STRIDE) // sumsq
+ add x11, x1, #(2*SUM_STRIDE) // sum
+ add x12, x3, x4 // src
+ lsl x4, x4, #1
+ mov x9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add w13, w5, #7
+ bic w13, w13, #7
+ sub x9, x9, w13, uxtw #1
+
+ // Store the width for the vertical loop
+ mov w8, w5
+
+ // Subtract the number of pixels read from the input from the stride
+ add w13, w13, #8
+ sub x4, x4, w13, uxtw #1
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 2f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #4
+ sub x12, x12, #4
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 2 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add x4, x4, #4
+
+
+1: // Loop vertically
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ ld1 {v16.8h, v17.8h}, [x12], #32
+
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 0f
+ cbz x2, 2f
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.d}[1], [x2], #8
+ // Move x3/x12 back to account for the last 2 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #4
+ sub x12, x12, #4
+ ld1 {v18.d}[1], [x2], #8
+ ext v1.16b, v0.16b, v1.16b, #12
+ ext v0.16b, v2.16b, v0.16b, #12
+ ext v17.16b, v16.16b, v17.16b, #12
+ ext v16.16b, v18.16b, v16.16b, #12
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v0/v1 to have 2x the first pixel at the front.
+ dup v2.8h, v0.h[0]
+ dup v18.8h, v16.h[0]
+ // Move x3 back to account for the last 2 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #4
+ sub x12, x12, #4
+ ext v1.16b, v0.16b, v1.16b, #12
+ ext v0.16b, v2.16b, v0.16b, #12
+ ext v17.16b, v16.16b, v17.16b, #12
+ ext v16.16b, v18.16b, v16.16b, #12
+
+2:
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w5, #(2 + 16 - 2 + 1)
+ ldr h30, [x3, w13, sxtw #1]
+ ldr h31, [x12, w13, sxtw #1]
+ // Fill v30/v31 with the right padding pixel
+ dup v30.8h, v30.h[0]
+ dup v31.8h, v31.h[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w5, #10
+ b.ge 4f // If w >= 10, all used input pixels are valid
+
+ // 1 <= w < 10, w pixels valid in v0-v1. For w=9, this ends up called
+ // again; it's not strictly needed in those cases (we pad enough here),
+ // but keeping the code as simple as possible.
+
+ // Insert padding in v0/1.h[w] onwards
+ movrel x13, right_ext_mask
+ sub x13, x13, w5, uxtw #1
+ ld1 {v28.16b, v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v28.16b
+ bit v1.16b, v30.16b, v29.16b
+ bit v16.16b, v31.16b, v28.16b
+ bit v17.16b, v31.16b, v29.16b
+
+4: // Loop horizontally
+ ext v26.16b, v0.16b, v1.16b, #2
+ ext v28.16b, v16.16b, v17.16b, #2
+ ext v27.16b, v0.16b, v1.16b, #4
+ ext v29.16b, v16.16b, v17.16b, #4
+
+ add v6.8h, v0.8h, v26.8h
+ umull v22.4s, v0.4h, v0.4h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v7.8h, v16.8h, v28.8h
+ umull v24.4s, v16.4h, v16.4h
+ umlal v24.4s, v28.4h, v28.4h
+ umlal v24.4s, v29.4h, v29.4h
+ add v6.8h, v6.8h, v27.8h
+ umull2 v23.4s, v0.8h, v0.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+ add v7.8h, v7.8h, v29.8h
+ umull2 v25.4s, v16.8h, v16.8h
+ umlal2 v25.4s, v28.8h, v28.8h
+ umlal2 v25.4s, v29.8h, v29.8h
+
+ subs w5, w5, #8
+
+ st1 {v6.8h}, [x1], #16
+ st1 {v7.8h}, [x11], #16
+ st1 {v22.4s,v23.4s}, [x0], #32
+ st1 {v24.4s,v25.4s}, [x10], #32
+
+ b.le 9f
+ tst w7, #2 // LR_HAVE_RIGHT
+ mov v0.16b, v1.16b
+ mov v16.16b, v17.16b
+ ld1 {v1.8h}, [x3], #16
+ ld1 {v17.8h}, [x12], #16
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs w6, w6, #2
+ b.le 0f
+ // Jump to the next row and loop horizontally
+ add x0, x0, x9, lsl #1
+ add x10, x10, x9, lsl #1
+ add x1, x1, x9
+ add x11, x11, x9
+ add x3, x3, x4
+ add x12, x12, x4
+ mov w5, w8
+ b 1b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_h_16bpc_neon, export=1
+ add w5, w5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add x10, x0, #(4*SUM_STRIDE) // sumsq
+ add x11, x1, #(2*SUM_STRIDE) // sum
+ add x12, x3, x4 // src
+ lsl x4, x4, #1
+ mov x9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ add w13, w5, #7
+ bic w13, w13, #7
+ sub x9, x9, w13, uxtw #1
+ add w13, w13, #8
+ sub x4, x4, w13, uxtw #1
+
+ // Store the width for the vertical loop
+ mov w8, w5
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 2f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #6
+ sub x12, x12, #6
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add x4, x4, #6
+
+1: // Loop vertically
+ ld1 {v0.8h, v1.8h}, [x3], #32
+ ld1 {v16.8h, v17.8h}, [x12], #32
+
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 0f
+ cbz x2, 2f
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v2.d}[1], [x2], #8
+ // Move x3/x12 back to account for the last 3 pixels we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #6
+ sub x12, x12, #6
+ ld1 {v18.d}[1], [x2], #8
+ ext v1.16b, v0.16b, v1.16b, #10
+ ext v0.16b, v2.16b, v0.16b, #10
+ ext v17.16b, v16.16b, v17.16b, #10
+ ext v16.16b, v18.16b, v16.16b, #10
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+ // and shift v0/v1 to have 3x the first pixel at the front.
+ dup v2.8h, v0.h[0]
+ dup v18.8h, v16.h[0]
+ // Move x3 back to account for the last 3 pixels we loaded before,
+ // which we shifted out.
+ sub x3, x3, #6
+ sub x12, x12, #6
+ ext v1.16b, v0.16b, v1.16b, #10
+ ext v0.16b, v2.16b, v0.16b, #10
+ ext v17.16b, v16.16b, v17.16b, #10
+ ext v16.16b, v18.16b, v16.16b, #10
+
+2:
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that pixel to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w5, #(2 + 16 - 3 + 1)
+ ldr h30, [x3, w13, sxtw #1]
+ ldr h31, [x12, w13, sxtw #1]
+ // Fill v30/v31 with the right padding pixel
+ dup v30.8h, v30.h[0]
+ dup v31.8h, v31.h[0]
+3: // !LR_HAVE_RIGHT
+
+ // Check whether we need to pad the right edge
+ cmp w5, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+
+ // 1 <= w < 11, w+1 pixels valid in v0-v1. For w=9 or w=10,
+ // this ends up called again; it's not strictly needed in those
+ // cases (we pad enough here), but keeping the code as simple as possible.
+
+ // Insert padding in v0/1.h[w+1] onwards; fuse the +1 into the
+ // buffer pointer.
+ movrel x13, right_ext_mask, -2
+ sub x13, x13, w5, uxtw #1
+ ld1 {v28.16b, v29.16b}, [x13]
+
+ bit v0.16b, v30.16b, v28.16b
+ bit v1.16b, v30.16b, v29.16b
+ bit v16.16b, v31.16b, v28.16b
+ bit v17.16b, v31.16b, v29.16b
+
+4: // Loop horizontally
+ ext v26.16b, v0.16b, v1.16b, #2
+ ext v28.16b, v16.16b, v17.16b, #2
+ ext v27.16b, v0.16b, v1.16b, #4
+ ext v29.16b, v16.16b, v17.16b, #4
+
+ add v6.8h, v0.8h, v26.8h
+ umull v22.4s, v0.4h, v0.4h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v7.8h, v16.8h, v28.8h
+ umull v24.4s, v16.4h, v16.4h
+ umlal v24.4s, v28.4h, v28.4h
+ umlal v24.4s, v29.4h, v29.4h
+ add v6.8h, v6.8h, v27.8h
+ umull2 v23.4s, v0.8h, v0.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+ add v7.8h, v7.8h, v29.8h
+ umull2 v25.4s, v16.8h, v16.8h
+ umlal2 v25.4s, v28.8h, v28.8h
+ umlal2 v25.4s, v29.8h, v29.8h
+
+ ext v26.16b, v0.16b, v1.16b, #6
+ ext v28.16b, v16.16b, v17.16b, #6
+ ext v27.16b, v0.16b, v1.16b, #8
+ ext v29.16b, v16.16b, v17.16b, #8
+
+ add v6.8h, v6.8h, v26.8h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v7.8h, v7.8h, v28.8h
+ umlal v24.4s, v28.4h, v28.4h
+ umlal v24.4s, v29.4h, v29.4h
+ add v6.8h, v6.8h, v27.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+ add v7.8h, v7.8h, v29.8h
+ umlal2 v25.4s, v28.8h, v28.8h
+ umlal2 v25.4s, v29.8h, v29.8h
+
+ subs w5, w5, #8
+
+ st1 {v6.8h}, [x1], #16
+ st1 {v7.8h}, [x11], #16
+ st1 {v22.4s,v23.4s}, [x0], #32
+ st1 {v24.4s,v25.4s}, [x10], #32
+
+ b.le 9f
+ tst w7, #2 // LR_HAVE_RIGHT
+ mov v0.16b, v1.16b
+ mov v16.16b, v17.16b
+ ld1 {v1.8h}, [x3], #16
+ ld1 {v17.8h}, [x12], #16
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+9:
+ subs w6, w6, #2
+ b.le 0f
+ // Jump to the next row and loop horizontally
+ add x0, x0, x9, lsl #1
+ add x10, x10, x9, lsl #1
+ add x1, x1, x9
+ add x11, x11, x9
+ add x3, x3, x4
+ add x12, x12, x4
+ mov w5, w8
+ b 1b
+0:
+ ret
+endfunc
+
+sgr_funcs 16
diff --git a/third_party/dav1d/src/arm/64/looprestoration_common.S b/third_party/dav1d/src/arm/64/looprestoration_common.S
new file mode 100644
index 0000000000..200eb63189
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration_common.S
@@ -0,0 +1,432 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+ add w10, w3, #2 // Number of output rows to move back
+ mov w11, w3 // Number of input rows to move back
+ add w2, w2, #2 // Actual summed width
+ mov x7, #(4*SUM_STRIDE) // sumsq stride
+ mov x8, #(2*SUM_STRIDE) // sum stride
+ sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst w4, #4 // LR_HAVE_TOP
+ b.eq 0f
+ // If have top, read from row -2.
+ sub x5, x0, #(4*SUM_STRIDE)
+ sub x6, x1, #(2*SUM_STRIDE)
+ add w11, w11, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add x5, x0, #(4*SUM_STRIDE)
+ add x6, x1, #(2*SUM_STRIDE)
+1:
+
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.eq 1f
+ // LR_HAVE_BOTTOM
+ add w3, w3, #2 // Sum all h+2 lines with the main loop
+ add w11, w11, #2
+1:
+ mov w9, w3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into v16-v21 and v24-v26 taking top
+ // padding into consideration.
+ tst w4, #4 // LR_HAVE_TOP
+ ld1 {v16.4s, v17.4s}, [x5], x7
+ ld1 {v24.8h}, [x6], x8
+ b.eq 2f
+ // LR_HAVE_TOP
+ ld1 {v18.4s, v19.4s}, [x5], x7
+ ld1 {v25.8h}, [x6], x8
+ ld1 {v20.4s, v21.4s}, [x5], x7
+ ld1 {v26.8h}, [x6], x8
+ b 3f
+2: // !LR_HAVE_TOP
+ mov v18.16b, v16.16b
+ mov v19.16b, v17.16b
+ mov v25.16b, v24.16b
+ mov v20.16b, v16.16b
+ mov v21.16b, v17.16b
+ mov v26.16b, v24.16b
+
+3:
+ subs w3, w3, #1
+.macro add3
+ add v16.4s, v16.4s, v18.4s
+ add v17.4s, v17.4s, v19.4s
+ add v24.8h, v24.8h, v25.8h
+ add v16.4s, v16.4s, v20.4s
+ add v17.4s, v17.4s, v21.4s
+ add v24.8h, v24.8h, v26.8h
+ st1 {v16.4s, v17.4s}, [x0], x7
+ st1 {v24.8h}, [x1], x8
+.endm
+ add3
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v24.16b, v25.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v25.16b, v26.16b
+ b.le 4f
+ ld1 {v20.4s, v21.4s}, [x5], x7
+ ld1 {v26.8h}, [x6], x8
+ b 3b
+
+4:
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.ne 5f
+ // !LR_HAVE_BOTTOM
+ // Produce two more rows, extending the already loaded rows.
+ add3
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v24.16b, v25.16b
+ add3
+
+5: // End of one vertical slice.
+ subs w2, w2, #8
+ b.le 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ msub x5, x7, x11, x5
+ msub x6, x8, x11, x6
+ // Output pointers
+ msub x0, x7, x10, x0
+ msub x1, x8, x10, x1
+ add x0, x0, #32
+ add x1, x1, #16
+ add x5, x5, #32
+ add x6, x6, #16
+ mov w3, w9
+ b 1b
+
+0:
+ ret
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+ add w10, w3, #2 // Number of output rows to move back
+ mov w11, w3 // Number of input rows to move back
+ add w2, w2, #8 // Actual summed width
+ mov x7, #(4*SUM_STRIDE) // sumsq stride
+ mov x8, #(2*SUM_STRIDE) // sum stride
+ sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst w4, #4 // LR_HAVE_TOP
+ b.eq 0f
+ // If have top, read from row -2.
+ sub x5, x0, #(4*SUM_STRIDE)
+ sub x6, x1, #(2*SUM_STRIDE)
+ add w11, w11, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add x5, x0, #(4*SUM_STRIDE)
+ add x6, x1, #(2*SUM_STRIDE)
+1:
+
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.eq 0f
+ // LR_HAVE_BOTTOM
+ add w3, w3, #2 // Handle h+2 lines with the main loop
+ add w11, w11, #2
+ b 1f
+0:
+ // !LR_HAVE_BOTTOM
+ sub w3, w3, #1 // Handle h-1 lines with the main loop
+1:
+ mov w9, w3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into v16-v25 and v26-v30 taking top
+ // padding into consideration.
+ tst w4, #4 // LR_HAVE_TOP
+ ld1 {v16.4s, v17.4s}, [x5], x7
+ ld1 {v26.8h}, [x6], x8
+ b.eq 2f
+ // LR_HAVE_TOP
+ ld1 {v20.4s, v21.4s}, [x5], x7
+ ld1 {v28.8h}, [x6], x8
+ mov v18.16b, v16.16b
+ mov v19.16b, v17.16b
+ mov v27.16b, v26.16b
+ ld1 {v22.4s, v23.4s}, [x5], x7
+ ld1 {v29.8h}, [x6], x8
+ b 3f
+2: // !LR_HAVE_TOP
+ mov v18.16b, v16.16b
+ mov v19.16b, v17.16b
+ mov v27.16b, v26.16b
+ mov v20.16b, v16.16b
+ mov v21.16b, v17.16b
+ mov v28.16b, v26.16b
+ mov v22.16b, v16.16b
+ mov v23.16b, v17.16b
+ mov v29.16b, v26.16b
+
+3:
+ cbz w3, 4f
+ ld1 {v24.4s, v25.4s}, [x5], x7
+ ld1 {v30.8h}, [x6], x8
+
+3:
+ // Start of vertical loop
+ subs w3, w3, #2
+.macro add5
+ add v16.4s, v16.4s, v18.4s
+ add v17.4s, v17.4s, v19.4s
+ add v26.8h, v26.8h, v27.8h
+ add v0.4s, v20.4s, v22.4s
+ add v1.4s, v21.4s, v23.4s
+ add v2.8h, v28.8h, v29.8h
+ add v16.4s, v16.4s, v24.4s
+ add v17.4s, v17.4s, v25.4s
+ add v26.8h, v26.8h, v30.8h
+ add v16.4s, v16.4s, v0.4s
+ add v17.4s, v17.4s, v1.4s
+ add v26.8h, v26.8h, v2.8h
+ st1 {v16.4s, v17.4s}, [x0], x7
+ st1 {v26.8h}, [x1], x8
+.endm
+ add5
+.macro shift2
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ mov v26.16b, v28.16b
+ mov v18.16b, v22.16b
+ mov v19.16b, v23.16b
+ mov v27.16b, v29.16b
+ mov v20.16b, v24.16b
+ mov v21.16b, v25.16b
+ mov v28.16b, v30.16b
+.endm
+ shift2
+ add x0, x0, x7
+ add x1, x1, x8
+ b.le 5f
+ ld1 {v22.4s, v23.4s}, [x5], x7
+ ld1 {v29.8h}, [x6], x8
+ ld1 {v24.4s, v25.4s}, [x5], x7
+ ld1 {v30.8h}, [x6], x8
+ b 3b
+
+4:
+ // h == 1, !LR_HAVE_BOTTOM.
+ // Pad the last row with the only content row, and add.
+ mov v24.16b, v22.16b
+ mov v25.16b, v23.16b
+ mov v30.16b, v29.16b
+ add5
+ shift2
+ add x0, x0, x7
+ add x1, x1, x8
+ add5
+ b 6f
+
+5:
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.ne 6f
+ // !LR_HAVE_BOTTOM
+ cbnz w3, 5f
+ // The intended three edge rows left; output the one at h-2 and
+ // the past edge one at h.
+ ld1 {v22.4s, v23.4s}, [x5], x7
+ ld1 {v29.8h}, [x6], x8
+ // Pad the past-edge row from the last content row.
+ mov v24.16b, v22.16b
+ mov v25.16b, v23.16b
+ mov v30.16b, v29.16b
+ add5
+ shift2
+ add x0, x0, x7
+ add x1, x1, x8
+ // The last two rows are already padded properly here.
+ add5
+ b 6f
+
+5:
+ // w3 == -1, two rows left, output one.
+ // Pad the last two rows from the mid one.
+ mov v22.16b, v20.16b
+ mov v23.16b, v21.16b
+ mov v29.16b, v28.16b
+ mov v24.16b, v20.16b
+ mov v25.16b, v21.16b
+ mov v30.16b, v28.16b
+ add5
+ add x0, x0, x7
+ add x1, x1, x8
+ b 6f
+
+6: // End of one vertical slice.
+ subs w2, w2, #8
+ b.le 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ msub x5, x7, x11, x5
+ msub x6, x8, x11, x6
+ // Output pointers
+ msub x0, x7, x10, x0
+ msub x1, x8, x10, x1
+ add x0, x0, #32
+ add x1, x1, #16
+ add x5, x5, #32
+ add x6, x6, #16
+ mov w3, w9
+ b 1b
+
+0:
+ ret
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength,
+// const int bitdepth_max);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength,
+// const int bitdepth_max);
+function sgr_calc_ab1_neon, export=1
+ clz w9, w5
+ add x3, x3, #2 // h += 2
+ movi v31.4s, #9 // n
+ mov x5, #455
+ mov x8, #SUM_STRIDE
+ b sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+ clz w9, w5
+ add x3, x3, #3 // h += 3
+ asr x3, x3, #1 // h /= 2
+ movi v31.4s, #25 // n
+ mov x5, #164
+ mov x8, #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+ sub w9, w9, #24 // -bitdepth_min_8
+ movrel x12, X(sgr_x_by_x)
+ ld1 {v16.16b, v17.16b, v18.16b}, [x12]
+ dup v6.8h, w9 // -bitdepth_min_8
+ movi v19.16b, #5
+ movi v20.8b, #55 // idx of last 5
+ movi v21.8b, #72 // idx of last 4
+ movi v22.8b, #101 // idx of last 3
+ movi v23.8b, #169 // idx of last 2
+ movi v24.8b, #254 // idx of last 1
+ saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8
+ add x2, x2, #2 // w += 2
+ add x7, x2, #7
+ bic x7, x7, #7 // aligned w
+ sub x7, x8, x7 // increment between rows
+ movi v29.8h, #1, lsl #8
+ dup v28.4s, w4
+ dup v30.4s, w5 // one_by_x
+ sub x0, x0, #(4*(SUM_STRIDE))
+ sub x1, x1, #(2*(SUM_STRIDE))
+ mov x6, x2 // backup of w
+ sub v16.16b, v16.16b, v19.16b
+ sub v17.16b, v17.16b, v19.16b
+ sub v18.16b, v18.16b, v19.16b
+1:
+ subs x2, x2, #8
+ ld1 {v0.4s, v1.4s}, [x0] // a
+ ld1 {v2.8h}, [x1] // b
+ srshl v0.4s, v0.4s, v7.4s
+ srshl v1.4s, v1.4s, v7.4s
+ srshl v4.8h, v2.8h, v6.8h
+ mul v0.4s, v0.4s, v31.4s // a * n
+ mul v1.4s, v1.4s, v31.4s // a * n
+ umull v3.4s, v4.4h, v4.4h // b * b
+ umull2 v4.4s, v4.8h, v4.8h // b * b
+ uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
+ uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
+ mul v0.4s, v0.4s, v28.4s // p * s
+ mul v1.4s, v1.4s, v28.4s // p * s
+ uqshrn v0.4h, v0.4s, #16
+ uqshrn2 v0.8h, v1.4s, #16
+ uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
+
+ cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
+ cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
+ tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
+ cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
+ cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
+ add v25.8b, v25.8b, v26.8b
+ cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
+ add v27.8b, v27.8b, v4.8b
+ add v5.8b, v5.8b, v19.8b
+ add v25.8b, v25.8b, v27.8b
+ add v1.8b, v1.8b, v5.8b
+ add v1.8b, v1.8b, v25.8b
+ uxtl v1.8h, v1.8b // x
+
+ umull v3.4s, v1.4h, v2.4h // x * BB[i]
+ umull2 v4.4s, v1.8h, v2.8h // x * BB[i]
+ mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ srshr v3.4s, v3.4s, #12 // AA[i]
+ srshr v4.4s, v4.4s, #12 // AA[i]
+ sub v2.8h, v29.8h, v1.8h // 256 - x
+
+ st1 {v3.4s, v4.4s}, [x0], #32
+ st1 {v2.8h}, [x1], #16
+ b.gt 1b
+
+ subs x3, x3, #1
+ b.le 0f
+ add x0, x0, x7, lsl #2
+ add x1, x1, x7, lsl #1
+ mov x2, x6
+ b 1b
+0:
+ ret
+endfunc
diff --git a/third_party/dav1d/src/arm/64/looprestoration_tmpl.S b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S
new file mode 100644
index 0000000000..7cdfd6f3f7
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/looprestoration_tmpl.S
@@ -0,0 +1,597 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+.macro sgr_funcs bpc
+// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter1_\bpc\()bpc_neon, export=1
+ sub x7, x3, #(4*SUM_STRIDE)
+ add x8, x3, #(4*SUM_STRIDE)
+ sub x9, x4, #(2*SUM_STRIDE)
+ add x10, x4, #(2*SUM_STRIDE)
+ mov x11, #SUM_STRIDE
+ mov x12, #FILTER_OUT_STRIDE
+ add x13, x5, #7
+ bic x13, x13, #7 // Aligned width
+.if \bpc == 8
+ sub x2, x2, x13
+.else
+ sub x2, x2, x13, lsl #1
+.endif
+ sub x12, x12, x13
+ sub x11, x11, x13
+ sub x11, x11, #4 // We read 4 extra elements from a
+ sub x14, x11, #4 // We read 8 extra elements from b
+ mov x13, x5
+ movi v6.8h, #3
+ movi v7.4s, #3
+1:
+ ld1 {v0.8h, v1.8h}, [x9], #32
+ ld1 {v2.8h, v3.8h}, [x4], #32
+ ld1 {v4.8h, v5.8h}, [x10], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48
+ ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48
+
+2:
+ subs x5, x5, #8
+ ext v25.16b, v0.16b, v1.16b, #2 // -stride
+ ext v26.16b, v2.16b, v3.16b, #2 // 0
+ ext v27.16b, v4.16b, v5.16b, #2 // +stride
+ ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
+ ext v29.16b, v2.16b, v3.16b, #4 // +1
+ ext v30.16b, v4.16b, v5.16b, #4 // +1+stride
+ add v2.8h, v2.8h, v25.8h // -1, -stride
+ add v26.8h, v26.8h, v27.8h // 0, +stride
+ add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
+ add v2.8h, v2.8h, v26.8h
+ add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride
+ add v2.8h, v2.8h, v29.8h // +1
+ add v0.8h, v0.8h, v4.8h
+
+ ext v25.16b, v16.16b, v17.16b, #4 // -stride
+ ext v26.16b, v17.16b, v18.16b, #4
+ shl v2.8h, v2.8h, #2
+ ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
+ ext v28.16b, v17.16b, v18.16b, #8
+ ext v29.16b, v19.16b, v20.16b, #4 // 0
+ ext v30.16b, v20.16b, v21.16b, #4
+ mla v2.8h, v0.8h, v6.8h // * 3 -> a
+ add v25.4s, v25.4s, v19.4s // -stride, -1
+ add v26.4s, v26.4s, v20.4s
+ add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride
+ add v17.4s, v17.4s, v28.4s
+ ext v27.16b, v19.16b, v20.16b, #8 // +1
+ ext v28.16b, v20.16b, v21.16b, #8
+ add v16.4s, v16.4s, v22.4s // -1+stride
+ add v17.4s, v17.4s, v23.4s
+ add v29.4s, v29.4s, v27.4s // 0, +1
+ add v30.4s, v30.4s, v28.4s
+ add v25.4s, v25.4s, v29.4s
+ add v26.4s, v26.4s, v30.4s
+ ext v27.16b, v22.16b, v23.16b, #4 // +stride
+ ext v28.16b, v23.16b, v24.16b, #4
+ ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
+ ext v30.16b, v23.16b, v24.16b, #8
+.if \bpc == 8
+ ld1 {v19.8b}, [x1], #8 // src
+.else
+ ld1 {v19.8h}, [x1], #16 // src
+.endif
+ add v25.4s, v25.4s, v27.4s // +stride
+ add v26.4s, v26.4s, v28.4s
+ add v16.4s, v16.4s, v29.4s // +1+stride
+ add v17.4s, v17.4s, v30.4s
+ shl v25.4s, v25.4s, #2
+ shl v26.4s, v26.4s, #2
+ mla v25.4s, v16.4s, v7.4s // * 3 -> b
+ mla v26.4s, v17.4s, v7.4s
+.if \bpc == 8
+ uxtl v19.8h, v19.8b // src
+.endif
+ mov v0.16b, v1.16b
+ umlal v25.4s, v2.4h, v19.4h // b + a * src
+ umlal2 v26.4s, v2.8h, v19.8h
+ mov v2.16b, v3.16b
+ rshrn v25.4h, v25.4s, #9
+ rshrn2 v25.8h, v26.4s, #9
+ mov v4.16b, v5.16b
+ st1 {v25.8h}, [x0], #16
+
+ b.le 3f
+ mov v16.16b, v18.16b
+ mov v19.16b, v21.16b
+ mov v22.16b, v24.16b
+ ld1 {v1.8h}, [x9], #16
+ ld1 {v3.8h}, [x4], #16
+ ld1 {v5.8h}, [x10], #16
+ ld1 {v17.4s, v18.4s}, [x7], #32
+ ld1 {v20.4s, v21.4s}, [x3], #32
+ ld1 {v23.4s, v24.4s}, [x8], #32
+ b 2b
+
+3:
+ subs x6, x6, #1
+ b.le 0f
+ mov x5, x13
+ add x0, x0, x12, lsl #1
+ add x1, x1, x2
+ add x3, x3, x11, lsl #2
+ add x7, x7, x11, lsl #2
+ add x8, x8, x11, lsl #2
+ add x4, x4, x14, lsl #1
+ add x9, x9, x14, lsl #1
+ add x10, x10, x14, lsl #1
+ b 1b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter2_\bpc\()bpc_neon, export=1
+ add x7, x3, #(4*(SUM_STRIDE))
+ sub x3, x3, #(4*(SUM_STRIDE))
+ add x8, x4, #(2*(SUM_STRIDE))
+ sub x4, x4, #(2*(SUM_STRIDE))
+ mov x9, #(2*SUM_STRIDE)
+ mov x10, #FILTER_OUT_STRIDE
+ add x11, x5, #7
+ bic x11, x11, #7 // Aligned width
+.if \bpc == 8
+ sub x2, x2, x11
+.else
+ sub x2, x2, x11, lsl #1
+.endif
+ sub x10, x10, x11
+ sub x9, x9, x11
+ sub x9, x9, #4 // We read 4 extra elements from a
+ sub x12, x9, #4 // We read 8 extra elements from b
+ mov x11, x5
+ movi v4.8h, #5
+ movi v5.4s, #5
+ movi v6.8h, #6
+ movi v7.4s, #6
+1:
+ ld1 {v0.8h, v1.8h}, [x4], #32
+ ld1 {v2.8h, v3.8h}, [x8], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
+
+2:
+ subs x5, x5, #8
+ ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
+ ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
+ ext v22.16b, v0.16b, v1.16b, #2 // -stride
+ ext v23.16b, v2.16b, v3.16b, #2 // +stride
+ add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
+ add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
+ add v2.8h, v22.8h, v23.8h // -stride, +stride
+ add v0.8h, v0.8h, v25.8h
+
+ ext v22.16b, v16.16b, v17.16b, #4 // -stride
+ ext v23.16b, v17.16b, v18.16b, #4
+ ext v24.16b, v19.16b, v20.16b, #4 // +stride
+ ext v25.16b, v20.16b, v21.16b, #4
+ ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
+ ext v27.16b, v17.16b, v18.16b, #8
+ ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
+ ext v29.16b, v20.16b, v21.16b, #8
+ mul v0.8h, v0.8h, v4.8h // * 5
+ mla v0.8h, v2.8h, v6.8h // * 6
+.if \bpc == 8
+ ld1 {v31.8b}, [x1], #8
+.else
+ ld1 {v31.8h}, [x1], #16
+.endif
+ add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
+ add v17.4s, v17.4s, v27.4s
+ add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
+ add v20.4s, v20.4s, v29.4s
+ add v16.4s, v16.4s, v19.4s
+ add v17.4s, v17.4s, v20.4s
+
+ add v22.4s, v22.4s, v24.4s // -stride, +stride
+ add v23.4s, v23.4s, v25.4s
+ // This is, surprisingly, faster than other variants where the
+ // mul+mla pairs are further apart, on Cortex A53.
+ mul v16.4s, v16.4s, v5.4s // * 5
+ mla v16.4s, v22.4s, v7.4s // * 6
+ mul v17.4s, v17.4s, v5.4s // * 5
+ mla v17.4s, v23.4s, v7.4s // * 6
+
+.if \bpc == 8
+ uxtl v31.8h, v31.8b
+.endif
+ umlal v16.4s, v0.4h, v31.4h // b + a * src
+ umlal2 v17.4s, v0.8h, v31.8h
+ mov v0.16b, v1.16b
+ rshrn v16.4h, v16.4s, #9
+ rshrn2 v16.8h, v17.4s, #9
+ mov v2.16b, v3.16b
+ st1 {v16.8h}, [x0], #16
+
+ b.le 3f
+ mov v16.16b, v18.16b
+ mov v19.16b, v21.16b
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v3.8h}, [x8], #16
+ ld1 {v17.4s, v18.4s}, [x3], #32
+ ld1 {v20.4s, v21.4s}, [x7], #32
+ b 2b
+
+3:
+ subs x6, x6, #1
+ b.le 0f
+ mov x5, x11
+ add x0, x0, x10, lsl #1
+ add x1, x1, x2
+ add x3, x3, x9, lsl #2
+ add x7, x7, x9, lsl #2
+ add x4, x4, x12, lsl #1
+ add x8, x8, x12, lsl #1
+ mov x13, x3
+ mov x14, x4
+
+ ld1 {v0.8h, v1.8h}, [x4], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
+
+4:
+ subs x5, x5, #8
+ ext v23.16b, v0.16b, v1.16b, #4 // +1
+ ext v22.16b, v0.16b, v1.16b, #2 // 0
+ add v0.8h, v0.8h, v23.8h // -1, +1
+
+ ext v24.16b, v16.16b, v17.16b, #4 // 0
+ ext v25.16b, v17.16b, v18.16b, #4
+ ext v26.16b, v16.16b, v17.16b, #8 // +1
+ ext v27.16b, v17.16b, v18.16b, #8
+ mul v2.8h, v22.8h, v6.8h // * 6
+ mla v2.8h, v0.8h, v4.8h // * 5 -> a
+.if \bpc == 8
+ ld1 {v31.8b}, [x1], #8
+.else
+ ld1 {v31.8h}, [x1], #16
+.endif
+ add v16.4s, v16.4s, v26.4s // -1, +1
+ add v17.4s, v17.4s, v27.4s
+.if \bpc == 8
+ uxtl v31.8h, v31.8b
+.endif
+ // This is, surprisingly, faster than other variants where the
+ // mul+mla pairs are further apart, on Cortex A53.
+ mul v24.4s, v24.4s, v7.4s // * 6
+ mla v24.4s, v16.4s, v5.4s // * 5 -> b
+ mul v25.4s, v25.4s, v7.4s // * 6
+ mla v25.4s, v17.4s, v5.4s // * 5 -> b
+
+ umlal v24.4s, v2.4h, v31.4h // b + a * src
+ umlal2 v25.4s, v2.8h, v31.8h
+ mov v0.16b, v1.16b
+ rshrn v24.4h, v24.4s, #8
+ rshrn2 v24.8h, v25.4s, #8
+ mov v16.16b, v18.16b
+ st1 {v24.8h}, [x0], #16
+
+ b.le 5f
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v17.4s, v18.4s}, [x3], #32
+ b 4b
+
+5:
+ subs x6, x6, #1
+ b.le 0f
+ mov x5, x11
+ add x0, x0, x10, lsl #1
+ add x1, x1, x2
+ mov x3, x13 // Rewind x3/x4 to where they started
+ mov x4, x14
+ b 1b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int w, const int h,
+// const int wt, const int bitdepth_max);
+function sgr_weighted1_\bpc\()bpc_neon, export=1
+.if \bpc == 16
+ ldr w8, [sp]
+.endif
+ dup v31.8h, w7
+ cmp x6, #2
+.if \bpc == 16
+ dup v30.8h, w8
+.endif
+ add x9, x0, x1
+ add x10, x2, x3
+ add x11, x4, #2*FILTER_OUT_STRIDE
+ mov x7, #(4*FILTER_OUT_STRIDE)
+ lsl x1, x1, #1
+ lsl x3, x3, #1
+ add x8, x5, #7
+ bic x8, x8, #7 // Aligned width
+.if \bpc == 8
+ sub x1, x1, x8
+ sub x3, x3, x8
+.else
+ sub x1, x1, x8, lsl #1
+ sub x3, x3, x8, lsl #1
+.endif
+ sub x7, x7, x8, lsl #1
+ mov x8, x5
+ b.lt 2f
+1:
+.if \bpc == 8
+ ld1 {v0.8b}, [x2], #8
+ ld1 {v4.8b}, [x10], #8
+.else
+ ld1 {v0.8h}, [x2], #16
+ ld1 {v4.8h}, [x10], #16
+.endif
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v5.8h}, [x11], #16
+ subs x5, x5, #8
+.if \bpc == 8
+ ushll v0.8h, v0.8b, #4 // u
+ ushll v4.8h, v4.8b, #4 // u
+.else
+ shl v0.8h, v0.8h, #4 // u
+ shl v4.8h, v4.8h, #4 // u
+.endif
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v5.8h, v5.8h, v4.8h // t1 - u
+ ushll v2.4s, v0.4h, #7 // u << 7
+ ushll2 v3.4s, v0.8h, #7 // u << 7
+ ushll v6.4s, v4.4h, #7 // u << 7
+ ushll2 v7.4s, v4.8h, #7 // u << 7
+ smlal v2.4s, v1.4h, v31.4h // v
+ smlal2 v3.4s, v1.8h, v31.8h // v
+ smlal v6.4s, v5.4h, v31.4h // v
+ smlal2 v7.4s, v5.8h, v31.8h // v
+.if \bpc == 8
+ rshrn v2.4h, v2.4s, #11
+ rshrn2 v2.8h, v3.4s, #11
+ rshrn v6.4h, v6.4s, #11
+ rshrn2 v6.8h, v7.4s, #11
+ sqxtun v2.8b, v2.8h
+ sqxtun v6.8b, v6.8h
+ st1 {v2.8b}, [x0], #8
+ st1 {v6.8b}, [x9], #8
+.else
+ sqrshrun v2.4h, v2.4s, #11
+ sqrshrun2 v2.8h, v3.4s, #11
+ sqrshrun v6.4h, v6.4s, #11
+ sqrshrun2 v6.8h, v7.4s, #11
+ umin v2.8h, v2.8h, v30.8h
+ umin v6.8h, v6.8h, v30.8h
+ st1 {v2.8h}, [x0], #16
+ st1 {v6.8h}, [x9], #16
+.endif
+ b.gt 1b
+
+ sub x6, x6, #2
+ cmp x6, #1
+ b.lt 0f
+ mov x5, x8
+ add x0, x0, x1
+ add x9, x9, x1
+ add x2, x2, x3
+ add x10, x10, x3
+ add x4, x4, x7
+ add x11, x11, x7
+ b.eq 2f
+ b 1b
+
+2:
+.if \bpc == 8
+ ld1 {v0.8b}, [x2], #8
+.else
+ ld1 {v0.8h}, [x2], #16
+.endif
+ ld1 {v1.8h}, [x4], #16
+ subs x5, x5, #8
+.if \bpc == 8
+ ushll v0.8h, v0.8b, #4 // u
+.else
+ shl v0.8h, v0.8h, #4 // u
+.endif
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ ushll v2.4s, v0.4h, #7 // u << 7
+ ushll2 v3.4s, v0.8h, #7 // u << 7
+ smlal v2.4s, v1.4h, v31.4h // v
+ smlal2 v3.4s, v1.8h, v31.8h // v
+.if \bpc == 8
+ rshrn v2.4h, v2.4s, #11
+ rshrn2 v2.8h, v3.4s, #11
+ sqxtun v2.8b, v2.8h
+ st1 {v2.8b}, [x0], #8
+.else
+ sqrshrun v2.4h, v2.4s, #11
+ sqrshrun2 v2.8h, v3.4s, #11
+ umin v2.8h, v2.8h, v30.8h
+ st1 {v2.8h}, [x0], #16
+.endif
+ b.gt 2b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *t1, const int16_t *t2,
+// const int w, const int h,
+// const int16_t wt[2], const int bitdepth_max);
+function sgr_weighted2_\bpc\()bpc_neon, export=1
+.if \bpc == 8
+ ldr x8, [sp]
+.else
+ ldp x8, x9, [sp]
+.endif
+ cmp x7, #2
+ add x10, x0, x1
+ add x11, x2, x3
+ add x12, x4, #2*FILTER_OUT_STRIDE
+ add x13, x5, #2*FILTER_OUT_STRIDE
+ ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
+.if \bpc == 16
+ dup v29.8h, w9
+.endif
+ mov x8, #4*FILTER_OUT_STRIDE
+ lsl x1, x1, #1
+ lsl x3, x3, #1
+ add x9, x6, #7
+ bic x9, x9, #7 // Aligned width
+.if \bpc == 8
+ sub x1, x1, x9
+ sub x3, x3, x9
+.else
+ sub x1, x1, x9, lsl #1
+ sub x3, x3, x9, lsl #1
+.endif
+ sub x8, x8, x9, lsl #1
+ mov x9, x6
+ b.lt 2f
+1:
+.if \bpc == 8
+ ld1 {v0.8b}, [x2], #8
+ ld1 {v16.8b}, [x11], #8
+.else
+ ld1 {v0.8h}, [x2], #16
+ ld1 {v16.8h}, [x11], #16
+.endif
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v17.8h}, [x12], #16
+ ld1 {v2.8h}, [x5], #16
+ ld1 {v18.8h}, [x13], #16
+ subs x6, x6, #8
+.if \bpc == 8
+ ushll v0.8h, v0.8b, #4 // u
+ ushll v16.8h, v16.8b, #4 // u
+.else
+ shl v0.8h, v0.8h, #4 // u
+ shl v16.8h, v16.8h, #4 // u
+.endif
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v2.8h, v2.8h, v0.8h // t2 - u
+ sub v17.8h, v17.8h, v16.8h // t1 - u
+ sub v18.8h, v18.8h, v16.8h // t2 - u
+ ushll v3.4s, v0.4h, #7 // u << 7
+ ushll2 v4.4s, v0.8h, #7 // u << 7
+ ushll v19.4s, v16.4h, #7 // u << 7
+ ushll2 v20.4s, v16.8h, #7 // u << 7
+ smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
+ smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
+.if \bpc == 8
+ rshrn v3.4h, v3.4s, #11
+ rshrn2 v3.8h, v4.4s, #11
+ rshrn v19.4h, v19.4s, #11
+ rshrn2 v19.8h, v20.4s, #11
+ sqxtun v3.8b, v3.8h
+ sqxtun v19.8b, v19.8h
+ st1 {v3.8b}, [x0], #8
+ st1 {v19.8b}, [x10], #8
+.else
+ sqrshrun v3.4h, v3.4s, #11
+ sqrshrun2 v3.8h, v4.4s, #11
+ sqrshrun v19.4h, v19.4s, #11
+ sqrshrun2 v19.8h, v20.4s, #11
+ umin v3.8h, v3.8h, v29.8h
+ umin v19.8h, v19.8h, v29.8h
+ st1 {v3.8h}, [x0], #16
+ st1 {v19.8h}, [x10], #16
+.endif
+ b.gt 1b
+
+ subs x7, x7, #2
+ cmp x7, #1
+ b.lt 0f
+ mov x6, x9
+ add x0, x0, x1
+ add x10, x10, x1
+ add x2, x2, x3
+ add x11, x11, x3
+ add x4, x4, x8
+ add x12, x12, x8
+ add x5, x5, x8
+ add x13, x13, x8
+ b.eq 2f
+ b 1b
+
+2:
+.if \bpc == 8
+ ld1 {v0.8b}, [x2], #8
+.else
+ ld1 {v0.8h}, [x2], #16
+.endif
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v2.8h}, [x5], #16
+ subs x6, x6, #8
+.if \bpc == 8
+ ushll v0.8h, v0.8b, #4 // u
+.else
+ shl v0.8h, v0.8h, #4 // u
+.endif
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v2.8h, v2.8h, v0.8h // t2 - u
+ ushll v3.4s, v0.4h, #7 // u << 7
+ ushll2 v4.4s, v0.8h, #7 // u << 7
+ smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
+.if \bpc == 8
+ rshrn v3.4h, v3.4s, #11
+ rshrn2 v3.8h, v4.4s, #11
+ sqxtun v3.8b, v3.8h
+ st1 {v3.8b}, [x0], #8
+.else
+ sqrshrun v3.4h, v3.4s, #11
+ sqrshrun2 v3.8h, v4.4s, #11
+ umin v3.8h, v3.8h, v29.8h
+ st1 {v3.8h}, [x0], #16
+.endif
+ b.gt 1b
+0:
+ ret
+endfunc
+.endm
diff --git a/third_party/dav1d/src/arm/64/mc.S b/third_party/dav1d/src/arm/64/mc.S
new file mode 100644
index 0000000000..9f7b4e7a89
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/mc.S
@@ -0,0 +1,3310 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro avg dst, t0, t1, t2, t3
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ add \t0\().8h, \t0\().8h, \t2\().8h
+ add \t1\().8h, \t1\().8h, \t3\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #5
+ sqrshrun2 \dst\().16b, \t1\().8h, #5
+.endm
+
+.macro w_avg dst, t0, t1, t2, t3
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ sub \t0\().8h, \t2\().8h, \t0\().8h
+ sub \t1\().8h, \t3\().8h, \t1\().8h
+ sqdmulh \t0\().8h, \t0\().8h, v30.8h
+ sqdmulh \t1\().8h, \t1\().8h, v30.8h
+ add \t0\().8h, \t2\().8h, \t0\().8h
+ add \t1\().8h, \t3\().8h, \t1\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #4
+ sqrshrun2 \dst\().16b, \t1\().8h, #4
+.endm
+
+.macro mask dst, t0, t1, t2, t3
+ ld1 {v30.16b}, [x6], 16
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ mul v30.16b, v30.16b, v31.16b
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ shll v28.8h, v30.8b, #8
+ shll2 v29.8h, v30.16b, #8
+ sub \t0\().8h, \t2\().8h, \t0\().8h
+ sub \t1\().8h, \t3\().8h, \t1\().8h
+ sqdmulh \t0\().8h, \t0\().8h, v28.8h
+ sqdmulh \t1\().8h, \t1\().8h, v29.8h
+ add \t0\().8h, \t2\().8h, \t0\().8h
+ add \t1\().8h, \t3\().8h, \t1\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #4
+ sqrshrun2 \dst\().16b, \t1\().8h, #4
+.endm
+
+.macro bidir_fn type
+function \type\()_8bpc_neon, export=1
+ clz w4, w4
+.ifc \type, w_avg
+ dup v30.8h, w6
+ neg v30.8h, v30.8h
+ shl v30.8h, v30.8h, #11
+.endif
+.ifc \type, mask
+ movi v31.16b, #256-2
+.endif
+ adr x7, L(\type\()_tbl)
+ sub w4, w4, #24
+ ldrh w4, [x7, x4, lsl #1]
+ \type v4, v0, v1, v2, v3
+ sub x7, x7, w4, uxtw
+ br x7
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+4:
+ cmp w5, #4
+ st1 {v4.s}[0], [x0], x1
+ st1 {v4.s}[1], [x7], x1
+ st1 {v4.s}[2], [x0], x1
+ st1 {v4.s}[3], [x7], x1
+ b.eq 0f
+ \type v5, v0, v1, v2, v3
+ cmp w5, #8
+ st1 {v5.s}[0], [x0], x1
+ st1 {v5.s}[1], [x7], x1
+ st1 {v5.s}[2], [x0], x1
+ st1 {v5.s}[3], [x7], x1
+ b.eq 0f
+ \type v4, v0, v1, v2, v3
+ st1 {v4.s}[0], [x0], x1
+ st1 {v4.s}[1], [x7], x1
+ \type v5, v0, v1, v2, v3
+ st1 {v4.s}[2], [x0], x1
+ st1 {v4.s}[3], [x7], x1
+ st1 {v5.s}[0], [x0], x1
+ st1 {v5.s}[1], [x7], x1
+ st1 {v5.s}[2], [x0], x1
+ st1 {v5.s}[3], [x7], x1
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+8:
+ st1 {v4.d}[0], [x0], x1
+ \type v5, v0, v1, v2, v3
+ st1 {v4.d}[1], [x7], x1
+ st1 {v5.d}[0], [x0], x1
+ subs w5, w5, #4
+ st1 {v5.d}[1], [x7], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 8b
+16:
+ AARCH64_VALID_JUMP_TARGET
+ \type v5, v0, v1, v2, v3
+ st1 {v4.16b}, [x0], x1
+ \type v6, v0, v1, v2, v3
+ st1 {v5.16b}, [x0], x1
+ \type v7, v0, v1, v2, v3
+ st1 {v6.16b}, [x0], x1
+ subs w5, w5, #4
+ st1 {v7.16b}, [x0], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 16b
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+32:
+ \type v5, v0, v1, v2, v3
+ \type v6, v0, v1, v2, v3
+ st1 {v4.16b,v5.16b}, [x0], x1
+ \type v7, v0, v1, v2, v3
+ subs w5, w5, #2
+ st1 {v6.16b,v7.16b}, [x7], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 32b
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+64:
+ \type v5, v0, v1, v2, v3
+ \type v6, v0, v1, v2, v3
+ \type v7, v0, v1, v2, v3
+ \type v16, v0, v1, v2, v3
+ \type v17, v0, v1, v2, v3
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
+ \type v18, v0, v1, v2, v3
+ \type v19, v0, v1, v2, v3
+ subs w5, w5, #2
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 64b
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, #64
+128:
+ \type v5, v0, v1, v2, v3
+ \type v6, v0, v1, v2, v3
+ \type v7, v0, v1, v2, v3
+ \type v16, v0, v1, v2, v3
+ \type v17, v0, v1, v2, v3
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
+ \type v18, v0, v1, v2, v3
+ \type v19, v0, v1, v2, v3
+ subs w5, w5, #1
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
+ b.le 0f
+ \type v4, v0, v1, v2, v3
+ b 128b
+0:
+ ret
+L(\type\()_tbl):
+ .hword L(\type\()_tbl) - 1280b
+ .hword L(\type\()_tbl) - 640b
+ .hword L(\type\()_tbl) - 320b
+ .hword L(\type\()_tbl) - 16b
+ .hword L(\type\()_tbl) - 80b
+ .hword L(\type\()_tbl) - 40b
+endfunc
+.endm
+
+bidir_fn avg
+bidir_fn w_avg
+bidir_fn mask
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_8bpc_neon, export=1
+ clz w8, w4
+ adr x9, L(w_mask_\type\()_tbl)
+ sub w8, w8, #24
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ mov w10, #6903
+ dup v0.8h, w10
+.if \type == 444
+ movi v1.16b, #64
+.elseif \type == 422
+ dup v2.8b, w7
+ movi v3.8b, #129
+ sub v3.8b, v3.8b, v2.8b
+.elseif \type == 420
+ dup v2.8h, w7
+ movi v3.8h, #1, lsl #8
+ sub v3.8h, v3.8h, v2.8h
+.endif
+ add x12, x0, x1
+ lsl x1, x1, #1
+ br x9
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
+ ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
+ subs w5, w5, #4
+ sub v16.8h, v6.8h, v4.8h
+ sub v17.8h, v7.8h, v5.8h
+ sabd v18.8h, v4.8h, v6.8h
+ sabd v19.8h, v5.8h, v7.8h
+ uqsub v18.8h, v0.8h, v18.8h
+ uqsub v19.8h, v0.8h, v19.8h
+ ushr v18.8h, v18.8h, #8
+ ushr v19.8h, v19.8h, #8
+ shl v20.8h, v18.8h, #9
+ shl v21.8h, v19.8h, #9
+ sqdmulh v20.8h, v20.8h, v16.8h
+ sqdmulh v21.8h, v21.8h, v17.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v5.8h
+ sqrshrun v22.8b, v20.8h, #4
+ sqrshrun v23.8b, v21.8h, #4
+.if \type == 444
+ uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2
+ sub v18.16b, v1.16b, v18.16b
+ st1 {v18.16b}, [x6], #16
+.elseif \type == 422
+ addp v18.8h, v18.8h, v19.8h
+ xtn v18.8b, v18.8h
+ uhsub v18.8b, v3.8b, v18.8b
+ st1 {v18.8b}, [x6], #8
+.elseif \type == 420
+ trn1 v24.2d, v18.2d, v19.2d
+ trn2 v25.2d, v18.2d, v19.2d
+ add v24.8h, v24.8h, v25.8h
+ addp v18.8h, v24.8h, v24.8h
+ sub v18.4h, v3.4h, v18.4h
+ rshrn v18.8b, v18.8h, #2
+ st1 {v18.s}[0], [x6], #4
+.endif
+ st1 {v22.s}[0], [x0], x1
+ st1 {v22.s}[1], [x12], x1
+ st1 {v23.s}[0], [x0], x1
+ st1 {v23.s}[1], [x12], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x2], #32
+ ld1 {v6.8h, v7.8h}, [x3], #32
+ subs w5, w5, #2
+ sub v16.8h, v6.8h, v4.8h
+ sub v17.8h, v7.8h, v5.8h
+ sabd v18.8h, v4.8h, v6.8h
+ sabd v19.8h, v5.8h, v7.8h
+ uqsub v18.8h, v0.8h, v18.8h
+ uqsub v19.8h, v0.8h, v19.8h
+ ushr v18.8h, v18.8h, #8
+ ushr v19.8h, v19.8h, #8
+ shl v20.8h, v18.8h, #9
+ shl v21.8h, v19.8h, #9
+ sqdmulh v20.8h, v20.8h, v16.8h
+ sqdmulh v21.8h, v21.8h, v17.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v5.8h
+ sqrshrun v22.8b, v20.8h, #4
+ sqrshrun v23.8b, v21.8h, #4
+.if \type == 444
+ uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2
+ sub v18.16b, v1.16b, v18.16b
+ st1 {v18.16b}, [x6], #16
+.elseif \type == 422
+ addp v18.8h, v18.8h, v19.8h
+ xtn v18.8b, v18.8h
+ uhsub v18.8b, v3.8b, v18.8b
+ st1 {v18.8b}, [x6], #8
+.elseif \type == 420
+ add v18.8h, v18.8h, v19.8h
+ addp v18.8h, v18.8h, v18.8h
+ sub v18.4h, v3.4h, v18.4h
+ rshrn v18.8b, v18.8h, #2
+ st1 {v18.s}[0], [x6], #4
+.endif
+ st1 {v22.8b}, [x0], x1
+ st1 {v23.8b}, [x12], x1
+ b.gt 8b
+ ret
+1280:
+640:
+320:
+160:
+ AARCH64_VALID_JUMP_TARGET
+ mov w11, w4
+ sub x1, x1, w4, uxtw
+.if \type == 444
+ add x10, x6, w4, uxtw
+.elseif \type == 422
+ add x10, x6, x11, lsr #1
+.endif
+ add x9, x3, w4, uxtw #1
+ add x7, x2, w4, uxtw #1
+161:
+ mov w8, w4
+16:
+ ld1 {v4.8h, v5.8h}, [x2], #32
+ ld1 {v6.8h, v7.8h}, [x3], #32
+ ld1 {v16.8h, v17.8h}, [x7], #32
+ ld1 {v18.8h, v19.8h}, [x9], #32
+ subs w8, w8, #16
+ sub v6.8h, v6.8h, v4.8h
+ sub v7.8h, v7.8h, v5.8h
+ sub v18.8h, v18.8h, v16.8h
+ sub v19.8h, v19.8h, v17.8h
+ abs v20.8h, v6.8h
+ abs v21.8h, v7.8h
+ abs v22.8h, v18.8h
+ abs v23.8h, v19.8h
+ uqsub v20.8h, v0.8h, v20.8h
+ uqsub v21.8h, v0.8h, v21.8h
+ uqsub v22.8h, v0.8h, v22.8h
+ uqsub v23.8h, v0.8h, v23.8h
+ ushr v20.8h, v20.8h, #8
+ ushr v21.8h, v21.8h, #8
+ ushr v22.8h, v22.8h, #8
+ ushr v23.8h, v23.8h, #8
+ shl v24.8h, v20.8h, #9
+ shl v25.8h, v21.8h, #9
+ shl v26.8h, v22.8h, #9
+ shl v27.8h, v23.8h, #9
+ sqdmulh v24.8h, v24.8h, v6.8h
+ sqdmulh v25.8h, v25.8h, v7.8h
+ sqdmulh v26.8h, v26.8h, v18.8h
+ sqdmulh v27.8h, v27.8h, v19.8h
+ add v24.8h, v24.8h, v4.8h
+ add v25.8h, v25.8h, v5.8h
+ add v26.8h, v26.8h, v16.8h
+ add v27.8h, v27.8h, v17.8h
+ sqrshrun v24.8b, v24.8h, #4
+ sqrshrun v25.8b, v25.8h, #4
+ sqrshrun v26.8b, v26.8h, #4
+ sqrshrun v27.8b, v27.8h, #4
+.if \type == 444
+ uzp1 v20.16b, v20.16b, v21.16b // Same as xtn, xtn2
+ uzp1 v21.16b, v22.16b, v23.16b // Ditto
+ sub v20.16b, v1.16b, v20.16b
+ sub v21.16b, v1.16b, v21.16b
+ st1 {v20.16b}, [x6], #16
+ st1 {v21.16b}, [x10], #16
+.elseif \type == 422
+ addp v20.8h, v20.8h, v21.8h
+ addp v21.8h, v22.8h, v23.8h
+ xtn v20.8b, v20.8h
+ xtn v21.8b, v21.8h
+ uhsub v20.8b, v3.8b, v20.8b
+ uhsub v21.8b, v3.8b, v21.8b
+ st1 {v20.8b}, [x6], #8
+ st1 {v21.8b}, [x10], #8
+.elseif \type == 420
+ add v20.8h, v20.8h, v22.8h
+ add v21.8h, v21.8h, v23.8h
+ addp v20.8h, v20.8h, v21.8h
+ sub v20.8h, v3.8h, v20.8h
+ rshrn v20.8b, v20.8h, #2
+ st1 {v20.8b}, [x6], #8
+.endif
+ st1 {v24.8b, v25.8b}, [x0], #16
+ st1 {v26.8b, v27.8b}, [x12], #16
+ b.gt 16b
+ subs w5, w5, #2
+ add x2, x2, w4, uxtw #1
+ add x3, x3, w4, uxtw #1
+ add x7, x7, w4, uxtw #1
+ add x9, x9, w4, uxtw #1
+.if \type == 444
+ add x6, x6, w4, uxtw
+ add x10, x10, w4, uxtw
+.elseif \type == 422
+ add x6, x6, x11, lsr #1
+ add x10, x10, x11, lsr #1
+.endif
+ add x0, x0, x1
+ add x12, x12, x1
+ b.gt 161b
+ ret
+L(w_mask_\type\()_tbl):
+ .hword L(w_mask_\type\()_tbl) - 1280b
+ .hword L(w_mask_\type\()_tbl) - 640b
+ .hword L(w_mask_\type\()_tbl) - 320b
+ .hword L(w_mask_\type\()_tbl) - 160b
+ .hword L(w_mask_\type\()_tbl) - 8b
+ .hword L(w_mask_\type\()_tbl) - 4b
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+
+function blend_8bpc_neon, export=1
+ adr x6, L(blend_tbl)
+ clz w3, w3
+ sub w3, w3, #26
+ ldrh w3, [x6, x3, lsl #1]
+ sub x6, x6, w3, uxtw
+ movi v4.16b, #64
+ add x8, x0, x1
+ lsl x1, x1, #1
+ br x6
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.8b}, [x5], #8
+ ld1 {v1.d}[0], [x2], #8
+ ld1 {v0.s}[0], [x0]
+ subs w4, w4, #2
+ ld1 {v0.s}[1], [x8]
+ sub v3.8b, v4.8b, v2.8b
+ umull v5.8h, v1.8b, v2.8b
+ umlal v5.8h, v0.8b, v3.8b
+ rshrn v6.8b, v5.8h, #6
+ st1 {v6.s}[0], [x0], x1
+ st1 {v6.s}[1], [x8], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v2.16b}, [x5], #16
+ ld1 {v1.16b}, [x2], #16
+ ld1 {v0.d}[0], [x0]
+ ld1 {v0.d}[1], [x8]
+ sub v3.16b, v4.16b, v2.16b
+ subs w4, w4, #2
+ umull v5.8h, v1.8b, v2.8b
+ umlal v5.8h, v0.8b, v3.8b
+ umull2 v6.8h, v1.16b, v2.16b
+ umlal2 v6.8h, v0.16b, v3.16b
+ rshrn v7.8b, v5.8h, #6
+ rshrn2 v7.16b, v6.8h, #6
+ st1 {v7.d}[0], [x0], x1
+ st1 {v7.d}[1], [x8], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v1.16b, v2.16b}, [x5], #32
+ ld1 {v5.16b, v6.16b}, [x2], #32
+ ld1 {v0.16b}, [x0]
+ subs w4, w4, #2
+ sub v7.16b, v4.16b, v1.16b
+ sub v20.16b, v4.16b, v2.16b
+ ld1 {v3.16b}, [x8]
+ umull v16.8h, v5.8b, v1.8b
+ umlal v16.8h, v0.8b, v7.8b
+ umull2 v17.8h, v5.16b, v1.16b
+ umlal2 v17.8h, v0.16b, v7.16b
+ umull v21.8h, v6.8b, v2.8b
+ umlal v21.8h, v3.8b, v20.8b
+ umull2 v22.8h, v6.16b, v2.16b
+ umlal2 v22.8h, v3.16b, v20.16b
+ rshrn v18.8b, v16.8h, #6
+ rshrn2 v18.16b, v17.8h, #6
+ rshrn v19.8b, v21.8h, #6
+ rshrn2 v19.16b, v22.8h, #6
+ st1 {v18.16b}, [x0], x1
+ st1 {v19.16b}, [x8], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
+ ld1 {v20.16b, v21.16b}, [x0]
+ subs w4, w4, #2
+ ld1 {v22.16b, v23.16b}, [x8]
+ sub v5.16b, v4.16b, v0.16b
+ sub v6.16b, v4.16b, v1.16b
+ sub v30.16b, v4.16b, v2.16b
+ sub v31.16b, v4.16b, v3.16b
+ umull v24.8h, v16.8b, v0.8b
+ umlal v24.8h, v20.8b, v5.8b
+ umull2 v26.8h, v16.16b, v0.16b
+ umlal2 v26.8h, v20.16b, v5.16b
+ umull v28.8h, v17.8b, v1.8b
+ umlal v28.8h, v21.8b, v6.8b
+ umull2 v7.8h, v17.16b, v1.16b
+ umlal2 v7.8h, v21.16b, v6.16b
+ umull v27.8h, v18.8b, v2.8b
+ umlal v27.8h, v22.8b, v30.8b
+ umull2 v1.8h, v18.16b, v2.16b
+ umlal2 v1.8h, v22.16b, v30.16b
+ umull v29.8h, v19.8b, v3.8b
+ umlal v29.8h, v23.8b, v31.8b
+ umull2 v21.8h, v19.16b, v3.16b
+ umlal2 v21.8h, v23.16b, v31.16b
+ rshrn v24.8b, v24.8h, #6
+ rshrn2 v24.16b, v26.8h, #6
+ rshrn v25.8b, v28.8h, #6
+ rshrn2 v25.16b, v7.8h, #6
+ rshrn v27.8b, v27.8h, #6
+ rshrn2 v27.16b, v1.8h, #6
+ rshrn v28.8b, v29.8h, #6
+ rshrn2 v28.16b, v21.8h, #6
+ st1 {v24.16b, v25.16b}, [x0], x1
+ st1 {v27.16b, v28.16b}, [x8], x1
+ b.gt 32b
+ ret
+L(blend_tbl):
+ .hword L(blend_tbl) - 32b
+ .hword L(blend_tbl) - 16b
+ .hword L(blend_tbl) - 8b
+ .hword L(blend_tbl) - 4b
+endfunc
+
+function blend_h_8bpc_neon, export=1
+ adr x6, L(blend_h_tbl)
+ movrel x5, X(obmc_masks)
+ add x5, x5, w4, uxtw
+ sub w4, w4, w4, lsr #2
+ clz w7, w3
+ movi v4.16b, #64
+ add x8, x0, x1
+ lsl x1, x1, #1
+ sub w7, w7, #24
+ ldrh w7, [x6, x7, lsl #1]
+ sub x6, x6, w7, uxtw
+ br x6
+2:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.h}[0], [x5], #2
+ ld1 {v1.s}[0], [x2], #4
+ subs w4, w4, #2
+ ld1 {v2.h}[0], [x0]
+ zip1 v0.8b, v0.8b, v0.8b
+ sub v3.8b, v4.8b, v0.8b
+ ld1 {v2.h}[1], [x8]
+ umull v5.8h, v1.8b, v0.8b
+ umlal v5.8h, v2.8b, v3.8b
+ rshrn v5.8b, v5.8h, #6
+ st1 {v5.h}[0], [x0], x1
+ st1 {v5.h}[1], [x8], x1
+ b.gt 2b
+ ret
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v0.8b, v1.8b}, [x5], #2
+ ld1 {v2.8b}, [x2], #8
+ subs w4, w4, #2
+ ext v0.8b, v0.8b, v1.8b, #4
+ ld1 {v3.s}[0], [x0]
+ sub v5.8b, v4.8b, v0.8b
+ ld1 {v3.s}[1], [x8]
+ umull v6.8h, v2.8b, v0.8b
+ umlal v6.8h, v3.8b, v5.8b
+ rshrn v6.8b, v6.8h, #6
+ st1 {v6.s}[0], [x0], x1
+ st1 {v6.s}[1], [x8], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v0.16b, v1.16b}, [x5], #2
+ ld1 {v2.16b}, [x2], #16
+ ld1 {v3.d}[0], [x0]
+ ext v0.16b, v0.16b, v1.16b, #8
+ sub v5.16b, v4.16b, v0.16b
+ ld1 {v3.d}[1], [x8]
+ subs w4, w4, #2
+ umull v6.8h, v0.8b, v2.8b
+ umlal v6.8h, v3.8b, v5.8b
+ umull2 v7.8h, v0.16b, v2.16b
+ umlal2 v7.8h, v3.16b, v5.16b
+ rshrn v16.8b, v6.8h, #6
+ rshrn2 v16.16b, v7.8h, #6
+ st1 {v16.d}[0], [x0], x1
+ st1 {v16.d}[1], [x8], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v0.16b, v1.16b}, [x5], #2
+ ld1 {v2.16b, v3.16b}, [x2], #32
+ ld1 {v5.16b}, [x0]
+ sub v7.16b, v4.16b, v0.16b
+ sub v16.16b, v4.16b, v1.16b
+ ld1 {v6.16b}, [x8]
+ subs w4, w4, #2
+ umull v17.8h, v0.8b, v2.8b
+ umlal v17.8h, v5.8b, v7.8b
+ umull2 v18.8h, v0.16b, v2.16b
+ umlal2 v18.8h, v5.16b, v7.16b
+ umull v19.8h, v1.8b, v3.8b
+ umlal v19.8h, v6.8b, v16.8b
+ umull2 v20.8h, v1.16b, v3.16b
+ umlal2 v20.8h, v6.16b, v16.16b
+ rshrn v21.8b, v17.8h, #6
+ rshrn2 v21.16b, v18.8h, #6
+ rshrn v22.8b, v19.8h, #6
+ rshrn2 v22.16b, v20.8h, #6
+ st1 {v21.16b}, [x0], x1
+ st1 {v22.16b}, [x8], x1
+ b.gt 16b
+ ret
+1280:
+640:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ sub x1, x1, w3, uxtw
+ add x7, x2, w3, uxtw
+321:
+ ld2r {v0.16b, v1.16b}, [x5], #2
+ mov w6, w3
+ sub v20.16b, v4.16b, v0.16b
+ sub v21.16b, v4.16b, v1.16b
+32:
+ ld1 {v16.16b, v17.16b}, [x2], #32
+ ld1 {v2.16b, v3.16b}, [x0]
+ subs w6, w6, #32
+ umull v23.8h, v0.8b, v16.8b
+ umlal v23.8h, v2.8b, v20.8b
+ ld1 {v18.16b, v19.16b}, [x7], #32
+ umull2 v27.8h, v0.16b, v16.16b
+ umlal2 v27.8h, v2.16b, v20.16b
+ ld1 {v6.16b, v7.16b}, [x8]
+ umull v24.8h, v0.8b, v17.8b
+ umlal v24.8h, v3.8b, v20.8b
+ umull2 v28.8h, v0.16b, v17.16b
+ umlal2 v28.8h, v3.16b, v20.16b
+ umull v25.8h, v1.8b, v18.8b
+ umlal v25.8h, v6.8b, v21.8b
+ umull2 v5.8h, v1.16b, v18.16b
+ umlal2 v5.8h, v6.16b, v21.16b
+ rshrn v29.8b, v23.8h, #6
+ rshrn2 v29.16b, v27.8h, #6
+ umull v26.8h, v1.8b, v19.8b
+ umlal v26.8h, v7.8b, v21.8b
+ umull2 v31.8h, v1.16b, v19.16b
+ umlal2 v31.8h, v7.16b, v21.16b
+ rshrn v30.8b, v24.8h, #6
+ rshrn2 v30.16b, v28.8h, #6
+ rshrn v23.8b, v25.8h, #6
+ rshrn2 v23.16b, v5.8h, #6
+ rshrn v24.8b, v26.8h, #6
+ st1 {v29.16b, v30.16b}, [x0], #32
+ rshrn2 v24.16b, v31.8h, #6
+ st1 {v23.16b, v24.16b}, [x8], #32
+ b.gt 32b
+ subs w4, w4, #2
+ add x0, x0, x1
+ add x8, x8, x1
+ add x2, x2, w3, uxtw
+ add x7, x7, w3, uxtw
+ b.gt 321b
+ ret
+L(blend_h_tbl):
+ .hword L(blend_h_tbl) - 1280b
+ .hword L(blend_h_tbl) - 640b
+ .hword L(blend_h_tbl) - 320b
+ .hword L(blend_h_tbl) - 16b
+ .hword L(blend_h_tbl) - 8b
+ .hword L(blend_h_tbl) - 4b
+ .hword L(blend_h_tbl) - 2b
+endfunc
+
+function blend_v_8bpc_neon, export=1
+ adr x6, L(blend_v_tbl)
+ movrel x5, X(obmc_masks)
+ add x5, x5, w3, uxtw
+ clz w3, w3
+ movi v4.16b, #64
+ add x8, x0, x1
+ lsl x1, x1, #1
+ sub w3, w3, #26
+ ldrh w3, [x6, x3, lsl #1]
+ sub x6, x6, w3, uxtw
+ br x6
+20:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.8b}, [x5]
+ sub v1.8b, v4.8b, v0.8b
+2:
+ ld1 {v2.h}[0], [x2], #2
+ ld1 {v3.b}[0], [x0]
+ subs w4, w4, #2
+ ld1 {v2.b}[1], [x2]
+ ld1 {v3.b}[1], [x8]
+ umull v5.8h, v2.8b, v0.8b
+ umlal v5.8h, v3.8b, v1.8b
+ rshrn v5.8b, v5.8h, #6
+ add x2, x2, #2
+ st1 {v5.b}[0], [x0], x1
+ st1 {v5.b}[1], [x8], x1
+ b.gt 2b
+ ret
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2s}, [x5]
+ sub x1, x1, #2
+ sub v1.8b, v4.8b, v0.8b
+4:
+ ld1 {v2.8b}, [x2], #8
+ ld1 {v3.s}[0], [x0]
+ ld1 {v3.s}[1], [x8]
+ subs w4, w4, #2
+ umull v5.8h, v2.8b, v0.8b
+ umlal v5.8h, v3.8b, v1.8b
+ rshrn v5.8b, v5.8h, #6
+ st1 {v5.h}[0], [x0], #2
+ st1 {v5.h}[2], [x8], #2
+ st1 {v5.b}[2], [x0], x1
+ st1 {v5.b}[6], [x8], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v0.2d}, [x5]
+ sub x1, x1, #4
+ sub v1.16b, v4.16b, v0.16b
+8:
+ ld1 {v2.16b}, [x2], #16
+ ld1 {v3.d}[0], [x0]
+ ld1 {v3.d}[1], [x8]
+ subs w4, w4, #2
+ umull v5.8h, v0.8b, v2.8b
+ umlal v5.8h, v3.8b, v1.8b
+ umull2 v6.8h, v0.16b, v2.16b
+ umlal2 v6.8h, v3.16b, v1.16b
+ rshrn v7.8b, v5.8h, #6
+ rshrn2 v7.16b, v6.8h, #6
+ st1 {v7.s}[0], [x0], #4
+ st1 {v7.s}[2], [x8], #4
+ st1 {v7.h}[2], [x0], x1
+ st1 {v7.h}[6], [x8], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b}, [x5]
+ sub x1, x1, #8
+ sub v2.16b, v4.16b, v0.16b
+16:
+ ld1 {v5.16b, v6.16b}, [x2], #32
+ ld1 {v7.16b}, [x0]
+ subs w4, w4, #2
+ ld1 {v16.16b}, [x8]
+ umull v17.8h, v5.8b, v0.8b
+ umlal v17.8h, v7.8b, v2.8b
+ umull2 v18.8h, v5.16b, v0.16b
+ umlal2 v18.8h, v7.16b, v2.16b
+ umull v20.8h, v6.8b, v0.8b
+ umlal v20.8h, v16.8b, v2.8b
+ umull2 v21.8h, v6.16b, v0.16b
+ umlal2 v21.8h, v16.16b, v2.16b
+ rshrn v19.8b, v17.8h, #6
+ rshrn2 v19.16b, v18.8h, #6
+ rshrn v22.8b, v20.8h, #6
+ rshrn2 v22.16b, v21.8h, #6
+ st1 {v19.8b}, [x0], #8
+ st1 {v22.8b}, [x8], #8
+ st1 {v19.s}[2], [x0], x1
+ st1 {v22.s}[2], [x8], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.16b, v1.16b}, [x5]
+ sub x1, x1, #16
+ sub v2.16b, v4.16b, v0.16b
+ sub v3.8b, v4.8b, v1.8b
+32:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
+ ld1 {v5.16b, v6.16b}, [x0]
+ subs w4, w4, #2
+ ld1 {v20.16b, v21.16b}, [x8]
+ umull v22.8h, v16.8b, v0.8b
+ umlal v22.8h, v5.8b, v2.8b
+ umull2 v23.8h, v16.16b, v0.16b
+ umlal2 v23.8h, v5.16b, v2.16b
+ umull v28.8h, v17.8b, v1.8b
+ umlal v28.8h, v6.8b, v3.8b
+ umull v30.8h, v18.8b, v0.8b
+ umlal v30.8h, v20.8b, v2.8b
+ umull2 v31.8h, v18.16b, v0.16b
+ umlal2 v31.8h, v20.16b, v2.16b
+ umull v25.8h, v19.8b, v1.8b
+ umlal v25.8h, v21.8b, v3.8b
+ rshrn v24.8b, v22.8h, #6
+ rshrn2 v24.16b, v23.8h, #6
+ rshrn v28.8b, v28.8h, #6
+ rshrn v30.8b, v30.8h, #6
+ rshrn2 v30.16b, v31.8h, #6
+ rshrn v27.8b, v25.8h, #6
+ st1 {v24.16b}, [x0], #16
+ st1 {v30.16b}, [x8], #16
+ st1 {v28.8b}, [x0], x1
+ st1 {v27.8b}, [x8], x1
+ b.gt 32b
+ ret
+L(blend_v_tbl):
+ .hword L(blend_v_tbl) - 320b
+ .hword L(blend_v_tbl) - 160b
+ .hword L(blend_v_tbl) - 80b
+ .hword L(blend_v_tbl) - 40b
+ .hword L(blend_v_tbl) - 20b
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that x8 is set to (clz(w)-24).
+function put_neon
+ adr x9, L(put_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+2:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.h}[0], [x2], x3
+ ld1 {v1.h}[0], [x2], x3
+ subs w5, w5, #2
+ st1 {v0.h}[0], [x0], x1
+ st1 {v1.h}[0], [x0], x1
+ b.gt 2b
+ ret
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v1.s}[0], [x2], x3
+ subs w5, w5, #2
+ st1 {v0.s}[0], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x2], x3
+ ld1 {v1.8b}, [x2], x3
+ subs w5, w5, #2
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x0], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, x1
+ lsl x1, x1, #1
+ add x9, x2, x3
+ lsl x3, x3, #1
+16:
+ ld1 {v0.16b}, [x2], x3
+ ld1 {v1.16b}, [x9], x3
+ subs w5, w5, #2
+ st1 {v0.16b}, [x0], x1
+ st1 {v1.16b}, [x8], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ subs w5, w5, #1
+ stp x8, x9, [x0, #16]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ ldp x10, x11, [x2, #32]
+ stp x8, x9, [x0, #16]
+ subs w5, w5, #1
+ ldp x12, x13, [x2, #48]
+ stp x10, x11, [x0, #32]
+ stp x12, x13, [x0, #48]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 64b
+ ret
+128:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x2]
+ ldp q2, q3, [x2, #32]
+ stp q0, q1, [x0]
+ ldp q4, q5, [x2, #64]
+ stp q2, q3, [x0, #32]
+ ldp q6, q7, [x2, #96]
+ subs w5, w5, #1
+ stp q4, q5, [x0, #64]
+ stp q6, q7, [x0, #96]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 128b
+ ret
+
+L(put_tbl):
+ .hword L(put_tbl) - 128b
+ .hword L(put_tbl) - 64b
+ .hword L(put_tbl) - 32b
+ .hword L(put_tbl) - 160b
+ .hword L(put_tbl) - 8b
+ .hword L(put_tbl) - 4b
+ .hword L(put_tbl) - 2b
+endfunc
+
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
+function prep_neon
+ adr x9, L(prep_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x1], x2
+ ld1 {v1.s}[0], [x1], x2
+ subs w4, w4, #2
+ ushll v0.8h, v0.8b, #4
+ ushll v1.8h, v1.8b, #4
+ st1 {v0.4h, v1.4h}, [x0], #16
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x1], x2
+ subs w4, w4, #2
+ ushll v0.8h, v0.8b, #4
+ ushll v1.8h, v1.8b, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ add x9, x1, x2
+ lsl x2, x2, #1
+16:
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v1.16b}, [x9], x2
+ subs w4, w4, #2
+ ushll v4.8h, v0.8b, #4
+ ushll2 v5.8h, v0.16b, #4
+ ushll v6.8h, v1.8b, #4
+ ushll2 v7.8h, v1.16b, #4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, w3, uxtw
+32:
+ ld1 {v0.16b, v1.16b}, [x1], x2
+ subs w4, w4, #2
+ ushll v4.8h, v0.8b, #4
+ ushll2 v5.8h, v0.16b, #4
+ ld1 {v2.16b, v3.16b}, [x1], x2
+ ushll v6.8h, v1.8b, #4
+ ushll2 v7.8h, v1.16b, #4
+ ushll v16.8h, v2.8b, #4
+ st1 {v4.8h, v5.8h}, [x0], x7
+ ushll2 v17.8h, v2.16b, #4
+ st1 {v6.8h, v7.8h}, [x8], x7
+ ushll v18.8h, v3.8b, #4
+ st1 {v16.8h, v17.8h}, [x0], x7
+ ushll2 v19.8h, v3.16b, #4
+ st1 {v18.8h, v19.8h}, [x8], x7
+ b.gt 32b
+ ret
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, #32
+ mov x6, #64
+64:
+ ldp q0, q1, [x1]
+ subs w4, w4, #1
+ ushll v4.8h, v0.8b, #4
+ ushll2 v5.8h, v0.16b, #4
+ ldp q2, q3, [x1, #32]
+ ushll v6.8h, v1.8b, #4
+ ushll2 v7.8h, v1.16b, #4
+ add x1, x1, x2
+ ushll v16.8h, v2.8b, #4
+ st1 {v4.8h, v5.8h}, [x0], x6
+ ushll2 v17.8h, v2.16b, #4
+ ushll v18.8h, v3.8b, #4
+ st1 {v6.8h, v7.8h}, [x8], x6
+ ushll2 v19.8h, v3.16b, #4
+ st1 {v16.8h, v17.8h}, [x0], x6
+ st1 {v18.8h, v19.8h}, [x8], x6
+ b.gt 64b
+ ret
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, #64
+ mov x6, #128
+128:
+ ldp q0, q1, [x1]
+ ldp q2, q3, [x1, #32]
+ ushll v16.8h, v0.8b, #4
+ ushll2 v17.8h, v0.16b, #4
+ ushll v18.8h, v1.8b, #4
+ ushll2 v19.8h, v1.16b, #4
+ ushll v20.8h, v2.8b, #4
+ ushll2 v21.8h, v2.16b, #4
+ ldp q4, q5, [x1, #64]
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6
+ ushll v22.8h, v3.8b, #4
+ ushll2 v23.8h, v3.16b, #4
+ ushll v24.8h, v4.8b, #4
+ ushll2 v25.8h, v4.16b, #4
+ ushll v26.8h, v5.8b, #4
+ ushll2 v27.8h, v5.16b, #4
+ ldp q6, q7, [x1, #96]
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6
+ ushll v28.8h, v6.8b, #4
+ ushll2 v29.8h, v6.16b, #4
+ ushll v30.8h, v7.8b, #4
+ ushll2 v31.8h, v7.16b, #4
+ subs w4, w4, #1
+ add x1, x1, x2
+ st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6
+ st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6
+ b.gt 128b
+ ret
+
+L(prep_tbl):
+ .hword L(prep_tbl) - 1280b
+ .hword L(prep_tbl) - 640b
+ .hword L(prep_tbl) - 320b
+ .hword L(prep_tbl) - 160b
+ .hword L(prep_tbl) - 8b
+ .hword L(prep_tbl) - 4b
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}[0], [\s0], \strd
+ ld1 {\d1\wd}[0], [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}[0], [\s0], \strd
+ ld1 {\d3\wd}[0], [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}[0], [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}[0], [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}[0], [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}, [\s0], \strd
+ ld1 {\d1\wd}, [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}, [\s0], \strd
+ ld1 {\d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}, [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}, [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro interleave_1 wd, r0, r1, r2, r3, r4
+ trn1 \r0\wd, \r0\wd, \r1\wd
+ trn1 \r1\wd, \r1\wd, \r2\wd
+.ifnb \r3
+ trn1 \r2\wd, \r2\wd, \r3\wd
+ trn1 \r3\wd, \r3\wd, \r4\wd
+.endif
+.endm
+.macro interleave_1_h r0, r1, r2, r3, r4
+ interleave_1 .4h, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro interleave_1_s r0, r1, r2, r3, r4
+ interleave_1 .2s, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro interleave_2 wd, r0, r1, r2, r3, r4, r5
+ trn1 \r0\wd, \r0\wd, \r2\wd
+ trn1 \r1\wd, \r1\wd, \r3\wd
+ trn1 \r2\wd, \r2\wd, \r4\wd
+ trn1 \r3\wd, \r3\wd, \r5\wd
+.endm
+.macro interleave_2_s r0, r1, r2, r3, r4, r5
+ interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5
+.endm
+.macro uxtl_b r0, r1, r2, r3, r4, r5, r6
+ uxtl \r0\().8h, \r0\().8b
+ uxtl \r1\().8h, \r1\().8b
+.ifnb \r2
+ uxtl \r2\().8h, \r2\().8b
+ uxtl \r3\().8h, \r3\().8b
+.endif
+.ifnb \r4
+ uxtl \r4\().8h, \r4\().8b
+.endif
+.ifnb \r5
+ uxtl \r5\().8h, \r5\().8b
+.endif
+.ifnb \r6
+ uxtl \r6\().8h, \r6\().8b
+.endif
+.endm
+.macro mul_mla_4 d, s0, s1, s2, s3, wd
+ mul \d\wd, \s0\wd, v0.h[0]
+ mla \d\wd, \s1\wd, v0.h[1]
+ mla \d\wd, \s2\wd, v0.h[2]
+ mla \d\wd, \s3\wd, v0.h[3]
+.endm
+// Interleaving the mul/mla chains actually hurts performance
+// significantly on Cortex A53, thus keeping mul/mla tightly
+// chained like this.
+.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+ mul \d0\().4h, \s0\().4h, v0.h[0]
+ mla \d0\().4h, \s1\().4h, v0.h[1]
+ mla \d0\().4h, \s2\().4h, v0.h[2]
+ mla \d0\().4h, \s3\().4h, v0.h[3]
+ mla \d0\().4h, \s4\().4h, v0.h[4]
+ mla \d0\().4h, \s5\().4h, v0.h[5]
+ mla \d0\().4h, \s6\().4h, v0.h[6]
+ mla \d0\().4h, \s7\().4h, v0.h[7]
+.endm
+.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+ mul \d0\().8h, \s0\().8h, v0.h[0]
+ mla \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mla \d0\().8h, \s7\().8h, v0.h[7]
+.endm
+.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ mul \d0\().8h, \s0\().8h, v0.h[0]
+ mla \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mla \d0\().8h, \s7\().8h, v0.h[7]
+ mul \d1\().8h, \s1\().8h, v0.h[0]
+ mla \d1\().8h, \s2\().8h, v0.h[1]
+ mla \d1\().8h, \s3\().8h, v0.h[2]
+ mla \d1\().8h, \s4\().8h, v0.h[3]
+ mla \d1\().8h, \s5\().8h, v0.h[4]
+ mla \d1\().8h, \s6\().8h, v0.h[5]
+ mla \d1\().8h, \s7\().8h, v0.h[6]
+ mla \d1\().8h, \s8\().8h, v0.h[7]
+.endm
+.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+ mul \d0\().8h, \s0\().8h, v0.h[0]
+ mla \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mla \d0\().8h, \s7\().8h, v0.h[7]
+ mul \d1\().8h, \s2\().8h, v0.h[0]
+ mla \d1\().8h, \s3\().8h, v0.h[1]
+ mla \d1\().8h, \s4\().8h, v0.h[2]
+ mla \d1\().8h, \s5\().8h, v0.h[3]
+ mla \d1\().8h, \s6\().8h, v0.h[4]
+ mla \d1\().8h, \s7\().8h, v0.h[5]
+ mla \d1\().8h, \s8\().8h, v0.h[6]
+ mla \d1\().8h, \s9\().8h, v0.h[7]
+.endm
+.macro sqrshrun_b shift, r0, r1, r2, r3
+ sqrshrun \r0\().8b, \r0\().8h, #\shift
+.ifnb \r1
+ sqrshrun \r1\().8b, \r1\().8h, #\shift
+.endif
+.ifnb \r2
+ sqrshrun \r2\().8b, \r2\().8h, #\shift
+ sqrshrun \r3\().8b, \r3\().8h, #\shift
+.endif
+.endm
+.macro srshr_h shift, r0, r1, r2, r3
+ srshr \r0\().8h, \r0\().8h, #\shift
+.ifnb \r1
+ srshr \r1\().8h, \r1\().8h, #\shift
+.endif
+.ifnb \r2
+ srshr \r2\().8h, \r2\().8h, #\shift
+ srshr \r3\().8h, \r3\().8h, #\shift
+.endif
+.endm
+.macro st_h strd, reg, lanes
+ st1 {\reg\().h}[0], [x0], \strd
+ st1 {\reg\().h}[1], [x8], \strd
+.if \lanes > 2
+ st1 {\reg\().h}[2], [x0], \strd
+ st1 {\reg\().h}[3], [x8], \strd
+.endif
+.endm
+.macro st_s strd, r0, r1
+ st1 {\r0\().s}[0], [x0], \strd
+ st1 {\r0\().s}[1], [x8], \strd
+.ifnb \r1
+ st1 {\r1\().s}[0], [x0], \strd
+ st1 {\r1\().s}[1], [x8], \strd
+.endif
+.endm
+.macro st_d strd, r0, r1
+ st1 {\r0\().d}[0], [x0], \strd
+ st1 {\r0\().d}[1], [x8], \strd
+.ifnb \r1
+ st1 {\r1\().d}[0], [x0], \strd
+ st1 {\r1\().d}[1], [x8], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, r0, r1
+.ifc \type, put
+ sqrshrun_b 6, \r0, \r1
+ st_s \strd, \r0, \r1
+.else
+ srshr_h 2, \r0, \r1
+ st_d \strd, \r0, \r1
+.endif
+.endm
+.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
+ st1 {\r0\wd}, [x0], \strd
+ st1 {\r1\wd}, [x8], \strd
+.ifnb \r2
+ st1 {\r2\wd}, [x0], \strd
+ st1 {\r3\wd}, [x8], \strd
+.endif
+.ifnb \r4
+ st1 {\r4\wd}, [x0], \strd
+ st1 {\r5\wd}, [x8], \strd
+ st1 {\r6\wd}, [x0], \strd
+ st1 {\r7\wd}, [x8], \strd
+.endif
+.endm
+.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7
+ st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7
+ st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro shift_store_8 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_b 6, \r0, \r1, \r2, \r3
+ st_8b \strd, \r0, \r1, \r2, \r3
+.else
+ srshr_h 2, \r0, \r1, \r2, \r3
+ st_16b \strd, \r0, \r1, \r2, \r3
+.endif
+.endm
+.macro shift_store_16 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun \r0\().8b, \r0\().8h, #6
+ sqrshrun2 \r0\().16b, \r1\().8h, #6
+ sqrshrun \r2\().8b, \r2\().8h, #6
+ sqrshrun2 \r2\().16b, \r3\().8h, #6
+ st_16b \strd, \r0, \r2
+.else
+ srshr_h 2, \r0, \r1, \r2, \r3
+ st1 {\r0\().8h, \r1\().8h}, [x0], \strd
+ st1 {\r2\().8h, \r3\().8h}, [x8], \strd
+.endif
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_8bpc_neon, export=1
+ mov x8, \type_h
+ mov x9, \type_v
+ b \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap_neon
+ mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, w10
+ mul \my, \my, w10
+ add \mx, \mx, w8 // mx, 8tap_h, 4tap_h
+ add \my, \my, w9 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ clz w8, \w
+ tst \mx, #(0x7f << 14)
+ sub w8, w8, #24
+ movrel x10, X(mc_subpel_filters), -8
+ b.ne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ b.ne L(\type\()_8tap_v)
+ b \type\()_neon
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfx w9, \mx, #7, #7
+ and \mx, \mx, #0x7f
+ b.le 4f
+ mov \mx, w9
+4:
+ tst \my, #(0x7f << 14)
+ add \xmx, x10, \mx, uxtw #3
+ b.ne L(\type\()_8tap_hv)
+
+ adr x9, L(\type\()_8tap_h_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN h
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #1
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+2:
+ ld1 {v4.8b}, [\src], \s_strd
+ ld1 {v6.8b}, [\sr2], \s_strd
+ uxtl v4.8h, v4.8b
+ uxtl v6.8h, v6.8b
+ ext v5.16b, v4.16b, v4.16b, #2
+ ext v7.16b, v6.16b, v6.16b, #2
+ subs \h, \h, #2
+ trn1 v3.2s, v4.2s, v6.2s
+ trn2 v6.2s, v4.2s, v6.2s
+ trn1 v4.2s, v5.2s, v7.2s
+ trn2 v7.2s, v5.2s, v7.2s
+ mul v3.4h, v3.4h, v0.h[0]
+ mla v3.4h, v4.4h, v0.h[1]
+ mla v3.4h, v6.4h, v0.h[2]
+ mla v3.4h, v7.4h, v0.h[3]
+ srshr v3.4h, v3.4h, #2
+ sqrshrun v3.8b, v3.8h, #4
+ st1 {v3.h}[0], [\dst], \d_strd
+ st1 {v3.h}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #1
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+4:
+ ld1 {v16.8b}, [\src], \s_strd
+ ld1 {v20.8b}, [\sr2], \s_strd
+ uxtl v16.8h, v16.8b
+ uxtl v20.8h, v20.8b
+ ext v17.16b, v16.16b, v16.16b, #2
+ ext v18.16b, v16.16b, v16.16b, #4
+ ext v19.16b, v16.16b, v16.16b, #6
+ ext v21.16b, v20.16b, v20.16b, #2
+ ext v22.16b, v20.16b, v20.16b, #4
+ ext v23.16b, v20.16b, v20.16b, #6
+ subs \h, \h, #2
+ mul v16.4h, v16.4h, v0.h[0]
+ mla v16.4h, v17.4h, v0.h[1]
+ mla v16.4h, v18.4h, v0.h[2]
+ mla v16.4h, v19.4h, v0.h[3]
+ mul v20.4h, v20.4h, v0.h[0]
+ mla v20.4h, v21.4h, v0.h[1]
+ mla v20.4h, v22.4h, v0.h[2]
+ mla v20.4h, v23.4h, v0.h[3]
+ srshr v16.4h, v16.4h, #2
+ srshr v20.4h, v20.4h, #2
+.ifc \type, put
+ sqrshrun v16.8b, v16.8h, #4
+ sqrshrun v20.8b, v20.8h, #4
+ st1 {v16.s}[0], [\dst], \d_strd
+ st1 {v20.s}[0], [\ds2], \d_strd
+.else
+ st1 {v16.4h}, [\dst], \d_strd
+ st1 {v20.4h}, [\ds2], \d_strd
+.endif
+ b.gt 4b
+ ret
+
+80: // 8xN h
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ sub \src, \src, #3
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+8:
+ ld1 {v16.8b, v17.8b}, [\src], \s_strd
+ ld1 {v20.8b, v21.8b}, [\sr2], \s_strd
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+
+ mul v18.8h, v16.8h, v0.h[0]
+ mul v22.8h, v20.8h, v0.h[0]
+.irpc i, 1234567
+ ext v19.16b, v16.16b, v17.16b, #(2*\i)
+ ext v23.16b, v20.16b, v21.16b, #(2*\i)
+ mla v18.8h, v19.8h, v0.h[\i]
+ mla v22.8h, v23.8h, v0.h[\i]
+.endr
+ subs \h, \h, #2
+ srshr v18.8h, v18.8h, #2
+ srshr v22.8h, v22.8h, #2
+.ifc \type, put
+ sqrshrun v18.8b, v18.8h, #4
+ sqrshrun v22.8b, v22.8h, #4
+ st1 {v18.8b}, [\dst], \d_strd
+ st1 {v22.8b}, [\ds2], \d_strd
+.else
+ st1 {v18.8h}, [\dst], \d_strd
+ st1 {v22.8h}, [\ds2], \d_strd
+.endif
+ b.gt 8b
+ ret
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ sub \src, \src, #3
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ sub \s_strd, \s_strd, \w, uxtw
+ sub \s_strd, \s_strd, #8
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw
+.endif
+161:
+ ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24
+ ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24
+ mov \mx, \w
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+
+16:
+ mul v24.8h, v16.8h, v0.h[0]
+ mul v25.8h, v17.8h, v0.h[0]
+ mul v26.8h, v20.8h, v0.h[0]
+ mul v27.8h, v21.8h, v0.h[0]
+.irpc i, 1234567
+ ext v28.16b, v16.16b, v17.16b, #(2*\i)
+ ext v29.16b, v17.16b, v18.16b, #(2*\i)
+ ext v30.16b, v20.16b, v21.16b, #(2*\i)
+ ext v31.16b, v21.16b, v22.16b, #(2*\i)
+ mla v24.8h, v28.8h, v0.h[\i]
+ mla v25.8h, v29.8h, v0.h[\i]
+ mla v26.8h, v30.8h, v0.h[\i]
+ mla v27.8h, v31.8h, v0.h[\i]
+.endr
+ srshr v24.8h, v24.8h, #2
+ srshr v25.8h, v25.8h, #2
+ srshr v26.8h, v26.8h, #2
+ srshr v27.8h, v27.8h, #2
+ subs \mx, \mx, #16
+.ifc \type, put
+ sqrshrun v24.8b, v24.8h, #4
+ sqrshrun2 v24.16b, v25.8h, #4
+ sqrshrun v26.8b, v26.8h, #4
+ sqrshrun2 v26.16b, v27.8h, #4
+ st1 {v24.16b}, [\dst], #16
+ st1 {v26.16b}, [\ds2], #16
+.else
+ st1 {v24.8h, v25.8h}, [\dst], #32
+ st1 {v26.8h, v27.8h}, [\ds2], #32
+.endif
+ b.le 9f
+
+ mov v16.16b, v18.16b
+ mov v20.16b, v22.16b
+ ld1 {v17.8b, v18.8b}, [\src], #16
+ ld1 {v21.8b, v22.8b}, [\sr2], #16
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 161b
+ ret
+
+L(\type\()_8tap_h_tbl):
+ .hword L(\type\()_8tap_h_tbl) - 1280b
+ .hword L(\type\()_8tap_h_tbl) - 640b
+ .hword L(\type\()_8tap_h_tbl) - 320b
+ .hword L(\type\()_8tap_h_tbl) - 160b
+ .hword L(\type\()_8tap_h_tbl) - 80b
+ .hword L(\type\()_8tap_h_tbl) - 40b
+ .hword L(\type\()_8tap_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfx w9, \my, #7, #7
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w9
+4:
+ add \xmy, x10, \my, uxtw #3
+
+ adr x9, L(\type\()_8tap_v_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN v
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ b.gt 28f
+
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ // 2x2 v
+ load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ interleave_1_h v1, v2, v3, v4, v5
+ b.gt 24f
+ uxtl_b v1, v2, v3, v4
+ mul_mla_4 v6, v1, v2, v3, v4, .4h
+ sqrshrun_b 6, v6
+ st_h \d_strd, v6, 2
+ ret
+
+24: // 2x4 v
+ load_h \sr2, \src, \s_strd, v6, v7
+ interleave_1_h v5, v6, v7
+ interleave_2_s v1, v2, v3, v4, v5, v6
+ uxtl_b v1, v2, v3, v4
+ mul_mla_4 v6, v1, v2, v3, v4, .8h
+ sqrshrun_b 6, v6
+ st_h \d_strd, v6, 4
+ ret
+
+28: // 2x6, 2x8, 2x12, 2x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7
+ interleave_1_h v1, v2, v3, v4, v5
+ interleave_1_h v5, v6, v7
+ interleave_2_s v1, v2, v3, v4, v5, v6
+ uxtl_b v1, v2, v3, v4
+216:
+ subs \h, \h, #4
+ load_h \sr2, \src, \s_strd, v16, v17, v18, v19
+ interleave_1_h v7, v16, v17, v18, v19
+ interleave_2_s v5, v6, v7, v16, v17, v18
+ uxtl_b v5, v6, v7, v16
+ mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
+ sqrshrun_b 6, v30
+ st_h \d_strd, v30, 4
+ b.le 0f
+ cmp \h, #2
+ mov v1.16b, v5.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ mov v4.16b, v16.16b
+ mov v5.16b, v17.16b
+ mov v6.16b, v18.16b
+ mov v7.16b, v19.16b
+ b.eq 26f
+ b 216b
+26:
+ load_h \sr2, \src, \s_strd, v16, v17
+ interleave_1_h v7, v16, v17
+ uxtl_b v5, v6, v7, v16
+ mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
+ sqrshrun_b 6, v30
+ st_h \d_strd, v30, 2
+0:
+ ret
+.endif
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ interleave_1_s v1, v2, v3, v4, v5
+ uxtl_b v1, v2, v3, v4
+ mul_mla_4 v6, v1, v2, v3, v4, .8h
+ shift_store_4 \type, \d_strd, v6
+ b.le 0f
+ load_s \sr2, \src, \s_strd, v6, v7
+ interleave_1_s v5, v6, v7
+ uxtl_b v5, v6
+ mul_mla_4 v7, v3, v4, v5, v6, .8h
+ shift_store_4 \type, \d_strd, v7
+0:
+ ret
+
+480: // 4x6, 4x8, 4x12, 4x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+ interleave_1_s v16, v17, v18
+ interleave_1_s v18, v19, v20, v21, v22
+ uxtl_b v16, v17
+ uxtl_b v18, v19, v20, v21
+
+48:
+ subs \h, \h, #4
+ load_s \sr2, \src, \s_strd, v23, v24, v25, v26
+ interleave_1_s v22, v23, v24, v25, v26
+ uxtl_b v22, v23, v24, v25
+ mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+ shift_store_4 \type, \d_strd, v1, v2
+ b.le 0f
+ load_s \sr2, \src, \s_strd, v27, v16
+ subs \h, \h, #2
+ interleave_1_s v26, v27, v16
+ uxtl_b v26, v27
+ mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
+ shift_store_4 \type, \d_strd, v1
+ b.le 0f
+ load_s \sr2, \src, \s_strd, v17, v18
+ subs \h, \h, #2
+ interleave_1_s v16, v17, v18
+ uxtl_b v16, v17
+ mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
+ shift_store_4 \type, \d_strd, v2
+ b.le 0f
+ subs \h, \h, #4
+ load_s \sr2, \src, \s_strd, v19, v20, v21, v22
+ interleave_1_s v18, v19, v20, v21, v22
+ uxtl_b v18, v19, v20, v21
+ mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
+ shift_store_4 \type, \d_strd, v1, v2
+ b.gt 48b
+0:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ uxtl_b v1, v2, v3, v4, v5
+ mul_mla_4 v6, v1, v2, v3, v4, .8h
+ mul_mla_4 v7, v2, v3, v4, v5, .8h
+ shift_store_8 \type, \d_strd, v6, v7
+ b.le 0f
+ load_8b \sr2, \src, \s_strd, v6, v7
+ uxtl_b v6, v7
+ mul_mla_4 v1, v3, v4, v5, v6, .8h
+ mul_mla_4 v2, v4, v5, v6, v7, .8h
+ shift_store_8 \type, \d_strd, v1, v2
+0:
+ ret
+
+880: // 8x6, 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmy]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+ uxtl_b v16, v17, v18, v19, v20, v21, v22
+
+88:
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v23, v24
+ uxtl_b v23, v24
+ mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
+ shift_store_8 \type, \d_strd, v1, v2
+ b.le 9f
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v25, v26
+ uxtl_b v25, v26
+ mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
+ shift_store_8 \type, \d_strd, v3, v4
+ b.le 9f
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v27, v16
+ uxtl_b v27, v16
+ mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
+ shift_store_8 \type, \d_strd, v1, v2
+ b.le 9f
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v17, v18
+ uxtl_b v17, v18
+ mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
+ shift_store_8 \type, \d_strd, v3, v4
+ b.le 9f
+ subs \h, \h, #4
+ load_8b \sr2, \src, \s_strd, v19, v20, v21, v22
+ uxtl_b v19, v20, v21, v22
+ mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
+ mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
+ shift_store_8 \type, \d_strd, v1, v2, v3, v4
+ b.gt 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 168b
+0:
+ ret
+
+160:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 1680b
+
+ // 16x2, 16x4 v
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ cmp \h, #2
+ load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ uxtl v16.8h, v1.8b
+ uxtl v17.8h, v2.8b
+ uxtl v18.8h, v3.8b
+ uxtl v19.8h, v4.8b
+ uxtl v20.8h, v5.8b
+ uxtl2 v23.8h, v1.16b
+ uxtl2 v24.8h, v2.16b
+ uxtl2 v25.8h, v3.16b
+ uxtl2 v26.8h, v4.16b
+ uxtl2 v27.8h, v5.16b
+ mul_mla_4 v1, v16, v17, v18, v19, .8h
+ mul_mla_4 v16, v17, v18, v19, v20, .8h
+ mul_mla_4 v2, v23, v24, v25, v26, .8h
+ mul_mla_4 v17, v24, v25, v26, v27, .8h
+ shift_store_16 \type, \d_strd, v1, v2, v16, v17
+ b.le 0f
+ load_16b \sr2, \src, \s_strd, v6, v7
+ uxtl v21.8h, v6.8b
+ uxtl v22.8h, v7.8b
+ uxtl2 v28.8h, v6.16b
+ uxtl2 v29.8h, v7.16b
+ mul_mla_4 v1, v18, v19, v20, v21, .8h
+ mul_mla_4 v3, v19, v20, v21, v22, .8h
+ mul_mla_4 v2, v25, v26, v27, v28, .8h
+ mul_mla_4 v4, v26, v27, v28, v29, .8h
+ shift_store_16 \type, \d_strd, v1, v2, v3, v4
+0:
+ ret
+
+L(\type\()_8tap_v_tbl):
+ .hword L(\type\()_8tap_v_tbl) - 1280b
+ .hword L(\type\()_8tap_v_tbl) - 640b
+ .hword L(\type\()_8tap_v_tbl) - 320b
+ .hword L(\type\()_8tap_v_tbl) - 160b
+ .hword L(\type\()_8tap_v_tbl) - 80b
+ .hword L(\type\()_8tap_v_tbl) - 40b
+ .hword L(\type\()_8tap_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfx w9, \my, #7, #7
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w9
+4:
+ add \xmy, x10, \my, uxtw #3
+
+ adr x9, L(\type\()_8tap_hv_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20:
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 280f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v28.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ ext v29.16b, v28.16b, v28.16b, #2
+ mul v28.4h, v28.4h, v0.4h
+ mul v29.4h, v29.4h, v0.4h
+ addp v28.4h, v28.4h, v29.4h
+ addp v16.4h, v28.4h, v28.4h
+ srshr v16.4h, v16.4h, #2
+ bl L(\type\()_8tap_filter_2)
+
+ trn1 v16.2s, v16.2s, v28.2s
+ mov v17.8b, v28.8b
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ ext v18.8b, v17.8b, v28.8b, #4
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v28.4h, v1.h[3]
+
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqxtun v2.8b, v2.8h
+ subs \h, \h, #2
+ st1 {v2.h}[0], [\dst], \d_strd
+ st1 {v2.h}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v28.8b
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #1
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v28.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ ext v29.16b, v28.16b, v28.16b, #2
+ mul v28.4h, v28.4h, v0.4h
+ mul v29.4h, v29.4h, v0.4h
+ addp v28.4h, v28.4h, v29.4h
+ addp v16.4h, v28.4h, v28.4h
+ srshr v16.4h, v16.4h, #2
+
+ bl L(\type\()_8tap_filter_2)
+ trn1 v16.2s, v16.2s, v28.2s
+ mov v17.8b, v28.8b
+ bl L(\type\()_8tap_filter_2)
+ ext v18.8b, v17.8b, v28.8b, #4
+ mov v19.8b, v28.8b
+ bl L(\type\()_8tap_filter_2)
+ ext v20.8b, v19.8b, v28.8b, #4
+ mov v21.8b, v28.8b
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ ext v22.8b, v21.8b, v28.8b, #4
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal v2.4s, v28.4h, v1.h[7]
+
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqxtun v2.8b, v2.8h
+ subs \h, \h, #2
+ st1 {v2.h}[0], [\dst], \d_strd
+ st1 {v2.h}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v28.8b
+ b 28b
+
+0:
+ ret x15
+
+L(\type\()_8tap_filter_2):
+ ld1 {v28.8b}, [\sr2], \s_strd
+ ld1 {v30.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ uxtl v30.8h, v30.8b
+ ext v29.16b, v28.16b, v28.16b, #2
+ ext v31.16b, v30.16b, v30.16b, #2
+ trn1 v27.2s, v28.2s, v30.2s
+ trn2 v30.2s, v28.2s, v30.2s
+ trn1 v28.2s, v29.2s, v31.2s
+ trn2 v31.2s, v29.2s, v31.2s
+ mul v27.4h, v27.4h, v0.h[0]
+ mla v27.4h, v28.4h, v0.h[1]
+ mla v27.4h, v30.4h, v0.h[2]
+ mla v27.4h, v31.4h, v0.h[3]
+ srshr v28.4h, v27.4h, #2
+ ret
+.endif
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 480f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+ sub \sr2, \src, #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ // 4x2, 4x4 hv
+ ld1 {v26.8b}, [\src], \s_strd
+ uxtl v26.8h, v26.8b
+ ext v28.16b, v26.16b, v26.16b, #2
+ ext v29.16b, v26.16b, v26.16b, #4
+ ext v30.16b, v26.16b, v26.16b, #6
+ mul v31.4h, v26.4h, v0.h[0]
+ mla v31.4h, v28.4h, v0.h[1]
+ mla v31.4h, v29.4h, v0.h[2]
+ mla v31.4h, v30.4h, v0.h[3]
+ srshr v16.4h, v31.4h, #2
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v28.8b
+ mov v18.8b, v29.8b
+
+4:
+ bl L(\type\()_8tap_filter_4)
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v28.4h, v1.h[3]
+ smull v3.4s, v17.4h, v1.h[0]
+ smlal v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v28.4h, v1.h[2]
+ smlal v3.4s, v29.4h, v1.h[3]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn v3.4h, v3.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ st1 {v2.s}[0], [\dst], \d_strd
+ st1 {v3.s}[0], [\ds2], \d_strd
+.else
+ st1 {v2.4h}, [\dst], \d_strd
+ st1 {v3.4h}, [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v28.8b
+ mov v18.8b, v29.8b
+ b 4b
+
+480: // 4x8, 4x16, 4x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #1
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v26.8b}, [\src], \s_strd
+ uxtl v26.8h, v26.8b
+ ext v28.16b, v26.16b, v26.16b, #2
+ ext v29.16b, v26.16b, v26.16b, #4
+ ext v30.16b, v26.16b, v26.16b, #6
+ mul v31.4h, v26.4h, v0.h[0]
+ mla v31.4h, v28.4h, v0.h[1]
+ mla v31.4h, v29.4h, v0.h[2]
+ mla v31.4h, v30.4h, v0.h[3]
+ srshr v16.4h, v31.4h, #2
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v28.8b
+ mov v18.8b, v29.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v19.8b, v28.8b
+ mov v20.8b, v29.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v21.8b, v28.8b
+ mov v22.8b, v29.8b
+
+48:
+ bl L(\type\()_8tap_filter_4)
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal v2.4s, v28.4h, v1.h[7]
+ smull v3.4s, v17.4h, v1.h[0]
+ smlal v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v19.4h, v1.h[2]
+ smlal v3.4s, v20.4h, v1.h[3]
+ smlal v3.4s, v21.4h, v1.h[4]
+ smlal v3.4s, v22.4h, v1.h[5]
+ smlal v3.4s, v28.4h, v1.h[6]
+ smlal v3.4s, v29.4h, v1.h[7]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn v3.4h, v3.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ st1 {v2.s}[0], [\dst], \d_strd
+ st1 {v3.s}[0], [\ds2], \d_strd
+.else
+ st1 {v2.4h}, [\dst], \d_strd
+ st1 {v3.4h}, [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v28.8b
+ mov v22.8b, v29.8b
+ b 48b
+0:
+ ret x15
+
+L(\type\()_8tap_filter_4):
+ ld1 {v26.8b}, [\sr2], \s_strd
+ ld1 {v27.8b}, [\src], \s_strd
+ uxtl v26.8h, v26.8b
+ uxtl v27.8h, v27.8b
+ ext v28.16b, v26.16b, v26.16b, #2
+ ext v29.16b, v26.16b, v26.16b, #4
+ ext v30.16b, v26.16b, v26.16b, #6
+ mul v31.4h, v26.4h, v0.h[0]
+ mla v31.4h, v28.4h, v0.h[1]
+ mla v31.4h, v29.4h, v0.h[2]
+ mla v31.4h, v30.4h, v0.h[3]
+ ext v28.16b, v27.16b, v27.16b, #2
+ ext v29.16b, v27.16b, v27.16b, #4
+ ext v30.16b, v27.16b, v27.16b, #6
+ mul v27.4h, v27.4h, v0.h[0]
+ mla v27.4h, v28.4h, v0.h[1]
+ mla v27.4h, v29.4h, v0.h[2]
+ mla v27.4h, v30.4h, v0.h[3]
+ srshr v28.4h, v31.4h, #2
+ srshr v29.4h, v27.4h, #2
+ ret
+
+80:
+160:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 880f
+ add \xmy, \xmy, #2
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.s}[0], [\xmy]
+ sub \src, \src, #3
+ sub \src, \src, \s_strd
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ bl L(\type\()_8tap_filter_8_first)
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v24.16b
+ mov v18.16b, v25.16b
+
+8:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v24.4h, v1.h[2]
+ smlal2 v5.4s, v24.8h, v1.h[2]
+ smlal v2.4s, v24.4h, v1.h[3]
+ smlal2 v3.4s, v24.8h, v1.h[3]
+ smlal v4.4s, v25.4h, v1.h[3]
+ smlal2 v5.4s, v25.8h, v1.h[3]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn2 v2.8h, v3.4s, #\shift_hv
+ sqrshrn v4.4h, v4.4s, #\shift_hv
+ sqrshrn2 v4.8h, v5.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v4.8b, v4.8h
+ st1 {v2.8b}, [\dst], \d_strd
+ st1 {v4.8b}, [\ds2], \d_strd
+.else
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v4.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v24.16b
+ mov v18.16b, v25.16b
+ b 8b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 164b
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #3
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ bl L(\type\()_8tap_filter_8_first)
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v24.16b
+ mov v18.16b, v25.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v19.16b, v24.16b
+ mov v20.16b, v25.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v21.16b, v24.16b
+ mov v22.16b, v25.16b
+
+88:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v19.4h, v1.h[2]
+ smlal2 v5.4s, v19.8h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal2 v3.4s, v19.8h, v1.h[3]
+ smlal v4.4s, v20.4h, v1.h[3]
+ smlal2 v5.4s, v20.8h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal2 v3.4s, v20.8h, v1.h[4]
+ smlal v4.4s, v21.4h, v1.h[4]
+ smlal2 v5.4s, v21.8h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal2 v3.4s, v21.8h, v1.h[5]
+ smlal v4.4s, v22.4h, v1.h[5]
+ smlal2 v5.4s, v22.8h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal2 v3.4s, v22.8h, v1.h[6]
+ smlal v4.4s, v24.4h, v1.h[6]
+ smlal2 v5.4s, v24.8h, v1.h[6]
+ smlal v2.4s, v24.4h, v1.h[7]
+ smlal2 v3.4s, v24.8h, v1.h[7]
+ smlal v4.4s, v25.4h, v1.h[7]
+ smlal2 v5.4s, v25.8h, v1.h[7]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn2 v2.8h, v3.4s, #\shift_hv
+ sqrshrn v4.4h, v4.4s, #\shift_hv
+ sqrshrn2 v4.8h, v5.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v4.8b, v4.8h
+ st1 {v2.8b}, [\dst], \d_strd
+ st1 {v4.8b}, [\ds2], \d_strd
+.else
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v4.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v20.16b, v22.16b
+ mov v21.16b, v24.16b
+ mov v22.16b, v25.16b
+ b 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 168b
+0:
+ ret x15
+
+L(\type\()_8tap_filter_8_first):
+ ld1 {v28.8b, v29.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ uxtl v29.8h, v29.8b
+ mul v16.8h, v28.8h, v0.h[0]
+ ext v24.16b, v28.16b, v29.16b, #(2*1)
+ ext v25.16b, v28.16b, v29.16b, #(2*2)
+ ext v26.16b, v28.16b, v29.16b, #(2*3)
+ ext v27.16b, v28.16b, v29.16b, #(2*4)
+ mla v16.8h, v24.8h, v0.h[1]
+ mla v16.8h, v25.8h, v0.h[2]
+ mla v16.8h, v26.8h, v0.h[3]
+ mla v16.8h, v27.8h, v0.h[4]
+ ext v24.16b, v28.16b, v29.16b, #(2*5)
+ ext v25.16b, v28.16b, v29.16b, #(2*6)
+ ext v26.16b, v28.16b, v29.16b, #(2*7)
+ mla v16.8h, v24.8h, v0.h[5]
+ mla v16.8h, v25.8h, v0.h[6]
+ mla v16.8h, v26.8h, v0.h[7]
+ srshr v16.8h, v16.8h, #2
+ ret
+
+L(\type\()_8tap_filter_8):
+ ld1 {v28.8b, v29.8b}, [\sr2], \s_strd
+ ld1 {v30.8b, v31.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ uxtl v29.8h, v29.8b
+ uxtl v30.8h, v30.8b
+ uxtl v31.8h, v31.8b
+ mul v24.8h, v28.8h, v0.h[0]
+ mul v25.8h, v30.8h, v0.h[0]
+.irpc i, 1234567
+ ext v26.16b, v28.16b, v29.16b, #(2*\i)
+ ext v27.16b, v30.16b, v31.16b, #(2*\i)
+ mla v24.8h, v26.8h, v0.h[\i]
+ mla v25.8h, v27.8h, v0.h[\i]
+.endr
+ srshr v24.8h, v24.8h, #2
+ srshr v25.8h, v25.8h, #2
+ ret
+
+L(\type\()_8tap_hv_tbl):
+ .hword L(\type\()_8tap_hv_tbl) - 1280b
+ .hword L(\type\()_8tap_hv_tbl) - 640b
+ .hword L(\type\()_8tap_hv_tbl) - 320b
+ .hword L(\type\()_8tap_hv_tbl) - 160b
+ .hword L(\type\()_8tap_hv_tbl) - 80b
+ .hword L(\type\()_8tap_hv_tbl) - 40b
+ .hword L(\type\()_8tap_hv_tbl) - 20b
+ .hword 0
+endfunc
+
+
+function \type\()_bilin_8bpc_neon, export=1
+ dup v1.16b, \mx
+ dup v3.16b, \my
+ mov w9, #16
+ sub w8, w9, \mx
+ sub w9, w9, \my
+ dup v0.16b, w8
+ dup v2.16b, w9
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ clz w8, \w
+ sub w8, w8, #24
+ cbnz \mx, L(\type\()_bilin_h)
+ cbnz \my, L(\type\()_bilin_v)
+ b \type\()_neon
+
+L(\type\()_bilin_h):
+ cbnz \my, L(\type\()_bilin_hv)
+
+ adr x9, L(\type\()_bilin_h_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN h
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ ld1 {v4.s}[0], [\src], \s_strd
+ ld1 {v6.s}[0], [\sr2], \s_strd
+ ext v5.8b, v4.8b, v4.8b, #1
+ ext v7.8b, v6.8b, v6.8b, #1
+ trn1 v4.4h, v4.4h, v6.4h
+ trn1 v5.4h, v5.4h, v7.4h
+ subs \h, \h, #2
+ umull v4.8h, v4.8b, v0.8b
+ umlal v4.8h, v5.8b, v1.8b
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.h}[0], [\dst], \d_strd
+ st1 {v4.h}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ ld1 {v4.8b}, [\src], \s_strd
+ ld1 {v6.8b}, [\sr2], \s_strd
+ ext v5.8b, v4.8b, v4.8b, #1
+ ext v7.8b, v6.8b, v6.8b, #1
+ trn1 v4.2s, v4.2s, v6.2s
+ trn1 v5.2s, v5.2s, v7.2s
+ subs \h, \h, #2
+ umull v4.8h, v4.8b, v0.8b
+ umlal v4.8h, v5.8b, v1.8b
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+.else
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+.endif
+ b.gt 4b
+ ret
+
+80: // 8xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ ld1 {v4.16b}, [\src], \s_strd
+ ld1 {v6.16b}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v4.16b, #1
+ ext v7.16b, v6.16b, v6.16b, #1
+ subs \h, \h, #2
+ umull v4.8h, v4.8b, v0.8b
+ umull v6.8h, v6.8b, v0.8b
+ umlal v4.8h, v5.8b, v1.8b
+ umlal v6.8h, v7.8b, v1.8b
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ uqrshrn v6.8b, v6.8h, #4
+ st1 {v4.8b}, [\dst], \d_strd
+ st1 {v6.8b}, [\ds2], \d_strd
+.else
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v6.8h}, [\ds2], \d_strd
+.endif
+ b.gt 8b
+ ret
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w, uxtw
+ sub \s_strd, \s_strd, #8
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw
+.endif
+161:
+ ld1 {v16.d}[1], [\src], #8
+ ld1 {v20.d}[1], [\sr2], #8
+ mov \mx, \w
+
+16:
+ ld1 {v18.16b}, [\src], #16
+ ld1 {v22.16b}, [\sr2], #16
+ ext v17.16b, v16.16b, v18.16b, #8
+ ext v19.16b, v16.16b, v18.16b, #9
+ ext v21.16b, v20.16b, v22.16b, #8
+ ext v23.16b, v20.16b, v22.16b, #9
+ umull v16.8h, v17.8b, v0.8b
+ umull2 v17.8h, v17.16b, v0.16b
+ umull v20.8h, v21.8b, v0.8b
+ umull2 v21.8h, v21.16b, v0.16b
+ umlal v16.8h, v19.8b, v1.8b
+ umlal2 v17.8h, v19.16b, v1.16b
+ umlal v20.8h, v23.8b, v1.8b
+ umlal2 v21.8h, v23.16b, v1.16b
+ subs \mx, \mx, #16
+.ifc \type, put
+ uqrshrn v16.8b, v16.8h, #4
+ uqrshrn2 v16.16b, v17.8h, #4
+ uqrshrn v20.8b, v20.8h, #4
+ uqrshrn2 v20.16b, v21.8h, #4
+ st1 {v16.16b}, [\dst], #16
+ st1 {v20.16b}, [\ds2], #16
+.else
+ st1 {v16.8h, v17.8h}, [\dst], #32
+ st1 {v20.8h, v21.8h}, [\ds2], #32
+.endif
+ b.le 9f
+
+ mov v16.16b, v18.16b
+ mov v20.16b, v22.16b
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 161b
+ ret
+
+L(\type\()_bilin_h_tbl):
+ .hword L(\type\()_bilin_h_tbl) - 1280b
+ .hword L(\type\()_bilin_h_tbl) - 640b
+ .hword L(\type\()_bilin_h_tbl) - 320b
+ .hword L(\type\()_bilin_h_tbl) - 160b
+ .hword L(\type\()_bilin_h_tbl) - 80b
+ .hword L(\type\()_bilin_h_tbl) - 40b
+ .hword L(\type\()_bilin_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr x9, L(\type\()_bilin_v_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN v
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ ld1 {v16.h}[0], [\src], \s_strd
+ b.gt 24f
+22:
+ ld1 {v17.h}[0], [\sr2], \s_strd
+ ld1 {v18.h}[0], [\src], \s_strd
+ trn1 v16.4h, v16.4h, v17.4h
+ trn1 v17.4h, v17.4h, v18.4h
+ umull v4.8h, v16.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.h}[0], [\dst]
+ st1 {v4.h}[1], [\ds2]
+ ret
+24: // 2x4, 2x6, 2x8, ... v
+ ld1 {v17.h}[0], [\sr2], \s_strd
+ ld1 {v18.h}[0], [\src], \s_strd
+ ld1 {v19.h}[0], [\sr2], \s_strd
+ ld1 {v20.h}[0], [\src], \s_strd
+ sub \h, \h, #4
+ trn1 v16.4h, v16.4h, v17.4h
+ trn1 v17.4h, v17.4h, v18.4h
+ trn1 v18.4h, v18.4h, v19.4h
+ trn1 v19.4h, v19.4h, v20.4h
+ trn1 v16.2s, v16.2s, v18.2s
+ trn1 v17.2s, v17.2s, v19.2s
+ umull v4.8h, v16.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ cmp \h, #2
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.h}[0], [\dst], \d_strd
+ st1 {v4.h}[1], [\ds2], \d_strd
+ st1 {v4.h}[2], [\dst], \d_strd
+ st1 {v4.h}[3], [\ds2], \d_strd
+ b.lt 0f
+ mov v16.8b, v20.8b
+ b.eq 22b
+ b 24b
+0:
+ ret
+.endif
+
+40: // 4xN v
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.s}[0], [\src], \s_strd
+4:
+ ld1 {v17.s}[0], [\sr2], \s_strd
+ ld1 {v18.s}[0], [\src], \s_strd
+ trn1 v16.2s, v16.2s, v17.2s
+ trn1 v17.2s, v17.2s, v18.2s
+ umull v4.8h, v16.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+.else
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ b 4b
+0:
+ ret
+
+80: // 8xN v
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.8b}, [\src], \s_strd
+8:
+ ld1 {v17.8b}, [\sr2], \s_strd
+ ld1 {v18.8b}, [\src], \s_strd
+ umull v4.8h, v16.8b, v2.8b
+ umull v5.8h, v17.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ umlal v5.8h, v18.8b, v3.8b
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ uqrshrn v5.8b, v5.8h, #4
+ st1 {v4.8b}, [\dst], \d_strd
+ st1 {v5.8b}, [\ds2], \d_strd
+.else
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ b 8b
+0:
+ ret
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v16.16b}, [\src], \s_strd
+2:
+ ld1 {v17.16b}, [\sr2], \s_strd
+ ld1 {v18.16b}, [\src], \s_strd
+ umull v4.8h, v16.8b, v2.8b
+ umull2 v5.8h, v16.16b, v2.16b
+ umull v6.8h, v17.8b, v2.8b
+ umull2 v7.8h, v17.16b, v2.16b
+ umlal v4.8h, v17.8b, v3.8b
+ umlal2 v5.8h, v17.16b, v3.16b
+ umlal v6.8h, v18.8b, v3.8b
+ umlal2 v7.8h, v18.16b, v3.16b
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ uqrshrn2 v4.16b, v5.8h, #4
+ uqrshrn v6.8b, v6.8h, #4
+ uqrshrn2 v6.16b, v7.8h, #4
+ st1 {v4.16b}, [\dst], \d_strd
+ st1 {v6.16b}, [\ds2], \d_strd
+.else
+ st1 {v4.8h, v5.8h}, [\dst], \d_strd
+ st1 {v6.8h, v7.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ b 2b
+9:
+ subs \w, \w, #16
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+.ifc \type, put
+ add \dst, \dst, #16
+.else
+ add \dst, \dst, #32
+.endif
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_v_tbl):
+ .hword L(\type\()_bilin_v_tbl) - 1280b
+ .hword L(\type\()_bilin_v_tbl) - 640b
+ .hword L(\type\()_bilin_v_tbl) - 320b
+ .hword L(\type\()_bilin_v_tbl) - 160b
+ .hword L(\type\()_bilin_v_tbl) - 80b
+ .hword L(\type\()_bilin_v_tbl) - 40b
+ .hword L(\type\()_bilin_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_bilin_hv):
+ uxtl v2.8h, v2.8b
+ uxtl v3.8h, v3.8b
+ adr x9, L(\type\()_bilin_hv_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN hv
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v28.s}[0], [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ umull v16.8h, v28.8b, v0.8b
+ umlal v16.8h, v29.8b, v1.8b
+
+2:
+ ld1 {v28.s}[0], [\sr2], \s_strd
+ ld1 {v30.s}[0], [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ ext v31.8b, v30.8b, v30.8b, #1
+ trn1 v28.4h, v28.4h, v30.4h
+ trn1 v29.4h, v29.4h, v31.4h
+ umull v17.8h, v28.8b, v0.8b
+ umlal v17.8h, v29.8b, v1.8b
+
+ trn1 v16.2s, v16.2s, v17.2s
+
+ mul v4.4h, v16.4h, v2.4h
+ mla v4.4h, v17.4h, v3.4h
+ uqrshrn v4.8b, v4.8h, #8
+ subs \h, \h, #2
+ st1 {v4.h}[0], [\dst], \d_strd
+ st1 {v4.h}[1], [\ds2], \d_strd
+ b.le 0f
+ trn2 v16.2s, v17.2s, v17.2s
+ b 2b
+0:
+ ret
+.endif
+
+40: // 4xN hv
+ AARCH64_VALID_JUMP_TARGET
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v28.8b}, [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ umull v16.8h, v28.8b, v0.8b
+ umlal v16.8h, v29.8b, v1.8b
+
+4:
+ ld1 {v28.8b}, [\sr2], \s_strd
+ ld1 {v30.8b}, [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ ext v31.8b, v30.8b, v30.8b, #1
+ trn1 v28.2s, v28.2s, v30.2s
+ trn1 v29.2s, v29.2s, v31.2s
+ umull v17.8h, v28.8b, v0.8b
+ umlal v17.8h, v29.8b, v1.8b
+
+ trn1 v16.2d, v16.2d, v17.2d
+
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #8
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+.else
+ urshr v4.8h, v4.8h, #4
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+.endif
+ b.le 0f
+ trn2 v16.2d, v17.2d, v17.2d
+ b 4b
+0:
+ ret
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v28.16b}, [\src], \s_strd
+ ext v29.16b, v28.16b, v28.16b, #1
+ umull v16.8h, v28.8b, v0.8b
+ umlal v16.8h, v29.8b, v1.8b
+
+2:
+ ld1 {v28.16b}, [\sr2], \s_strd
+ ld1 {v30.16b}, [\src], \s_strd
+ ext v29.16b, v28.16b, v28.16b, #1
+ ext v31.16b, v30.16b, v30.16b, #1
+ umull v17.8h, v28.8b, v0.8b
+ umlal v17.8h, v29.8b, v1.8b
+ umull v18.8h, v30.8b, v0.8b
+ umlal v18.8h, v31.8b, v1.8b
+
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ mul v5.8h, v17.8h, v2.8h
+ mla v5.8h, v18.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #8
+ uqrshrn v5.8b, v5.8h, #8
+ st1 {v4.8b}, [\dst], \d_strd
+ st1 {v5.8b}, [\ds2], \d_strd
+.else
+ urshr v4.8h, v4.8h, #4
+ urshr v5.8h, v5.8h, #4
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ b 2b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_hv_tbl):
+ .hword L(\type\()_bilin_hv_tbl) - 1280b
+ .hword L(\type\()_bilin_hv_tbl) - 640b
+ .hword L(\type\()_bilin_hv_tbl) - 320b
+ .hword L(\type\()_bilin_hv_tbl) - 160b
+ .hword L(\type\()_bilin_hv_tbl) - 80b
+ .hword L(\type\()_bilin_hv_tbl) - 40b
+ .hword L(\type\()_bilin_hv_tbl) - 20b
+ .hword 0
+endfunc
+.endm
+
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
+filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+
+.macro load_filter_row dst, src, inc
+ asr w13, \src, #10
+ add \src, \src, \inc
+ ldr \dst, [x11, w13, sxtw #3]
+.endm
+
+function warp_filter_horz_neon
+ add w12, w5, #512
+
+ ld1 {v16.8b, v17.8b}, [x2], x3
+
+ load_filter_row d0, w12, w7
+ load_filter_row d1, w12, w7
+ load_filter_row d2, w12, w7
+ load_filter_row d3, w12, w7
+ load_filter_row d4, w12, w7
+ load_filter_row d5, w12, w7
+ load_filter_row d6, w12, w7
+ // subtract by 128 to allow using smull
+ eor v16.8b, v16.8b, v22.8b
+ eor v17.8b, v17.8b, v22.8b
+ load_filter_row d7, w12, w7
+
+ ext v18.8b, v16.8b, v17.8b, #1
+ ext v19.8b, v16.8b, v17.8b, #2
+ smull v0.8h, v0.8b, v16.8b
+ smull v1.8h, v1.8b, v18.8b
+ ext v18.8b, v16.8b, v17.8b, #3
+ ext v20.8b, v16.8b, v17.8b, #4
+ smull v2.8h, v2.8b, v19.8b
+ smull v3.8h, v3.8b, v18.8b
+ ext v18.8b, v16.8b, v17.8b, #5
+ ext v19.8b, v16.8b, v17.8b, #6
+ smull v4.8h, v4.8b, v20.8b
+ smull v5.8h, v5.8b, v18.8b
+ ext v18.8b, v16.8b, v17.8b, #7
+ smull v6.8h, v6.8b, v19.8b
+ smull v7.8h, v7.8b, v18.8b
+
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+
+ addp v0.8h, v0.8h, v4.8h
+
+ add w5, w5, w8
+
+ ret
+endfunc
+
+// void dav1d_warp_affine_8x8_8bpc_neon(
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *const abcd, int mx, int my)
+.macro warp t, shift
+function warp_affine_8x8\t\()_8bpc_neon, export=1
+ ldr x4, [x4]
+ sbfx x7, x4, #0, #16
+ sbfx x8, x4, #16, #16
+ sbfx x9, x4, #32, #16
+ sbfx x4, x4, #48, #16
+ mov w10, #8
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ sub x2, x2, #3
+ movrel x11, X(mc_warp_filter), 64*8
+ mov x15, x30
+.ifnb \t
+ lsl x1, x1, #1
+.endif
+
+ movi v22.8b, #128
+.ifb \t
+ movi v23.8h, #128
+.else
+ movi v23.8h, #8, lsl #8
+.endif
+
+ bl warp_filter_horz_neon
+ srshr v24.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v25.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v26.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v27.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v28.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v29.8h, v0.8h, #3
+ bl warp_filter_horz_neon
+ srshr v30.8h, v0.8h, #3
+
+1:
+ add w14, w6, #512
+ bl warp_filter_horz_neon
+ srshr v31.8h, v0.8h, #3
+
+ load_filter_row d0, w14, w9
+ load_filter_row d1, w14, w9
+ load_filter_row d2, w14, w9
+ load_filter_row d3, w14, w9
+ load_filter_row d4, w14, w9
+ load_filter_row d5, w14, w9
+ load_filter_row d6, w14, w9
+ load_filter_row d7, w14, w9
+ transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
+
+ // This ordering of smull/smlal/smull2/smlal2 is highly
+ // beneficial for Cortex A53 here.
+ smull v16.4s, v24.4h, v0.4h
+ smlal v16.4s, v25.4h, v1.4h
+ smlal v16.4s, v26.4h, v2.4h
+ smlal v16.4s, v27.4h, v3.4h
+ smlal v16.4s, v28.4h, v4.4h
+ smlal v16.4s, v29.4h, v5.4h
+ smlal v16.4s, v30.4h, v6.4h
+ smlal v16.4s, v31.4h, v7.4h
+ smull2 v17.4s, v24.8h, v0.8h
+ smlal2 v17.4s, v25.8h, v1.8h
+ smlal2 v17.4s, v26.8h, v2.8h
+ smlal2 v17.4s, v27.8h, v3.8h
+ smlal2 v17.4s, v28.8h, v4.8h
+ smlal2 v17.4s, v29.8h, v5.8h
+ smlal2 v17.4s, v30.8h, v6.8h
+ smlal2 v17.4s, v31.8h, v7.8h
+
+ mov v24.16b, v25.16b
+ mov v25.16b, v26.16b
+ sqrshrn v16.4h, v16.4s, #\shift
+ mov v26.16b, v27.16b
+ sqrshrn2 v16.8h, v17.4s, #\shift
+ mov v27.16b, v28.16b
+ mov v28.16b, v29.16b
+ add v16.8h, v16.8h, v23.8h
+.ifb \t
+ sqxtun v16.8b, v16.8h
+.endif
+ mov v29.16b, v30.16b
+ mov v30.16b, v31.16b
+ subs w10, w10, #1
+.ifnb \t
+ st1 {v16.8h}, [x0], x1
+.else
+ st1 {v16.8b}, [x0], x1
+.endif
+
+ add w6, w6, w4
+ b.gt 1b
+
+ ret x15
+endfunc
+.endm
+
+warp , 11
+warp t, 7
+
+// void dav1d_emu_edge_8bpc_neon(
+// const intptr_t bw, const intptr_t bh,
+// const intptr_t iw, const intptr_t ih,
+// const intptr_t x, const intptr_t y,
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_8bpc_neon, export=1
+ ldp x8, x9, [sp]
+
+ // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ // ref += iclip(x, 0, iw - 1)
+ sub x12, x3, #1 // ih - 1
+ cmp x5, x3
+ sub x13, x2, #1 // iw - 1
+ csel x12, x12, x5, ge // min(y, ih - 1)
+ cmp x4, x2
+ bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
+ csel x13, x13, x4, ge // min(x, iw - 1)
+ bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
+ madd x8, x12, x9, x8 // ref += iclip() * stride
+ add x8, x8, x13 // ref += iclip()
+
+ // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ // top_ext = iclip(-y, 0, bh - 1)
+ add x10, x5, x1 // y + bh
+ neg x5, x5 // -y
+ sub x10, x10, x3 // y + bh - ih
+ sub x12, x1, #1 // bh - 1
+ cmp x10, x1
+ bic x5, x5, x5, asr #63 // max(-y, 0)
+ csel x10, x10, x12, lt // min(y + bh - ih, bh-1)
+ cmp x5, x1
+ bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
+ csel x5, x5, x12, lt // min(max(-y, 0), bh-1)
+
+ // right_ext = iclip(x + bw - iw, 0, bw - 1)
+ // left_ext = iclip(-x, 0, bw - 1)
+ add x11, x4, x0 // x + bw
+ neg x4, x4 // -x
+ sub x11, x11, x2 // x + bw - iw
+ sub x13, x0, #1 // bw - 1
+ cmp x11, x0
+ bic x4, x4, x4, asr #63 // max(-x, 0)
+ csel x11, x11, x13, lt // min(x + bw - iw, bw-1)
+ cmp x4, x0
+ bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
+ csel x4, x4, x13, lt // min(max(-x, 0), bw - 1)
+
+ // center_h = bh - top_ext - bottom_ext
+ // dst += top_ext * PXSTRIDE(dst_stride)
+ // center_w = bw - left_ext - right_ext
+ sub x1, x1, x5 // bh - top_ext
+ madd x6, x5, x7, x6
+ sub x2, x0, x4 // bw - left_ext
+ sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext
+ sub x2, x2, x11 // center_w = bw - left_ext - right_ext
+
+ mov x14, x6 // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+ ld1r {v0.16b}, [x8]
+ mov x12, x6 // out = dst
+ mov x3, x4
+1:
+ subs x3, x3, #16
+ st1 {v0.16b}, [x12], #16
+ b.gt 1b
+.endif
+ mov x13, x8
+ add x12, x6, x4 // out = dst + left_ext
+ mov x3, x2
+1:
+ ld1 {v0.16b, v1.16b}, [x13], #32
+ subs x3, x3, #32
+ st1 {v0.16b, v1.16b}, [x12], #32
+ b.gt 1b
+.if \need_right
+ add x3, x8, x2 // in + center_w
+ sub x3, x3, #1 // in + center_w - 1
+ add x12, x6, x4 // dst + left_ext
+ ld1r {v0.16b}, [x3]
+ add x12, x12, x2 // out = dst + left_ext + center_w
+ mov x3, x11
+1:
+ subs x3, x3, #16
+ st1 {v0.16b}, [x12], #16
+ b.gt 1b
+.endif
+
+ subs x1, x1, #1 // center_h--
+ add x6, x6, x7
+ add x8, x8, x9
+ b.gt 0b
+.endm
+
+ cbz x4, 2f
+ // need_left
+ cbz x11, 3f
+ // need_left + need_right
+ v_loop 1, 1
+ b 5f
+
+2:
+ // !need_left
+ cbz x11, 4f
+ // !need_left + need_right
+ v_loop 0, 1
+ b 5f
+
+3:
+ // need_left + !need_right
+ v_loop 1, 0
+ b 5f
+
+4:
+ // !need_left + !need_right
+ v_loop 0, 0
+
+5:
+
+ cbz x10, 3f
+ // need_bottom
+ sub x8, x6, x7 // ref = dst - stride
+ mov x4, x0
+1:
+ ld1 {v0.16b, v1.16b}, [x8], #32
+ mov x3, x10
+2:
+ subs x3, x3, #1
+ st1 {v0.16b, v1.16b}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x10, x6 // dst -= bottom_ext * stride
+ subs x4, x4, #32 // bw -= 32
+ add x6, x6, #32 // dst += 32
+ b.gt 1b
+
+3:
+ cbz x5, 3f
+ // need_top
+ msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride
+1:
+ ld1 {v0.16b, v1.16b}, [x14], #32
+ mov x3, x5
+2:
+ subs x3, x3, #1
+ st1 {v0.16b, v1.16b}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x5, x6 // dst -= top_ext * stride
+ subs x0, x0, #32 // bw -= 32
+ add x6, x6, #32 // dst += 32
+ b.gt 1b
+
+3:
+ ret
+endfunc
diff --git a/third_party/dav1d/src/arm/64/mc16.S b/third_party/dav1d/src/arm/64/mc16.S
new file mode 100644
index 0000000000..1bfb12ebb3
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/mc16.S
@@ -0,0 +1,3611 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define PREP_BIAS 8192
+
+.macro avg d0, d1, t0, t1, t2, t3
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ sqadd \t0\().8h, \t0\().8h, \t2\().8h
+ sqadd \t1\().8h, \t1\().8h, \t3\().8h
+ smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+ smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+ sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+ sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+ sshl \d0\().8h, \t0\().8h, v29.8h // -(intermediate_bits+1)
+ sshl \d1\().8h, \t1\().8h, v29.8h // -(intermediate_bits+1)
+.endm
+
+.macro w_avg d0, d1, t0, t1, t2, t3
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ // This difference requires a 17 bit range, and all bits are
+ // significant for the following multiplication.
+ ssubl \d0\().4s, \t2\().4h, \t0\().4h
+ ssubl2 \t0\().4s, \t2\().8h, \t0\().8h
+ ssubl \d1\().4s, \t3\().4h, \t1\().4h
+ ssubl2 \t1\().4s, \t3\().8h, \t1\().8h
+ mul \d0\().4s, \d0\().4s, v27.4s
+ mul \t0\().4s, \t0\().4s, v27.4s
+ mul \d1\().4s, \d1\().4s, v27.4s
+ mul \t1\().4s, \t1\().4s, v27.4s
+ sshr \d0\().4s, \d0\().4s, #4
+ sshr \t0\().4s, \t0\().4s, #4
+ sshr \d1\().4s, \d1\().4s, #4
+ sshr \t1\().4s, \t1\().4s, #4
+ saddw \d0\().4s, \d0\().4s, \t2\().4h
+ saddw2 \t0\().4s, \t0\().4s, \t2\().8h
+ saddw \d1\().4s, \d1\().4s, \t3\().4h
+ saddw2 \t1\().4s, \t1\().4s, \t3\().8h
+ uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2
+ uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto
+ srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits
+ srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits
+ add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits
+ add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits
+ smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max
+ smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max
+ smax \d0\().8h, \d0\().8h, v30.8h // 0
+ smax \d1\().8h, \d1\().8h, v30.8h // 0
+.endm
+
+.macro mask d0, d1, t0, t1, t2, t3
+ ld1 {v27.16b}, [x6], 16
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ neg v27.16b, v27.16b
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ sxtl v26.8h, v27.8b
+ sxtl2 v27.8h, v27.16b
+ sxtl v24.4s, v26.4h
+ sxtl2 v25.4s, v26.8h
+ sxtl v26.4s, v27.4h
+ sxtl2 v27.4s, v27.8h
+ ssubl \d0\().4s, \t2\().4h, \t0\().4h
+ ssubl2 \t0\().4s, \t2\().8h, \t0\().8h
+ ssubl \d1\().4s, \t3\().4h, \t1\().4h
+ ssubl2 \t1\().4s, \t3\().8h, \t1\().8h
+ mul \d0\().4s, \d0\().4s, v24.4s
+ mul \t0\().4s, \t0\().4s, v25.4s
+ mul \d1\().4s, \d1\().4s, v26.4s
+ mul \t1\().4s, \t1\().4s, v27.4s
+ sshr \d0\().4s, \d0\().4s, #6
+ sshr \t0\().4s, \t0\().4s, #6
+ sshr \d1\().4s, \d1\().4s, #6
+ sshr \t1\().4s, \t1\().4s, #6
+ saddw \d0\().4s, \d0\().4s, \t2\().4h
+ saddw2 \t0\().4s, \t0\().4s, \t2\().8h
+ saddw \d1\().4s, \d1\().4s, \t3\().4h
+ saddw2 \t1\().4s, \t1\().4s, \t3\().8h
+ uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2
+ uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto
+ srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits
+ srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits
+ add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits
+ add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits
+ smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max
+ smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max
+ smax \d0\().8h, \d0\().8h, v30.8h // 0
+ smax \d1\().8h, \d1\().8h, v30.8h // 0
+.endm
+
+.macro bidir_fn type, bdmax
+function \type\()_16bpc_neon, export=1
+ clz w4, w4
+.ifnc \type, avg
+ dup v31.8h, \bdmax // bitdepth_max
+ movi v30.8h, #0
+.endif
+ clz w7, \bdmax
+ sub w7, w7, #18 // intermediate_bits = clz(bitdepth_max) - 18
+.ifc \type, avg
+ mov w9, #1
+ mov w8, #-2*PREP_BIAS
+ lsl w9, w9, w7 // 1 << intermediate_bits
+ add w7, w7, #1
+ sub w8, w8, w9 // -2*PREP_BIAS - 1 << intermediate_bits
+ neg w7, w7 // -(intermediate_bits+1)
+ dup v28.8h, w8 // -2*PREP_BIAS - 1 << intermediate_bits
+ dup v29.8h, w7 // -(intermediate_bits+1)
+.else
+ mov w8, #PREP_BIAS
+ lsr w8, w8, w7 // PREP_BIAS >> intermediate_bits
+ neg w7, w7 // -intermediate_bits
+ dup v28.8h, w8 // PREP_BIAS >> intermediate_bits
+ dup v29.8h, w7 // -intermediate_bits
+.endif
+.ifc \type, w_avg
+ dup v27.4s, w6
+ neg v27.4s, v27.4s
+.endif
+ adr x7, L(\type\()_tbl)
+ sub w4, w4, #24
+ \type v4, v5, v0, v1, v2, v3
+ ldrh w4, [x7, x4, lsl #1]
+ sub x7, x7, w4, uxtw
+ br x7
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+4:
+ subs w5, w5, #4
+ st1 {v4.d}[0], [x0], x1
+ st1 {v4.d}[1], [x7], x1
+ st1 {v5.d}[0], [x0], x1
+ st1 {v5.d}[1], [x7], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 4b
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, x1
+ lsl x1, x1, #1
+8:
+ st1 {v4.8h}, [x0], x1
+ subs w5, w5, #2
+ st1 {v5.8h}, [x7], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 8b
+16:
+ AARCH64_VALID_JUMP_TARGET
+ \type v6, v7, v0, v1, v2, v3
+ st1 {v4.8h, v5.8h}, [x0], x1
+ subs w5, w5, #2
+ st1 {v6.8h, v7.8h}, [x0], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 16b
+32:
+ AARCH64_VALID_JUMP_TARGET
+ \type v6, v7, v0, v1, v2, v3
+ subs w5, w5, #1
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 32b
+640:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, #64
+64:
+ \type v6, v7, v0, v1, v2, v3
+ \type v16, v17, v0, v1, v2, v3
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ \type v18, v19, v0, v1, v2, v3
+ subs w5, w5, #1
+ st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 64b
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ add x7, x0, #64
+ mov x8, #128
+ sub x1, x1, #128
+128:
+ \type v6, v7, v0, v1, v2, v3
+ \type v16, v17, v0, v1, v2, v3
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x8
+ \type v18, v19, v0, v1, v2, v3
+ st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
+ \type v4, v5, v0, v1, v2, v3
+ \type v6, v7, v0, v1, v2, v3
+ \type v16, v17, v0, v1, v2, v3
+ subs w5, w5, #1
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ \type v18, v19, v0, v1, v2, v3
+ st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
+ b.le 0f
+ \type v4, v5, v0, v1, v2, v3
+ b 128b
+0:
+ ret
+L(\type\()_tbl):
+ .hword L(\type\()_tbl) - 1280b
+ .hword L(\type\()_tbl) - 640b
+ .hword L(\type\()_tbl) - 32b
+ .hword L(\type\()_tbl) - 16b
+ .hword L(\type\()_tbl) - 80b
+ .hword L(\type\()_tbl) - 40b
+endfunc
+.endm
+
+bidir_fn avg, w6
+bidir_fn w_avg, w7
+bidir_fn mask, w7
+
+
+.macro w_mask_fn type
+function w_mask_\type\()_16bpc_neon, export=1
+ ldr w8, [sp]
+ clz w9, w4
+ adr x10, L(w_mask_\type\()_tbl)
+ dup v31.8h, w8 // bitdepth_max
+ sub w9, w9, #24
+ clz w8, w8 // clz(bitdepth_max)
+ ldrh w9, [x10, x9, lsl #1]
+ sub x10, x10, w9, uxtw
+ sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
+ mov w9, #PREP_BIAS*64
+ neg w8, w8 // -sh
+ mov w11, #27615 // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
+ dup v30.4s, w9 // PREP_BIAS*64
+ dup v29.4s, w8 // -sh
+ dup v0.8h, w11
+.if \type == 444
+ movi v1.16b, #64
+.elseif \type == 422
+ dup v2.8b, w7
+ movi v3.8b, #129
+ sub v3.8b, v3.8b, v2.8b
+.elseif \type == 420
+ dup v2.8h, w7
+ movi v3.8h, #1, lsl #8
+ sub v3.8h, v3.8h, v2.8h
+.endif
+ add x12, x0, x1
+ lsl x1, x1, #1
+ br x10
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
+ ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
+ subs w5, w5, #4
+ sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2)
+ sabd v21.8h, v5.8h, v7.8h
+ ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit)
+ ssubl2 v17.4s, v6.8h, v4.8h
+ ssubl v18.4s, v7.4h, v5.4h
+ ssubl2 v19.4s, v7.8h, v5.8h
+ uqsub v20.8h, v0.8h, v20.8h // 27615 - abs()
+ uqsub v21.8h, v0.8h, v21.8h
+ sshll2 v7.4s, v5.8h, #6 // tmp1 << 6
+ sshll v6.4s, v5.4h, #6
+ sshll2 v5.4s, v4.8h, #6
+ sshll v4.4s, v4.4h, #6
+ ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
+ ushr v21.8h, v21.8h, #10
+ add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64
+ add v5.4s, v5.4s, v30.4s
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ uxtl v22.4s, v20.4h
+ uxtl2 v23.4s, v20.8h
+ uxtl v24.4s, v21.4h
+ uxtl2 v25.4s, v21.8h
+ mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m)
+ mla v5.4s, v17.4s, v23.4s
+ mla v6.4s, v18.4s, v24.4s
+ mla v7.4s, v19.4s, v25.4s
+ srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ srshl v5.4s, v5.4s, v29.4s
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ sqxtun v4.4h, v4.4s // iclip_pixel
+ sqxtun2 v4.8h, v5.4s
+ sqxtun v5.4h, v6.4s
+ sqxtun2 v5.8h, v7.4s
+ umin v4.8h, v4.8h, v31.8h // iclip_pixel
+ umin v5.8h, v5.8h, v31.8h
+.if \type == 444
+ uzp1 v20.16b, v20.16b, v21.16b // 64 - m
+ sub v20.16b, v1.16b, v20.16b // m
+ st1 {v20.16b}, [x6], #16
+.elseif \type == 422
+ addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition)
+ xtn v20.8b, v20.8h
+ uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ st1 {v20.8b}, [x6], #8
+.elseif \type == 420
+ trn1 v24.2d, v20.2d, v21.2d
+ trn2 v25.2d, v20.2d, v21.2d
+ add v24.8h, v24.8h, v25.8h // (64 - my1) + (64 - my2) (row wise addition)
+ addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition)
+ sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n))
+ rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ st1 {v20.s}[0], [x6], #4
+.endif
+ st1 {v4.d}[0], [x0], x1
+ st1 {v4.d}[1], [x12], x1
+ st1 {v5.d}[0], [x0], x1
+ st1 {v5.d}[1], [x12], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1
+ ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2
+ subs w5, w5, #2
+ sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2)
+ sabd v21.8h, v5.8h, v7.8h
+ ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit)
+ ssubl2 v17.4s, v6.8h, v4.8h
+ ssubl v18.4s, v7.4h, v5.4h
+ ssubl2 v19.4s, v7.8h, v5.8h
+ uqsub v20.8h, v0.8h, v20.8h // 27615 - abs()
+ uqsub v21.8h, v0.8h, v21.8h
+ sshll2 v7.4s, v5.8h, #6 // tmp1 << 6
+ sshll v6.4s, v5.4h, #6
+ sshll2 v5.4s, v4.8h, #6
+ sshll v4.4s, v4.4h, #6
+ ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
+ ushr v21.8h, v21.8h, #10
+ add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64
+ add v5.4s, v5.4s, v30.4s
+ add v6.4s, v6.4s, v30.4s
+ add v7.4s, v7.4s, v30.4s
+ uxtl v22.4s, v20.4h
+ uxtl2 v23.4s, v20.8h
+ uxtl v24.4s, v21.4h
+ uxtl2 v25.4s, v21.8h
+ mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m)
+ mla v5.4s, v17.4s, v23.4s
+ mla v6.4s, v18.4s, v24.4s
+ mla v7.4s, v19.4s, v25.4s
+ srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ srshl v5.4s, v5.4s, v29.4s
+ srshl v6.4s, v6.4s, v29.4s
+ srshl v7.4s, v7.4s, v29.4s
+ sqxtun v4.4h, v4.4s // iclip_pixel
+ sqxtun2 v4.8h, v5.4s
+ sqxtun v5.4h, v6.4s
+ sqxtun2 v5.8h, v7.4s
+ umin v4.8h, v4.8h, v31.8h // iclip_pixel
+ umin v5.8h, v5.8h, v31.8h
+.if \type == 444
+ uzp1 v20.16b, v20.16b, v21.16b // 64 - m
+ sub v20.16b, v1.16b, v20.16b // m
+ st1 {v20.16b}, [x6], #16
+.elseif \type == 422
+ addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition)
+ xtn v20.8b, v20.8h
+ uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ st1 {v20.8b}, [x6], #8
+.elseif \type == 420
+ add v20.8h, v20.8h, v21.8h // (64 - my1) + (64 - my2) (row wise addition)
+ addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition)
+ sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n))
+ rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ st1 {v20.s}[0], [x6], #4
+.endif
+ st1 {v4.8h}, [x0], x1
+ st1 {v5.8h}, [x12], x1
+ b.gt 8b
+ ret
+1280:
+640:
+320:
+160:
+ AARCH64_VALID_JUMP_TARGET
+ mov w11, w4
+ sub x1, x1, w4, uxtw #1
+.if \type == 444
+ add x10, x6, w4, uxtw
+.elseif \type == 422
+ add x10, x6, x11, lsr #1
+.endif
+ add x9, x3, w4, uxtw #1
+ add x7, x2, w4, uxtw #1
+161:
+ mov w8, w4
+16:
+ ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1
+ ld1 {v16.8h, v17.8h}, [x3], #32 // tmp2
+ ld1 {v6.8h, v7.8h}, [x7], #32
+ ld1 {v18.8h, v19.8h}, [x9], #32
+ subs w8, w8, #16
+ sabd v20.8h, v4.8h, v16.8h // abs(tmp1 - tmp2)
+ sabd v21.8h, v5.8h, v17.8h
+ ssubl v22.4s, v16.4h, v4.4h // tmp2 - tmp1 (requires 17 bit)
+ ssubl2 v23.4s, v16.8h, v4.8h
+ ssubl v24.4s, v17.4h, v5.4h
+ ssubl2 v25.4s, v17.8h, v5.8h
+ uqsub v20.8h, v0.8h, v20.8h // 27615 - abs()
+ uqsub v21.8h, v0.8h, v21.8h
+ sshll2 v27.4s, v5.8h, #6 // tmp1 << 6
+ sshll v26.4s, v5.4h, #6
+ sshll2 v5.4s, v4.8h, #6
+ sshll v4.4s, v4.4h, #6
+ ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
+ ushr v21.8h, v21.8h, #10
+ add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64
+ add v5.4s, v5.4s, v30.4s
+ add v26.4s, v26.4s, v30.4s
+ add v27.4s, v27.4s, v30.4s
+ uxtl v16.4s, v20.4h
+ uxtl2 v17.4s, v20.8h
+ uxtl v28.4s, v21.4h
+ mla v4.4s, v22.4s, v16.4s // (tmp2-tmp1)*(64-m)
+ uxtl2 v16.4s, v21.8h
+ mla v5.4s, v23.4s, v17.4s
+ mla v26.4s, v24.4s, v28.4s
+ mla v27.4s, v25.4s, v16.4s
+ srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ srshl v5.4s, v5.4s, v29.4s
+ srshl v26.4s, v26.4s, v29.4s
+ srshl v27.4s, v27.4s, v29.4s
+ sqxtun v4.4h, v4.4s // iclip_pixel
+ sqxtun2 v4.8h, v5.4s
+ sqxtun v5.4h, v26.4s
+ sqxtun2 v5.8h, v27.4s
+
+ // Start of other half
+ sabd v22.8h, v6.8h, v18.8h // abs(tmp1 - tmp2)
+ sabd v23.8h, v7.8h, v19.8h
+
+ umin v4.8h, v4.8h, v31.8h // iclip_pixel
+ umin v5.8h, v5.8h, v31.8h
+
+ ssubl v16.4s, v18.4h, v6.4h // tmp2 - tmp1 (requires 17 bit)
+ ssubl2 v17.4s, v18.8h, v6.8h
+ ssubl v18.4s, v19.4h, v7.4h
+ ssubl2 v19.4s, v19.8h, v7.8h
+ uqsub v22.8h, v0.8h, v22.8h // 27615 - abs()
+ uqsub v23.8h, v0.8h, v23.8h
+ sshll v24.4s, v6.4h, #6 // tmp1 << 6
+ sshll2 v25.4s, v6.8h, #6
+ sshll v26.4s, v7.4h, #6
+ sshll2 v27.4s, v7.8h, #6
+ ushr v22.8h, v22.8h, #10 // 64-m = (27615 - abs()) >> mask_sh
+ ushr v23.8h, v23.8h, #10
+ add v24.4s, v24.4s, v30.4s // += PREP_BIAS*64
+ add v25.4s, v25.4s, v30.4s
+ add v26.4s, v26.4s, v30.4s
+ add v27.4s, v27.4s, v30.4s
+ uxtl v6.4s, v22.4h
+ uxtl2 v7.4s, v22.8h
+ uxtl v28.4s, v23.4h
+ mla v24.4s, v16.4s, v6.4s // (tmp2-tmp1)*(64-m)
+ uxtl2 v6.4s, v23.8h
+ mla v25.4s, v17.4s, v7.4s
+ mla v26.4s, v18.4s, v28.4s
+ mla v27.4s, v19.4s, v6.4s
+ srshl v24.4s, v24.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
+ srshl v25.4s, v25.4s, v29.4s
+ srshl v26.4s, v26.4s, v29.4s
+ srshl v27.4s, v27.4s, v29.4s
+ sqxtun v6.4h, v24.4s // iclip_pixel
+ sqxtun2 v6.8h, v25.4s
+ sqxtun v7.4h, v26.4s
+ sqxtun2 v7.8h, v27.4s
+ umin v6.8h, v6.8h, v31.8h // iclip_pixel
+ umin v7.8h, v7.8h, v31.8h
+.if \type == 444
+ uzp1 v20.16b, v20.16b, v21.16b // 64 - m
+ uzp1 v21.16b, v22.16b, v23.16b
+ sub v20.16b, v1.16b, v20.16b // m
+ sub v21.16b, v1.16b, v21.16b
+ st1 {v20.16b}, [x6], #16
+ st1 {v21.16b}, [x10], #16
+.elseif \type == 422
+ addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition)
+ addp v21.8h, v22.8h, v23.8h
+ xtn v20.8b, v20.8h
+ xtn v21.8b, v21.8h
+ uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
+ uhsub v21.8b, v3.8b, v21.8b
+ st1 {v20.8b}, [x6], #8
+ st1 {v21.8b}, [x10], #8
+.elseif \type == 420
+ add v20.8h, v20.8h, v22.8h // (64 - my1) + (64 - my2) (row wise addition)
+ add v21.8h, v21.8h, v23.8h
+ addp v20.8h, v20.8h, v21.8h // (128 - m) + (128 - n) (column wise addition)
+ sub v20.8h, v3.8h, v20.8h // (256 - sign) - ((128 - m) + (128 - n))
+ rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
+ st1 {v20.8b}, [x6], #8
+.endif
+ st1 {v4.8h, v5.8h}, [x0], #32
+ st1 {v6.8h, v7.8h}, [x12], #32
+ b.gt 16b
+ subs w5, w5, #2
+ add x2, x2, w4, uxtw #1
+ add x3, x3, w4, uxtw #1
+ add x7, x7, w4, uxtw #1
+ add x9, x9, w4, uxtw #1
+.if \type == 444
+ add x6, x6, w4, uxtw
+ add x10, x10, w4, uxtw
+.elseif \type == 422
+ add x6, x6, x11, lsr #1
+ add x10, x10, x11, lsr #1
+.endif
+ add x0, x0, x1
+ add x12, x12, x1
+ b.gt 161b
+ ret
+L(w_mask_\type\()_tbl):
+ .hword L(w_mask_\type\()_tbl) - 1280b
+ .hword L(w_mask_\type\()_tbl) - 640b
+ .hword L(w_mask_\type\()_tbl) - 320b
+ .hword L(w_mask_\type\()_tbl) - 160b
+ .hword L(w_mask_\type\()_tbl) - 8b
+ .hword L(w_mask_\type\()_tbl) - 4b
+endfunc
+.endm
+
+w_mask_fn 444
+w_mask_fn 422
+w_mask_fn 420
+
+
+function blend_16bpc_neon, export=1
+ adr x6, L(blend_tbl)
+ clz w3, w3
+ sub w3, w3, #26
+ ldrh w3, [x6, x3, lsl #1]
+ sub x6, x6, w3, uxtw
+ add x8, x0, x1
+ br x6
+40:
+ AARCH64_VALID_JUMP_TARGET
+ lsl x1, x1, #1
+4:
+ ld1 {v2.8b}, [x5], #8
+ ld1 {v1.8h}, [x2], #16
+ ld1 {v0.d}[0], [x0]
+ neg v2.8b, v2.8b // -m
+ subs w4, w4, #2
+ ld1 {v0.d}[1], [x8]
+ sxtl v2.8h, v2.8b
+ shl v2.8h, v2.8h, #9 // -m << 9
+ sub v1.8h, v0.8h, v1.8h // a - b
+ sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6
+ add v0.8h, v0.8h, v1.8h
+ st1 {v0.d}[0], [x0], x1
+ st1 {v0.d}[1], [x8], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ lsl x1, x1, #1
+8:
+ ld1 {v4.16b}, [x5], #16
+ ld1 {v2.8h, v3.8h}, [x2], #32
+ neg v5.16b, v4.16b // -m
+ ld1 {v0.8h}, [x0]
+ ld1 {v1.8h}, [x8]
+ sxtl v4.8h, v5.8b
+ sxtl2 v5.8h, v5.16b
+ shl v4.8h, v4.8h, #9 // -m << 9
+ shl v5.8h, v5.8h, #9
+ sub v2.8h, v0.8h, v2.8h // a - b
+ sub v3.8h, v1.8h, v3.8h
+ subs w4, w4, #2
+ sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v3.8h, v3.8h, v5.8h
+ add v0.8h, v0.8h, v2.8h
+ add v1.8h, v1.8h, v3.8h
+ st1 {v0.8h}, [x0], x1
+ st1 {v1.8h}, [x8], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ lsl x1, x1, #1
+16:
+ ld1 {v16.16b, v17.16b}, [x5], #32
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ subs w4, w4, #2
+ neg v18.16b, v16.16b // -m
+ neg v19.16b, v17.16b
+ ld1 {v0.8h, v1.8h}, [x0]
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+ ld1 {v2.8h, v3.8h}, [x8]
+ shl v16.8h, v16.8h, #9 // -m << 9
+ shl v17.8h, v17.8h, #9
+ shl v18.8h, v18.8h, #9
+ shl v19.8h, v19.8h, #9
+ sub v4.8h, v0.8h, v4.8h // a - b
+ sub v5.8h, v1.8h, v5.8h
+ sub v6.8h, v2.8h, v6.8h
+ sub v7.8h, v3.8h, v7.8h
+ sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v5.8h, v5.8h, v17.8h
+ sqrdmulh v6.8h, v6.8h, v18.8h
+ sqrdmulh v7.8h, v7.8h, v19.8h
+ add v0.8h, v0.8h, v4.8h
+ add v1.8h, v1.8h, v5.8h
+ add v2.8h, v2.8h, v6.8h
+ add v3.8h, v3.8h, v7.8h
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v2.8h, v3.8h}, [x8], x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v16.16b, v17.16b}, [x5], #32
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ subs w4, w4, #1
+ neg v18.16b, v16.16b // -m
+ neg v19.16b, v17.16b
+ sxtl v16.8h, v18.8b
+ sxtl2 v17.8h, v18.16b
+ sxtl v18.8h, v19.8b
+ sxtl2 v19.8h, v19.16b
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+ shl v16.8h, v16.8h, #9 // -m << 9
+ shl v17.8h, v17.8h, #9
+ shl v18.8h, v18.8h, #9
+ shl v19.8h, v19.8h, #9
+ sub v4.8h, v0.8h, v4.8h // a - b
+ sub v5.8h, v1.8h, v5.8h
+ sub v6.8h, v2.8h, v6.8h
+ sub v7.8h, v3.8h, v7.8h
+ sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v5.8h, v5.8h, v17.8h
+ sqrdmulh v6.8h, v6.8h, v18.8h
+ sqrdmulh v7.8h, v7.8h, v19.8h
+ add v0.8h, v0.8h, v4.8h
+ add v1.8h, v1.8h, v5.8h
+ add v2.8h, v2.8h, v6.8h
+ add v3.8h, v3.8h, v7.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ b.gt 32b
+ ret
+L(blend_tbl):
+ .hword L(blend_tbl) - 32b
+ .hword L(blend_tbl) - 160b
+ .hword L(blend_tbl) - 80b
+ .hword L(blend_tbl) - 40b
+endfunc
+
+function blend_h_16bpc_neon, export=1
+ adr x6, L(blend_h_tbl)
+ movrel x5, X(obmc_masks)
+ add x5, x5, w4, uxtw
+ sub w4, w4, w4, lsr #2
+ clz w7, w3
+ add x8, x0, x1
+ lsl x1, x1, #1
+ sub w7, w7, #24
+ ldrh w7, [x6, x7, lsl #1]
+ sub x6, x6, w7, uxtw
+ br x6
+2:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v2.8b, v3.8b}, [x5], #2
+ ld1 {v1.4h}, [x2], #8
+ ext v2.8b, v2.8b, v3.8b, #6
+ subs w4, w4, #2
+ neg v2.8b, v2.8b // -m
+ ld1 {v0.s}[0], [x0]
+ ld1 {v0.s}[1], [x8]
+ sxtl v2.8h, v2.8b
+ shl v2.4h, v2.4h, #9 // -m << 9
+ sub v1.4h, v0.4h, v1.4h // a - b
+ sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6
+ add v0.4h, v0.4h, v1.4h
+ st1 {v0.s}[0], [x0], x1
+ st1 {v0.s}[1], [x8], x1
+ b.gt 2b
+ ret
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v2.8b, v3.8b}, [x5], #2
+ ld1 {v1.8h}, [x2], #16
+ ext v2.8b, v2.8b, v3.8b, #4
+ subs w4, w4, #2
+ neg v2.8b, v2.8b // -m
+ ld1 {v0.d}[0], [x0]
+ ld1 {v0.d}[1], [x8]
+ sxtl v2.8h, v2.8b
+ shl v2.8h, v2.8h, #9 // -m << 9
+ sub v1.8h, v0.8h, v1.8h // a - b
+ sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6
+ add v0.8h, v0.8h, v1.8h
+ st1 {v0.d}[0], [x0], x1
+ st1 {v0.d}[1], [x8], x1
+ b.gt 4b
+ ret
+8:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v4.8b, v5.8b}, [x5], #2
+ ld1 {v2.8h, v3.8h}, [x2], #32
+ neg v4.8b, v4.8b // -m
+ neg v5.8b, v5.8b
+ ld1 {v0.8h}, [x0]
+ subs w4, w4, #2
+ sxtl v4.8h, v4.8b
+ sxtl v5.8h, v5.8b
+ ld1 {v1.8h}, [x8]
+ shl v4.8h, v4.8h, #9 // -m << 9
+ shl v5.8h, v5.8h, #9
+ sub v2.8h, v0.8h, v2.8h // a - b
+ sub v3.8h, v1.8h, v3.8h
+ sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v3.8h, v3.8h, v5.8h
+ add v0.8h, v0.8h, v2.8h
+ add v1.8h, v1.8h, v3.8h
+ st1 {v0.8h}, [x0], x1
+ st1 {v1.8h}, [x8], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ld2r {v16.8b, v17.8b}, [x5], #2
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ neg v16.8b, v16.8b // -m
+ neg v17.8b, v17.8b
+ ld1 {v0.8h, v1.8h}, [x0]
+ ld1 {v2.8h, v3.8h}, [x8]
+ subs w4, w4, #2
+ sxtl v16.8h, v16.8b
+ sxtl v17.8h, v17.8b
+ shl v16.8h, v16.8h, #9 // -m << 9
+ shl v17.8h, v17.8h, #9
+ sub v4.8h, v0.8h, v4.8h // a - b
+ sub v5.8h, v1.8h, v5.8h
+ sub v6.8h, v2.8h, v6.8h
+ sub v7.8h, v3.8h, v7.8h
+ sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v5.8h, v5.8h, v16.8h
+ sqrdmulh v6.8h, v6.8h, v17.8h
+ sqrdmulh v7.8h, v7.8h, v17.8h
+ add v0.8h, v0.8h, v4.8h
+ add v1.8h, v1.8h, v5.8h
+ add v2.8h, v2.8h, v6.8h
+ add v3.8h, v3.8h, v7.8h
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v2.8h, v3.8h}, [x8], x1
+ b.gt 16b
+ ret
+1280:
+640:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ sub x1, x1, w3, uxtw #1
+ add x7, x2, w3, uxtw #1
+321:
+ ld2r {v24.8b, v25.8b}, [x5], #2
+ mov w6, w3
+ neg v24.8b, v24.8b // -m
+ neg v25.8b, v25.8b
+ sxtl v24.8h, v24.8b
+ sxtl v25.8h, v25.8b
+ shl v24.8h, v24.8h, #9 // -m << 9
+ shl v25.8h, v25.8h, #9
+32:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+ subs w6, w6, #32
+ sub v16.8h, v0.8h, v16.8h // a - b
+ sub v17.8h, v1.8h, v17.8h
+ sub v18.8h, v2.8h, v18.8h
+ sub v19.8h, v3.8h, v19.8h
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8]
+ sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v17.8h, v17.8h, v24.8h
+ sqrdmulh v18.8h, v18.8h, v24.8h
+ sqrdmulh v19.8h, v19.8h, v24.8h
+ sub v20.8h, v4.8h, v20.8h // a - b
+ sub v21.8h, v5.8h, v21.8h
+ sub v22.8h, v6.8h, v22.8h
+ sub v23.8h, v7.8h, v23.8h
+ add v0.8h, v0.8h, v16.8h
+ add v1.8h, v1.8h, v17.8h
+ add v2.8h, v2.8h, v18.8h
+ add v3.8h, v3.8h, v19.8h
+ sqrdmulh v20.8h, v20.8h, v25.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v21.8h, v21.8h, v25.8h
+ sqrdmulh v22.8h, v22.8h, v25.8h
+ sqrdmulh v23.8h, v23.8h, v25.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v4.8h, v4.8h, v20.8h
+ add v5.8h, v5.8h, v21.8h
+ add v6.8h, v6.8h, v22.8h
+ add v7.8h, v7.8h, v23.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8], #64
+ b.gt 32b
+ subs w4, w4, #2
+ add x0, x0, x1
+ add x8, x8, x1
+ add x2, x2, w3, uxtw #1
+ add x7, x7, w3, uxtw #1
+ b.gt 321b
+ ret
+L(blend_h_tbl):
+ .hword L(blend_h_tbl) - 1280b
+ .hword L(blend_h_tbl) - 640b
+ .hword L(blend_h_tbl) - 320b
+ .hword L(blend_h_tbl) - 16b
+ .hword L(blend_h_tbl) - 8b
+ .hword L(blend_h_tbl) - 4b
+ .hword L(blend_h_tbl) - 2b
+endfunc
+
+function blend_v_16bpc_neon, export=1
+ adr x6, L(blend_v_tbl)
+ movrel x5, X(obmc_masks)
+ add x5, x5, w3, uxtw
+ clz w3, w3
+ add x8, x0, x1
+ lsl x1, x1, #1
+ sub w3, w3, #26
+ ldrh w3, [x6, x3, lsl #1]
+ sub x6, x6, w3, uxtw
+ br x6
+20:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v2.8b}, [x5]
+ neg v2.8b, v2.8b // -m
+ sxtl v2.8h, v2.8b
+ shl v2.4h, v2.4h, #9 // -m << 9
+2:
+ ld1 {v1.s}[0], [x2], #4
+ ld1 {v0.h}[0], [x0]
+ subs w4, w4, #2
+ ld1 {v1.h}[1], [x2]
+ ld1 {v0.h}[1], [x8]
+ add x2, x2, #4
+ sub v1.4h, v0.4h, v1.4h // a - b
+ sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6
+ add v0.4h, v0.4h, v1.4h
+ st1 {v0.h}[0], [x0], x1
+ st1 {v0.h}[1], [x8], x1
+ b.gt 2b
+ ret
+40:
+ AARCH64_VALID_JUMP_TARGET
+ ld1r {v2.2s}, [x5]
+ sub x1, x1, #4
+ neg v2.8b, v2.8b // -m
+ sxtl v2.8h, v2.8b
+ shl v2.8h, v2.8h, #9 // -m << 9
+4:
+ ld1 {v1.8h}, [x2], #16
+ ld1 {v0.d}[0], [x0]
+ ld1 {v0.d}[1], [x8]
+ subs w4, w4, #2
+ sub v1.8h, v0.8h, v1.8h // a - b
+ sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6
+ add v0.8h, v0.8h, v1.8h
+ st1 {v0.s}[0], [x0], #4
+ st1 {v0.s}[2], [x8], #4
+ st1 {v0.h}[2], [x0], x1
+ st1 {v0.h}[6], [x8], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v4.8b}, [x5]
+ sub x1, x1, #8
+ neg v4.8b, v4.8b // -m
+ sxtl v4.8h, v4.8b
+ shl v4.8h, v4.8h, #9 // -m << 9
+8:
+ ld1 {v2.8h, v3.8h}, [x2], #32
+ ld1 {v0.8h}, [x0]
+ ld1 {v1.8h}, [x8]
+ subs w4, w4, #2
+ sub v2.8h, v0.8h, v2.8h // a - b
+ sub v3.8h, v1.8h, v3.8h
+ sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v3.8h, v3.8h, v4.8h
+ add v0.8h, v0.8h, v2.8h
+ add v1.8h, v1.8h, v3.8h
+ st1 {v0.d}[0], [x0], #8
+ st1 {v1.d}[0], [x8], #8
+ st1 {v0.s}[2], [x0], x1
+ st1 {v1.s}[2], [x8], x1
+ b.gt 8b
+ ret
+160:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v16.16b}, [x5]
+ sub x1, x1, #16
+ neg v17.16b, v16.16b // -m
+ sxtl v16.8h, v17.8b
+ sxtl2 v17.8h, v17.16b
+ shl v16.8h, v16.8h, #9 // -m << 9
+ shl v17.4h, v17.4h, #9
+16:
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ ld1 {v0.8h, v1.8h}, [x0]
+ subs w4, w4, #2
+ ld1 {v2.8h, v3.8h}, [x8]
+ sub v4.8h, v0.8h, v4.8h // a - b
+ sub v5.4h, v1.4h, v5.4h
+ sub v6.8h, v2.8h, v6.8h
+ sub v7.4h, v3.4h, v7.4h
+ sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v5.4h, v5.4h, v17.4h
+ sqrdmulh v6.8h, v6.8h, v16.8h
+ sqrdmulh v7.4h, v7.4h, v17.4h
+ add v0.8h, v0.8h, v4.8h
+ add v1.4h, v1.4h, v5.4h
+ add v2.8h, v2.8h, v6.8h
+ add v3.4h, v3.4h, v7.4h
+ st1 {v0.8h}, [x0], #16
+ st1 {v2.8h}, [x8], #16
+ st1 {v1.4h}, [x0], x1
+ st1 {v3.4h}, [x8], x1
+ b.gt 16b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v24.16b, v25.16b}, [x5]
+ neg v26.16b, v24.16b // -m
+ neg v27.8b, v25.8b
+ sxtl v24.8h, v26.8b
+ sxtl2 v25.8h, v26.16b
+ sxtl v26.8h, v27.8b
+ shl v24.8h, v24.8h, #9 // -m << 9
+ shl v25.8h, v25.8h, #9
+ shl v26.8h, v26.8h, #9
+32:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
+ ld1 {v0.8h, v1.8h, v2.8h}, [x0]
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
+ ld1 {v4.8h, v5.8h, v6.8h}, [x8]
+ subs w4, w4, #2
+ sub v16.8h, v0.8h, v16.8h // a - b
+ sub v17.8h, v1.8h, v17.8h
+ sub v18.8h, v2.8h, v18.8h
+ sub v20.8h, v4.8h, v20.8h
+ sub v21.8h, v5.8h, v21.8h
+ sub v22.8h, v6.8h, v22.8h
+ sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6
+ sqrdmulh v17.8h, v17.8h, v25.8h
+ sqrdmulh v18.8h, v18.8h, v26.8h
+ sqrdmulh v20.8h, v20.8h, v24.8h
+ sqrdmulh v21.8h, v21.8h, v25.8h
+ sqrdmulh v22.8h, v22.8h, v26.8h
+ add v0.8h, v0.8h, v16.8h
+ add v1.8h, v1.8h, v17.8h
+ add v2.8h, v2.8h, v18.8h
+ add v4.8h, v4.8h, v20.8h
+ add v5.8h, v5.8h, v21.8h
+ add v6.8h, v6.8h, v22.8h
+ st1 {v0.8h, v1.8h, v2.8h}, [x0], x1
+ st1 {v4.8h, v5.8h, v6.8h}, [x8], x1
+ b.gt 32b
+ ret
+L(blend_v_tbl):
+ .hword L(blend_v_tbl) - 320b
+ .hword L(blend_v_tbl) - 160b
+ .hword L(blend_v_tbl) - 80b
+ .hword L(blend_v_tbl) - 40b
+ .hword L(blend_v_tbl) - 20b
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that x9 is set to (clz(w)-24).
+function put_neon
+ adr x10, L(put_tbl)
+ ldrh w9, [x10, x9, lsl #1]
+ sub x10, x10, w9, uxtw
+ br x10
+
+2:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v1.s}[0], [x2], x3
+ subs w5, w5, #2
+ st1 {v0.s}[0], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ b.gt 2b
+ ret
+4:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.4h}, [x2], x3
+ ld1 {v1.4h}, [x2], x3
+ subs w5, w5, #2
+ st1 {v0.4h}, [x0], x1
+ st1 {v1.4h}, [x0], x1
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x8, x0, x1
+ lsl x1, x1, #1
+ add x9, x2, x3
+ lsl x3, x3, #1
+8:
+ ld1 {v0.8h}, [x2], x3
+ ld1 {v1.8h}, [x9], x3
+ subs w5, w5, #2
+ st1 {v0.8h}, [x0], x1
+ st1 {v1.8h}, [x8], x1
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ subs w5, w5, #1
+ stp x8, x9, [x0, #16]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ ldp x10, x11, [x2, #32]
+ stp x8, x9, [x0, #16]
+ subs w5, w5, #1
+ ldp x12, x13, [x2, #48]
+ stp x10, x11, [x0, #32]
+ stp x12, x13, [x0, #48]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x2]
+ ldp q2, q3, [x2, #32]
+ stp q0, q1, [x0]
+ ldp q4, q5, [x2, #64]
+ stp q2, q3, [x0, #32]
+ ldp q6, q7, [x2, #96]
+ subs w5, w5, #1
+ stp q4, q5, [x0, #64]
+ stp q6, q7, [x0, #96]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 64b
+ ret
+128:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x2]
+ ldp q2, q3, [x2, #32]
+ stp q0, q1, [x0]
+ ldp q4, q5, [x2, #64]
+ stp q2, q3, [x0, #32]
+ ldp q6, q7, [x2, #96]
+ subs w5, w5, #1
+ stp q4, q5, [x0, #64]
+ ldp q16, q17, [x2, #128]
+ stp q6, q7, [x0, #96]
+ ldp q18, q19, [x2, #160]
+ stp q16, q17, [x0, #128]
+ ldp q20, q21, [x2, #192]
+ stp q18, q19, [x0, #160]
+ ldp q22, q23, [x2, #224]
+ stp q20, q21, [x0, #192]
+ stp q22, q23, [x0, #224]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 128b
+ ret
+
+L(put_tbl):
+ .hword L(put_tbl) - 128b
+ .hword L(put_tbl) - 64b
+ .hword L(put_tbl) - 32b
+ .hword L(put_tbl) - 16b
+ .hword L(put_tbl) - 80b
+ .hword L(put_tbl) - 4b
+ .hword L(put_tbl) - 2b
+endfunc
+
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
+// x8 to w*2.
+function prep_neon
+ adr x10, L(prep_tbl)
+ ldrh w9, [x10, x9, lsl #1]
+ dup v31.8h, w7 // intermediate_bits
+ movi v30.8h, #(PREP_BIAS >> 8), lsl #8
+ sub x10, x10, w9, uxtw
+ br x10
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add x9, x1, x2
+ lsl x2, x2, #1
+4:
+ ld1 {v0.d}[0], [x1], x2
+ ld1 {v0.d}[1], [x9], x2
+ subs w4, w4, #2
+ sshl v0.8h, v0.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ st1 {v0.8h}, [x0], #16
+ b.gt 4b
+ ret
+80:
+ AARCH64_VALID_JUMP_TARGET
+ add x9, x1, x2
+ lsl x2, x2, #1
+8:
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x9], x2
+ subs w4, w4, #2
+ sshl v0.8h, v0.8h, v31.8h
+ sshl v1.8h, v1.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 8b
+ ret
+16:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x1]
+ add x1, x1, x2
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1]
+ add x1, x1, x2
+ subs w4, w4, #2
+ sshl v1.8h, v1.8h, v31.8h
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ b.gt 16b
+ ret
+32:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x1]
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1, #32]
+ add x1, x1, x2
+ sshl v1.8h, v1.8h, v31.8h
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ subs w4, w4, #1
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ b.gt 32b
+ ret
+64:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x1]
+ subs w4, w4, #1
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1, #32]
+ sshl v1.8h, v1.8h, v31.8h
+ ldp q4, q5, [x1, #64]
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ ldp q6, q7, [x1, #96]
+ add x1, x1, x2
+ sshl v4.8h, v4.8h, v31.8h
+ sshl v5.8h, v5.8h, v31.8h
+ sshl v6.8h, v6.8h, v31.8h
+ sshl v7.8h, v7.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ stp q0, q1, [x0]
+ sub v4.8h, v4.8h, v30.8h
+ sub v5.8h, v5.8h, v30.8h
+ stp q2, q3, [x0, #32]
+ sub v6.8h, v6.8h, v30.8h
+ sub v7.8h, v7.8h, v30.8h
+ stp q4, q5, [x0, #64]
+ stp q6, q7, [x0, #96]
+ add x0, x0, x8
+ b.gt 64b
+ ret
+128:
+ AARCH64_VALID_JUMP_TARGET
+ ldp q0, q1, [x1]
+ subs w4, w4, #1
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1, #32]
+ sshl v1.8h, v1.8h, v31.8h
+ ldp q4, q5, [x1, #64]
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ ldp q6, q7, [x1, #96]
+ sshl v4.8h, v4.8h, v31.8h
+ sshl v5.8h, v5.8h, v31.8h
+ ldp q16, q17, [x1, #128]
+ sshl v6.8h, v6.8h, v31.8h
+ sshl v7.8h, v7.8h, v31.8h
+ ldp q18, q19, [x1, #160]
+ sshl v16.8h, v16.8h, v31.8h
+ sshl v17.8h, v17.8h, v31.8h
+ ldp q20, q21, [x1, #192]
+ sshl v18.8h, v18.8h, v31.8h
+ sshl v19.8h, v19.8h, v31.8h
+ ldp q22, q23, [x1, #224]
+ add x1, x1, x2
+ sshl v20.8h, v20.8h, v31.8h
+ sshl v21.8h, v21.8h, v31.8h
+ sshl v22.8h, v22.8h, v31.8h
+ sshl v23.8h, v23.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ stp q0, q1, [x0]
+ sub v4.8h, v4.8h, v30.8h
+ sub v5.8h, v5.8h, v30.8h
+ stp q2, q3, [x0, #32]
+ sub v6.8h, v6.8h, v30.8h
+ sub v7.8h, v7.8h, v30.8h
+ stp q4, q5, [x0, #64]
+ sub v16.8h, v16.8h, v30.8h
+ sub v17.8h, v17.8h, v30.8h
+ stp q6, q7, [x0, #96]
+ sub v18.8h, v18.8h, v30.8h
+ sub v19.8h, v19.8h, v30.8h
+ stp q16, q17, [x0, #128]
+ sub v20.8h, v20.8h, v30.8h
+ sub v21.8h, v21.8h, v30.8h
+ stp q18, q19, [x0, #160]
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ stp q20, q21, [x0, #192]
+ stp q22, q23, [x0, #224]
+ add x0, x0, x8
+ b.gt 128b
+ ret
+
+L(prep_tbl):
+ .hword L(prep_tbl) - 128b
+ .hword L(prep_tbl) - 64b
+ .hword L(prep_tbl) - 32b
+ .hword L(prep_tbl) - 16b
+ .hword L(prep_tbl) - 80b
+ .hword L(prep_tbl) - 40b
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}[0], [\s0], \strd
+ ld1 {\d1\wd}[0], [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}[0], [\s0], \strd
+ ld1 {\d3\wd}[0], [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}[0], [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}[0], [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}[0], [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}, [\s0], \strd
+ ld1 {\d1\wd}, [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}, [\s0], \strd
+ ld1 {\d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}, [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}, [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
+ ld1 {\d0\wd, \d1\wd}, [\s0], \strd
+.ifnb \d2
+ ld1 {\d2\wd, \d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd, \d5\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
+ load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
+.endm
+.macro interleave_1 wd, r0, r1, r2, r3, r4
+ trn1 \r0\wd, \r0\wd, \r1\wd
+ trn1 \r1\wd, \r1\wd, \r2\wd
+.ifnb \r3
+ trn1 \r2\wd, \r2\wd, \r3\wd
+ trn1 \r3\wd, \r3\wd, \r4\wd
+.endif
+.endm
+.macro interleave_1_s r0, r1, r2, r3, r4
+ interleave_1 .2s, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro umin_h c, wd, r0, r1, r2, r3
+ umin \r0\wd, \r0\wd, \c\wd
+.ifnb \r1
+ umin \r1\wd, \r1\wd, \c\wd
+.endif
+.ifnb \r2
+ umin \r2\wd, \r2\wd, \c\wd
+ umin \r3\wd, \r3\wd, \c\wd
+.endif
+.endm
+.macro sub_h c, wd, r0, r1, r2, r3
+ sub \r0\wd, \r0\wd, \c\wd
+.ifnb \r1
+ sub \r1\wd, \r1\wd, \c\wd
+.endif
+.ifnb \r2
+ sub \r2\wd, \r2\wd, \c\wd
+ sub \r3\wd, \r3\wd, \c\wd
+.endif
+.endm
+.macro smull_smlal_4 d, s0, s1, s2, s3
+ smull \d\().4s, \s0\().4h, v0.h[0]
+ smlal \d\().4s, \s1\().4h, v0.h[1]
+ smlal \d\().4s, \s2\().4h, v0.h[2]
+ smlal \d\().4s, \s3\().4h, v0.h[3]
+.endm
+.macro smull2_smlal2_4 d, s0, s1, s2, s3
+ smull2 \d\().4s, \s0\().8h, v0.h[0]
+ smlal2 \d\().4s, \s1\().8h, v0.h[1]
+ smlal2 \d\().4s, \s2\().8h, v0.h[2]
+ smlal2 \d\().4s, \s3\().8h, v0.h[3]
+.endm
+.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull \d\().4s, \s0\().4h, v0.h[0]
+ smlal \d\().4s, \s1\().4h, v0.h[1]
+ smlal \d\().4s, \s2\().4h, v0.h[2]
+ smlal \d\().4s, \s3\().4h, v0.h[3]
+ smlal \d\().4s, \s4\().4h, v0.h[4]
+ smlal \d\().4s, \s5\().4h, v0.h[5]
+ smlal \d\().4s, \s6\().4h, v0.h[6]
+ smlal \d\().4s, \s7\().4h, v0.h[7]
+.endm
+.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull2 \d\().4s, \s0\().8h, v0.h[0]
+ smlal2 \d\().4s, \s1\().8h, v0.h[1]
+ smlal2 \d\().4s, \s2\().8h, v0.h[2]
+ smlal2 \d\().4s, \s3\().8h, v0.h[3]
+ smlal2 \d\().4s, \s4\().8h, v0.h[4]
+ smlal2 \d\().4s, \s5\().8h, v0.h[5]
+ smlal2 \d\().4s, \s6\().8h, v0.h[6]
+ smlal2 \d\().4s, \s7\().8h, v0.h[7]
+.endm
+.macro sqrshrun_h shift, r0, r1, r2, r3
+ sqrshrun \r0\().4h, \r0\().4s, #\shift
+.ifnb \r1
+ sqrshrun2 \r0\().8h, \r1\().4s, #\shift
+.endif
+.ifnb \r2
+ sqrshrun \r2\().4h, \r2\().4s, #\shift
+ sqrshrun2 \r2\().8h, \r3\().4s, #\shift
+.endif
+.endm
+.macro xtn_h r0, r1, r2, r3
+ uzp1 \r0\().8h, \r0\().8h, \r1\().8h // Same as xtn, xtn2
+.ifnb \r2
+ uzp1 \r2\().8h, \r2\().8h, \r3\().8h // Ditto
+.endif
+.endm
+.macro srshl_s shift, r0, r1, r2, r3
+ srshl \r0\().4s, \r0\().4s, \shift\().4s
+ srshl \r1\().4s, \r1\().4s, \shift\().4s
+.ifnb \r2
+ srshl \r2\().4s, \r2\().4s, \shift\().4s
+ srshl \r3\().4s, \r3\().4s, \shift\().4s
+.endif
+.endm
+.macro st_s strd, reg, lanes
+ st1 {\reg\().s}[0], [x0], \strd
+ st1 {\reg\().s}[1], [x9], \strd
+.if \lanes > 2
+ st1 {\reg\().s}[2], [x0], \strd
+ st1 {\reg\().s}[3], [x9], \strd
+.endif
+.endm
+.macro st_d strd, r0, r1
+ st1 {\r0\().d}[0], [x0], \strd
+ st1 {\r0\().d}[1], [x9], \strd
+.ifnb \r1
+ st1 {\r1\().d}[0], [x0], \strd
+ st1 {\r1\().d}[1], [x9], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_h 6, \r0, \r1, \r2, \r3
+ umin_h v31, .8h, \r0, \r2
+.else
+ srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
+ xtn_h \r0, \r1, \r2, \r3
+ sub_h v29, .8h, \r0, \r2 // PREP_BIAS
+.endif
+ st_d \strd, \r0, \r2
+.endm
+.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
+ st1 {\r0\wd}, [x0], \strd
+ st1 {\r1\wd}, [x9], \strd
+.ifnb \r2
+ st1 {\r2\wd}, [x0], \strd
+ st1 {\r3\wd}, [x9], \strd
+.endif
+.ifnb \r4
+ st1 {\r4\wd}, [x0], \strd
+ st1 {\r5\wd}, [x9], \strd
+ st1 {\r6\wd}, [x0], \strd
+ st1 {\r7\wd}, [x9], \strd
+.endif
+.endm
+.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
+ st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro shift_store_8 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_h 6, \r0, \r1, \r2, \r3
+ umin_h v31, .8h, \r0, \r2
+.else
+ srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
+ xtn_h \r0, \r1, \r2, \r3
+ sub_h v29, .8h, \r0, \r2 // PREP_BIAS
+.endif
+ st_8h \strd, \r0, \r2
+.endm
+.macro shift_store_16 type, strd, dst, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_h 6, \r0, \r1, \r2, \r3
+ umin \r0\().8h, \r0\().8h, v31.8h
+ umin \r1\().8h, \r2\().8h, v31.8h
+.else
+ srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
+ xtn_h \r0, \r1, \r2, \r3
+ sub \r0\().8h, \r0\().8h, v29.8h
+ sub \r1\().8h, \r2\().8h, v29.8h
+.endif
+ st1 {\r0\().8h, \r1\().8h}, [\dst], \strd
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_16bpc_neon, export=1
+ mov w9, \type_h
+ mov w10, \type_v
+ b \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap_neon
+.ifc \bdmax, w8
+ ldr w8, [sp]
+.endif
+ mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, w11
+ mul \my, \my, w11
+ add \mx, \mx, w9 // mx, 8tap_h, 4tap_h
+ add \my, \my, w10 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ dup v31.8h, \bdmax // bitdepth_max
+ clz \bdmax, \bdmax
+ clz w9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ mov w12, #6
+ tst \mx, #(0x7f << 14)
+ sub w9, w9, #24
+ add w13, w12, \bdmax // 6 + intermediate_bits
+ sub w12, w12, \bdmax // 6 - intermediate_bits
+ movrel x11, X(mc_subpel_filters), -8
+ b.ne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ b.ne L(\type\()_8tap_v)
+ b \type\()_neon
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfx w10, \mx, #7, #7
+ and \mx, \mx, #0x7f
+ b.le 4f
+ mov \mx, w10
+4:
+ tst \my, #(0x7f << 14)
+ add \xmx, x11, \mx, uxtw #3
+ b.ne L(\type\()_8tap_hv)
+
+ adr x10, L(\type\()_8tap_h_tbl)
+ dup v30.4s, w12 // 6 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v30.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ dup v29.8h, \bdmax // intermediate_bits
+.else
+ movi v28.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v29.8h, v29.8h // -intermediate_bits
+.endif
+ br x10
+
+20: // 2xN h
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+2:
+ ld1 {v4.8h}, [\src], \s_strd
+ ld1 {v6.8h}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v4.16b, #2
+ ext v7.16b, v6.16b, v6.16b, #2
+ subs \h, \h, #2
+ trn1 v3.2s, v4.2s, v6.2s
+ trn2 v6.2s, v4.2s, v6.2s
+ trn1 v4.2s, v5.2s, v7.2s
+ trn2 v7.2s, v5.2s, v7.2s
+ smull v3.4s, v3.4h, v0.h[0]
+ smlal v3.4s, v4.4h, v0.h[1]
+ smlal v3.4s, v6.4h, v0.h[2]
+ smlal v3.4s, v7.4h, v0.h[3]
+ srshl v3.4s, v3.4s, v30.4s // -(6-intermediate_bits)
+ sqxtun v3.4h, v3.4s
+ srshl v3.4h, v3.4h, v29.4h // -intermediate_bits
+ umin v3.4h, v3.4h, v31.4h
+ st1 {v3.s}[0], [\dst], \d_strd
+ st1 {v3.s}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+4:
+ ld1 {v16.8h}, [\src], \s_strd
+ ld1 {v20.8h}, [\sr2], \s_strd
+ ext v17.16b, v16.16b, v16.16b, #2
+ ext v18.16b, v16.16b, v16.16b, #4
+ ext v19.16b, v16.16b, v16.16b, #6
+ ext v21.16b, v20.16b, v20.16b, #2
+ ext v22.16b, v20.16b, v20.16b, #4
+ ext v23.16b, v20.16b, v20.16b, #6
+ subs \h, \h, #2
+ smull v16.4s, v16.4h, v0.h[0]
+ smlal v16.4s, v17.4h, v0.h[1]
+ smlal v16.4s, v18.4h, v0.h[2]
+ smlal v16.4s, v19.4h, v0.h[3]
+ smull v20.4s, v20.4h, v0.h[0]
+ smlal v20.4s, v21.4h, v0.h[1]
+ smlal v20.4s, v22.4h, v0.h[2]
+ smlal v20.4s, v23.4h, v0.h[3]
+ srshl v16.4s, v16.4s, v30.4s // -(6-intermediate_bits)
+ srshl v20.4s, v20.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ sqxtun v16.4h, v16.4s
+ sqxtun2 v16.8h, v20.4s
+ srshl v16.8h, v16.8h, v29.8h // -intermediate_bits
+ umin v16.8h, v16.8h, v31.8h
+.else
+ uzp1 v16.8h, v16.8h, v20.8h // Same as xtn, xtn2
+ sub v16.8h, v16.8h, v28.8h // PREP_BIAS
+.endif
+ st1 {v16.d}[0], [\dst], \d_strd
+ st1 {v16.d}[1], [\ds2], \d_strd
+ b.gt 4b
+ ret
+
+80:
+160:
+320:
+640:
+1280: // 8xN, 16xN, 32xN, ... h
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ sub \src, \src, #6
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ sub \s_strd, \s_strd, \w, uxtw #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw #1
+.endif
+81:
+ ld1 {v16.8h, v17.8h}, [\src], #32
+ ld1 {v20.8h, v21.8h}, [\sr2], #32
+ mov \mx, \w
+
+8:
+ smull v18.4s, v16.4h, v0.h[0]
+ smull2 v19.4s, v16.8h, v0.h[0]
+ smull v22.4s, v20.4h, v0.h[0]
+ smull2 v23.4s, v20.8h, v0.h[0]
+.irpc i, 1234567
+ ext v24.16b, v16.16b, v17.16b, #(2*\i)
+ ext v25.16b, v20.16b, v21.16b, #(2*\i)
+ smlal v18.4s, v24.4h, v0.h[\i]
+ smlal2 v19.4s, v24.8h, v0.h[\i]
+ smlal v22.4s, v25.4h, v0.h[\i]
+ smlal2 v23.4s, v25.8h, v0.h[\i]
+.endr
+ subs \mx, \mx, #8
+ srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits)
+ srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits)
+ srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits)
+ srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ sqxtun v18.4h, v18.4s
+ sqxtun2 v18.8h, v19.4s
+ sqxtun v22.4h, v22.4s
+ sqxtun2 v22.8h, v23.4s
+ srshl v18.8h, v18.8h, v29.8h // -intermediate_bits
+ srshl v22.8h, v22.8h, v29.8h // -intermediate_bits
+ umin v18.8h, v18.8h, v31.8h
+ umin v22.8h, v22.8h, v31.8h
+.else
+ uzp1 v18.8h, v18.8h, v19.8h // Same as xtn, xtn2
+ uzp1 v22.8h, v22.8h, v23.8h // Ditto
+ sub v18.8h, v18.8h, v28.8h // PREP_BIAS
+ sub v22.8h, v22.8h, v28.8h // PREP_BIAS
+.endif
+ st1 {v18.8h}, [\dst], #16
+ st1 {v22.8h}, [\ds2], #16
+ b.le 9f
+
+ mov v16.16b, v17.16b
+ mov v20.16b, v21.16b
+ ld1 {v17.8h}, [\src], #16
+ ld1 {v21.8h}, [\sr2], #16
+ b 8b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 81b
+ ret
+
+L(\type\()_8tap_h_tbl):
+ .hword L(\type\()_8tap_h_tbl) - 1280b
+ .hword L(\type\()_8tap_h_tbl) - 640b
+ .hword L(\type\()_8tap_h_tbl) - 320b
+ .hword L(\type\()_8tap_h_tbl) - 160b
+ .hword L(\type\()_8tap_h_tbl) - 80b
+ .hword L(\type\()_8tap_h_tbl) - 40b
+ .hword L(\type\()_8tap_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfx w10, \my, #7, #7
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w10
+4:
+ add \xmy, x11, \my, uxtw #3
+
+.ifc \type, prep
+ dup v30.4s, w12 // 6 - intermediate_bits
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ adr x10, L(\type\()_8tap_v_tbl)
+ ldrh w9, [x10, x9, lsl #1]
+.ifc \type, prep
+ neg v30.4s, v30.4s // -(6-intermediate_bits)
+.endif
+ sub x10, x10, w9, uxtw
+ br x10
+
+20: // 2xN v
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ b.gt 28f
+
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ // 2x2 v
+ load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ interleave_1_s v1, v2, v3, v4, v5
+ b.gt 24f
+ smull_smlal_4 v6, v1, v2, v3, v4
+ sqrshrun_h 6, v6
+ umin_h v31, .8h, v6
+ st_s \d_strd, v6, 2
+ ret
+
+24: // 2x4 v
+ load_s \sr2, \src, \s_strd, v6, v7
+ interleave_1_s v5, v6, v7
+ smull_smlal_4 v16, v1, v2, v3, v4
+ smull_smlal_4 v17, v3, v4, v5, v6
+ sqrshrun_h 6, v16, v17
+ umin_h v31, .8h, v16
+ st_s \d_strd, v16, 4
+ ret
+
+28: // 2x6, 2x8, 2x12, 2x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7
+ interleave_1_s v1, v2, v3, v4, v5
+ interleave_1_s v5, v6, v7
+216:
+ subs \h, \h, #4
+ load_s \sr2, \src, \s_strd, v16, v17, v18, v19
+ interleave_1_s v7, v16, v17, v18, v19
+ smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
+ smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18
+ sqrshrun_h 6, v24, v25
+ umin_h v31, .8h, v24
+ st_s \d_strd, v24, 4
+ b.le 0f
+ cmp \h, #2
+ mov v1.16b, v5.16b
+ mov v2.16b, v6.16b
+ mov v3.16b, v7.16b
+ mov v4.16b, v16.16b
+ mov v5.16b, v17.16b
+ mov v6.16b, v18.16b
+ mov v7.16b, v19.16b
+ b.eq 26f
+ b 216b
+26:
+ load_s \sr2, \src, \s_strd, v16, v17
+ interleave_1_s v7, v16, v17
+ smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
+ sqrshrun_h 6, v24
+ umin_h v31, .4h, v24
+ st_s \d_strd, v24, 2
+0:
+ ret
+.endif
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ smull_smlal_4 v6, v1, v2, v3, v4
+ smull_smlal_4 v7, v2, v3, v4, v5
+ shift_store_4 \type, \d_strd, v6, v7
+ b.le 0f
+ load_4h \sr2, \src, \s_strd, v6, v7
+ smull_smlal_4 v1, v3, v4, v5, v6
+ smull_smlal_4 v2, v4, v5, v6, v7
+ shift_store_4 \type, \d_strd, v1, v2
+0:
+ ret
+
+480: // 4x6, 4x8, 4x12, 4x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_4h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+
+48:
+ subs \h, \h, #4
+ load_4h \sr2, \src, \s_strd, v23, v24, v25, v26
+ smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ shift_store_4 \type, \d_strd, v1, v2, v3, v4
+ b.le 0f
+ cmp \h, #2
+ mov v16.8b, v20.8b
+ mov v17.8b, v21.8b
+ mov v18.8b, v22.8b
+ mov v19.8b, v23.8b
+ mov v20.8b, v24.8b
+ mov v21.8b, v25.8b
+ mov v22.8b, v26.8b
+ b.eq 46f
+ b 48b
+46:
+ load_4h \sr2, \src, \s_strd, v23, v24
+ smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
+ shift_store_4 \type, \d_strd, v1, v2
+0:
+ ret
+
+80:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ smull_smlal_4 v16, v1, v2, v3, v4
+ smull2_smlal2_4 v17, v1, v2, v3, v4
+ smull_smlal_4 v18, v2, v3, v4, v5
+ smull2_smlal2_4 v19, v2, v3, v4, v5
+ shift_store_8 \type, \d_strd, v16, v17, v18, v19
+ b.le 0f
+ load_8h \sr2, \src, \s_strd, v6, v7
+ smull_smlal_4 v16, v3, v4, v5, v6
+ smull2_smlal2_4 v17, v3, v4, v5, v6
+ smull_smlal_4 v18, v4, v5, v6, v7
+ smull2_smlal2_4 v19, v4, v5, v6, v7
+ shift_store_8 \type, \d_strd, v16, v17, v18, v19
+0:
+ ret
+
+880: // 8x6, 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmy]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_8h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+
+88:
+ subs \h, \h, #2
+ load_8h \sr2, \src, \s_strd, v23, v24
+ smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24
+ smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24
+ shift_store_8 \type, \d_strd, v1, v2, v3, v4
+ b.le 9f
+ subs \h, \h, #2
+ load_8h \sr2, \src, \s_strd, v25, v26
+ smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25
+ smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26
+ smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ shift_store_8 \type, \d_strd, v1, v2, v3, v4
+ b.le 9f
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ mov v18.16b, v22.16b
+ mov v19.16b, v23.16b
+ mov v20.16b, v24.16b
+ mov v21.16b, v25.16b
+ mov v22.16b, v26.16b
+ b 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ ret
+
+160:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 1680b
+
+ // 16x2, 16x4 v
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ sxtl v0.8h, v0.8b
+
+ load_16h \src, \src, \s_strd, v16, v17, v18, v19, v20, v21
+16:
+ load_16h \src, \src, \s_strd, v22, v23
+ subs \h, \h, #1
+ smull_smlal_4 v1, v16, v18, v20, v22
+ smull2_smlal2_4 v2, v16, v18, v20, v22
+ smull_smlal_4 v3, v17, v19, v21, v23
+ smull2_smlal2_4 v4, v17, v19, v21, v23
+ shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4
+ b.le 0f
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v20.16b, v22.16b
+ mov v21.16b, v23.16b
+ b 16b
+0:
+ ret
+
+L(\type\()_8tap_v_tbl):
+ .hword L(\type\()_8tap_v_tbl) - 1280b
+ .hword L(\type\()_8tap_v_tbl) - 640b
+ .hword L(\type\()_8tap_v_tbl) - 320b
+ .hword L(\type\()_8tap_v_tbl) - 160b
+ .hword L(\type\()_8tap_v_tbl) - 80b
+ .hword L(\type\()_8tap_v_tbl) - 40b
+ .hword L(\type\()_8tap_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfx w10, \my, #7, #7
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w10
+4:
+ add \xmy, x11, \my, uxtw #3
+
+ adr x10, L(\type\()_8tap_hv_tbl)
+ dup v30.4s, w12 // 6 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v30.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ dup v29.4s, w13 // 6 + intermediate_bits
+.else
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v29.4s, v29.4s // -(6+intermediate_bits)
+.endif
+ br x10
+
+20:
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 280f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v27.8h}, [\src], \s_strd
+ ext v28.16b, v27.16b, v27.16b, #2
+ smull v27.4s, v27.4h, v0.4h
+ smull v28.4s, v28.4h, v0.4h
+ addp v27.4s, v27.4s, v28.4s
+ addp v16.4s, v27.4s, v27.4s
+ srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
+ bl L(\type\()_8tap_filter_2)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+ xtn v16.4h, v16.4s
+
+ trn1 v16.2s, v16.2s, v24.2s
+ mov v17.8b, v24.8b
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ ext v18.8b, v17.8b, v24.8b, #4
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v24.4h, v1.h[3]
+
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ umin v2.4h, v2.4h, v31.4h
+ subs \h, \h, #2
+ st1 {v2.s}[0], [\dst], \d_strd
+ st1 {v2.s}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v24.8b
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v27.8h}, [\src], \s_strd
+ ext v28.16b, v27.16b, v27.16b, #2
+ smull v27.4s, v27.4h, v0.4h
+ smull v28.4s, v28.4h, v0.4h
+ addp v27.4s, v27.4s, v28.4s
+ addp v16.4s, v27.4s, v27.4s
+ srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+
+ bl L(\type\()_8tap_filter_2)
+ xtn v16.4h, v16.4s
+ trn1 v16.2s, v16.2s, v24.2s
+ mov v17.8b, v24.8b
+ bl L(\type\()_8tap_filter_2)
+ ext v18.8b, v17.8b, v24.8b, #4
+ mov v19.8b, v24.8b
+ bl L(\type\()_8tap_filter_2)
+ ext v20.8b, v19.8b, v24.8b, #4
+ mov v21.8b, v24.8b
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ ext v22.8b, v21.8b, v24.8b, #4
+ smull v3.4s, v16.4h, v1.h[0]
+ smlal v3.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v19.4h, v1.h[3]
+ smlal v3.4s, v20.4h, v1.h[4]
+ smlal v3.4s, v21.4h, v1.h[5]
+ smlal v3.4s, v22.4h, v1.h[6]
+ smlal v3.4s, v24.4h, v1.h[7]
+
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v3.4h, v3.4s
+ umin v3.4h, v3.4h, v31.4h
+ subs \h, \h, #2
+ st1 {v3.s}[0], [\dst], \d_strd
+ st1 {v3.s}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v24.8b
+ b 28b
+
+0:
+ ret x15
+
+L(\type\()_8tap_filter_2):
+ ld1 {v25.8h}, [\sr2], \s_strd
+ ld1 {v27.8h}, [\src], \s_strd
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v28.16b, v27.16b, v27.16b, #2
+ trn1 v24.2s, v25.2s, v27.2s
+ trn2 v27.2s, v25.2s, v27.2s
+ trn1 v25.2s, v26.2s, v28.2s
+ trn2 v28.2s, v26.2s, v28.2s
+ smull v24.4s, v24.4h, v0.h[0]
+ smlal v24.4s, v25.4h, v0.h[1]
+ smlal v24.4s, v27.4h, v0.h[2]
+ smlal v24.4s, v28.4h, v0.h[3]
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ xtn v24.4h, v24.4s
+ ret
+.endif
+
+40:
+ AARCH64_VALID_JUMP_TARGET
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 480f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ // 4x2, 4x4 hv
+ ld1 {v25.8h}, [\src], \s_strd
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v27.16b, v25.16b, v25.16b, #4
+ ext v28.16b, v25.16b, v25.16b, #6
+ smull v25.4s, v25.4h, v0.h[0]
+ smlal v25.4s, v26.4h, v0.h[1]
+ smlal v25.4s, v27.4h, v0.h[2]
+ smlal v25.4s, v28.4h, v0.h[3]
+ srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+ xtn v16.4h, v16.4s
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v24.8b
+ mov v18.8b, v25.8b
+
+4:
+ bl L(\type\()_8tap_filter_4)
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v24.4h, v1.h[3]
+ smull v3.4s, v17.4h, v1.h[0]
+ smlal v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v24.4h, v1.h[2]
+ smlal v3.4s, v25.4h, v1.h[3]
+.ifc \type, put
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ umin v2.8h, v2.8h, v31.8h
+.else
+ rshrn v2.4h, v2.4s, #6
+ rshrn2 v2.8h, v3.4s, #6
+ sub v2.8h, v2.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+
+ st1 {v2.d}[0], [\dst], \d_strd
+ st1 {v2.d}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v24.8b
+ mov v18.8b, v25.8b
+ b 4b
+
+480: // 4x8, 4x16, 4x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v25.8h}, [\src], \s_strd
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v27.16b, v25.16b, v25.16b, #4
+ ext v28.16b, v25.16b, v25.16b, #6
+ smull v25.4s, v25.4h, v0.h[0]
+ smlal v25.4s, v26.4h, v0.h[1]
+ smlal v25.4s, v27.4h, v0.h[2]
+ smlal v25.4s, v28.4h, v0.h[3]
+ srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+ xtn v16.4h, v16.4s
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v24.8b
+ mov v18.8b, v25.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v19.8b, v24.8b
+ mov v20.8b, v25.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v21.8b, v24.8b
+ mov v22.8b, v25.8b
+
+48:
+ bl L(\type\()_8tap_filter_4)
+ smull v3.4s, v16.4h, v1.h[0]
+ smlal v3.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v19.4h, v1.h[3]
+ smlal v3.4s, v20.4h, v1.h[4]
+ smlal v3.4s, v21.4h, v1.h[5]
+ smlal v3.4s, v22.4h, v1.h[6]
+ smlal v3.4s, v24.4h, v1.h[7]
+ smull v4.4s, v17.4h, v1.h[0]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal v4.4s, v19.4h, v1.h[2]
+ smlal v4.4s, v20.4h, v1.h[3]
+ smlal v4.4s, v21.4h, v1.h[4]
+ smlal v4.4s, v22.4h, v1.h[5]
+ smlal v4.4s, v24.4h, v1.h[6]
+ smlal v4.4s, v25.4h, v1.h[7]
+.ifc \type, put
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v3.4h, v3.4s
+ sqxtun2 v3.8h, v4.4s
+ umin v3.8h, v3.8h, v31.8h
+.else
+ rshrn v3.4h, v3.4s, #6
+ rshrn2 v3.8h, v4.4s, #6
+ sub v3.8h, v3.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ st1 {v3.d}[0], [\dst], \d_strd
+ st1 {v3.d}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v24.8b
+ mov v22.8b, v25.8b
+ b 48b
+0:
+ ret x15
+
+L(\type\()_8tap_filter_4):
+ ld1 {v24.8h}, [\sr2], \s_strd
+ ld1 {v25.8h}, [\src], \s_strd
+ ext v26.16b, v24.16b, v24.16b, #2
+ ext v27.16b, v24.16b, v24.16b, #4
+ ext v28.16b, v24.16b, v24.16b, #6
+ smull v24.4s, v24.4h, v0.h[0]
+ smlal v24.4s, v26.4h, v0.h[1]
+ smlal v24.4s, v27.4h, v0.h[2]
+ smlal v24.4s, v28.4h, v0.h[3]
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v27.16b, v25.16b, v25.16b, #4
+ ext v28.16b, v25.16b, v25.16b, #6
+ smull v25.4s, v25.4h, v0.h[0]
+ smlal v25.4s, v26.4h, v0.h[1]
+ smlal v25.4s, v27.4h, v0.h[2]
+ smlal v25.4s, v28.4h, v0.h[3]
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ xtn v24.4h, v24.4s
+ xtn v25.4h, v25.4s
+ ret
+
+80:
+160:
+320:
+ AARCH64_VALID_JUMP_TARGET
+ b.gt 880f
+ add \xmy, \xmy, #2
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.s}[0], [\xmy]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ ld1 {v27.8h, v28.8h}, [\src], \s_strd
+ smull v24.4s, v27.4h, v0.h[0]
+ smull2 v25.4s, v27.8h, v0.h[0]
+.irpc i, 1234567
+ ext v26.16b, v27.16b, v28.16b, #(2*\i)
+ smlal v24.4s, v26.4h, v0.h[\i]
+ smlal2 v25.4s, v26.8h, v0.h[\i]
+.endr
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53),
+ // and conserves register space (no need to clobber v8-v15).
+ uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
+
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+
+8:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v23.4h, v1.h[2]
+ smlal2 v5.4s, v23.8h, v1.h[2]
+ smlal v2.4s, v23.4h, v1.h[3]
+ smlal2 v3.4s, v23.8h, v1.h[3]
+ smlal v4.4s, v24.4h, v1.h[3]
+ smlal2 v5.4s, v24.8h, v1.h[3]
+.ifc \type, put
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
+ srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+.else
+ rshrn v2.4h, v2.4s, #6
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ sub v2.8h, v2.8h, v29.8h // PREP_BIAS
+ sub v3.8h, v3.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v3.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+ b 8b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 164b
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ ld1 {v27.8h, v28.8h}, [\src], \s_strd
+ smull v24.4s, v27.4h, v0.h[0]
+ smull2 v25.4s, v27.8h, v0.h[0]
+.irpc i, 1234567
+ ext v26.16b, v27.16b, v28.16b, #(2*\i)
+ smlal v24.4s, v26.4h, v0.h[\i]
+ smlal2 v25.4s, v26.8h, v0.h[\i]
+.endr
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53),
+ // and conserves register space (no need to clobber v8-v15).
+ uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2
+
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v19.16b, v23.16b
+ mov v20.16b, v24.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v21.16b, v23.16b
+ mov v22.16b, v24.16b
+
+88:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v19.4h, v1.h[2]
+ smlal2 v5.4s, v19.8h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal2 v3.4s, v19.8h, v1.h[3]
+ smlal v4.4s, v20.4h, v1.h[3]
+ smlal2 v5.4s, v20.8h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal2 v3.4s, v20.8h, v1.h[4]
+ smlal v4.4s, v21.4h, v1.h[4]
+ smlal2 v5.4s, v21.8h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal2 v3.4s, v21.8h, v1.h[5]
+ smlal v4.4s, v22.4h, v1.h[5]
+ smlal2 v5.4s, v22.8h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal2 v3.4s, v22.8h, v1.h[6]
+ smlal v4.4s, v23.4h, v1.h[6]
+ smlal2 v5.4s, v23.8h, v1.h[6]
+ smlal v2.4s, v23.4h, v1.h[7]
+ smlal2 v3.4s, v23.8h, v1.h[7]
+ smlal v4.4s, v24.4h, v1.h[7]
+ smlal2 v5.4s, v24.8h, v1.h[7]
+.ifc \type, put
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
+ srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+.else
+ rshrn v2.4h, v2.4s, #6
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ sub v2.8h, v2.8h, v29.8h // PREP_BIAS
+ sub v3.8h, v3.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v3.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v20.16b, v22.16b
+ mov v21.16b, v23.16b
+ mov v22.16b, v24.16b
+ b 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ ret x15
+
+L(\type\()_8tap_filter_8):
+ ld1 {v4.8h, v5.8h}, [\sr2], \s_strd
+ ld1 {v6.8h, v7.8h}, [\src], \s_strd
+ smull v25.4s, v4.4h, v0.h[0]
+ smull2 v26.4s, v4.8h, v0.h[0]
+ smull v27.4s, v6.4h, v0.h[0]
+ smull2 v28.4s, v6.8h, v0.h[0]
+.irpc i, 1234567
+ ext v23.16b, v4.16b, v5.16b, #(2*\i)
+ ext v24.16b, v6.16b, v7.16b, #(2*\i)
+ smlal v25.4s, v23.4h, v0.h[\i]
+ smlal2 v26.4s, v23.8h, v0.h[\i]
+ smlal v27.4s, v24.4h, v0.h[\i]
+ smlal2 v28.4s, v24.8h, v0.h[\i]
+.endr
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits)
+ srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits)
+ srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits)
+ uzp1 v23.8h, v25.8h, v26.8h // Same as xtn, xtn2
+ uzp1 v24.8h, v27.8h, v28.8h // Ditto
+ ret
+
+L(\type\()_8tap_hv_tbl):
+ .hword L(\type\()_8tap_hv_tbl) - 1280b
+ .hword L(\type\()_8tap_hv_tbl) - 640b
+ .hword L(\type\()_8tap_hv_tbl) - 320b
+ .hword L(\type\()_8tap_hv_tbl) - 160b
+ .hword L(\type\()_8tap_hv_tbl) - 80b
+ .hword L(\type\()_8tap_hv_tbl) - 40b
+ .hword L(\type\()_8tap_hv_tbl) - 20b
+ .hword 0
+endfunc
+
+
+function \type\()_bilin_16bpc_neon, export=1
+.ifc \bdmax, w8
+ ldr w8, [sp]
+.endif
+ dup v1.8h, \mx
+ dup v3.8h, \my
+ mov w10, #16
+ sub w9, w10, \mx
+ sub w10, w10, \my
+ dup v0.8h, w9
+ dup v2.8h, w10
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ clz \bdmax, \bdmax // bitdepth_max
+ clz w9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ mov w11, #4
+ sub w9, w9, #24
+ sub w11, w11, \bdmax // 4 - intermediate_bits
+ add w12, \bdmax, #4 // 4 + intermediate_bits
+ cbnz \mx, L(\type\()_bilin_h)
+ cbnz \my, L(\type\()_bilin_v)
+ b \type\()_neon
+
+L(\type\()_bilin_h):
+ cbnz \my, L(\type\()_bilin_hv)
+
+ adr x10, L(\type\()_bilin_h_tbl)
+ dup v31.8h, w11 // 4 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v31.8h, v31.8h // -(4-intermediate_bits)
+.ifc \type, put
+ dup v30.8h, \bdmax // intermediate_bits
+.else
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v30.8h, v30.8h // -intermediate_bits
+.endif
+ br x10
+
+20: // 2xN h
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ ld1 {v4.4h}, [\src], \s_strd
+ ld1 {v6.4h}, [\sr2], \s_strd
+ ext v5.8b, v4.8b, v4.8b, #2
+ ext v7.8b, v6.8b, v6.8b, #2
+ trn1 v4.2s, v4.2s, v6.2s
+ trn1 v5.2s, v5.2s, v7.2s
+ subs \h, \h, #2
+ mul v4.4h, v4.4h, v0.4h
+ mla v4.4h, v5.4h, v1.4h
+ urshl v4.4h, v4.4h, v31.4h
+ urshl v4.4h, v4.4h, v30.4h
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ ld1 {v4.8h}, [\src], \s_strd
+ ld1 {v6.8h}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v4.16b, #2
+ ext v7.16b, v6.16b, v6.16b, #2
+ trn1 v4.2d, v4.2d, v6.2d
+ trn1 v5.2d, v5.2d, v7.2d
+ subs \h, \h, #2
+ mul v4.8h, v4.8h, v0.8h
+ mla v4.8h, v5.8h, v1.8h
+ urshl v4.8h, v4.8h, v31.8h
+.ifc \type, put
+ urshl v4.8h, v4.8h, v30.8h
+.else
+ sub v4.8h, v4.8h, v29.8h
+.endif
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+ b.gt 4b
+ ret
+
+80: // 8xN h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ ldr h5, [\src, #16]
+ ldr h7, [\sr2, #16]
+ ld1 {v4.8h}, [\src], \s_strd
+ ld1 {v6.8h}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v5.16b, #2
+ ext v7.16b, v6.16b, v7.16b, #2
+ subs \h, \h, #2
+ mul v4.8h, v4.8h, v0.8h
+ mla v4.8h, v5.8h, v1.8h
+ mul v6.8h, v6.8h, v0.8h
+ mla v6.8h, v7.8h, v1.8h
+ urshl v4.8h, v4.8h, v31.8h
+ urshl v6.8h, v6.8h, v31.8h
+.ifc \type, put
+ urshl v4.8h, v4.8h, v30.8h
+ urshl v6.8h, v6.8h, v30.8h
+.else
+ sub v4.8h, v4.8h, v29.8h
+ sub v6.8h, v6.8h, v29.8h
+.endif
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v6.8h}, [\ds2], \d_strd
+ b.gt 8b
+ ret
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w, uxtw #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw #1
+.endif
+161:
+ ld1 {v16.8h}, [\src], #16
+ ld1 {v21.8h}, [\sr2], #16
+ mov \mx, \w
+
+16:
+ ld1 {v17.8h, v18.8h}, [\src], #32
+ ld1 {v22.8h, v23.8h}, [\sr2], #32
+ ext v19.16b, v16.16b, v17.16b, #2
+ ext v20.16b, v17.16b, v18.16b, #2
+ ext v24.16b, v21.16b, v22.16b, #2
+ ext v25.16b, v22.16b, v23.16b, #2
+ mul v16.8h, v16.8h, v0.8h
+ mla v16.8h, v19.8h, v1.8h
+ mul v17.8h, v17.8h, v0.8h
+ mla v17.8h, v20.8h, v1.8h
+ mul v21.8h, v21.8h, v0.8h
+ mla v21.8h, v24.8h, v1.8h
+ mul v22.8h, v22.8h, v0.8h
+ mla v22.8h, v25.8h, v1.8h
+ urshl v16.8h, v16.8h, v31.8h
+ urshl v17.8h, v17.8h, v31.8h
+ urshl v21.8h, v21.8h, v31.8h
+ urshl v22.8h, v22.8h, v31.8h
+ subs \mx, \mx, #16
+.ifc \type, put
+ urshl v16.8h, v16.8h, v30.8h
+ urshl v17.8h, v17.8h, v30.8h
+ urshl v21.8h, v21.8h, v30.8h
+ urshl v22.8h, v22.8h, v30.8h
+.else
+ sub v16.8h, v16.8h, v29.8h
+ sub v17.8h, v17.8h, v29.8h
+ sub v21.8h, v21.8h, v29.8h
+ sub v22.8h, v22.8h, v29.8h
+.endif
+ st1 {v16.8h, v17.8h}, [\dst], #32
+ st1 {v21.8h, v22.8h}, [\ds2], #32
+ b.le 9f
+
+ mov v16.16b, v18.16b
+ mov v21.16b, v23.16b
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 161b
+ ret
+
+L(\type\()_bilin_h_tbl):
+ .hword L(\type\()_bilin_h_tbl) - 1280b
+ .hword L(\type\()_bilin_h_tbl) - 640b
+ .hword L(\type\()_bilin_h_tbl) - 320b
+ .hword L(\type\()_bilin_h_tbl) - 160b
+ .hword L(\type\()_bilin_h_tbl) - 80b
+ .hword L(\type\()_bilin_h_tbl) - 40b
+ .hword L(\type\()_bilin_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr x10, L(\type\()_bilin_v_tbl)
+.ifc \type, prep
+ dup v31.8h, w11 // 4 - intermediate_bits
+.endif
+ ldrh w9, [x10, x9, lsl #1]
+.ifc \type, prep
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+ neg v31.8h, v31.8h // -(4-intermediate_bits)
+.endif
+ sub x10, x10, w9, uxtw
+ br x10
+
+20: // 2xN v
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ ld1 {v16.s}[0], [\src], \s_strd
+ b.gt 24f
+22:
+ ld1 {v17.s}[0], [\sr2], \s_strd
+ ld1 {v18.s}[0], [\src], \s_strd
+ trn1 v16.2s, v16.2s, v17.2s
+ trn1 v17.2s, v17.2s, v18.2s
+ mul v4.4h, v16.4h, v2.4h
+ mla v4.4h, v17.4h, v3.4h
+ urshr v4.8h, v4.8h, #4
+ st1 {v4.s}[0], [\dst]
+ st1 {v4.s}[1], [\ds2]
+ ret
+24: // 2x4, 2x6, 2x8, ... v
+ ld1 {v17.s}[0], [\sr2], \s_strd
+ ld1 {v18.s}[0], [\src], \s_strd
+ ld1 {v19.s}[0], [\sr2], \s_strd
+ ld1 {v20.s}[0], [\src], \s_strd
+ sub \h, \h, #4
+ trn1 v16.2s, v16.2s, v17.2s
+ trn1 v17.2s, v17.2s, v18.2s
+ trn1 v18.2s, v18.2s, v19.2s
+ trn1 v19.2s, v19.2s, v20.2s
+ trn1 v16.2d, v16.2d, v18.2d
+ trn1 v17.2d, v17.2d, v19.2d
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ cmp \h, #2
+ urshr v4.8h, v4.8h, #4
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+ st1 {v4.s}[2], [\dst], \d_strd
+ st1 {v4.s}[3], [\ds2], \d_strd
+ b.lt 0f
+ mov v16.8b, v20.8b
+ b.eq 22b
+ b 24b
+0:
+ ret
+.endif
+
+40: // 4xN v
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.4h}, [\src], \s_strd
+4:
+ ld1 {v17.4h}, [\sr2], \s_strd
+ ld1 {v18.4h}, [\src], \s_strd
+ trn1 v16.2d, v16.2d, v17.2d
+ trn1 v17.2d, v17.2d, v18.2d
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ urshr v4.8h, v4.8h, #4
+.else
+ urshl v4.8h, v4.8h, v31.8h
+ sub v4.8h, v4.8h, v29.8h
+.endif
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ b 4b
+0:
+ ret
+
+80: // 8xN v
+ AARCH64_VALID_JUMP_TARGET
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.8h}, [\src], \s_strd
+8:
+ ld1 {v17.8h}, [\sr2], \s_strd
+ ld1 {v18.8h}, [\src], \s_strd
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ mul v5.8h, v17.8h, v2.8h
+ mla v5.8h, v18.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ urshr v4.8h, v4.8h, #4
+ urshr v5.8h, v5.8h, #4
+.else
+ urshl v4.8h, v4.8h, v31.8h
+ urshl v5.8h, v5.8h, v31.8h
+ sub v4.8h, v4.8h, v29.8h
+ sub v5.8h, v5.8h, v29.8h
+.endif
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+ b.le 0f
+ mov v16.16b, v18.16b
+ b 8b
+0:
+ ret
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v16.8h, v17.8h}, [\src], \s_strd
+2:
+ ld1 {v18.8h, v19.8h}, [\sr2], \s_strd
+ ld1 {v20.8h, v21.8h}, [\src], \s_strd
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v18.8h, v3.8h
+ mul v5.8h, v17.8h, v2.8h
+ mla v5.8h, v19.8h, v3.8h
+ mul v6.8h, v18.8h, v2.8h
+ mla v6.8h, v20.8h, v3.8h
+ mul v7.8h, v19.8h, v2.8h
+ mla v7.8h, v21.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ urshr v4.8h, v4.8h, #4
+ urshr v5.8h, v5.8h, #4
+ urshr v6.8h, v6.8h, #4
+ urshr v7.8h, v7.8h, #4
+.else
+ urshl v4.8h, v4.8h, v31.8h
+ urshl v5.8h, v5.8h, v31.8h
+ urshl v6.8h, v6.8h, v31.8h
+ urshl v7.8h, v7.8h, v31.8h
+ sub v4.8h, v4.8h, v29.8h
+ sub v5.8h, v5.8h, v29.8h
+ sub v6.8h, v6.8h, v29.8h
+ sub v7.8h, v7.8h, v29.8h
+.endif
+ st1 {v4.8h, v5.8h}, [\dst], \d_strd
+ st1 {v6.8h, v7.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ b 2b
+9:
+ subs \w, \w, #16
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #32
+ add \dst, \dst, #32
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_v_tbl):
+ .hword L(\type\()_bilin_v_tbl) - 1280b
+ .hword L(\type\()_bilin_v_tbl) - 640b
+ .hword L(\type\()_bilin_v_tbl) - 320b
+ .hword L(\type\()_bilin_v_tbl) - 160b
+ .hword L(\type\()_bilin_v_tbl) - 80b
+ .hword L(\type\()_bilin_v_tbl) - 40b
+ .hword L(\type\()_bilin_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_bilin_hv):
+ adr x10, L(\type\()_bilin_hv_tbl)
+ dup v31.8h, w11 // 4 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v31.8h, v31.8h // -(4-intermediate_bits)
+.ifc \type, put
+ dup v30.4s, w12 // 4 + intermediate_bits
+.else
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v30.4s, v30.4s // -(4+intermediate_bits)
+.endif
+ br x10
+
+20: // 2xN hv
+ AARCH64_VALID_JUMP_TARGET
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v20.4h}, [\src], \s_strd
+ ext v21.8b, v20.8b, v20.8b, #2
+ mul v16.4h, v20.4h, v0.4h
+ mla v16.4h, v21.4h, v1.4h
+ urshl v16.4h, v16.4h, v31.4h
+
+2:
+ ld1 {v22.4h}, [\sr2], \s_strd
+ ld1 {v24.4h}, [\src], \s_strd
+ ext v23.8b, v22.8b, v22.8b, #2
+ ext v25.8b, v24.8b, v24.8b, #2
+ trn1 v22.2s, v22.2s, v24.2s
+ trn1 v23.2s, v23.2s, v25.2s
+ mul v17.4h, v22.4h, v0.4h
+ mla v17.4h, v23.4h, v1.4h
+ urshl v17.4h, v17.4h, v31.4h
+
+ trn1 v16.2s, v16.2s, v17.2s
+
+ umull v4.4s, v16.4h, v2.4h
+ umlal v4.4s, v17.4h, v3.4h
+ urshl v4.4s, v4.4s, v30.4s
+ xtn v4.4h, v4.4s
+ subs \h, \h, #2
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+ b.le 0f
+ trn2 v16.2s, v17.2s, v17.2s
+ b 2b
+0:
+ ret
+.endif
+
+40: // 4xN hv
+ AARCH64_VALID_JUMP_TARGET
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v20.8h}, [\src], \s_strd
+ ext v21.16b, v20.16b, v20.16b, #2
+ mul v16.4h, v20.4h, v0.4h
+ mla v16.4h, v21.4h, v1.4h
+ urshl v16.4h, v16.4h, v31.4h
+
+4:
+ ld1 {v22.8h}, [\sr2], \s_strd
+ ld1 {v24.8h}, [\src], \s_strd
+ ext v23.16b, v22.16b, v22.16b, #2
+ ext v25.16b, v24.16b, v24.16b, #2
+ trn1 v22.2d, v22.2d, v24.2d
+ trn1 v23.2d, v23.2d, v25.2d
+ mul v17.8h, v22.8h, v0.8h
+ mla v17.8h, v23.8h, v1.8h
+ urshl v17.8h, v17.8h, v31.8h
+
+ trn1 v16.2d, v16.2d, v17.2d
+
+ umull v4.4s, v16.4h, v2.4h
+ umlal v4.4s, v17.4h, v3.4h
+ umull2 v5.4s, v16.8h, v2.8h
+ umlal2 v5.4s, v17.8h, v3.8h
+.ifc \type, put
+ urshl v4.4s, v4.4s, v30.4s
+ urshl v5.4s, v5.4s, v30.4s
+ uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2
+.else
+ rshrn v4.4h, v4.4s, #4
+ rshrn2 v4.8h, v5.4s, #4
+ sub v4.8h, v4.8h, v29.8h
+.endif
+ subs \h, \h, #2
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+ b.le 0f
+ trn2 v16.2d, v17.2d, v17.2d
+ b 4b
+0:
+ ret
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ AARCH64_VALID_JUMP_TARGET
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ldr h21, [\src, #16]
+ ld1 {v20.8h}, [\src], \s_strd
+ ext v21.16b, v20.16b, v21.16b, #2
+ mul v16.8h, v20.8h, v0.8h
+ mla v16.8h, v21.8h, v1.8h
+ urshl v16.8h, v16.8h, v31.8h
+
+2:
+ ldr h23, [\sr2, #16]
+ ld1 {v22.8h}, [\sr2], \s_strd
+ ldr h25, [\src, #16]
+ ld1 {v24.8h}, [\src], \s_strd
+ ext v23.16b, v22.16b, v23.16b, #2
+ ext v25.16b, v24.16b, v25.16b, #2
+ mul v17.8h, v22.8h, v0.8h
+ mla v17.8h, v23.8h, v1.8h
+ mul v18.8h, v24.8h, v0.8h
+ mla v18.8h, v25.8h, v1.8h
+ urshl v17.8h, v17.8h, v31.8h
+ urshl v18.8h, v18.8h, v31.8h
+
+ umull v4.4s, v16.4h, v2.4h
+ umlal v4.4s, v17.4h, v3.4h
+ umull2 v5.4s, v16.8h, v2.8h
+ umlal2 v5.4s, v17.8h, v3.8h
+ umull v6.4s, v17.4h, v2.4h
+ umlal v6.4s, v18.4h, v3.4h
+ umull2 v7.4s, v17.8h, v2.8h
+ umlal2 v7.4s, v18.8h, v3.8h
+.ifc \type, put
+ urshl v4.4s, v4.4s, v30.4s
+ urshl v5.4s, v5.4s, v30.4s
+ urshl v6.4s, v6.4s, v30.4s
+ urshl v7.4s, v7.4s, v30.4s
+ uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2
+ uzp1 v5.8h, v6.8h, v7.8h // Ditto
+.else
+ rshrn v4.4h, v4.4s, #4
+ rshrn2 v4.8h, v5.4s, #4
+ rshrn v5.4h, v6.4s, #4
+ rshrn2 v5.8h, v7.4s, #4
+ sub v4.8h, v4.8h, v29.8h
+ sub v5.8h, v5.8h, v29.8h
+.endif
+ subs \h, \h, #2
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v18.16b
+ b 2b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_hv_tbl):
+ .hword L(\type\()_bilin_hv_tbl) - 1280b
+ .hword L(\type\()_bilin_hv_tbl) - 640b
+ .hword L(\type\()_bilin_hv_tbl) - 320b
+ .hword L(\type\()_bilin_hv_tbl) - 160b
+ .hword L(\type\()_bilin_hv_tbl) - 80b
+ .hword L(\type\()_bilin_hv_tbl) - 40b
+ .hword L(\type\()_bilin_hv_tbl) - 20b
+ .hword 0
+endfunc
+.endm
+
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
+filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+
+.macro load_filter_row dst, src, inc
+ asr w13, \src, #10
+ add \src, \src, \inc
+ ldr \dst, [x11, w13, sxtw #3]
+.endm
+
+function warp_filter_horz_neon
+ add w12, w5, #512
+
+ ld1 {v16.8h, v17.8h}, [x2], x3
+
+ load_filter_row d0, w12, w7
+ load_filter_row d1, w12, w7
+ load_filter_row d2, w12, w7
+ sxtl v0.8h, v0.8b
+ load_filter_row d3, w12, w7
+ sxtl v1.8h, v1.8b
+ load_filter_row d4, w12, w7
+ sxtl v2.8h, v2.8b
+ load_filter_row d5, w12, w7
+ sxtl v3.8h, v3.8b
+ load_filter_row d6, w12, w7
+ sxtl v4.8h, v4.8b
+ load_filter_row d7, w12, w7
+ sxtl v5.8h, v5.8b
+ ext v18.16b, v16.16b, v17.16b, #2*1
+ smull v8.4s, v16.4h, v0.4h
+ smull2 v9.4s, v16.8h, v0.8h
+ sxtl v6.8h, v6.8b
+ ext v19.16b, v16.16b, v17.16b, #2*2
+ smull v10.4s, v18.4h, v1.4h
+ smull2 v11.4s, v18.8h, v1.8h
+ sxtl v7.8h, v7.8b
+ ext v20.16b, v16.16b, v17.16b, #2*3
+ smull v0.4s, v19.4h, v2.4h
+ smull2 v1.4s, v19.8h, v2.8h
+ ext v21.16b, v16.16b, v17.16b, #2*4
+ addp v8.4s, v8.4s, v9.4s
+ smull v2.4s, v20.4h, v3.4h
+ smull2 v3.4s, v20.8h, v3.8h
+ ext v22.16b, v16.16b, v17.16b, #2*5
+ addp v9.4s, v10.4s, v11.4s
+ smull v10.4s, v21.4h, v4.4h
+ smull2 v11.4s, v21.8h, v4.8h
+ ext v23.16b, v16.16b, v17.16b, #2*6
+ addp v0.4s, v0.4s, v1.4s
+ smull v18.4s, v22.4h, v5.4h
+ smull2 v19.4s, v22.8h, v5.8h
+ ext v16.16b, v16.16b, v17.16b, #2*7
+ addp v1.4s, v2.4s, v3.4s
+ addp v2.4s, v10.4s, v11.4s
+ smull v20.4s, v23.4h, v6.4h
+ smull2 v21.4s, v23.8h, v6.8h
+ addp v3.4s, v18.4s, v19.4s
+ smull v22.4s, v16.4h, v7.4h
+ smull2 v23.4s, v16.8h, v7.8h
+ addp v4.4s, v20.4s, v21.4s
+ addp v5.4s, v22.4s, v23.4s
+
+ addp v8.4s, v8.4s, v9.4s
+ addp v0.4s, v0.4s, v1.4s
+ addp v2.4s, v2.4s, v3.4s
+ addp v4.4s, v4.4s, v5.4s
+
+ addp v16.4s, v8.4s, v0.4s
+ addp v17.4s, v2.4s, v4.4s
+
+ add w5, w5, w8
+
+ srshl v16.4s, v16.4s, v14.4s // -(7 - intermediate_bits)
+ srshl v17.4s, v17.4s, v14.4s // -(7 - intermediate_bits)
+
+ ret
+endfunc
+
+// void dav1d_warp_affine_8x8_16bpc_neon(
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const int16_t *const abcd, int mx, int my,
+// const int bitdepth_max)
+.macro warp t
+function warp_affine_8x8\t\()_16bpc_neon, export=1
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+
+.ifb \t
+ dup v15.8h, w7 // bitdepth_max
+.else
+ movi v15.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ clz w7, w7
+ // intermediate_bits = clz(bitdepth_max) - 18
+.ifb \t
+ sub w8, w7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
+.endif
+ sub w7, w7, #25 // -(7 - intermediate_bits)
+.ifb \t
+ neg w8, w8 // -(7 + intermediate_bits)
+.endif
+ dup v14.4s, w7 // -(7 - intermediate_bits)
+.ifb \t
+ dup v13.4s, w8 // -(7 + intermediate_bits)
+.endif
+
+ ldr x4, [x4]
+ sbfx x7, x4, #0, #16
+ sbfx x8, x4, #16, #16
+ sbfx x9, x4, #32, #16
+ sbfx x4, x4, #48, #16
+ mov w10, #8
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ sub x2, x2, #6
+ movrel x11, X(mc_warp_filter), 64*8
+ mov x15, x30
+.ifnb \t
+ lsl x1, x1, #1
+.endif
+
+ bl warp_filter_horz_neon
+ uzp1 v24.8h, v16.8h, v17.8h // Same as xtn, xtn2
+ bl warp_filter_horz_neon
+ uzp1 v25.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v26.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v27.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v28.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v29.8h, v16.8h, v17.8h // Ditto
+ bl warp_filter_horz_neon
+ uzp1 v30.8h, v16.8h, v17.8h // Ditto
+
+1:
+ add w14, w6, #512
+ bl warp_filter_horz_neon
+ uzp1 v31.8h, v16.8h, v17.8h // Same as xtn, xtn2
+
+ load_filter_row d0, w14, w9
+ load_filter_row d1, w14, w9
+ load_filter_row d2, w14, w9
+ load_filter_row d3, w14, w9
+ load_filter_row d4, w14, w9
+ load_filter_row d5, w14, w9
+ load_filter_row d6, w14, w9
+ load_filter_row d7, w14, w9
+ transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
+
+ // This ordering of smull/smlal/smull2/smlal2 is highly
+ // beneficial for Cortex A53 here.
+ smull v16.4s, v24.4h, v0.4h
+ smlal v16.4s, v25.4h, v1.4h
+ smlal v16.4s, v26.4h, v2.4h
+ smlal v16.4s, v27.4h, v3.4h
+ smlal v16.4s, v28.4h, v4.4h
+ smlal v16.4s, v29.4h, v5.4h
+ smlal v16.4s, v30.4h, v6.4h
+ smlal v16.4s, v31.4h, v7.4h
+ smull2 v17.4s, v24.8h, v0.8h
+ smlal2 v17.4s, v25.8h, v1.8h
+ smlal2 v17.4s, v26.8h, v2.8h
+ smlal2 v17.4s, v27.8h, v3.8h
+ smlal2 v17.4s, v28.8h, v4.8h
+ smlal2 v17.4s, v29.8h, v5.8h
+ smlal2 v17.4s, v30.8h, v6.8h
+ smlal2 v17.4s, v31.8h, v7.8h
+
+ mov v24.16b, v25.16b
+ mov v25.16b, v26.16b
+.ifb \t
+ srshl v16.4s, v16.4s, v13.4s // -(7 + intermediate_bits)
+ srshl v17.4s, v17.4s, v13.4s // -(7 + intermediate_bits)
+.else
+ rshrn v16.4h, v16.4s, #7
+ rshrn2 v16.8h, v17.4s, #7
+.endif
+ mov v26.16b, v27.16b
+.ifb \t
+ sqxtun v16.4h, v16.4s
+ sqxtun2 v16.8h, v17.4s
+.else
+ sub v16.8h, v16.8h, v15.8h // PREP_BIAS
+.endif
+ mov v27.16b, v28.16b
+ mov v28.16b, v29.16b
+.ifb \t
+ umin v16.8h, v16.8h, v15.8h // bitdepth_max
+.endif
+ mov v29.16b, v30.16b
+ mov v30.16b, v31.16b
+ subs w10, w10, #1
+ st1 {v16.8h}, [x0], x1
+
+ add w6, w6, w4
+ b.gt 1b
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+
+ ret x15
+endfunc
+.endm
+
+warp
+warp t
+
+// void dav1d_emu_edge_16bpc_neon(
+// const intptr_t bw, const intptr_t bh,
+// const intptr_t iw, const intptr_t ih,
+// const intptr_t x, const intptr_t y,
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_16bpc_neon, export=1
+ ldp x8, x9, [sp]
+
+ // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ // ref += iclip(x, 0, iw - 1)
+ sub x12, x3, #1 // ih - 1
+ cmp x5, x3
+ sub x13, x2, #1 // iw - 1
+ csel x12, x12, x5, ge // min(y, ih - 1)
+ cmp x4, x2
+ bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
+ csel x13, x13, x4, ge // min(x, iw - 1)
+ bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
+ madd x8, x12, x9, x8 // ref += iclip() * stride
+ add x8, x8, x13, lsl #1 // ref += iclip()
+
+ // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ // top_ext = iclip(-y, 0, bh - 1)
+ add x10, x5, x1 // y + bh
+ neg x5, x5 // -y
+ sub x10, x10, x3 // y + bh - ih
+ sub x12, x1, #1 // bh - 1
+ cmp x10, x1
+ bic x5, x5, x5, asr #63 // max(-y, 0)
+ csel x10, x10, x12, lt // min(y + bh - ih, bh-1)
+ cmp x5, x1
+ bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
+ csel x5, x5, x12, lt // min(max(-y, 0), bh-1)
+
+ // right_ext = iclip(x + bw - iw, 0, bw - 1)
+ // left_ext = iclip(-x, 0, bw - 1)
+ add x11, x4, x0 // x + bw
+ neg x4, x4 // -x
+ sub x11, x11, x2 // x + bw - iw
+ sub x13, x0, #1 // bw - 1
+ cmp x11, x0
+ bic x4, x4, x4, asr #63 // max(-x, 0)
+ csel x11, x11, x13, lt // min(x + bw - iw, bw-1)
+ cmp x4, x0
+ bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
+ csel x4, x4, x13, lt // min(max(-x, 0), bw - 1)
+
+ // center_h = bh - top_ext - bottom_ext
+ // dst += top_ext * PXSTRIDE(dst_stride)
+ // center_w = bw - left_ext - right_ext
+ sub x1, x1, x5 // bh - top_ext
+ madd x6, x5, x7, x6
+ sub x2, x0, x4 // bw - left_ext
+ sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext
+ sub x2, x2, x11 // center_w = bw - left_ext - right_ext
+
+ mov x14, x6 // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+ ld1r {v0.8h}, [x8]
+ mov x12, x6 // out = dst
+ mov x3, x4
+ mov v1.16b, v0.16b
+1:
+ subs x3, x3, #16
+ st1 {v0.8h, v1.8h}, [x12], #32
+ b.gt 1b
+.endif
+ mov x13, x8
+ add x12, x6, x4, lsl #1 // out = dst + left_ext
+ mov x3, x2
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
+ subs x3, x3, #32
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64
+ b.gt 1b
+.if \need_right
+ add x3, x8, x2, lsl #1 // in + center_w
+ sub x3, x3, #2 // in + center_w - 1
+ add x12, x6, x4, lsl #1 // dst + left_ext
+ ld1r {v0.8h}, [x3]
+ add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w
+ mov x3, x11
+ mov v1.16b, v0.16b
+1:
+ subs x3, x3, #16
+ st1 {v0.8h, v1.8h}, [x12], #32
+ b.gt 1b
+.endif
+
+ subs x1, x1, #1 // center_h--
+ add x6, x6, x7
+ add x8, x8, x9
+ b.gt 0b
+.endm
+
+ cbz x4, 2f
+ // need_left
+ cbz x11, 3f
+ // need_left + need_right
+ v_loop 1, 1
+ b 5f
+
+2:
+ // !need_left
+ cbz x11, 4f
+ // !need_left + need_right
+ v_loop 0, 1
+ b 5f
+
+3:
+ // need_left + !need_right
+ v_loop 1, 0
+ b 5f
+
+4:
+ // !need_left + !need_right
+ v_loop 0, 0
+
+5:
+
+ cbz x10, 3f
+ // need_bottom
+ sub x8, x6, x7 // ref = dst - stride
+ mov x4, x0
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64
+ mov x3, x10
+2:
+ subs x3, x3, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x10, x6 // dst -= bottom_ext * stride
+ subs x4, x4, #32 // bw -= 32
+ add x6, x6, #64 // dst += 32
+ b.gt 1b
+
+3:
+ cbz x5, 3f
+ // need_top
+ msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64
+ mov x3, x5
+2:
+ subs x3, x3, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x5, x6 // dst -= top_ext * stride
+ subs x0, x0, #32 // bw -= 32
+ add x6, x6, #64 // dst += 32
+ b.gt 1b
+
+3:
+ ret
+endfunc
diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S
new file mode 100644
index 0000000000..3a6cf900a9
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/msac.S
@@ -0,0 +1,480 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define BUF_POS 0
+#define BUF_END 8
+#define DIF 16
+#define RNG 24
+#define CNT 28
+#define ALLOW_UPDATE_CDF 32
+
+const coeffs
+ .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+ .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+endconst
+
+const bits
+ .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+ .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
+endconst
+
+.macro ld1_n d0, d1, src, sz, n
+.if \n <= 8
+ ld1 {\d0\sz}, [\src]
+.else
+ ld1 {\d0\sz, \d1\sz}, [\src]
+.endif
+.endm
+
+.macro st1_n s0, s1, dst, sz, n
+.if \n <= 8
+ st1 {\s0\sz}, [\dst]
+.else
+ st1 {\s0\sz, \s1\sz}, [\dst]
+.endif
+.endm
+
+.macro ushr_n d0, d1, s0, s1, shift, sz, n
+ ushr \d0\sz, \s0\sz, \shift
+.if \n == 16
+ ushr \d1\sz, \s1\sz, \shift
+.endif
+.endm
+
+.macro add_n d0, d1, s0, s1, s2, s3, sz, n
+ add \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ add \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro sub_n d0, d1, s0, s1, s2, s3, sz, n
+ sub \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ sub \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro and_n d0, d1, s0, s1, s2, s3, sz, n
+ and \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ and \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n
+ cmhs \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ cmhs \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n
+ urhadd \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ urhadd \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n
+ sshl \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ sshl \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
+ sqdmulh \d0\sz, \s0\sz, \s2\sz
+.if \n == 16
+ sqdmulh \d1\sz, \s1\sz, \s3\sz
+.endif
+.endm
+
+.macro str_n idx0, idx1, dstreg, dstoff, n
+ str \idx0, [\dstreg, \dstoff]
+.if \n == 16
+ str \idx1, [\dstreg, \dstoff + 16]
+.endif
+.endm
+
+// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+// size_t n_symbols);
+
+function msac_decode_symbol_adapt4_neon, export=1
+.macro decode_update sz, szb, n
+ sub sp, sp, #48
+ add x8, x0, #RNG
+ ld1_n v0, v1, x1, \sz, \n // cdf
+ ld1r {v4\sz}, [x8] // rng
+ movrel x9, coeffs, 30
+ movi v31\sz, #0x7f, lsl #8 // 0x7f00
+ sub x9, x9, x2, lsl #1
+ mvni v30\sz, #0x3f // 0xffc0
+ and v7\szb, v4\szb, v31\szb // rng & 0x7f00
+ str h4, [sp, #14] // store original u = s->rng
+ and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0
+
+ ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret)
+ sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add x8, x0, #DIF + 6
+
+ add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+ add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+
+ ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16)
+ movrel x8, bits
+ str_n q4, q5, sp, #16, \n // store v values to allow indexed access
+
+ ld1_n v16, v17, x8, .8h, \n
+
+ cmhs_n v2, v3, v6, v6, v4, v5, .8h, \n // c >= v
+
+ and_n v6, v7, v2, v3, v16, v17, .16b, \n // One bit per halfword set in the mask
+.if \n == 16
+ add v6.8h, v6.8h, v7.8h
+.endif
+ addv h6, v6.8h // Aggregate mask bits
+ ldr w4, [x0, #ALLOW_UPDATE_CDF]
+ umov w3, v6.h[0]
+ rbit w3, w3
+ clz w15, w3 // ret
+
+ cbz w4, L(renorm)
+ // update_cdf
+ ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols]
+ movi v5\szb, #0xff
+.if \n == 16
+ mov w4, #-5
+.else
+ mvn w14, w2
+ mov w4, #-4
+ cmn w14, #3 // set C if n_symbols <= 2
+.endif
+ urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768
+.if \n == 16
+ sub w4, w4, w3, lsr #4 // -((count >> 4) + 5)
+.else
+ lsr w14, w3, #4 // count >> 4
+ sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4)
+.endif
+ sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i])
+ dup v6\sz, w4 // -rate
+
+ sub w3, w3, w3, lsr #5 // count - (count == 32)
+ sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0)
+ sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate
+ add w3, w3, #1 // count + (count < 32)
+ add_n v0, v1, v0, v1, v4, v5, \sz, \n // cdf + (32768 - cdf[i]) >> rate
+ st1_n v0, v1, x1, \sz, \n
+ strh w3, [x1, x2, lsl #1]
+.endm
+
+ decode_update .4h, .8b, 4
+
+L(renorm):
+ add x8, sp, #16
+ add x8, x8, w15, uxtw #1
+ ldrh w3, [x8] // v
+ ldurh w4, [x8, #-2] // u
+ ldr w6, [x0, #CNT]
+ ldr x7, [x0, #DIF]
+ sub w4, w4, w3 // rng = u - v
+ clz w5, w4 // clz(rng)
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ mvn x7, x7 // ~dif
+ add x7, x7, x3, lsl #48 // ~dif + (v << 48)
+L(renorm2):
+ lsl w4, w4, w5 // rng << d
+ subs w6, w6, w5 // cnt -= d
+ lsl x7, x7, x5 // (~dif + (v << 48)) << d
+ str w4, [x0, #RNG]
+ mvn x7, x7 // ~dif
+ b.hs 9f
+
+ // refill
+ ldp x3, x4, [x0] // BUF_POS, BUF_END
+ add x5, x3, #8
+ cmp x5, x4
+ b.gt 2f
+
+ ldr x3, [x3] // next_bits
+ add w8, w6, #23 // shift_bits = cnt + 23
+ add w6, w6, #16 // cnt += 16
+ rev x3, x3 // next_bits = bswap(next_bits)
+ sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
+ and w8, w8, #24 // shift_bits &= 24
+ lsr x3, x3, x8 // next_bits >>= shift_bits
+ sub w8, w8, w6 // shift_bits -= 16 + cnt
+ str x5, [x0, #BUF_POS]
+ lsl x3, x3, x8 // next_bits <<= shift_bits
+ mov w4, #48
+ sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
+ eor x7, x7, x3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ mov w14, #40
+ sub w5, w14, w6 // c = 40 - cnt
+3:
+ cmp x3, x4
+ b.ge 4f
+ ldrb w8, [x3], #1
+ lsl x8, x8, x5
+ eor x7, x7, x8
+ subs w5, w5, #8
+ b.ge 3b
+
+4: // refill_eob_end
+ str x3, [x0, #BUF_POS]
+ sub w6, w14, w5 // cnt = 40 - c
+
+9:
+ str w6, [x0, #CNT]
+ str x7, [x0, #DIF]
+
+ mov w0, w15
+ add sp, sp, #48
+ ret
+endfunc
+
+function msac_decode_symbol_adapt8_neon, export=1
+ decode_update .8h, .16b, 8
+ b L(renorm)
+endfunc
+
+function msac_decode_symbol_adapt16_neon, export=1
+ decode_update .8h, .16b, 16
+ b L(renorm)
+endfunc
+
+function msac_decode_hi_tok_neon, export=1
+ ld1 {v0.4h}, [x1] // cdf
+ add x16, x0, #RNG
+ movi v31.4h, #0x7f, lsl #8 // 0x7f00
+ movrel x17, coeffs, 30-2*3
+ mvni v30.4h, #0x3f // 0xffc0
+ ldrh w9, [x1, #6] // count = cdf[n_symbols]
+ ld1r {v3.4h}, [x16] // rng
+ movrel x16, bits
+ ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret)
+ add x17, x0, #DIF + 6
+ ld1 {v16.8h}, [x16]
+ mov w13, #-24
+ and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
+ ldr w10, [x0, #ALLOW_UPDATE_CDF]
+ ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16)
+ sub sp, sp, #48
+ ldr w6, [x0, #CNT]
+ ldr x7, [x0, #DIF]
+1:
+ and v7.8b, v3.8b, v31.8b // rng & 0x7f00
+ sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+ add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+ add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+ str h3, [sp, #14] // store original u = s->rng
+ cmhs v2.8h, v1.8h, v4.8h // c >= v
+ str q4, [sp, #16] // store v values to allow indexed access
+ and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask
+ addv h6, v6.8h // Aggregate mask bits
+ umov w3, v6.h[0]
+ add w13, w13, #5
+ rbit w3, w3
+ add x8, sp, #16
+ clz w15, w3 // ret
+
+ cbz w10, 2f
+ // update_cdf
+ movi v5.8b, #0xff
+ mov w4, #-5
+ urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768
+ sub w4, w4, w9, lsr #4 // -((count >> 4) + 5)
+ sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i])
+ dup v6.4h, w4 // -rate
+
+ sub w9, w9, w9, lsr #5 // count - (count == 32)
+ sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0)
+ sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate
+ add w9, w9, #1 // count + (count < 32)
+ add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate
+ st1 {v0.4h}, [x1]
+ and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
+ strh w9, [x1, #6]
+
+2:
+ add x8, x8, w15, uxtw #1
+ ldrh w3, [x8] // v
+ ldurh w4, [x8, #-2] // u
+ sub w4, w4, w3 // rng = u - v
+ clz w5, w4 // clz(rng)
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ mvn x7, x7 // ~dif
+ add x7, x7, x3, lsl #48 // ~dif + (v << 48)
+ lsl w4, w4, w5 // rng << d
+ subs w6, w6, w5 // cnt -= d
+ lsl x7, x7, x5 // (~dif + (v << 48)) << d
+ str w4, [x0, #RNG]
+ dup v3.4h, w4
+ mvn x7, x7 // ~dif
+ b.hs 9f
+
+ // refill
+ ldp x3, x4, [x0] // BUF_POS, BUF_END
+ add x5, x3, #8
+ cmp x5, x4
+ b.gt 2f
+
+ ldr x3, [x3] // next_bits
+ add w8, w6, #23 // shift_bits = cnt + 23
+ add w6, w6, #16 // cnt += 16
+ rev x3, x3 // next_bits = bswap(next_bits)
+ sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3
+ and w8, w8, #24 // shift_bits &= 24
+ lsr x3, x3, x8 // next_bits >>= shift_bits
+ sub w8, w8, w6 // shift_bits -= 16 + cnt
+ str x5, [x0, #BUF_POS]
+ lsl x3, x3, x8 // next_bits <<= shift_bits
+ mov w4, #48
+ sub w6, w4, w8 // cnt = cnt + 64 - shift_bits
+ eor x7, x7, x3 // dif ^= next_bits
+ b 9f
+
+2: // refill_eob
+ mov w14, #40
+ sub w5, w14, w6 // c = 40 - cnt
+3:
+ cmp x3, x4
+ b.ge 4f
+ ldrb w8, [x3], #1
+ lsl x8, x8, x5
+ eor x7, x7, x8
+ subs w5, w5, #8
+ b.ge 3b
+
+4: // refill_eob_end
+ str x3, [x0, #BUF_POS]
+ sub w6, w14, w5 // cnt = 40 - c
+
+9:
+ lsl w15, w15, #1
+ sub w15, w15, #5
+ lsr x12, x7, #48
+ adds w13, w13, w15 // carry = tok_br < 3 || tok == 15
+ dup v1.8h, w12
+ b.cc 1b // loop if !carry
+ add w13, w13, #30
+ str w6, [x0, #CNT]
+ add sp, sp, #48
+ str x7, [x0, #DIF]
+ lsr w0, w13, #1
+ ret
+endfunc
+
+function msac_decode_bool_equi_neon, export=1
+ ldp w5, w6, [x0, #RNG] // + CNT
+ sub sp, sp, #48
+ ldr x7, [x0, #DIF]
+ bic w4, w5, #0xff // r &= 0xff00
+ add w4, w4, #8
+ subs x8, x7, x4, lsl #47 // dif - vw
+ lsr w4, w4, #1 // v
+ sub w5, w5, w4 // r - v
+ cset w15, lo
+ csel w4, w5, w4, hs // if (ret) v = r - v;
+ csel x7, x8, x7, hs // if (ret) dif = dif - vw;
+
+ clz w5, w4 // clz(rng)
+ mvn x7, x7 // ~dif
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_neon, export=1
+ ldp w5, w6, [x0, #RNG] // + CNT
+ sub sp, sp, #48
+ ldr x7, [x0, #DIF]
+ lsr w4, w5, #8 // r >> 8
+ bic w1, w1, #0x3f // f &= ~63
+ mul w4, w4, w1
+ lsr w4, w4, #7
+ add w4, w4, #4 // v
+ subs x8, x7, x4, lsl #48 // dif - vw
+ sub w5, w5, w4 // r - v
+ cset w15, lo
+ csel w4, w5, w4, hs // if (ret) v = r - v;
+ csel x7, x8, x7, hs // if (ret) dif = dif - vw;
+
+ clz w5, w4 // clz(rng)
+ mvn x7, x7 // ~dif
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_adapt_neon, export=1
+ ldr w9, [x1] // cdf[0-1]
+ ldp w5, w6, [x0, #RNG] // + CNT
+ sub sp, sp, #48
+ ldr x7, [x0, #DIF]
+ lsr w4, w5, #8 // r >> 8
+ and w2, w9, #0xffc0 // f &= ~63
+ mul w4, w4, w2
+ lsr w4, w4, #7
+ add w4, w4, #4 // v
+ subs x8, x7, x4, lsl #48 // dif - vw
+ sub w5, w5, w4 // r - v
+ cset w15, lo
+ csel w4, w5, w4, hs // if (ret) v = r - v;
+ csel x7, x8, x7, hs // if (ret) dif = dif - vw;
+
+ ldr w10, [x0, #ALLOW_UPDATE_CDF]
+
+ clz w5, w4 // clz(rng)
+ mvn x7, x7 // ~dif
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+
+ cbz w10, L(renorm2)
+
+ lsr w2, w9, #16 // count = cdf[1]
+ and w9, w9, #0xffff // cdf[0]
+
+ sub w3, w2, w2, lsr #5 // count - (count >= 32)
+ lsr w2, w2, #4 // count >> 4
+ add w10, w3, #1 // count + (count < 32)
+ add w2, w2, #4 // rate = (count >> 4) | 4
+
+ sub w9, w9, w15 // cdf[0] -= bit
+ sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769}
+ asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate
+ sub w9, w9, w11 // cdf[0]
+
+ strh w9, [x1]
+ strh w10, [x1, #2]
+
+ b L(renorm2)
+endfunc
diff --git a/third_party/dav1d/src/arm/64/refmvs.S b/third_party/dav1d/src/arm/64/refmvs.S
new file mode 100644
index 0000000000..becd4c08f6
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/refmvs.S
@@ -0,0 +1,91 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
+// int bx4, int bw4, int bh4)
+
+function splat_mv_neon, export=1
+ ld1 {v3.16b}, [x1]
+ clz w3, w3
+ adr x5, L(splat_tbl)
+ sub w3, w3, #26
+ ext v2.16b, v3.16b, v3.16b, #12
+ ldrh w3, [x5, w3, uxtw #1]
+ add w2, w2, w2, lsl #1
+ ext v0.16b, v2.16b, v3.16b, #4
+ sub x3, x5, w3, uxtw
+ ext v1.16b, v2.16b, v3.16b, #8
+ lsl w2, w2, #2
+ ext v2.16b, v2.16b, v3.16b, #12
+1:
+ ldr x1, [x0], #8
+ subs w4, w4, #1
+ add x1, x1, x2
+ br x3
+
+10:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.8b}, [x1]
+ str s2, [x1, #8]
+ b.gt 1b
+ ret
+20:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b}, [x1]
+ str d1, [x1, #16]
+ b.gt 1b
+ ret
+320:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+160:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+80:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1], #48
+40:
+ AARCH64_VALID_JUMP_TARGET
+ st1 {v0.16b, v1.16b, v2.16b}, [x1]
+ b.gt 1b
+ ret
+
+L(splat_tbl):
+ .hword L(splat_tbl) - 320b
+ .hword L(splat_tbl) - 160b
+ .hword L(splat_tbl) - 80b
+ .hword L(splat_tbl) - 40b
+ .hword L(splat_tbl) - 20b
+ .hword L(splat_tbl) - 10b
+endfunc
diff --git a/third_party/dav1d/src/arm/64/util.S b/third_party/dav1d/src/arm/64/util.S
new file mode 100644
index 0000000000..9013fd4b1e
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/util.S
@@ -0,0 +1,229 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2015 Martin Storsjo
+ * Copyright © 2015 Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#ifndef DAV1D_SRC_ARM_64_UTIL_S
+#define DAV1D_SRC_ARM_64_UTIL_S
+
+#include "config.h"
+#include "src/arm/asm.S"
+
+.macro movrel rd, val, offset=0
+#if defined(__APPLE__)
+ .if \offset < 0
+ adrp \rd, \val@PAGE
+ add \rd, \rd, \val@PAGEOFF
+ sub \rd, \rd, -(\offset)
+ .else
+ adrp \rd, \val+(\offset)@PAGE
+ add \rd, \rd, \val+(\offset)@PAGEOFF
+ .endif
+#elif defined(PIC) && defined(_WIN32)
+ .if \offset < 0
+ adrp \rd, \val
+ add \rd, \rd, :lo12:\val
+ sub \rd, \rd, -(\offset)
+ .else
+ adrp \rd, \val+(\offset)
+ add \rd, \rd, :lo12:\val+(\offset)
+ .endif
+#elif defined(PIC)
+ adrp \rd, \val+(\offset)
+ add \rd, \rd, :lo12:\val+(\offset)
+#else
+ ldr \rd, =\val+\offset
+#endif
+.endm
+
+.macro sub_sp space
+#ifdef _WIN32
+.if \space > 8192
+ // Here, we'd need to touch two (or more) pages while decrementing
+ // the stack pointer.
+ .error "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+ sub x16, sp, #4096
+ ldr xzr, [x16]
+ sub sp, x16, #(\space - 4096)
+.else
+ sub sp, sp, #\space
+.endif
+#else
+.if \space >= 4096
+ sub sp, sp, #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+ sub sp, sp, #(\space)%4096
+.endif
+#endif
+.endm
+
+.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl
+ // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7
+ zip1 \r0\().16b, \r0\().16b, \r1\().16b
+ // c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7
+ zip1 \r2\().16b, \r2\().16b, \r3\().16b
+ // e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7
+ zip1 \r4\().16b, \r4\().16b, \r5\().16b
+ // g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7
+ zip1 \r6\().16b, \r6\().16b, \r7\().16b
+
+ // a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6
+ trn1 \r1\().8h, \r0\().8h, \r2\().8h
+ // a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7
+ trn2 \r3\().8h, \r0\().8h, \r2\().8h
+ // e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6
+ trn1 \r5\().8h, \r4\().8h, \r6\().8h
+ // e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7
+ trn2 \r7\().8h, \r4\().8h, \r6\().8h
+
+ // a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4
+ trn1 \r0\().4s, \r1\().4s, \r5\().4s
+ // a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6
+ trn2 \r2\().4s, \r1\().4s, \r5\().4s
+ // a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5
+ trn1 \r1\().4s, \r3\().4s, \r7\().4s
+ // a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7
+ trn2 \r3\().4s, \r3\().4s, \r7\().4s
+
+ \xtl\()2 \r4\().8h, \r0\().16b
+ \xtl \r0\().8h, \r0\().8b
+ \xtl\()2 \r6\().8h, \r2\().16b
+ \xtl \r2\().8h, \r2\().8b
+ \xtl\()2 \r5\().8h, \r1\().16b
+ \xtl \r1\().8h, \r1\().8b
+ \xtl\()2 \r7\().8h, \r3\().16b
+ \xtl \r3\().8h, \r3\().8b
+.endm
+
+.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
+ trn1 \t8\().8h, \r0\().8h, \r1\().8h
+ trn2 \t9\().8h, \r0\().8h, \r1\().8h
+ trn1 \r1\().8h, \r2\().8h, \r3\().8h
+ trn2 \r3\().8h, \r2\().8h, \r3\().8h
+ trn1 \r0\().8h, \r4\().8h, \r5\().8h
+ trn2 \r5\().8h, \r4\().8h, \r5\().8h
+ trn1 \r2\().8h, \r6\().8h, \r7\().8h
+ trn2 \r7\().8h, \r6\().8h, \r7\().8h
+
+ trn1 \r4\().4s, \r0\().4s, \r2\().4s
+ trn2 \r2\().4s, \r0\().4s, \r2\().4s
+ trn1 \r6\().4s, \r5\().4s, \r7\().4s
+ trn2 \r7\().4s, \r5\().4s, \r7\().4s
+ trn1 \r5\().4s, \t9\().4s, \r3\().4s
+ trn2 \t9\().4s, \t9\().4s, \r3\().4s
+ trn1 \r3\().4s, \t8\().4s, \r1\().4s
+ trn2 \t8\().4s, \t8\().4s, \r1\().4s
+
+ trn1 \r0\().2d, \r3\().2d, \r4\().2d
+ trn2 \r4\().2d, \r3\().2d, \r4\().2d
+ trn1 \r1\().2d, \r5\().2d, \r6\().2d
+ trn2 \r5\().2d, \r5\().2d, \r6\().2d
+ trn2 \r6\().2d, \t8\().2d, \r2\().2d
+ trn1 \r2\().2d, \t8\().2d, \r2\().2d
+ trn1 \r3\().2d, \t9\().2d, \r7\().2d
+ trn2 \r7\().2d, \t9\().2d, \r7\().2d
+.endm
+
+.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
+ trn1 \t8\().16b, \r0\().16b, \r1\().16b
+ trn2 \t9\().16b, \r0\().16b, \r1\().16b
+ trn1 \r1\().16b, \r2\().16b, \r3\().16b
+ trn2 \r3\().16b, \r2\().16b, \r3\().16b
+ trn1 \r0\().16b, \r4\().16b, \r5\().16b
+ trn2 \r5\().16b, \r4\().16b, \r5\().16b
+ trn1 \r2\().16b, \r6\().16b, \r7\().16b
+ trn2 \r7\().16b, \r6\().16b, \r7\().16b
+
+ trn1 \r4\().8h, \r0\().8h, \r2\().8h
+ trn2 \r2\().8h, \r0\().8h, \r2\().8h
+ trn1 \r6\().8h, \r5\().8h, \r7\().8h
+ trn2 \r7\().8h, \r5\().8h, \r7\().8h
+ trn1 \r5\().8h, \t9\().8h, \r3\().8h
+ trn2 \t9\().8h, \t9\().8h, \r3\().8h
+ trn1 \r3\().8h, \t8\().8h, \r1\().8h
+ trn2 \t8\().8h, \t8\().8h, \r1\().8h
+
+ trn1 \r0\().4s, \r3\().4s, \r4\().4s
+ trn2 \r4\().4s, \r3\().4s, \r4\().4s
+ trn1 \r1\().4s, \r5\().4s, \r6\().4s
+ trn2 \r5\().4s, \r5\().4s, \r6\().4s
+ trn2 \r6\().4s, \t8\().4s, \r2\().4s
+ trn1 \r2\().4s, \t8\().4s, \r2\().4s
+ trn1 \r3\().4s, \t9\().4s, \r7\().4s
+ trn2 \r7\().4s, \t9\().4s, \r7\().4s
+.endm
+
+.macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().16b, \r0\().16b, \r1\().16b
+ trn2 \t5\().16b, \r0\().16b, \r1\().16b
+ trn1 \t6\().16b, \r2\().16b, \r3\().16b
+ trn2 \t7\().16b, \r2\().16b, \r3\().16b
+
+ trn1 \r0\().8h, \t4\().8h, \t6\().8h
+ trn2 \r2\().8h, \t4\().8h, \t6\().8h
+ trn1 \r1\().8h, \t5\().8h, \t7\().8h
+ trn2 \r3\().8h, \t5\().8h, \t7\().8h
+.endm
+
+.macro transpose_4x4h r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().4h, \r0\().4h, \r1\().4h
+ trn2 \t5\().4h, \r0\().4h, \r1\().4h
+ trn1 \t6\().4h, \r2\().4h, \r3\().4h
+ trn2 \t7\().4h, \r2\().4h, \r3\().4h
+
+ trn1 \r0\().2s, \t4\().2s, \t6\().2s
+ trn2 \r2\().2s, \t4\().2s, \t6\().2s
+ trn1 \r1\().2s, \t5\().2s, \t7\().2s
+ trn2 \r3\().2s, \t5\().2s, \t7\().2s
+.endm
+
+.macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().4s, \r0\().4s, \r1\().4s
+ trn2 \t5\().4s, \r0\().4s, \r1\().4s
+ trn1 \t6\().4s, \r2\().4s, \r3\().4s
+ trn2 \t7\().4s, \r2\().4s, \r3\().4s
+
+ trn1 \r0\().2d, \t4\().2d, \t6\().2d
+ trn2 \r2\().2d, \t4\().2d, \t6\().2d
+ trn1 \r1\().2d, \t5\().2d, \t7\().2d
+ trn2 \r3\().2d, \t5\().2d, \t7\().2d
+.endm
+
+.macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().8h, \r0\().8h, \r1\().8h
+ trn2 \t5\().8h, \r0\().8h, \r1\().8h
+ trn1 \t6\().8h, \r2\().8h, \r3\().8h
+ trn2 \t7\().8h, \r2\().8h, \r3\().8h
+
+ trn1 \r0\().4s, \t4\().4s, \t6\().4s
+ trn2 \r2\().4s, \t4\().4s, \t6\().4s
+ trn1 \r1\().4s, \t5\().4s, \t7\().4s
+ trn2 \r3\().4s, \t5\().4s, \t7\().4s
+.endm
+
+#endif /* DAV1D_SRC_ARM_64_UTIL_S */
diff --git a/third_party/dav1d/src/arm/asm-offsets.h b/third_party/dav1d/src/arm/asm-offsets.h
new file mode 100644
index 0000000000..2f3c3caa1f
--- /dev/null
+++ b/third_party/dav1d/src/arm/asm-offsets.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ARM_ASM_OFFSETS_H
+#define ARM_ASM_OFFSETS_H
+
+#define FGD_SEED 0
+#define FGD_AR_COEFF_LAG 92
+#define FGD_AR_COEFFS_Y 96
+#define FGD_AR_COEFFS_UV 120
+#define FGD_AR_COEFF_SHIFT 176
+#define FGD_GRAIN_SCALE_SHIFT 184
+
+#define FGD_SCALING_SHIFT 88
+#define FGD_UV_MULT 188
+#define FGD_UV_LUMA_MULT 196
+#define FGD_UV_OFFSET 204
+#define FGD_CLIP_TO_RESTRICTED_RANGE 216
+
+#endif /* ARM_ASM_OFFSETS_H */
diff --git a/third_party/dav1d/src/arm/asm.S b/third_party/dav1d/src/arm/asm.S
new file mode 100644
index 0000000000..dc50415f1f
--- /dev/null
+++ b/third_party/dav1d/src/arm/asm.S
@@ -0,0 +1,291 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ARM_ASM_S
+#define DAV1D_SRC_ARM_ASM_S
+
+#include "config.h"
+
+#if ARCH_AARCH64
+#define x18 do_not_use_x18
+#define w18 do_not_use_w18
+
+/* Support macros for
+ * - Armv8.3-A Pointer Authentication and
+ * - Armv8.5-A Branch Target Identification
+ * features which require emitting a .note.gnu.property section with the
+ * appropriate architecture-dependent feature bits set.
+ *
+ * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
+ * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
+ * used immediately before saving the LR register (x30) to the stack.
+ * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
+ * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
+ * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
+ * have the same value at the two points. For example:
+ *
+ * .global f
+ * f:
+ * AARCH64_SIGN_LINK_REGISTER
+ * stp x29, x30, [sp, #-96]!
+ * mov x29, sp
+ * ...
+ * ldp x29, x30, [sp], #96
+ * AARCH64_VALIDATE_LINK_REGISTER
+ * ret
+ *
+ * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
+ * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
+ * indirect call target. In particular, all symbols exported from a file must
+ * begin with one of these macros. For example, a leaf function that does not
+ * save LR can instead use |AARCH64_VALID_CALL_TARGET|:
+ *
+ * .globl return_zero
+ * return_zero:
+ * AARCH64_VALID_CALL_TARGET
+ * mov x0, #0
+ * ret
+ *
+ * A non-leaf function which does not immediately save LR may need both macros
+ * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
+ * may jump to an alternate implementation before setting up the stack:
+ *
+ * .globl with_early_jump
+ * with_early_jump:
+ * AARCH64_VALID_CALL_TARGET
+ * cmp x0, #128
+ * b.lt .Lwith_early_jump_128
+ * AARCH64_SIGN_LINK_REGISTER
+ * stp x29, x30, [sp, #-96]!
+ * mov x29, sp
+ * ...
+ * ldp x29, x30, [sp], #96
+ * AARCH64_VALIDATE_LINK_REGISTER
+ * ret
+ *
+ * .Lwith_early_jump_128:
+ * ...
+ * ret
+ *
+ * These annotations are only required with indirect calls. Private symbols that
+ * are only the target of direct calls do not require annotations. Also note
+ * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
+ * indirect jumps (BR). Indirect jumps in assembly are supported through
+ * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and
+ * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|.
+ *
+ * Although not necessary, it is safe to use these macros in 32-bit ARM
+ * assembly. This may be used to simplify dual 32-bit and 64-bit files.
+ *
+ * References:
+ * - "ELF for the Arm® 64-bit Architecture"
+ * https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
+ * - "Providing protection for complex software"
+ * https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
+ */
+#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1)
+#define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification
+#define AARCH64_VALID_JUMP_CALL_TARGET hint #38 // BTI 'jc'
+#define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c'
+#define AARCH64_VALID_JUMP_TARGET hint #36 // BTI 'j'
+#else
+#define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification
+#define AARCH64_VALID_JUMP_CALL_TARGET
+#define AARCH64_VALID_CALL_TARGET
+#define AARCH64_VALID_JUMP_TARGET
+#endif
+
+#if defined(__ARM_FEATURE_PAC_DEFAULT)
+
+#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A
+#define AARCH64_SIGN_LINK_REGISTER paciasp
+#define AARCH64_VALIDATE_LINK_REGISTER autiasp
+#elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B
+#define AARCH64_SIGN_LINK_REGISTER pacibsp
+#define AARCH64_VALIDATE_LINK_REGISTER autibsp
+#else
+#error Pointer authentication defines no valid key!
+#endif
+#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions
+#error Authentication of leaf functions is enabled but not supported in dav1d!
+#endif
+#define GNU_PROPERTY_AARCH64_PAC (1 << 1)
+
+#elif defined(__APPLE__) && defined(__arm64e__)
+
+#define GNU_PROPERTY_AARCH64_PAC 0
+#define AARCH64_SIGN_LINK_REGISTER pacibsp
+#define AARCH64_VALIDATE_LINK_REGISTER autibsp
+
+#else /* __ARM_FEATURE_PAC_DEFAULT */
+
+#define GNU_PROPERTY_AARCH64_PAC 0
+#define AARCH64_SIGN_LINK_REGISTER
+#define AARCH64_VALIDATE_LINK_REGISTER
+
+#endif /* !__ARM_FEATURE_PAC_DEFAULT */
+
+
+#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__)
+ .pushsection .note.gnu.property, "a"
+ .balign 8
+ .long 4
+ .long 0x10
+ .long 0x5
+ .asciz "GNU"
+ .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
+ .long 4
+ .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC)
+ .long 0
+ .popsection
+#endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */
+#endif /* ARCH_AARCH64 */
+
+#if ARCH_ARM
+ .syntax unified
+#ifdef __ELF__
+ .arch armv7-a
+ .fpu neon
+ .eabi_attribute 10, 0 // suppress Tag_FP_arch
+ .eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch
+ .section .note.GNU-stack,"",%progbits // Mark stack as non-executable
+#endif /* __ELF__ */
+
+#ifdef _WIN32
+#define CONFIG_THUMB 1
+#else
+#define CONFIG_THUMB 0
+#endif
+
+#if CONFIG_THUMB
+ .thumb
+#define A @
+#define T
+#else
+#define A
+#define T @
+#endif /* CONFIG_THUMB */
+#endif /* ARCH_ARM */
+
+#if !defined(PIC)
+#if defined(__PIC__)
+#define PIC __PIC__
+#elif defined(__pic__)
+#define PIC __pic__
+#endif
+#endif
+
+#ifndef PRIVATE_PREFIX
+#define PRIVATE_PREFIX dav1d_
+#endif
+
+#define PASTE(a,b) a ## b
+#define CONCAT(a,b) PASTE(a,b)
+
+#ifdef PREFIX
+#define EXTERN CONCAT(_,PRIVATE_PREFIX)
+#else
+#define EXTERN PRIVATE_PREFIX
+#endif
+
+.macro function name, export=0, align=2
+ .macro endfunc
+#ifdef __ELF__
+ .size \name, . - \name
+#endif
+#if HAVE_AS_FUNC
+ .endfunc
+#endif
+ .purgem endfunc
+ .endm
+ .text
+ .align \align
+ .if \export
+ .global EXTERN\name
+#ifdef __ELF__
+ .type EXTERN\name, %function
+ .hidden EXTERN\name
+#elif defined(__MACH__)
+ .private_extern EXTERN\name
+#endif
+#if HAVE_AS_FUNC
+ .func EXTERN\name
+#endif
+EXTERN\name:
+ .else
+#ifdef __ELF__
+ .type \name, %function
+#endif
+#if HAVE_AS_FUNC
+ .func \name
+#endif
+ .endif
+\name:
+#if ARCH_AARCH64
+ .if \export
+ AARCH64_VALID_CALL_TARGET
+ .endif
+#endif
+.endm
+
+.macro const name, export=0, align=2
+ .macro endconst
+#ifdef __ELF__
+ .size \name, . - \name
+#endif
+ .purgem endconst
+ .endm
+#if defined(_WIN32)
+ .section .rdata
+#elif !defined(__MACH__)
+ .section .rodata
+#else
+ .const_data
+#endif
+ .align \align
+ .if \export
+ .global EXTERN\name
+#ifdef __ELF__
+ .hidden EXTERN\name
+#elif defined(__MACH__)
+ .private_extern EXTERN\name
+#endif
+EXTERN\name:
+ .endif
+\name:
+.endm
+
+#ifdef __APPLE__
+#define L(x) L ## x
+#else
+#define L(x) .L ## x
+#endif
+
+#define X(x) CONCAT(EXTERN, x)
+
+
+#endif /* DAV1D_SRC_ARM_ASM_S */
diff --git a/third_party/dav1d/src/arm/cdef.h b/third_party/dav1d/src/arm/cdef.h
new file mode 100644
index 0000000000..2e8c8ab6fb
--- /dev/null
+++ b/third_party/dav1d/src/arm/cdef.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/cdef.h"
+
+decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon));
+
+void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src,
+ ptrdiff_t src_stride, const pixel (*left)[2],
+ const pixel *const top,
+ const pixel *const bottom, int h,
+ enum CdefEdgeFlags edges);
+void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src,
+ ptrdiff_t src_stride, const pixel (*left)[2],
+ const pixel *const top,
+ const pixel *const bottom, int h,
+ enum CdefEdgeFlags edges);
+
+// Passing edges to this function, to allow it to switch to a more
+// optimized version for fully edged cases. Using size_t for edges,
+// to avoid ABI differences for passing more than one argument on the stack.
+void BF(dav1d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride,
+ const uint16_t *tmp, int pri_strength,
+ int sec_strength, int dir, int damping, int h,
+ size_t edges HIGHBD_DECL_SUFFIX);
+void BF(dav1d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride,
+ const uint16_t *tmp, int pri_strength,
+ int sec_strength, int dir, int damping, int h,
+ size_t edges HIGHBD_DECL_SUFFIX);
+
+#define DEFINE_FILTER(w, h, tmp_stride) \
+static void \
+cdef_filter_##w##x##h##_neon(pixel *dst, const ptrdiff_t stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, const int sec_strength, \
+ const int dir, const int damping, \
+ const enum CdefEdgeFlags edges \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
+ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \
+ BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, \
+ left, top, bottom, h, edges); \
+ BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength, \
+ sec_strength, dir, damping, h, edges \
+ HIGHBD_TAIL_SUFFIX); \
+}
+
+DEFINE_FILTER(8, 8, 16)
+DEFINE_FILTER(4, 8, 8)
+DEFINE_FILTER(4, 4, 8)
+
+static ALWAYS_INLINE void cdef_dsp_init_arm(Dav1dCdefDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->dir = BF(dav1d_cdef_find_dir, neon);
+ c->fb[0] = cdef_filter_8x8_neon;
+ c->fb[1] = cdef_filter_4x8_neon;
+ c->fb[2] = cdef_filter_4x4_neon;
+}
diff --git a/third_party/dav1d/src/arm/cpu.c b/third_party/dav1d/src/arm/cpu.c
new file mode 100644
index 0000000000..b7a0d3adbc
--- /dev/null
+++ b/third_party/dav1d/src/arm/cpu.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/attributes.h"
+
+#include "src/arm/cpu.h"
+
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
+// NEON is always available; runtime tests are not needed.
+#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
+#include <sys/auxv.h>
+
+#ifndef HWCAP_ARM_NEON
+#define HWCAP_ARM_NEON (1 << 12)
+#endif
+#define NEON_HWCAP HWCAP_ARM_NEON
+
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
+#include <sys/auxv.h>
+
+#define NEON_HWCAP HWCAP_NEON
+
+#elif defined(__ANDROID__)
+#include <stdio.h>
+#include <string.h>
+
+static unsigned parse_proc_cpuinfo(const char *flag) {
+ FILE *file = fopen("/proc/cpuinfo", "r");
+ if (!file)
+ return 0;
+
+ char line_buffer[120];
+ const char *line;
+
+ while ((line = fgets(line_buffer, sizeof(line_buffer), file))) {
+ if (strstr(line, flag)) {
+ fclose(file);
+ return 1;
+ }
+ // if line is incomplete seek back to avoid splitting the search
+ // string into two buffers
+ if (!strchr(line, '\n') && strlen(line) > strlen(flag)) {
+ // use fseek since the 64 bit fseeko is only available since
+ // Android API level 24 and meson defines _FILE_OFFSET_BITS
+ // by default 64
+ if (fseek(file, -strlen(flag), SEEK_CUR))
+ break;
+ }
+ }
+
+ fclose(file);
+
+ return 0;
+}
+#endif
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+ unsigned flags = 0;
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
+ flags |= DAV1D_ARM_CPU_FLAG_NEON;
+#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+ flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
+ unsigned long hw_cap = 0;
+ elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+ flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#elif defined(__ANDROID__)
+ flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#endif
+
+ return flags;
+}
diff --git a/third_party/dav1d/src/arm/cpu.h b/third_party/dav1d/src/arm/cpu.h
new file mode 100644
index 0000000000..8c10a1b6b0
--- /dev/null
+++ b/third_party/dav1d/src/arm/cpu.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ARM_CPU_H
+#define DAV1D_SRC_ARM_CPU_H
+
+enum CpuFlags {
+ DAV1D_ARM_CPU_FLAG_NEON = 1 << 0,
+};
+
+unsigned dav1d_get_cpu_flags_arm(void);
+
+#endif /* DAV1D_SRC_ARM_CPU_H */
diff --git a/third_party/dav1d/src/arm/filmgrain.h b/third_party/dav1d/src/arm/filmgrain.h
new file mode 100644
index 0000000000..48776ac852
--- /dev/null
+++ b/third_party/dav1d/src/arm/filmgrain.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2021, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/filmgrain.h"
+#include "asm-offsets.h"
+
+CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_uv, FGD_AR_COEFFS_UV);
+CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_shift, FGD_AR_COEFF_SHIFT);
+CHECK_OFFSET(Dav1dFilmGrainData, grain_scale_shift, FGD_GRAIN_SCALE_SHIFT);
+
+CHECK_OFFSET(Dav1dFilmGrainData, scaling_shift, FGD_SCALING_SHIFT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_mult, FGD_UV_MULT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_luma_mult, FGD_UV_LUMA_MULT);
+CHECK_OFFSET(Dav1dFilmGrainData, uv_offset, FGD_UV_OFFSET);
+CHECK_OFFSET(Dav1dFilmGrainData, clip_to_restricted_range, FGD_CLIP_TO_RESTRICTED_RANGE);
+
+void BF(dav1d_generate_grain_y, neon)(entry buf[][GRAIN_WIDTH],
+ const Dav1dFilmGrainData *const data
+ HIGHBD_DECL_SUFFIX);
+
+#define GEN_GRAIN_UV(suff) \
+void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \
+ const entry buf_y[][GRAIN_WIDTH], \
+ const Dav1dFilmGrainData *const data, \
+ const intptr_t uv \
+ HIGHBD_DECL_SUFFIX)
+
+GEN_GRAIN_UV(420);
+GEN_GRAIN_UV(422);
+GEN_GRAIN_UV(444);
+
+// Use ptrdiff_t instead of int for the last few parameters, to get the
+// same layout of parameters on the stack across platforms.
+void BF(dav1d_fgy_32x32, neon)(pixel *const dst,
+ const pixel *const src,
+ const ptrdiff_t stride,
+ const uint8_t scaling[SCALING_SIZE],
+ const int scaling_shift,
+ const entry grain_lut[][GRAIN_WIDTH],
+ const int offsets[][2],
+ const int h, const ptrdiff_t clip,
+ const ptrdiff_t type
+ HIGHBD_DECL_SUFFIX);
+
+static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row,
+ const ptrdiff_t stride,
+ const Dav1dFilmGrainData *const data, const size_t pw,
+ const uint8_t scaling[SCALING_SIZE],
+ const entry grain_lut[][GRAIN_WIDTH],
+ const int bh, const int row_num HIGHBD_DECL_SUFFIX)
+{
+ const int rows = 1 + (data->overlap_flag && row_num > 0);
+
+ // seed[0] contains the current row, seed[1] contains the previous
+ unsigned seed[2];
+ for (int i = 0; i < rows; i++) {
+ seed[i] = data->seed;
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8;
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+ }
+
+ int offsets[2 /* col offset */][2 /* row offset */];
+
+ // process this row in BLOCK_SIZE^2 blocks
+ for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
+
+ if (data->overlap_flag && bx) {
+ // shift previous offsets left
+ for (int i = 0; i < rows; i++)
+ offsets[1][i] = offsets[0][i];
+ }
+
+ // update current offsets
+ for (int i = 0; i < rows; i++)
+ offsets[0][i] = get_random_number(8, &seed[i]);
+
+ int type = 0;
+ if (data->overlap_flag && row_num)
+ type |= 1; /* overlap y */
+ if (data->overlap_flag && bx)
+ type |= 2; /* overlap x */
+
+ BF(dav1d_fgy_32x32, neon)(dst_row + bx, src_row + bx, stride,
+ scaling, data->scaling_shift,
+ grain_lut, offsets, bh,
+ data->clip_to_restricted_range, type
+ HIGHBD_TAIL_SUFFIX);
+ }
+}
+
+// Use ptrdiff_t instead of int for the last few parameters, to get the
+// parameters on the stack with the same layout across platforms.
+#define FGUV(nm, sx, sy) \
+void BF(dav1d_fguv_32x32_##nm, neon)(pixel *const dst, \
+ const pixel *const src, \
+ const ptrdiff_t stride, \
+ const uint8_t scaling[SCALING_SIZE], \
+ const Dav1dFilmGrainData *const data, \
+ const entry grain_lut[][GRAIN_WIDTH], \
+ const pixel *const luma_row, \
+ const ptrdiff_t luma_stride, \
+ const int offsets[][2], \
+ const ptrdiff_t h, const ptrdiff_t uv, \
+ const ptrdiff_t is_id, \
+ const ptrdiff_t type \
+ HIGHBD_DECL_SUFFIX); \
+static void \
+fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
+ const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \
+ const size_t pw, const uint8_t scaling[SCALING_SIZE], \
+ const entry grain_lut[][GRAIN_WIDTH], const int bh, \
+ const int row_num, const pixel *const luma_row, \
+ const ptrdiff_t luma_stride, const int uv, const int is_id \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ const int rows = 1 + (data->overlap_flag && row_num > 0); \
+ \
+ /* seed[0] contains the current row, seed[1] contains the previous */ \
+ unsigned seed[2]; \
+ for (int i = 0; i < rows; i++) { \
+ seed[i] = data->seed; \
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; \
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); \
+ } \
+ \
+ int offsets[2 /* col offset */][2 /* row offset */]; \
+ \
+ /* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \
+ for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \
+ if (data->overlap_flag && bx) { \
+ /* shift previous offsets left */ \
+ for (int i = 0; i < rows; i++) \
+ offsets[1][i] = offsets[0][i]; \
+ } \
+ \
+ /* update current offsets */ \
+ for (int i = 0; i < rows; i++) \
+ offsets[0][i] = get_random_number(8, &seed[i]); \
+ \
+ int type = 0; \
+ if (data->overlap_flag && row_num) \
+ type |= 1; /* overlap y */ \
+ if (data->overlap_flag && bx) \
+ type |= 2; /* overlap x */ \
+ if (data->chroma_scaling_from_luma) \
+ type |= 4; \
+ \
+ BF(dav1d_fguv_32x32_##nm, neon)(dst_row + bx, src_row + bx, stride, \
+ scaling, data, grain_lut, \
+ luma_row + (bx << sx), luma_stride, \
+ offsets, bh, uv, is_id, type \
+ HIGHBD_TAIL_SUFFIX); \
+ } \
+}
+
+FGUV(420, 1, 1);
+FGUV(422, 1, 0);
+FGUV(444, 0, 0);
+
+static ALWAYS_INLINE void film_grain_dsp_init_arm(Dav1dFilmGrainDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon);
+
+ c->fgy_32x32xn = fgy_32x32xn_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon;
+}
diff --git a/third_party/dav1d/src/arm/ipred.h b/third_party/dav1d/src/arm/ipred.h
new file mode 100644
index 0000000000..e849d4998b
--- /dev/null
+++ b/third_party/dav1d/src/arm/ipred.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+decl_angular_ipred_fn(BF(dav1d_ipred_dc, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_h, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_v, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_paeth, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_filter, neon));
+
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, neon));
+
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon));
+
+decl_pal_pred_fn(BF(dav1d_pal_pred, neon));
+
+#if ARCH_AARCH64
+void BF(dav1d_ipred_z1_upsample_edge, neon)(pixel *out, const int hsz,
+ const pixel *const in,
+ const int end HIGHBD_DECL_SUFFIX);
+void BF(dav1d_ipred_z1_filter_edge, neon)(pixel *out, const int sz,
+ const pixel *const in,
+ const int end, const int strength);
+void BF(dav1d_ipred_pixel_set, neon)(pixel *out, const pixel px,
+ const int n);
+void BF(dav1d_ipred_z1_fill1, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const top, const int width,
+ const int height, const int dx,
+ const int max_base_x);
+void BF(dav1d_ipred_z1_fill2, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const top, const int width,
+ const int height, const int dx,
+ const int max_base_x);
+
+static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int is_sm = (angle >> 9) & 0x1;
+ const int enable_intra_edge_filter = angle >> 10;
+ angle &= 511;
+ int dx = dav1d_dr_intra_derivative[angle >> 1];
+ pixel top_out[64 + 64 + (64+15)*2 + 16];
+ int max_base_x;
+ const int upsample_above = enable_intra_edge_filter ?
+ get_upsample(width + height, 90 - angle, is_sm) : 0;
+ if (upsample_above) {
+ BF(dav1d_ipred_z1_upsample_edge, neon)(top_out, width + height,
+ topleft_in,
+ width + imin(width, height)
+ HIGHBD_TAIL_SUFFIX);
+ max_base_x = 2 * (width + height) - 2;
+ dx <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, 90 - angle, is_sm) : 0;
+ if (filter_strength) {
+ BF(dav1d_ipred_z1_filter_edge, neon)(top_out, width + height,
+ topleft_in,
+ width + imin(width, height),
+ filter_strength);
+ max_base_x = width + height - 1;
+ } else {
+ max_base_x = width + imin(width, height) - 1;
+ memcpy(top_out, &topleft_in[1], (max_base_x + 1) * sizeof(pixel));
+ }
+ }
+ const int base_inc = 1 + upsample_above;
+ int pad_pixels = width + 15; // max(dx >> 6) == 15
+ BF(dav1d_ipred_pixel_set, neon)(&top_out[max_base_x + 1],
+ top_out[max_base_x], pad_pixels * base_inc);
+ if (upsample_above)
+ BF(dav1d_ipred_z1_fill2, neon)(dst, stride, top_out, width, height,
+ dx, max_base_x);
+ else
+ BF(dav1d_ipred_z1_fill1, neon)(dst, stride, top_out, width, height,
+ dx, max_base_x);
+}
+
+void BF(dav1d_ipred_z3_fill1, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const left, const int width,
+ const int height, const int dy,
+ const int max_base_y);
+void BF(dav1d_ipred_z3_fill2, neon)(pixel *dst, ptrdiff_t stride,
+ const pixel *const left, const int width,
+ const int height, const int dy,
+ const int max_base_y);
+
+void BF(dav1d_ipred_reverse, neon)(pixel *dst, const pixel *const src,
+ const int n);
+
+static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int is_sm = (angle >> 9) & 0x1;
+ const int enable_intra_edge_filter = angle >> 10;
+ angle &= 511;
+ assert(angle > 180);
+ int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
+ pixel flipped[64 + 64 + 16];
+ pixel left_out[64 + 64 + (64+15)*2];
+ int max_base_y;
+ const int upsample_left = enable_intra_edge_filter ?
+ get_upsample(width + height, angle - 180, is_sm) : 0;
+ if (upsample_left) {
+ flipped[0] = topleft_in[0];
+ BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
+ height + imax(width, height));
+ BF(dav1d_ipred_z1_upsample_edge, neon)(left_out, width + height,
+ flipped,
+ height + imin(width, height)
+ HIGHBD_TAIL_SUFFIX);
+ max_base_y = 2 * (width + height) - 2;
+ dy <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, angle - 180, is_sm) : 0;
+
+ if (filter_strength) {
+ flipped[0] = topleft_in[0];
+ BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
+ height + imax(width, height));
+ BF(dav1d_ipred_z1_filter_edge, neon)(left_out, width + height,
+ flipped,
+ height + imin(width, height),
+ filter_strength);
+ max_base_y = width + height - 1;
+ } else {
+ BF(dav1d_ipred_reverse, neon)(left_out, &topleft_in[0],
+ height + imin(width, height));
+ max_base_y = height + imin(width, height) - 1;
+ }
+ }
+ const int base_inc = 1 + upsample_left;
+ // The tbx based implementation needs left[] to have 64 bytes intitialized,
+ // the other implementation can read height + max(dy >> 6) past the end.
+ int pad_pixels = imax(64 - max_base_y - 1, height + 15);
+
+ BF(dav1d_ipred_pixel_set, neon)(&left_out[max_base_y + 1],
+ left_out[max_base_y], pad_pixels * base_inc);
+ if (upsample_left)
+ BF(dav1d_ipred_z3_fill2, neon)(dst, stride, left_out, width, height,
+ dy, max_base_y);
+ else
+ BF(dav1d_ipred_z3_fill1, neon)(dst, stride, left_out, width, height,
+ dy, max_base_y);
+}
+#endif
+
+static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon);
+ c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon);
+ c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon);
+ c->intra_pred[LEFT_DC_PRED] = BF(dav1d_ipred_dc_left, neon);
+ c->intra_pred[HOR_PRED] = BF(dav1d_ipred_h, neon);
+ c->intra_pred[VERT_PRED] = BF(dav1d_ipred_v, neon);
+ c->intra_pred[PAETH_PRED] = BF(dav1d_ipred_paeth, neon);
+ c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon);
+ c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon);
+ c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon);
+#if ARCH_AARCH64
+ c->intra_pred[Z1_PRED] = ipred_z1_neon;
+ c->intra_pred[Z3_PRED] = ipred_z3_neon;
+#endif
+ c->intra_pred[FILTER_PRED] = BF(dav1d_ipred_filter, neon);
+
+ c->cfl_pred[DC_PRED] = BF(dav1d_ipred_cfl, neon);
+ c->cfl_pred[DC_128_PRED] = BF(dav1d_ipred_cfl_128, neon);
+ c->cfl_pred[TOP_DC_PRED] = BF(dav1d_ipred_cfl_top, neon);
+ c->cfl_pred[LEFT_DC_PRED] = BF(dav1d_ipred_cfl_left, neon);
+
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon);
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon);
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);
+
+ c->pal_pred = BF(dav1d_pal_pred, neon);
+}
diff --git a/third_party/dav1d/src/arm/itx.h b/third_party/dav1d/src/arm/itx.h
new file mode 100644
index 0000000000..2ecd086b3b
--- /dev/null
+++ b/third_party/dav1d/src/arm/itx.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
+decl_itx17_fns( 4, 4, neon);
+decl_itx16_fns( 4, 8, neon);
+decl_itx16_fns( 4, 16, neon);
+decl_itx16_fns( 8, 4, neon);
+decl_itx16_fns( 8, 8, neon);
+decl_itx16_fns( 8, 16, neon);
+decl_itx2_fns ( 8, 32, neon);
+decl_itx16_fns(16, 4, neon);
+decl_itx16_fns(16, 8, neon);
+decl_itx12_fns(16, 16, neon);
+decl_itx2_fns (16, 32, neon);
+decl_itx2_fns (32, 8, neon);
+decl_itx2_fns (32, 16, neon);
+decl_itx2_fns (32, 32, neon);
+
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
+
+static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+ assign_itx1_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+ assign_itx2_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
+ assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+ assign_itx12_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+ assign_itx16_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
+
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ if (BITDEPTH == 16 && bpc != 10) return;
+
+ assign_itx17_fn( , 4, 4, neon);
+ assign_itx16_fn(R, 4, 8, neon);
+ assign_itx16_fn(R, 4, 16, neon);
+ assign_itx16_fn(R, 8, 4, neon);
+ assign_itx16_fn( , 8, 8, neon);
+ assign_itx16_fn(R, 8, 16, neon);
+ assign_itx2_fn (R, 8, 32, neon);
+ assign_itx16_fn(R, 16, 4, neon);
+ assign_itx16_fn(R, 16, 8, neon);
+ assign_itx12_fn( , 16, 16, neon);
+ assign_itx2_fn (R, 16, 32, neon);
+ assign_itx1_fn (R, 16, 64, neon);
+ assign_itx2_fn (R, 32, 8, neon);
+ assign_itx2_fn (R, 32, 16, neon);
+ assign_itx2_fn ( , 32, 32, neon);
+ assign_itx1_fn (R, 32, 64, neon);
+ assign_itx1_fn (R, 64, 16, neon);
+ assign_itx1_fn (R, 64, 32, neon);
+ assign_itx1_fn ( , 64, 64, neon);
+}
diff --git a/third_party/dav1d/src/arm/loopfilter.h b/third_party/dav1d/src/arm/loopfilter.h
new file mode 100644
index 0000000000..9ac08d94d2
--- /dev/null
+++ b/third_party/dav1d/src/arm/loopfilter.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/loopfilter.h"
+
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon));
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon));
+
+static ALWAYS_INLINE void loop_filter_dsp_init_arm(Dav1dLoopFilterDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon);
+}
diff --git a/third_party/dav1d/src/arm/looprestoration.h b/third_party/dav1d/src/arm/looprestoration.h
new file mode 100644
index 0000000000..7993dbff68
--- /dev/null
+++ b/third_party/dav1d/src/arm/looprestoration.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#if ARCH_AARCH64
+void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+#else
+
+// The 8bpc version calculates things slightly differently than the reference
+// C version. That version calculates roughly this:
+// int16_t sum = 0;
+// for (int i = 0; i < 7; i++)
+// sum += src[idx] * fh[i];
+// int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h;
+// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
+// sum += 1 << (bitdepth + 6 - round_bits_h);
+// Compared to the reference C version, this is the output of the first pass
+// _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e.
+// with round_offset precompensated.
+// The 16bpc version calculates things pretty much the same way as the
+// reference C version, but with the end result subtracted by
+// 1 << (bitdepth + 6 - round_bits_h).
+void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
+ const pixel *src, ptrdiff_t stride,
+ const int16_t fh[8], intptr_t w,
+ int h, enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX);
+// This calculates things slightly differently than the reference C version.
+// This version calculates roughly this:
+// int32_t sum = 0;
+// for (int i = 0; i < 7; i++)
+// sum += mid[idx] * fv[i];
+// sum = (sum + rounding_off_v) >> round_bits_v;
+// This function assumes that the width is a multiple of 8.
+void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
+ const int16_t *mid, int w, int h,
+ const int16_t fv[8], enum LrEdgeFlags edges,
+ ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
+
+static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ const int16_t (*const filter)[8] = params->filter;
+ ALIGN_STK_16(int16_t, mid, 68 * 384,);
+ int mid_stride = (w + 7) & ~7;
+
+ // Horizontal filter
+ BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride,
+ filter[0], w, h, edges HIGHBD_TAIL_SUFFIX);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride,
+ filter[0], w, 2, edges
+ HIGHBD_TAIL_SUFFIX);
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
+ lpf + 6 * PXSTRIDE(stride),
+ stride, filter[0], w, 2, edges
+ HIGHBD_TAIL_SUFFIX);
+
+ // Vertical filter
+ BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride],
+ w, h, filter[1], edges,
+ mid_stride * sizeof(*mid)
+ HIGHBD_TAIL_SUFFIX);
+}
+#endif
+
+void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
+ const pixel (*left)[4],
+ const pixel *src, const ptrdiff_t stride,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+ const int w, const int h, const int strength,
+ const int bitdepth_max);
+void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const int32_t *a, const int16_t *b,
+ const int w, const int h);
+
+/* filter with a 3x3 box (radius=1) */
+static void dav1d_sgr_filter1_neon(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, const int h, const int strength,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
+ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
+ ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
+ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
+
+ BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
+ NULL, lpf, stride, w, 2, edges);
+
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
+ NULL, lpf + 6 * PXSTRIDE(stride),
+ stride, w, 2, edges);
+
+ dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
+ dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
+ BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h);
+}
+
+void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum,
+ const pixel (*left)[4],
+ const pixel *src, const ptrdiff_t stride,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+ const int w, const int h,
+ const enum LrEdgeFlags edges);
+void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+ const int w, const int h, const int strength,
+ const int bitdepth_max);
+void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const int32_t *a, const int16_t *b,
+ const int w, const int h);
+
+/* filter with a 5x5 box (radius=2) */
+static void dav1d_sgr_filter2_neon(int16_t *tmp,
+ const pixel *src, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
+ const int w, const int h, const int strength,
+ const enum LrEdgeFlags edges
+ HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
+ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
+ ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
+ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
+
+ BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
+ if (edges & LR_HAVE_TOP)
+ BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
+ NULL, lpf, stride, w, 2, edges);
+
+ if (edges & LR_HAVE_BOTTOM)
+ BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
+ NULL, lpf + 6 * PXSTRIDE(stride),
+ stride, w, 2, edges);
+
+ dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
+ dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
+ BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h);
+}
+
+void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *t1, const int w, const int h,
+ const int wt HIGHBD_DECL_SUFFIX);
+void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *t1, const int16_t *t2,
+ const int w, const int h,
+ const int16_t wt[2] HIGHBD_DECL_SUFFIX);
+
+static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp, 64 * 384,);
+ dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf,
+ w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
+ BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
+ tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
+}
+
+static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp, 64 * 384,);
+ dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf,
+ w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
+ BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
+ tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
+}
+
+static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
+ ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
+ dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf,
+ w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
+ dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf,
+ w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
+ const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 };
+ BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride,
+ tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
+}
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if ARCH_AARCH64
+ c->wiener[0] = BF(dav1d_wiener_filter7, neon);
+ c->wiener[1] = BF(dav1d_wiener_filter5, neon);
+#else
+ c->wiener[0] = c->wiener[1] = wiener_filter_neon;
+#endif
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = sgr_filter_5x5_neon;
+ c->sgr[1] = sgr_filter_3x3_neon;
+ c->sgr[2] = sgr_filter_mix_neon;
+ }
+}
diff --git a/third_party/dav1d/src/arm/mc.h b/third_party/dav1d/src/arm/mc.h
new file mode 100644
index 0000000000..06cd533a9b
--- /dev/null
+++ b/third_party/dav1d/src/arm/mc.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/mc.h"
+#include "src/cpu.h"
+
+decl_mc_fn(BF(dav1d_put_8tap_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon));
+decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon));
+decl_mc_fn(BF(dav1d_put_bilin, neon));
+
+decl_mct_fn(BF(dav1d_prep_8tap_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon));
+decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon));
+decl_mct_fn(BF(dav1d_prep_bilin, neon));
+
+decl_avg_fn(BF(dav1d_avg, neon));
+decl_w_avg_fn(BF(dav1d_w_avg, neon));
+decl_mask_fn(BF(dav1d_mask, neon));
+decl_blend_fn(BF(dav1d_blend, neon));
+decl_blend_dir_fn(BF(dav1d_blend_h, neon));
+decl_blend_dir_fn(BF(dav1d_blend_v, neon));
+
+decl_w_mask_fn(BF(dav1d_w_mask_444, neon));
+decl_w_mask_fn(BF(dav1d_w_mask_422, neon));
+decl_w_mask_fn(BF(dav1d_w_mask_420, neon));
+
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon));
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon));
+
+decl_emu_edge_fn(BF(dav1d_emu_edge, neon));
+
+static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
+#define init_mc_fn(type, name, suffix) \
+ c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+ c->mct[type] = BF(dav1d_prep_##name, suffix)
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
+ init_mc_fn (FILTER_2D_BILINEAR, bilin, neon);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
+
+ c->avg = BF(dav1d_avg, neon);
+ c->w_avg = BF(dav1d_w_avg, neon);
+ c->mask = BF(dav1d_mask, neon);
+ c->blend = BF(dav1d_blend, neon);
+ c->blend_h = BF(dav1d_blend_h, neon);
+ c->blend_v = BF(dav1d_blend_v, neon);
+ c->w_mask[0] = BF(dav1d_w_mask_444, neon);
+ c->w_mask[1] = BF(dav1d_w_mask_422, neon);
+ c->w_mask[2] = BF(dav1d_w_mask_420, neon);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
+ c->emu_edge = BF(dav1d_emu_edge, neon);
+}
diff --git a/third_party/dav1d/src/arm/msac.h b/third_party/dav1d/src/arm/msac.h
new file mode 100644
index 0000000000..9db0bf86ae
--- /dev/null
+++ b/third_party/dav1d/src/arm/msac.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ARM_MSAC_H
+#define DAV1D_SRC_ARM_MSAC_H
+
+unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_hi_tok_neon(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
+unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
+
+#if ARCH_AARCH64 || defined(__ARM_NEON)
+#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon
+#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
+#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_neon
+#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_neon
+#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_neon
+#define dav1d_msac_decode_bool dav1d_msac_decode_bool_neon
+#endif
+
+#endif /* DAV1D_SRC_ARM_MSAC_H */
diff --git a/third_party/dav1d/src/arm/refmvs.h b/third_party/dav1d/src/arm/refmvs.h
new file mode 100644
index 0000000000..4c96fc5095
--- /dev/null
+++ b/third_party/dav1d/src/arm/refmvs.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/refmvs.h"
+
+decl_splat_mv_fn(dav1d_splat_mv_neon);
+
+static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+ c->splat_mv = dav1d_splat_mv_neon;
+}
diff --git a/third_party/dav1d/src/cdef.h b/third_party/dav1d/src/cdef.h
new file mode 100644
index 0000000000..07c84d9ff5
--- /dev/null
+++ b/third_party/dav1d/src/cdef.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CDEF_H
+#define DAV1D_SRC_CDEF_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "common/bitdepth.h"
+
+enum CdefEdgeFlags {
+ CDEF_HAVE_LEFT = 1 << 0,
+ CDEF_HAVE_RIGHT = 1 << 1,
+ CDEF_HAVE_TOP = 1 << 2,
+ CDEF_HAVE_BOTTOM = 1 << 3,
+};
+
+#ifdef BITDEPTH
+typedef const pixel (*const_left_pixel_row_2px)[2];
+#else
+typedef const void *const_left_pixel_row_2px;
+#endif
+
+// CDEF operates entirely on pre-filter data; if bottom/right edges are
+// present (according to $edges), then the pre-filter data is located in
+// $dst. However, the edge pixels above $dst may be post-filter, so in
+// order to get access to pre-filter top pixels, use $top.
+#define decl_cdef_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const_left_pixel_row_2px left, \
+ const pixel *top, const pixel *bottom, \
+ int pri_strength, int sec_strength, \
+ int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
+typedef decl_cdef_fn(*cdef_fn);
+
+#define decl_cdef_dir_fn(name) \
+int (name)(const pixel *dst, ptrdiff_t dst_stride, unsigned *var HIGHBD_DECL_SUFFIX)
+typedef decl_cdef_dir_fn(*cdef_dir_fn);
+
+typedef struct Dav1dCdefDSPContext {
+ cdef_dir_fn dir;
+ cdef_fn fb[3 /* 444/luma, 422, 420 */];
+} Dav1dCdefDSPContext;
+
+bitfn_decls(void dav1d_cdef_dsp_init, Dav1dCdefDSPContext *c);
+
+#endif /* DAV1D_SRC_CDEF_H */
diff --git a/third_party/dav1d/src/cdef_apply.h b/third_party/dav1d/src/cdef_apply.h
new file mode 100644
index 0000000000..a9748ee4f6
--- /dev/null
+++ b/third_party/dav1d/src/cdef_apply.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CDEF_APPLY_H
+#define DAV1D_SRC_CDEF_APPLY_H
+
+#include "common/bitdepth.h"
+
+#include "src/internal.h"
+
+void bytefn(dav1d_cdef_brow)(Dav1dTaskContext *tc, pixel *const p[3],
+ const Av1Filter *lflvl, int by_start, int by_end,
+ int sbrow_start, int sby);
+
+#endif /* DAV1D_SRC_CDEF_APPLY_H */
diff --git a/third_party/dav1d/src/cdef_apply_tmpl.c b/third_party/dav1d/src/cdef_apply_tmpl.c
new file mode 100644
index 0000000000..e2d8b83fc7
--- /dev/null
+++ b/third_party/dav1d/src/cdef_apply_tmpl.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/cdef_apply.h"
+
+enum Backup2x8Flags {
+ BACKUP_2X8_Y = 1 << 0,
+ BACKUP_2X8_UV = 1 << 1,
+};
+
+static void backup2lines(pixel *const dst[3], /*const*/ pixel *const src[3],
+ const ptrdiff_t stride[2],
+ const enum Dav1dPixelLayout layout)
+{
+ const ptrdiff_t y_stride = PXSTRIDE(stride[0]);
+ if (y_stride < 0)
+ pixel_copy(dst[0] + y_stride, src[0] + 7 * y_stride, -2 * y_stride);
+ else
+ pixel_copy(dst[0], src[0] + 6 * y_stride, 2 * y_stride);
+
+ if (layout != DAV1D_PIXEL_LAYOUT_I400) {
+ const ptrdiff_t uv_stride = PXSTRIDE(stride[1]);
+ if (uv_stride < 0) {
+ const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 3 : 7;
+ pixel_copy(dst[1] + uv_stride, src[1] + uv_off * uv_stride, -2 * uv_stride);
+ pixel_copy(dst[2] + uv_stride, src[2] + uv_off * uv_stride, -2 * uv_stride);
+ } else {
+ const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 2 : 6;
+ pixel_copy(dst[1], src[1] + uv_off * uv_stride, 2 * uv_stride);
+ pixel_copy(dst[2], src[2] + uv_off * uv_stride, 2 * uv_stride);
+ }
+ }
+}
+
+static void backup2x8(pixel dst[3][8][2],
+ /*const*/ pixel *const src[3],
+ const ptrdiff_t src_stride[2], int x_off,
+ const enum Dav1dPixelLayout layout,
+ const enum Backup2x8Flags flag)
+{
+ ptrdiff_t y_off = 0;
+ if (flag & BACKUP_2X8_Y) {
+ for (int y = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))
+ pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);
+ }
+
+ if (layout == DAV1D_PIXEL_LAYOUT_I400 || !(flag & BACKUP_2X8_UV))
+ return;
+
+ const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+
+ x_off >>= ss_hor;
+ y_off = 0;
+ for (int y = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) {
+ pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2);
+ pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2);
+ }
+}
+
+static int adjust_strength(const int strength, const unsigned var) {
+ if (!var) return 0;
+ const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0;
+ return (strength * (4 + i) + 8) >> 4;
+}
+
+void bytefn(dav1d_cdef_brow)(Dav1dTaskContext *const tc,
+ pixel *const p[3],
+ const Av1Filter *const lflvl,
+ const int by_start, const int by_end,
+ const int sbrow_start, const int sby)
+{
+ Dav1dFrameContext *const f = (Dav1dFrameContext *)tc->f;
+ const int bitdepth_min_8 = BITDEPTH == 8 ? 0 : f->cur.p.bpc - 8;
+ const Dav1dDSPContext *const dsp = f->dsp;
+ enum CdefEdgeFlags edges = CDEF_HAVE_BOTTOM | (by_start > 0 ? CDEF_HAVE_TOP : 0);
+ pixel *ptrs[3] = { p[0], p[1], p[2] };
+ const int sbsz = 16;
+ const int sb64w = f->sb128w << 1;
+ const int damping = f->frame_hdr->cdef.damping + bitdepth_min_8;
+ const enum Dav1dPixelLayout layout = f->cur.p.layout;
+ const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
+ const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+ static const uint8_t uv_dirs[2][8] = { { 0, 1, 2, 3, 4, 5, 6, 7 },
+ { 7, 0, 2, 4, 5, 6, 6, 6 } };
+ const uint8_t *uv_dir = uv_dirs[layout == DAV1D_PIXEL_LAYOUT_I422];
+ const int have_tt = f->c->n_tc > 1;
+ const int sb128 = f->seq_hdr->sb128;
+ const int resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
+ const ptrdiff_t y_stride = PXSTRIDE(f->cur.stride[0]);
+ const ptrdiff_t uv_stride = PXSTRIDE(f->cur.stride[1]);
+
+ for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) {
+ const int tf = tc->top_pre_cdef_toggle;
+ const int by_idx = (by & 30) >> 1;
+ if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
+
+ if ((!have_tt || sbrow_start || by + 2 < by_end) &&
+ edges & CDEF_HAVE_BOTTOM)
+ {
+ // backup pre-filter data for next iteration
+ pixel *const cdef_top_bak[3] = {
+ f->lf.cdef_line[!tf][0] + have_tt * sby * 4 * y_stride,
+ f->lf.cdef_line[!tf][1] + have_tt * sby * 8 * uv_stride,
+ f->lf.cdef_line[!tf][2] + have_tt * sby * 8 * uv_stride
+ };
+ backup2lines(cdef_top_bak, ptrs, f->cur.stride, layout);
+ }
+
+ ALIGN_STK_16(pixel, lr_bak, 2 /* idx */, [3 /* plane */][8 /* y */][2 /* x */]);
+ pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
+ edges &= ~CDEF_HAVE_LEFT;
+ edges |= CDEF_HAVE_RIGHT;
+ enum Backup2x8Flags prev_flag = 0;
+ for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
+ const int sb128x = sbx >> 1;
+ const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
+ const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
+ if (cdef_idx == -1 ||
+ (!f->frame_hdr->cdef.y_strength[cdef_idx] &&
+ !f->frame_hdr->cdef.uv_strength[cdef_idx]))
+ {
+ last_skip = 1;
+ goto next_sb;
+ }
+
+ // Create a complete 32-bit mask for the sb row ahead of time.
+ const uint16_t (*noskip_row)[2] = &lflvl[sb128x].noskip_mask[by_idx];
+ const unsigned noskip_mask = (unsigned) noskip_row[0][1] << 16 |
+ noskip_row[0][0];
+
+ const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
+ const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
+ const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1);
+
+ const int y_pri_lvl = (y_lvl >> 2) << bitdepth_min_8;
+ int y_sec_lvl = y_lvl & 3;
+ y_sec_lvl += y_sec_lvl == 3;
+ y_sec_lvl <<= bitdepth_min_8;
+
+ const int uv_pri_lvl = (uv_lvl >> 2) << bitdepth_min_8;
+ int uv_sec_lvl = uv_lvl & 3;
+ uv_sec_lvl += uv_sec_lvl == 3;
+ uv_sec_lvl <<= bitdepth_min_8;
+
+ pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
+ for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
+ bx += 2, edges |= CDEF_HAVE_LEFT)
+ {
+ if (bx + 2 >= f->bw) edges &= ~CDEF_HAVE_RIGHT;
+
+ // check if this 8x8 block had any coded coefficients; if not,
+ // go to the next block
+ const uint32_t bx_mask = 3U << (bx & 30);
+ if (!(noskip_mask & bx_mask)) {
+ last_skip = 1;
+ goto next_b;
+ }
+ const int do_left = last_skip ? flag : (prev_flag ^ flag) & flag;
+ prev_flag = flag;
+ if (do_left && edges & CDEF_HAVE_LEFT) {
+ // we didn't backup the prefilter data because it wasn't
+ // there, so do it here instead
+ backup2x8(lr_bak[bit], bptrs, f->cur.stride, 0, layout, do_left);
+ }
+ if (edges & CDEF_HAVE_RIGHT) {
+ // backup pre-filter data for next iteration
+ backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout, flag);
+ }
+
+ int dir;
+ unsigned variance;
+ if (y_pri_lvl || uv_pri_lvl)
+ dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0],
+ &variance HIGHBD_CALL_SUFFIX);
+
+ const pixel *top, *bot;
+ ptrdiff_t offset;
+
+ if (!have_tt) goto st_y;
+ if (sbrow_start && by == by_start) {
+ if (resize) {
+ offset = (sby - 1) * 4 * y_stride + bx * 4;
+ top = &f->lf.cdef_lpf_line[0][offset];
+ } else {
+ offset = (sby * (4 << sb128) - 4) * y_stride + bx * 4;
+ top = &f->lf.lr_lpf_line[0][offset];
+ }
+ bot = bptrs[0] + 8 * y_stride;
+ } else if (!sbrow_start && by + 2 >= by_end) {
+ top = &f->lf.cdef_line[tf][0][sby * 4 * y_stride + bx * 4];
+ if (resize) {
+ offset = (sby * 4 + 2) * y_stride + bx * 4;
+ bot = &f->lf.cdef_lpf_line[0][offset];
+ } else {
+ const int line = sby * (4 << sb128) + 4 * sb128 + 2;
+ offset = line * y_stride + bx * 4;
+ bot = &f->lf.lr_lpf_line[0][offset];
+ }
+ } else {
+ st_y:;
+ offset = sby * 4 * y_stride;
+ top = &f->lf.cdef_line[tf][0][have_tt * offset + bx * 4];
+ bot = bptrs[0] + 8 * y_stride;
+ }
+ if (y_pri_lvl) {
+ const int adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance);
+ if (adj_y_pri_lvl || y_sec_lvl)
+ dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
+ top, bot, adj_y_pri_lvl, y_sec_lvl,
+ dir, damping, edges HIGHBD_CALL_SUFFIX);
+ } else if (y_sec_lvl)
+ dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
+ top, bot, 0, y_sec_lvl, 0, damping,
+ edges HIGHBD_CALL_SUFFIX);
+
+ if (!uv_lvl) goto skip_uv;
+ assert(layout != DAV1D_PIXEL_LAYOUT_I400);
+
+ const int uvdir = uv_pri_lvl ? uv_dir[dir] : 0;
+ for (int pl = 1; pl <= 2; pl++) {
+ if (!have_tt) goto st_uv;
+ if (sbrow_start && by == by_start) {
+ if (resize) {
+ offset = (sby - 1) * 4 * uv_stride + (bx * 4 >> ss_hor);
+ top = &f->lf.cdef_lpf_line[pl][offset];
+ } else {
+ const int line = sby * (4 << sb128) - 4;
+ offset = line * uv_stride + (bx * 4 >> ss_hor);
+ top = &f->lf.lr_lpf_line[pl][offset];
+ }
+ bot = bptrs[pl] + (8 >> ss_ver) * uv_stride;
+ } else if (!sbrow_start && by + 2 >= by_end) {
+ const ptrdiff_t top_offset = sby * 8 * uv_stride +
+ (bx * 4 >> ss_hor);
+ top = &f->lf.cdef_line[tf][pl][top_offset];
+ if (resize) {
+ offset = (sby * 4 + 2) * uv_stride + (bx * 4 >> ss_hor);
+ bot = &f->lf.cdef_lpf_line[pl][offset];
+ } else {
+ const int line = sby * (4 << sb128) + 4 * sb128 + 2;
+ offset = line * uv_stride + (bx * 4 >> ss_hor);
+ bot = &f->lf.lr_lpf_line[pl][offset];
+ }
+ } else {
+ st_uv:;
+ const ptrdiff_t offset = sby * 8 * uv_stride;
+ top = &f->lf.cdef_line[tf][pl][have_tt * offset + (bx * 4 >> ss_hor)];
+ bot = bptrs[pl] + (8 >> ss_ver) * uv_stride;
+ }
+ dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],
+ lr_bak[bit][pl], top, bot,
+ uv_pri_lvl, uv_sec_lvl, uvdir,
+ damping - 1, edges HIGHBD_CALL_SUFFIX);
+ }
+
+ skip_uv:
+ bit ^= 1;
+ last_skip = 0;
+
+ next_b:
+ bptrs[0] += 8;
+ bptrs[1] += 8 >> ss_hor;
+ bptrs[2] += 8 >> ss_hor;
+ }
+
+ next_sb:
+ iptrs[0] += sbsz * 4;
+ iptrs[1] += sbsz * 4 >> ss_hor;
+ iptrs[2] += sbsz * 4 >> ss_hor;
+ }
+
+ ptrs[0] += 8 * PXSTRIDE(f->cur.stride[0]);
+ ptrs[1] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
+ ptrs[2] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
+ tc->top_pre_cdef_toggle ^= 1;
+ }
+}
diff --git a/third_party/dav1d/src/cdef_tmpl.c b/third_party/dav1d/src/cdef_tmpl.c
new file mode 100644
index 0000000000..59439457a1
--- /dev/null
+++ b/third_party/dav1d/src/cdef_tmpl.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+#include "src/cdef.h"
+#include "src/tables.h"
+
+static inline int constrain(const int diff, const int threshold,
+ const int shift)
+{
+ const int adiff = abs(diff);
+ return apply_sign(imin(adiff, imax(0, threshold - (adiff >> shift))), diff);
+}
+
+static inline void fill(int16_t *tmp, const ptrdiff_t stride,
+ const int w, const int h)
+{
+ /* Use a value that's a large positive number when interpreted as unsigned,
+ * and a large negative number when interpreted as signed. */
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++)
+ tmp[x] = INT16_MIN;
+ tmp += stride;
+ }
+}
+
+static void padding(int16_t *tmp, const ptrdiff_t tmp_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const pixel (*left)[2],
+ const pixel *top, const pixel *bottom,
+ const int w, const int h, const enum CdefEdgeFlags edges)
+{
+ // fill extended input buffer
+ int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2;
+ if (!(edges & CDEF_HAVE_TOP)) {
+ fill(tmp - 2 - 2 * tmp_stride, tmp_stride, w + 4, 2);
+ y_start = 0;
+ }
+ if (!(edges & CDEF_HAVE_BOTTOM)) {
+ fill(tmp + h * tmp_stride - 2, tmp_stride, w + 4, 2);
+ y_end -= 2;
+ }
+ if (!(edges & CDEF_HAVE_LEFT)) {
+ fill(tmp + y_start * tmp_stride - 2, tmp_stride, 2, y_end - y_start);
+ x_start = 0;
+ }
+ if (!(edges & CDEF_HAVE_RIGHT)) {
+ fill(tmp + y_start * tmp_stride + w, tmp_stride, 2, y_end - y_start);
+ x_end -= 2;
+ }
+
+ for (int y = y_start; y < 0; y++) {
+ for (int x = x_start; x < x_end; x++)
+ tmp[x + y * tmp_stride] = top[x];
+ top += PXSTRIDE(src_stride);
+ }
+ for (int y = 0; y < h; y++)
+ for (int x = x_start; x < 0; x++)
+ tmp[x + y * tmp_stride] = left[y][2 + x];
+ for (int y = 0; y < h; y++) {
+ for (int x = (y < h) ? 0 : x_start; x < x_end; x++)
+ tmp[x] = src[x];
+ src += PXSTRIDE(src_stride);
+ tmp += tmp_stride;
+ }
+ for (int y = h; y < y_end; y++) {
+ for (int x = x_start; x < x_end; x++)
+ tmp[x] = bottom[x];
+ bottom += PXSTRIDE(src_stride);
+ tmp += tmp_stride;
+ }
+
+}
+
+static NOINLINE void
+cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2],
+ const pixel *const top, const pixel *const bottom,
+ const int pri_strength, const int sec_strength,
+ const int dir, const int damping, const int w, int h,
+ const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ const ptrdiff_t tmp_stride = 12;
+ assert((w == 4 || w == 8) && (h == 4 || h == 8));
+ int16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4)
+ int16_t *tmp = tmp_buf + 2 * tmp_stride + 2;
+
+ padding(tmp, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+
+ if (pri_strength) {
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
+ const int pri_shift = imax(0, damping - ulog2(pri_strength));
+ if (sec_strength) {
+ const int sec_shift = damping - ulog2(sec_strength);
+ do {
+ for (int x = 0; x < w; x++) {
+ const int px = dst[x];
+ int sum = 0;
+ int max = px, min = px;
+ int pri_tap_k = pri_tap;
+ for (int k = 0; k < 2; k++) {
+ const int off1 = dav1d_cdef_directions[dir + 2][k]; // dir
+ const int p0 = tmp[x + off1];
+ const int p1 = tmp[x - off1];
+ sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
+ sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
+ // if pri_tap_k == 4 then it becomes 2 else it remains 3
+ pri_tap_k = (pri_tap_k & 3) | 2;
+ min = umin(p0, min);
+ max = imax(p0, max);
+ min = umin(p1, min);
+ max = imax(p1, max);
+ const int off2 = dav1d_cdef_directions[dir + 4][k]; // dir + 2
+ const int off3 = dav1d_cdef_directions[dir + 0][k]; // dir - 2
+ const int s0 = tmp[x + off2];
+ const int s1 = tmp[x - off2];
+ const int s2 = tmp[x + off3];
+ const int s3 = tmp[x - off3];
+ // sec_tap starts at 2 and becomes 1
+ const int sec_tap = 2 - k;
+ sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
+ sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
+ sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
+ sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
+ min = umin(s0, min);
+ max = imax(s0, max);
+ min = umin(s1, min);
+ max = imax(s1, max);
+ min = umin(s2, min);
+ max = imax(s2, max);
+ min = umin(s3, min);
+ max = imax(s3, max);
+ }
+ dst[x] = iclip(px + ((sum - (sum < 0) + 8) >> 4), min, max);
+ }
+ dst += PXSTRIDE(dst_stride);
+ tmp += tmp_stride;
+ } while (--h);
+ } else { // pri_strength only
+ do {
+ for (int x = 0; x < w; x++) {
+ const int px = dst[x];
+ int sum = 0;
+ int pri_tap_k = pri_tap;
+ for (int k = 0; k < 2; k++) {
+ const int off = dav1d_cdef_directions[dir + 2][k]; // dir
+ const int p0 = tmp[x + off];
+ const int p1 = tmp[x - off];
+ sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift);
+ sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift);
+ pri_tap_k = (pri_tap_k & 3) | 2;
+ }
+ dst[x] = px + ((sum - (sum < 0) + 8) >> 4);
+ }
+ dst += PXSTRIDE(dst_stride);
+ tmp += tmp_stride;
+ } while (--h);
+ }
+ } else { // sec_strength only
+ assert(sec_strength);
+ const int sec_shift = damping - ulog2(sec_strength);
+ do {
+ for (int x = 0; x < w; x++) {
+ const int px = dst[x];
+ int sum = 0;
+ for (int k = 0; k < 2; k++) {
+ const int off1 = dav1d_cdef_directions[dir + 4][k]; // dir + 2
+ const int off2 = dav1d_cdef_directions[dir + 0][k]; // dir - 2
+ const int s0 = tmp[x + off1];
+ const int s1 = tmp[x - off1];
+ const int s2 = tmp[x + off2];
+ const int s3 = tmp[x - off2];
+ const int sec_tap = 2 - k;
+ sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift);
+ sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift);
+ sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift);
+ sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift);
+ }
+ dst[x] = px + ((sum - (sum < 0) + 8) >> 4);
+ }
+ dst += PXSTRIDE(dst_stride);
+ tmp += tmp_stride;
+ } while (--h);
+ }
+}
+
+#define cdef_fn(w, h) \
+static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
+ const ptrdiff_t stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, \
+ const int sec_strength, \
+ const int dir, \
+ const int damping, \
+ const enum CdefEdgeFlags edges \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ cdef_filter_block_c(dst, stride, left, top, bottom, \
+ pri_strength, sec_strength, dir, damping, w, h, edges HIGHBD_TAIL_SUFFIX); \
+}
+
+cdef_fn(4, 4);
+cdef_fn(4, 8);
+cdef_fn(8, 8);
+
+static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,
+ unsigned *const var HIGHBD_DECL_SUFFIX)
+{
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ int partial_sum_hv[2][8] = { { 0 } };
+ int partial_sum_diag[2][15] = { { 0 } };
+ int partial_sum_alt[4][11] = { { 0 } };
+
+ for (int y = 0; y < 8; y++) {
+ for (int x = 0; x < 8; x++) {
+ const int px = (img[x] >> bitdepth_min_8) - 128;
+
+ partial_sum_diag[0][ y + x ] += px;
+ partial_sum_alt [0][ y + (x >> 1)] += px;
+ partial_sum_hv [0][ y ] += px;
+ partial_sum_alt [1][3 + y - (x >> 1)] += px;
+ partial_sum_diag[1][7 + y - x ] += px;
+ partial_sum_alt [2][3 - (y >> 1) + x ] += px;
+ partial_sum_hv [1][ x ] += px;
+ partial_sum_alt [3][ (y >> 1) + x ] += px;
+ }
+ img += PXSTRIDE(stride);
+ }
+
+ unsigned cost[8] = { 0 };
+ for (int n = 0; n < 8; n++) {
+ cost[2] += partial_sum_hv[0][n] * partial_sum_hv[0][n];
+ cost[6] += partial_sum_hv[1][n] * partial_sum_hv[1][n];
+ }
+ cost[2] *= 105;
+ cost[6] *= 105;
+
+ static const uint16_t div_table[7] = { 840, 420, 280, 210, 168, 140, 120 };
+ for (int n = 0; n < 7; n++) {
+ const int d = div_table[n];
+ cost[0] += (partial_sum_diag[0][n] * partial_sum_diag[0][n] +
+ partial_sum_diag[0][14 - n] * partial_sum_diag[0][14 - n]) * d;
+ cost[4] += (partial_sum_diag[1][n] * partial_sum_diag[1][n] +
+ partial_sum_diag[1][14 - n] * partial_sum_diag[1][14 - n]) * d;
+ }
+ cost[0] += partial_sum_diag[0][7] * partial_sum_diag[0][7] * 105;
+ cost[4] += partial_sum_diag[1][7] * partial_sum_diag[1][7] * 105;
+
+ for (int n = 0; n < 4; n++) {
+ unsigned *const cost_ptr = &cost[n * 2 + 1];
+ for (int m = 0; m < 5; m++)
+ *cost_ptr += partial_sum_alt[n][3 + m] * partial_sum_alt[n][3 + m];
+ *cost_ptr *= 105;
+ for (int m = 0; m < 3; m++) {
+ const int d = div_table[2 * m + 1];
+ *cost_ptr += (partial_sum_alt[n][m] * partial_sum_alt[n][m] +
+ partial_sum_alt[n][10 - m] * partial_sum_alt[n][10 - m]) * d;
+ }
+ }
+
+ int best_dir = 0;
+ unsigned best_cost = cost[0];
+ for (int n = 1; n < 8; n++) {
+ if (cost[n] > best_cost) {
+ best_cost = cost[n];
+ best_dir = n;
+ }
+ }
+
+ *var = (best_cost - (cost[best_dir ^ 4])) >> 10;
+ return best_dir;
+}
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/cdef.h"
+#elif ARCH_PPC64LE
+#include "src/ppc/cdef.h"
+#elif ARCH_X86
+#include "src/x86/cdef.h"
+#endif
+#endif
+
+COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
+ c->dir = cdef_find_dir_c;
+ c->fb[0] = cdef_filter_block_8x8_c;
+ c->fb[1] = cdef_filter_block_4x8_c;
+ c->fb[2] = cdef_filter_block_4x4_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ cdef_dsp_init_arm(c);
+#elif ARCH_PPC64LE
+ cdef_dsp_init_ppc(c);
+#elif ARCH_X86
+ cdef_dsp_init_x86(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/cdf.c b/third_party/dav1d/src/cdf.c
new file mode 100644
index 0000000000..e0f2132e00
--- /dev/null
+++ b/third_party/dav1d/src/cdf.c
@@ -0,0 +1,4123 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/frame.h"
+
+#include "src/internal.h"
+#include "src/tables.h"
+
+#define CDF1(x) (32768-(x))
+
+#define CDF2(a,b) \
+ CDF1(a), CDF1(b)
+#define CDF3(a,b,c) \
+ CDF1(a), CDF2(b,c)
+#define CDF4(a,b,c,d) \
+ CDF1(a), CDF3(b,c,d)
+#define CDF5(a,b,c,d,e) \
+ CDF1(a), CDF4(b,c,d,e)
+#define CDF6(a,b,c,d,e,f) \
+ CDF1(a), CDF5(b,c,d,e,f)
+#define CDF7(a,b,c,d,e,f,g) \
+ CDF1(a), CDF6(b,c,d,e,f,g)
+#define CDF8(a,b,c,d,e,f,g,h) \
+ CDF1(a), CDF7(b,c,d,e,f,g,h)
+#define CDF9(a,b,c,d,e,f,g,h,i) \
+ CDF1(a), CDF8(b,c,d,e,f,g,h,i)
+#define CDF10(a,b,c,d,e,f,g,h,i,j) \
+ CDF1(a), CDF9(b,c,d,e,f,g,h,i,j)
+#define CDF11(a,b,c,d,e,f,g,h,i,j,k) \
+ CDF1(a), CDF10(b,c,d,e,f,g,h,i,j,k)
+#define CDF12(a,b,c,d,e,f,g,h,i,j,k,l) \
+ CDF1(a), CDF11(b,c,d,e,f,g,h,i,j,k,l)
+#define CDF13(a,b,c,d,e,f,g,h,i,j,k,l,m) \
+ CDF1(a), CDF12(b,c,d,e,f,g,h,i,j,k,l,m)
+#define CDF14(a,b,c,d,e,f,g,h,i,j,k,l,m,n) \
+ CDF1(a), CDF13(b,c,d,e,f,g,h,i,j,k,l,m,n)
+#define CDF15(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \
+ CDF1(a), CDF14(b,c,d,e,f,g,h,i,j,k,l,m,n,o)
+
+static const CdfModeContext av1_default_cdf = {
+ .y_mode = {
+ { CDF12(22801, 23489, 24293, 24756, 25601, 26123,
+ 26606, 27418, 27945, 29228, 29685, 30349) },
+ { CDF12(18673, 19845, 22631, 23318, 23950, 24649,
+ 25527, 27364, 28152, 29701, 29984, 30852) },
+ { CDF12(19770, 20979, 23396, 23939, 24241, 24654,
+ 25136, 27073, 27830, 29360, 29730, 30659) },
+ { CDF12(20155, 21301, 22838, 23178, 23261, 23533,
+ 23703, 24804, 25352, 26575, 27016, 28049) },
+ }, .use_filter_intra = {
+ [BS_4x4] = { CDF1( 4621) },
+ [BS_4x8] = { CDF1( 6743) },
+ [BS_8x4] = { CDF1( 5893) },
+ [BS_8x8] = { CDF1( 7866) },
+ [BS_8x16] = { CDF1(12551) },
+ [BS_16x8] = { CDF1( 9394) },
+ [BS_16x16] = { CDF1(12408) },
+ [BS_16x32] = { CDF1(14301) },
+ [BS_32x16] = { CDF1(12756) },
+ [BS_32x32] = { CDF1(22343) },
+ [BS_32x64] = { CDF1(16384) },
+ [BS_64x32] = { CDF1(16384) },
+ [BS_64x64] = { CDF1(16384) },
+ [BS_64x128] = { CDF1(16384) },
+ [BS_128x64] = { CDF1(16384) },
+ [BS_128x128] = { CDF1(16384) },
+ [BS_4x16] = { CDF1(12770) },
+ [BS_16x4] = { CDF1(10368) },
+ [BS_8x32] = { CDF1(20229) },
+ [BS_32x8] = { CDF1(18101) },
+ [BS_16x64] = { CDF1(16384) },
+ [BS_64x16] = { CDF1(16384) },
+ }, .filter_intra = {
+ CDF4(8949, 12776, 17211, 29558),
+ }, .uv_mode = {
+ {
+ { CDF12(22631, 24152, 25378, 25661, 25986, 26520,
+ 27055, 27923, 28244, 30059, 30941, 31961) },
+ { CDF12( 9513, 26881, 26973, 27046, 27118, 27664,
+ 27739, 27824, 28359, 29505, 29800, 31796) },
+ { CDF12( 9845, 9915, 28663, 28704, 28757, 28780,
+ 29198, 29822, 29854, 30764, 31777, 32029) },
+ { CDF12(13639, 13897, 14171, 25331, 25606, 25727,
+ 25953, 27148, 28577, 30612, 31355, 32493) },
+ { CDF12( 9764, 9835, 9930, 9954, 25386, 27053,
+ 27958, 28148, 28243, 31101, 31744, 32363) },
+ { CDF12(11825, 13589, 13677, 13720, 15048, 29213,
+ 29301, 29458, 29711, 31161, 31441, 32550) },
+ { CDF12(14175, 14399, 16608, 16821, 17718, 17775,
+ 28551, 30200, 30245, 31837, 32342, 32667) },
+ { CDF12(12885, 13038, 14978, 15590, 15673, 15748,
+ 16176, 29128, 29267, 30643, 31961, 32461) },
+ { CDF12(12026, 13661, 13874, 15305, 15490, 15726,
+ 15995, 16273, 28443, 30388, 30767, 32416) },
+ { CDF12(19052, 19840, 20579, 20916, 21150, 21467,
+ 21885, 22719, 23174, 28861, 30379, 32175) },
+ { CDF12(18627, 19649, 20974, 21219, 21492, 21816,
+ 22199, 23119, 23527, 27053, 31397, 32148) },
+ { CDF12(17026, 19004, 19997, 20339, 20586, 21103,
+ 21349, 21907, 22482, 25896, 26541, 31819) },
+ { CDF12(12124, 13759, 14959, 14992, 15007, 15051,
+ 15078, 15166, 15255, 15753, 16039, 16606) },
+ }, {
+ { CDF13(10407, 11208, 12900, 13181, 13823, 14175, 14899,
+ 15656, 15986, 20086, 20995, 22455, 24212) },
+ { CDF13( 4532, 19780, 20057, 20215, 20428, 21071, 21199,
+ 21451, 22099, 24228, 24693, 27032, 29472) },
+ { CDF13( 5273, 5379, 20177, 20270, 20385, 20439, 20949,
+ 21695, 21774, 23138, 24256, 24703, 26679) },
+ { CDF13( 6740, 7167, 7662, 14152, 14536, 14785, 15034,
+ 16741, 18371, 21520, 22206, 23389, 24182) },
+ { CDF13( 4987, 5368, 5928, 6068, 19114, 20315, 21857,
+ 22253, 22411, 24911, 25380, 26027, 26376) },
+ { CDF13( 5370, 6889, 7247, 7393, 9498, 21114, 21402,
+ 21753, 21981, 24780, 25386, 26517, 27176) },
+ { CDF13( 4816, 4961, 7204, 7326, 8765, 8930, 20169,
+ 20682, 20803, 23188, 23763, 24455, 24940) },
+ { CDF13( 6608, 6740, 8529, 9049, 9257, 9356, 9735,
+ 18827, 19059, 22336, 23204, 23964, 24793) },
+ { CDF13( 5998, 7419, 7781, 8933, 9255, 9549, 9753,
+ 10417, 18898, 22494, 23139, 24764, 25989) },
+ { CDF13(10660, 11298, 12550, 12957, 13322, 13624, 14040,
+ 15004, 15534, 20714, 21789, 23443, 24861) },
+ { CDF13(10522, 11530, 12552, 12963, 13378, 13779, 14245,
+ 15235, 15902, 20102, 22696, 23774, 25838) },
+ { CDF13(10099, 10691, 12639, 13049, 13386, 13665, 14125,
+ 15163, 15636, 19676, 20474, 23519, 25208) },
+ { CDF13( 3144, 5087, 7382, 7504, 7593, 7690, 7801,
+ 8064, 8232, 9248, 9875, 10521, 29048) },
+ },
+ }, .angle_delta = {
+ { CDF6( 2180, 5032, 7567, 22776, 26989, 30217) },
+ { CDF6( 2301, 5608, 8801, 23487, 26974, 30330) },
+ { CDF6( 3780, 11018, 13699, 19354, 23083, 31286) },
+ { CDF6( 4581, 11226, 15147, 17138, 21834, 28397) },
+ { CDF6( 1737, 10927, 14509, 19588, 22745, 28823) },
+ { CDF6( 2664, 10176, 12485, 17650, 21600, 30495) },
+ { CDF6( 2240, 11096, 15453, 20341, 22561, 28917) },
+ { CDF6( 3605, 10428, 12459, 17676, 21244, 30655) },
+ }, .filter = {
+ {
+ { CDF2(31935, 32720) }, { CDF2( 5568, 32719) },
+ { CDF2( 422, 2938) }, { CDF2(28244, 32608) },
+ { CDF2(31206, 31953) }, { CDF2( 4862, 32121) },
+ { CDF2( 770, 1152) }, { CDF2(20889, 25637) },
+ }, {
+ { CDF2(31910, 32724) }, { CDF2( 4120, 32712) },
+ { CDF2( 305, 2247) }, { CDF2(27403, 32636) },
+ { CDF2(31022, 32009) }, { CDF2( 2963, 32093) },
+ { CDF2( 601, 943) }, { CDF2(14969, 21398) },
+ },
+ }, .newmv_mode = {
+ { CDF1(24035) }, { CDF1(16630) }, { CDF1(15339) },
+ { CDF1( 8386) }, { CDF1(12222) }, { CDF1( 4676) },
+ }, .globalmv_mode = {
+ { CDF1( 2175) }, { CDF1( 1054) },
+ }, .refmv_mode = {
+ { CDF1(23974) }, { CDF1(24188) }, { CDF1(17848) },
+ { CDF1(28622) }, { CDF1(24312) }, { CDF1(19923) },
+ }, .drl_bit = {
+ { CDF1(13104) }, { CDF1(24560) }, { CDF1(18945) },
+ }, .comp_inter_mode = {
+ { CDF7( 7760, 13823, 15808, 17641, 19156, 20666, 26891) },
+ { CDF7(10730, 19452, 21145, 22749, 24039, 25131, 28724) },
+ { CDF7(10664, 20221, 21588, 22906, 24295, 25387, 28436) },
+ { CDF7(13298, 16984, 20471, 24182, 25067, 25736, 26422) },
+ { CDF7(18904, 23325, 25242, 27432, 27898, 28258, 30758) },
+ { CDF7(10725, 17454, 20124, 22820, 24195, 25168, 26046) },
+ { CDF7(17125, 24273, 25814, 27492, 28214, 28704, 30592) },
+ { CDF7(13046, 23214, 24505, 25942, 27435, 28442, 29330) },
+ }, .intra = {
+ { CDF1( 806) }, { CDF1(16662) }, { CDF1(20186) },
+ { CDF1(26538) },
+ }, .comp = {
+ { CDF1(26828) }, { CDF1(24035) }, { CDF1(12031) },
+ { CDF1(10640) }, { CDF1( 2901) },
+ }, .comp_dir = {
+ { CDF1( 1198) }, { CDF1( 2070) }, { CDF1( 9166) },
+ { CDF1( 7499) }, { CDF1(22475) },
+ }, .jnt_comp = {
+ { CDF1(18244) }, { CDF1(12865) }, { CDF1( 7053) },
+ { CDF1(13259) }, { CDF1( 9334) }, { CDF1( 4644) },
+ }, .mask_comp = {
+ { CDF1(26607) }, { CDF1(22891) }, { CDF1(18840) },
+ { CDF1(24594) }, { CDF1(19934) }, { CDF1(22674) },
+ }, .wedge_comp = {
+ { CDF1(23431) }, { CDF1(13171) }, { CDF1(11470) },
+ { CDF1( 9770) }, { CDF1( 9100) }, { CDF1( 8233) },
+ { CDF1( 6172) }, { CDF1(11820) }, { CDF1( 7701) },
+ }, .wedge_idx = {
+ { CDF15( 2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094,
+ 20359, 22362, 24127, 25702, 27752, 29450, 31171) },
+ { CDF15( 806, 3266, 6005, 6738, 7218, 7367, 7771, 14588,
+ 16323, 17367, 18452, 19422, 22839, 26127, 29629) },
+ { CDF15( 2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357,
+ 17939, 21332, 24520, 27470, 29456, 30529, 31656) },
+ { CDF15( 1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144,
+ 19163, 20961, 22884, 24471, 26719, 28714, 30877) },
+ { CDF15( 1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624,
+ 15369, 16730, 18114, 19313, 22521, 26012, 29550) },
+ { CDF15( 2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124,
+ 17270, 20533, 23434, 25972, 27944, 29570, 31416) },
+ { CDF15( 1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944,
+ 20638, 22038, 23963, 25311, 26988, 28766, 31012) },
+ { CDF15( 154, 987, 1925, 2051, 2088, 2111, 2151, 23033,
+ 23703, 24284, 24985, 25684, 27259, 28883, 30911) },
+ { CDF15( 1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016,
+ 22935, 25057, 27251, 29173, 30089, 30960, 31933) },
+ }, .interintra = {
+ { CDF1(16384) }, { CDF1(26887) }, { CDF1(27597) },
+ { CDF1(30237) },
+ }, .interintra_mode = {
+ { CDF3(8192, 16384, 24576) },
+ { CDF3(1875, 11082, 27332) },
+ { CDF3(2473, 9996, 26388) },
+ { CDF3(4238, 11537, 25926) },
+ }, .interintra_wedge = {
+ { CDF1(20036) }, { CDF1(24957) }, { CDF1(26704) },
+ { CDF1(27530) }, { CDF1(29564) }, { CDF1(29444) },
+ { CDF1(26872) },
+ }, .ref = {
+ { { CDF1( 4897) }, { CDF1(16973) }, { CDF1(29744) } },
+ { { CDF1( 1555) }, { CDF1(16751) }, { CDF1(30279) } },
+ { { CDF1( 4236) }, { CDF1(19647) }, { CDF1(31194) } },
+ { { CDF1( 8650) }, { CDF1(24773) }, { CDF1(31895) } },
+ { { CDF1( 904) }, { CDF1(11014) }, { CDF1(26875) } },
+ { { CDF1( 1444) }, { CDF1(15087) }, { CDF1(30304) } },
+ }, .comp_fwd_ref = {
+ { { CDF1( 4946) }, { CDF1(19891) }, { CDF1(30731) } },
+ { { CDF1( 9468) }, { CDF1(22441) }, { CDF1(31059) } },
+ { { CDF1( 1503) }, { CDF1(15160) }, { CDF1(27544) } },
+ }, .comp_bwd_ref = {
+ { { CDF1( 2235) }, { CDF1(17182) }, { CDF1(30606) } },
+ { { CDF1( 1423) }, { CDF1(15175) }, { CDF1(30489) } },
+ }, .comp_uni_ref = {
+ { { CDF1( 5284) }, { CDF1(23152) }, { CDF1(31774) } },
+ { { CDF1( 3865) }, { CDF1(14173) }, { CDF1(25120) } },
+ { { CDF1( 3128) }, { CDF1(15270) }, { CDF1(26710) } },
+ }, .txsz = {
+ {
+ { CDF1(19968) }, { CDF1(19968) }, { CDF1(24320) },
+ }, {
+ { CDF2(12272, 30172) }, { CDF2(12272, 30172) },
+ { CDF2(18677, 30848) },
+ }, {
+ { CDF2(12986, 15180) }, { CDF2(12986, 15180) },
+ { CDF2(24302, 25602) },
+ }, {
+ { CDF2( 5782, 11475) }, { CDF2( 5782, 11475) },
+ { CDF2(16803, 22759) },
+ },
+ }, .txpart = {
+ { { CDF1(28581) }, { CDF1(23846) }, { CDF1(20847) } },
+ { { CDF1(24315) }, { CDF1(18196) }, { CDF1(12133) } },
+ { { CDF1(18791) }, { CDF1(10887) }, { CDF1(11005) } },
+ { { CDF1(27179) }, { CDF1(20004) }, { CDF1(11281) } },
+ { { CDF1(26549) }, { CDF1(19308) }, { CDF1(14224) } },
+ { { CDF1(28015) }, { CDF1(21546) }, { CDF1(14400) } },
+ { { CDF1(28165) }, { CDF1(22401) }, { CDF1(16088) } },
+ }, .txtp_inter1 = {
+ { CDF15( 4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266,
+ 21504, 22848, 23934, 25474, 27727, 28915, 30631) },
+ { CDF15( 1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357,
+ 17674, 20408, 22517, 25010, 27116, 28856, 30749) },
+ }, .txtp_inter2 = {
+ CDF11( 770, 2421, 5225, 12907, 15819, 18927,
+ 21561, 24089, 26595, 28526, 30529)
+ }, .txtp_inter3 = {
+ { CDF1(16384) }, { CDF1( 4167) }, { CDF1( 1998) }, { CDF1( 748) },
+ }, .txtp_intra1 = {
+ {
+ { CDF6( 1535, 8035, 9461, 12751, 23467, 27825) },
+ { CDF6( 564, 3335, 9709, 10870, 18143, 28094) },
+ { CDF6( 672, 3247, 3676, 11982, 19415, 23127) },
+ { CDF6( 5279, 13885, 15487, 18044, 23527, 30252) },
+ { CDF6( 4423, 6074, 7985, 10416, 25693, 29298) },
+ { CDF6( 1486, 4241, 9460, 10662, 16456, 27694) },
+ { CDF6( 439, 2838, 3522, 6737, 18058, 23754) },
+ { CDF6( 1190, 4233, 4855, 11670, 20281, 24377) },
+ { CDF6( 1045, 4312, 8647, 10159, 18644, 29335) },
+ { CDF6( 202, 3734, 4747, 7298, 17127, 24016) },
+ { CDF6( 447, 4312, 6819, 8884, 16010, 23858) },
+ { CDF6( 277, 4369, 5255, 8905, 16465, 22271) },
+ { CDF6( 3409, 5436, 10599, 15599, 19687, 24040) },
+ }, {
+ { CDF6( 1870, 13742, 14530, 16498, 23770, 27698) },
+ { CDF6( 326, 8796, 14632, 15079, 19272, 27486) },
+ { CDF6( 484, 7576, 7712, 14443, 19159, 22591) },
+ { CDF6( 1126, 15340, 15895, 17023, 20896, 30279) },
+ { CDF6( 655, 4854, 5249, 5913, 22099, 27138) },
+ { CDF6( 1299, 6458, 8885, 9290, 14851, 25497) },
+ { CDF6( 311, 5295, 5552, 6885, 16107, 22672) },
+ { CDF6( 883, 8059, 8270, 11258, 17289, 21549) },
+ { CDF6( 741, 7580, 9318, 10345, 16688, 29046) },
+ { CDF6( 110, 7406, 7915, 9195, 16041, 23329) },
+ { CDF6( 363, 7974, 9357, 10673, 15629, 24474) },
+ { CDF6( 153, 7647, 8112, 9936, 15307, 19996) },
+ { CDF6( 3511, 6332, 11165, 15335, 19323, 23594) },
+ },
+ }, .txtp_intra2 = {
+ {
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ }, {
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ { CDF4( 6554, 13107, 19661, 26214) },
+ }, {
+ { CDF4( 1127, 12814, 22772, 27483) },
+ { CDF4( 145, 6761, 11980, 26667) },
+ { CDF4( 362, 5887, 11678, 16725) },
+ { CDF4( 385, 15213, 18587, 30693) },
+ { CDF4( 25, 2914, 23134, 27903) },
+ { CDF4( 60, 4470, 11749, 23991) },
+ { CDF4( 37, 3332, 14511, 21448) },
+ { CDF4( 157, 6320, 13036, 17439) },
+ { CDF4( 119, 6719, 12906, 29396) },
+ { CDF4( 47, 5537, 12576, 21499) },
+ { CDF4( 269, 6076, 11258, 23115) },
+ { CDF4( 83, 5615, 12001, 17228) },
+ { CDF4( 1968, 5556, 12023, 18547) },
+ },
+ }, .skip = {
+ { CDF1(31671) }, { CDF1(16515) }, { CDF1( 4576) },
+ }, .skip_mode = {
+ { CDF1(32621) }, { CDF1(20708) }, { CDF1( 8127) },
+ }, .partition = {
+ {
+ // 128x128 -> 64x64
+ { CDF7(27899, 28219, 28529, 32484, 32539, 32619, 32639) },
+ { CDF7( 6607, 6990, 8268, 32060, 32219, 32338, 32371) },
+ { CDF7( 5429, 6676, 7122, 32027, 32227, 32531, 32582) },
+ { CDF7( 711, 966, 1172, 32448, 32538, 32617, 32664) },
+ }, {
+ // 64x64 -> 32x32
+ { CDF9(20137, 21547, 23078, 29566, 29837,
+ 30261, 30524, 30892, 31724) },
+ { CDF9( 6732, 7490, 9497, 27944, 28250,
+ 28515, 28969, 29630, 30104) },
+ { CDF9( 5945, 7663, 8348, 28683, 29117,
+ 29749, 30064, 30298, 32238) },
+ { CDF9( 870, 1212, 1487, 31198, 31394,
+ 31574, 31743, 31881, 32332) },
+ }, {
+ // 32x32 -> 16x16
+ { CDF9(18462, 20920, 23124, 27647, 28227,
+ 29049, 29519, 30178, 31544) },
+ { CDF9( 7689, 9060, 12056, 24992, 25660,
+ 26182, 26951, 28041, 29052) },
+ { CDF9( 6015, 9009, 10062, 24544, 25409,
+ 26545, 27071, 27526, 32047) },
+ { CDF9( 1394, 2208, 2796, 28614, 29061,
+ 29466, 29840, 30185, 31899) },
+ }, {
+ // 16x16 -> 8x8
+ { CDF9(15597, 20929, 24571, 26706, 27664,
+ 28821, 29601, 30571, 31902) },
+ { CDF9( 7925, 11043, 16785, 22470, 23971,
+ 25043, 26651, 28701, 29834) },
+ { CDF9( 5414, 13269, 15111, 20488, 22360,
+ 24500, 25537, 26336, 32117) },
+ { CDF9( 2662, 6362, 8614, 20860, 23053,
+ 24778, 26436, 27829, 31171) },
+ }, {
+ // 8x8 -> 4x4 only supports the four legacy partition types
+ { CDF3(19132, 25510, 30392) },
+ { CDF3(13928, 19855, 28540) },
+ { CDF3(12522, 23679, 28629) },
+ { CDF3( 9896, 18783, 25853) },
+ },
+ }, .seg_pred = {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ }, .seg_id = {
+ { CDF7( 5622, 7893, 16093, 18233, 27809, 28373, 32533) },
+ { CDF7(14274, 18230, 22557, 24935, 29980, 30851, 32344) },
+ { CDF7(27527, 28487, 28723, 28890, 32397, 32647, 32679) },
+ }, .cfl_sign = {
+ CDF7( 1418, 2123, 13340, 18405, 26972, 28343, 32294)
+ }, .cfl_alpha = {
+ { CDF15( 7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696,
+ 32700, 32704, 32708, 32712, 32716, 32720, 32724) },
+ { CDF15(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573,
+ 32620, 32647, 32668, 32672, 32676, 32680, 32684) },
+ { CDF15(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649,
+ 32673, 32677, 32681, 32685, 32689, 32693, 32697) },
+ { CDF15(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704,
+ 32708, 32712, 32716, 32720, 32724, 32728, 32732) },
+ { CDF15(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321,
+ 32394, 32464, 32516, 32560, 32576, 32593, 32622) },
+ { CDF15(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843,
+ 32144, 32413, 32520, 32594, 32622, 32656, 32660) },
+ }, .restore_wiener = {
+ CDF1(11570)
+ }, .restore_sgrproj = {
+ CDF1(16855)
+ }, .restore_switchable = {
+ CDF2( 9413, 22581)
+ }, .delta_q = {
+ CDF3(28160, 32120, 32677)
+ }, .delta_lf = {
+ { CDF3(28160, 32120, 32677) },
+ { CDF3(28160, 32120, 32677) },
+ { CDF3(28160, 32120, 32677) },
+ { CDF3(28160, 32120, 32677) },
+ { CDF3(28160, 32120, 32677) },
+ }, .motion_mode = {
+ [BS_8x8] = { CDF2( 7651, 24760) },
+ [BS_8x16] = { CDF2( 4738, 24765) },
+ [BS_8x32] = { CDF2(28799, 31390) },
+ [BS_16x8] = { CDF2( 5391, 25528) },
+ [BS_16x16] = { CDF2(19419, 26810) },
+ [BS_16x32] = { CDF2( 5123, 23606) },
+ [BS_16x64] = { CDF2(28973, 31594) },
+ [BS_32x8] = { CDF2(26431, 30774) },
+ [BS_32x16] = { CDF2(11606, 24308) },
+ [BS_32x32] = { CDF2(26260, 29116) },
+ [BS_32x64] = { CDF2(20360, 28062) },
+ [BS_64x16] = { CDF2(29742, 31203) },
+ [BS_64x32] = { CDF2(21679, 26830) },
+ [BS_64x64] = { CDF2(29516, 30701) },
+ [BS_64x128] = { CDF2(28898, 30397) },
+ [BS_128x64] = { CDF2(30878, 31335) },
+ [BS_128x128] = { CDF2(32507, 32558) },
+ }, .obmc = {
+ [BS_8x8] = { CDF1(10437) },
+ [BS_8x16] = { CDF1( 9371) },
+ [BS_8x32] = { CDF1(23664) },
+ [BS_16x8] = { CDF1( 9301) },
+ [BS_16x16] = { CDF1(17432) },
+ [BS_16x32] = { CDF1(14423) },
+ [BS_16x64] = { CDF1(24008) },
+ [BS_32x8] = { CDF1(20901) },
+ [BS_32x16] = { CDF1(15142) },
+ [BS_32x32] = { CDF1(25817) },
+ [BS_32x64] = { CDF1(22823) },
+ [BS_64x16] = { CDF1(26879) },
+ [BS_64x32] = { CDF1(22083) },
+ [BS_64x64] = { CDF1(30128) },
+ [BS_64x128] = { CDF1(31014) },
+ [BS_128x64] = { CDF1(31560) },
+ [BS_128x128] = { CDF1(32638) },
+ }, .pal_y = {
+ { { CDF1(31676) }, { CDF1( 3419) }, { CDF1( 1261) } },
+ { { CDF1(31912) }, { CDF1( 2859) }, { CDF1( 980) } },
+ { { CDF1(31823) }, { CDF1( 3400) }, { CDF1( 781) } },
+ { { CDF1(32030) }, { CDF1( 3561) }, { CDF1( 904) } },
+ { { CDF1(32309) }, { CDF1( 7337) }, { CDF1( 1462) } },
+ { { CDF1(32265) }, { CDF1( 4015) }, { CDF1( 1521) } },
+ { { CDF1(32450) }, { CDF1( 7946) }, { CDF1( 129) } },
+ }, .pal_sz = {
+ {
+ { CDF6( 7952, 13000, 18149, 21478, 25527, 29241) },
+ { CDF6( 7139, 11421, 16195, 19544, 23666, 28073) },
+ { CDF6( 7788, 12741, 17325, 20500, 24315, 28530) },
+ { CDF6( 8271, 14064, 18246, 21564, 25071, 28533) },
+ { CDF6(12725, 19180, 21863, 24839, 27535, 30120) },
+ { CDF6( 9711, 14888, 16923, 21052, 25661, 27875) },
+ { CDF6(14940, 20797, 21678, 24186, 27033, 28999) },
+ }, {
+ { CDF6( 8713, 19979, 27128, 29609, 31331, 32272) },
+ { CDF6( 5839, 15573, 23581, 26947, 29848, 31700) },
+ { CDF6( 4426, 11260, 17999, 21483, 25863, 29430) },
+ { CDF6( 3228, 9464, 14993, 18089, 22523, 27420) },
+ { CDF6( 3768, 8886, 13091, 17852, 22495, 27207) },
+ { CDF6( 2464, 8451, 12861, 21632, 25525, 28555) },
+ { CDF6( 1269, 5435, 10433, 18963, 21700, 25865) },
+ },
+ }, .pal_uv = {
+ { CDF1(32461) }, { CDF1(21488) },
+ }, .color_map = {
+ { /* y */
+ {
+ { CDF1(28710) }, { CDF1(16384) }, { CDF1(10553) },
+ { CDF1(27036) }, { CDF1(31603) },
+ }, {
+ { CDF2(27877, 30490) }, { CDF2(11532, 25697) },
+ { CDF2( 6544, 30234) }, { CDF2(23018, 28072) },
+ { CDF2(31915, 32385) },
+ }, {
+ { CDF3(25572, 28046, 30045) },
+ { CDF3( 9478, 21590, 27256) },
+ { CDF3( 7248, 26837, 29824) },
+ { CDF3(19167, 24486, 28349) },
+ { CDF3(31400, 31825, 32250) },
+ }, {
+ { CDF4(24779, 26955, 28576, 30282) },
+ { CDF4( 8669, 20364, 24073, 28093) },
+ { CDF4( 4255, 27565, 29377, 31067) },
+ { CDF4(19864, 23674, 26716, 29530) },
+ { CDF4(31646, 31893, 32147, 32426) },
+ }, {
+ { CDF5(23132, 25407, 26970, 28435, 30073) },
+ { CDF5( 7443, 17242, 20717, 24762, 27982) },
+ { CDF5( 6300, 24862, 26944, 28784, 30671) },
+ { CDF5(18916, 22895, 25267, 27435, 29652) },
+ { CDF5(31270, 31550, 31808, 32059, 32353) },
+ }, {
+ { CDF6(23105, 25199, 26464, 27684, 28931, 30318) },
+ { CDF6( 6950, 15447, 18952, 22681, 25567, 28563) },
+ { CDF6( 7560, 23474, 25490, 27203, 28921, 30708) },
+ { CDF6(18544, 22373, 24457, 26195, 28119, 30045) },
+ { CDF6(31198, 31451, 31670, 31882, 32123, 32391) },
+ }, {
+ { CDF7(21689, 23883, 25163, 26352, 27506, 28827, 30195) },
+ { CDF7( 6892, 15385, 17840, 21606, 24287, 26753, 29204) },
+ { CDF7( 5651, 23182, 25042, 26518, 27982, 29392, 30900) },
+ { CDF7(19349, 22578, 24418, 25994, 27524, 29031, 30448) },
+ { CDF7(31028, 31270, 31504, 31705, 31927, 32153, 32392) },
+ },
+ }, { /* uv */
+ {
+ { CDF1(29089) }, { CDF1(16384) }, { CDF1( 8713) },
+ { CDF1(29257) }, { CDF1(31610) },
+ }, {
+ { CDF2(25257, 29145) }, { CDF2(12287, 27293) },
+ { CDF2( 7033, 27960) }, { CDF2(20145, 25405) },
+ { CDF2(30608, 31639) },
+ }, {
+ { CDF3(24210, 27175, 29903) },
+ { CDF3( 9888, 22386, 27214) },
+ { CDF3( 5901, 26053, 29293) },
+ { CDF3(18318, 22152, 28333) },
+ { CDF3(30459, 31136, 31926) },
+ }, {
+ { CDF4(22980, 25479, 27781, 29986) },
+ { CDF4( 8413, 21408, 24859, 28874) },
+ { CDF4( 2257, 29449, 30594, 31598) },
+ { CDF4(19189, 21202, 25915, 28620) },
+ { CDF4(31844, 32044, 32281, 32518) },
+ }, {
+ { CDF5(22217, 24567, 26637, 28683, 30548) },
+ { CDF5( 7307, 16406, 19636, 24632, 28424) },
+ { CDF5( 4441, 25064, 26879, 28942, 30919) },
+ { CDF5(17210, 20528, 23319, 26750, 29582) },
+ { CDF5(30674, 30953, 31396, 31735, 32207) },
+ }, {
+ { CDF6(21239, 23168, 25044, 26962, 28705, 30506) },
+ { CDF6( 6545, 15012, 18004, 21817, 25503, 28701) },
+ { CDF6( 3448, 26295, 27437, 28704, 30126, 31442) },
+ { CDF6(15889, 18323, 21704, 24698, 26976, 29690) },
+ { CDF6(30988, 31204, 31479, 31734, 31983, 32325) },
+ }, {
+ { CDF7(21442, 23288, 24758, 26246, 27649, 28980, 30563) },
+ { CDF7( 5863, 14933, 17552, 20668, 23683, 26411, 29273) },
+ { CDF7( 3415, 25810, 26877, 27990, 29223, 30394, 31618) },
+ { CDF7(17965, 20084, 22232, 23974, 26274, 28402, 30390) },
+ { CDF7(31190, 31329, 31516, 31679, 31825, 32026, 32322) },
+ },
+ },
+ }, .intrabc = {
+ CDF1(30531)
+ },
+};
+
+static const CdfMvComponent default_mv_component_cdf = {
+ .classes = {
+ CDF10(28672, 30976, 31858, 32320, 32551,
+ 32656, 32740, 32757, 32762, 32767)
+ }, .class0 = {
+ CDF1(27648)
+ }, .classN = {
+ { CDF1(17408) }, { CDF1(17920) }, { CDF1(18944) },
+ { CDF1(20480) }, { CDF1(22528) }, { CDF1(24576) },
+ { CDF1(28672) }, { CDF1(29952) }, { CDF1(29952) },
+ { CDF1(30720) },
+ }, .class0_fp = {
+ { CDF3(16384, 24576, 26624) },
+ { CDF3(12288, 21248, 24128) },
+ }, .classN_fp = {
+ CDF3( 8192, 17408, 21248)
+ }, .class0_hp = {
+ CDF1(20480)
+ }, .classN_hp = {
+ CDF1(16384)
+ }, .sign = {
+ CDF1(16384)
+ },
+};
+
+static const uint16_t ALIGN(default_mv_joint_cdf[N_MV_JOINTS], 8) = {
+ CDF3( 4096, 11264, 19328)
+};
+
+static const uint16_t ALIGN(default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 3], 32) = {
+ {
+ { CDF12(15588, 17027, 19338, 20218, 20682, 21110,
+ 21825, 23244, 24189, 28165, 29093, 30466) },
+ { CDF12(12016, 18066, 19516, 20303, 20719, 21444,
+ 21888, 23032, 24434, 28658, 30172, 31409) },
+ { CDF12(10052, 10771, 22296, 22788, 23055, 23239,
+ 24133, 25620, 26160, 29336, 29929, 31567) },
+ { CDF12(14091, 15406, 16442, 18808, 19136, 19546,
+ 19998, 22096, 24746, 29585, 30958, 32462) },
+ { CDF12(12122, 13265, 15603, 16501, 18609, 20033,
+ 22391, 25583, 26437, 30261, 31073, 32475) },
+ }, {
+ { CDF12(10023, 19585, 20848, 21440, 21832, 22760,
+ 23089, 24023, 25381, 29014, 30482, 31436) },
+ { CDF12( 5983, 24099, 24560, 24886, 25066, 25795,
+ 25913, 26423, 27610, 29905, 31276, 31794) },
+ { CDF12( 7444, 12781, 20177, 20728, 21077, 21607,
+ 22170, 23405, 24469, 27915, 29090, 30492) },
+ { CDF12( 8537, 14689, 15432, 17087, 17408, 18172,
+ 18408, 19825, 24649, 29153, 31096, 32210) },
+ { CDF12( 7543, 14231, 15496, 16195, 17905, 20717,
+ 21984, 24516, 26001, 29675, 30981, 31994) },
+ }, {
+ { CDF12(12613, 13591, 21383, 22004, 22312, 22577,
+ 23401, 25055, 25729, 29538, 30305, 32077) },
+ { CDF12( 9687, 13470, 18506, 19230, 19604, 20147,
+ 20695, 22062, 23219, 27743, 29211, 30907) },
+ { CDF12( 6183, 6505, 26024, 26252, 26366, 26434,
+ 27082, 28354, 28555, 30467, 30794, 32086) },
+ { CDF12(10718, 11734, 14954, 17224, 17565, 17924,
+ 18561, 21523, 23878, 28975, 30287, 32252) },
+ { CDF12( 9194, 9858, 16501, 17263, 18424, 19171,
+ 21563, 25961, 26561, 30072, 30737, 32463) },
+ }, {
+ { CDF12(12602, 14399, 15488, 18381, 18778, 19315,
+ 19724, 21419, 25060, 29696, 30917, 32409) },
+ { CDF12( 8203, 13821, 14524, 17105, 17439, 18131,
+ 18404, 19468, 25225, 29485, 31158, 32342) },
+ { CDF12( 8451, 9731, 15004, 17643, 18012, 18425,
+ 19070, 21538, 24605, 29118, 30078, 32018) },
+ { CDF12( 7714, 9048, 9516, 16667, 16817, 16994,
+ 17153, 18767, 26743, 30389, 31536, 32528) },
+ { CDF12( 8843, 10280, 11496, 15317, 16652, 17943,
+ 19108, 22718, 25769, 29953, 30983, 32485) },
+ }, {
+ { CDF12(12578, 13671, 15979, 16834, 19075, 20913,
+ 22989, 25449, 26219, 30214, 31150, 32477) },
+ { CDF12( 9563, 13626, 15080, 15892, 17756, 20863,
+ 22207, 24236, 25380, 29653, 31143, 32277) },
+ { CDF12( 8356, 8901, 17616, 18256, 19350, 20106,
+ 22598, 25947, 26466, 29900, 30523, 32261) },
+ { CDF12(10835, 11815, 13124, 16042, 17018, 18039,
+ 18947, 22753, 24615, 29489, 30883, 32482) },
+ { CDF12( 7618, 8288, 9859, 10509, 15386, 18657,
+ 22903, 28776, 29180, 31355, 31802, 32593) },
+ },
+};
+
+static const CdfCoefContext av1_default_coef_cdf[4] = {
+ [0] = {
+ .skip = {
+ {
+ { CDF1(31849) }, { CDF1( 5892) }, { CDF1(12112) },
+ { CDF1(21935) }, { CDF1(20289) }, { CDF1(27473) },
+ { CDF1(32487) }, { CDF1( 7654) }, { CDF1(19473) },
+ { CDF1(29984) }, { CDF1( 9961) }, { CDF1(30242) },
+ { CDF1(32117) },
+ }, {
+ { CDF1(31548) }, { CDF1( 1549) }, { CDF1(10130) },
+ { CDF1(16656) }, { CDF1(18591) }, { CDF1(26308) },
+ { CDF1(32537) }, { CDF1( 5403) }, { CDF1(18096) },
+ { CDF1(30003) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(29957) }, { CDF1( 5391) }, { CDF1(18039) },
+ { CDF1(23566) }, { CDF1(22431) }, { CDF1(25822) },
+ { CDF1(32197) }, { CDF1( 3778) }, { CDF1(15336) },
+ { CDF1(28981) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(17920) }, { CDF1( 1818) }, { CDF1( 7282) },
+ { CDF1(25273) }, { CDF1(10923) }, { CDF1(31554) },
+ { CDF1(32624) }, { CDF1( 1366) }, { CDF1(15628) },
+ { CDF1(30462) }, { CDF1( 146) }, { CDF1( 5132) },
+ { CDF1(31657) },
+ }, {
+ { CDF1( 6308) }, { CDF1( 117) }, { CDF1( 1638) },
+ { CDF1( 2161) }, { CDF1(16384) }, { CDF1(10923) },
+ { CDF1(30247) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ },
+ }, .eob_bin_16 = {
+ {
+ { CDF4( 840, 1039, 1980, 4895) },
+ { CDF4( 370, 671, 1883, 4471) },
+ }, {
+ { CDF4( 3247, 4950, 9688, 14563) },
+ { CDF4( 1904, 3354, 7763, 14647) },
+ },
+ }, .eob_bin_32 = {
+ {
+ { CDF5( 400, 520, 977, 2102, 6542) },
+ { CDF5( 210, 405, 1315, 3326, 7537) },
+ }, {
+ { CDF5( 2636, 4273, 7588, 11794, 20401) },
+ { CDF5( 1786, 3179, 6902, 11357, 19054) },
+ },
+ }, .eob_bin_64 = {
+ {
+ { CDF6( 329, 498, 1101, 1784, 3265, 7758) },
+ { CDF6( 335, 730, 1459, 5494, 8755, 12997) },
+ }, {
+ { CDF6( 3505, 5304, 10086, 13814, 17684, 23370) },
+ { CDF6( 1563, 2700, 4876, 10911, 14706, 22480) },
+ },
+ }, .eob_bin_128 = {
+ {
+ { CDF7( 219, 482, 1140, 2091, 3680, 6028, 12586) },
+ { CDF7( 371, 699, 1254, 4830, 9479, 12562, 17497) },
+ }, {
+ { CDF7( 5245, 7456, 12880, 15852, 20033, 23932, 27608) },
+ { CDF7( 2054, 3472, 5869, 14232, 18242, 20590, 26752) },
+ },
+ }, .eob_bin_256 = {
+ {
+ { CDF8( 310, 584, 1887, 3589,
+ 6168, 8611, 11352, 15652) },
+ { CDF8( 998, 1850, 2998, 5604,
+ 17341, 19888, 22899, 25583) },
+ }, {
+ { CDF8( 2520, 3240, 5952, 8870,
+ 12577, 17558, 19954, 24168) },
+ { CDF8( 2203, 4130, 7435, 10739,
+ 20652, 23681, 25609, 27261) },
+ },
+ }, .eob_bin_512 = {
+ { CDF9( 641, 983, 3707, 5430, 10234,
+ 14958, 18788, 23412, 26061) },
+ { CDF9( 5095, 6446, 9996, 13354, 16017,
+ 17986, 20919, 26129, 29140) },
+ }, .eob_bin_1024 = {
+ { CDF10( 393, 421, 751, 1623, 3160,
+ 6352, 13345, 18047, 22571, 25830) },
+ { CDF10( 1865, 1988, 2930, 4242, 10533,
+ 16538, 21354, 27255, 28546, 31784) },
+ }, .eob_hi_bit = {
+ {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16961) },
+ { CDF1(17223) }, { CDF1( 7621) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(19069) },
+ { CDF1(22525) }, { CDF1(13377) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20401) },
+ { CDF1(17025) }, { CDF1(12845) }, { CDF1(12873) },
+ { CDF1(14094) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20681) },
+ { CDF1(20701) }, { CDF1(15250) }, { CDF1(15017) },
+ { CDF1(14928) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(23905) },
+ { CDF1(17194) }, { CDF1(16170) }, { CDF1(17695) },
+ { CDF1(13826) }, { CDF1(15810) }, { CDF1(12036) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(23959) },
+ { CDF1(20799) }, { CDF1(19021) }, { CDF1(16203) },
+ { CDF1(17886) }, { CDF1(14144) }, { CDF1(12010) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(27399) },
+ { CDF1(16327) }, { CDF1(18071) }, { CDF1(19584) },
+ { CDF1(20721) }, { CDF1(18432) }, { CDF1(19560) },
+ { CDF1(10150) }, { CDF1( 8805) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(24932) },
+ { CDF1(20833) }, { CDF1(12027) }, { CDF1(16670) },
+ { CDF1(19914) }, { CDF1(15106) }, { CDF1(17662) },
+ { CDF1(13783) }, { CDF1(28756) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(23406) },
+ { CDF1(21845) }, { CDF1(18432) }, { CDF1(16384) },
+ { CDF1(17096) }, { CDF1(12561) }, { CDF1(17320) },
+ { CDF1(22395) }, { CDF1(21370) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ },
+ }, .eob_base_tok = {
+ {
+ {
+ { CDF2(17837, 29055) }, { CDF2(29600, 31446) },
+ { CDF2(30844, 31878) }, { CDF2(24926, 28948) },
+ }, {
+ { CDF2(21365, 30026) }, { CDF2(30512, 32423) },
+ { CDF2(31658, 32621) }, { CDF2(29630, 31881) },
+ },
+ }, {
+ {
+ { CDF2( 5717, 26477) }, { CDF2(30491, 31703) },
+ { CDF2(31550, 32158) }, { CDF2(29648, 31491) },
+ }, {
+ { CDF2(12608, 27820) }, { CDF2(30680, 32225) },
+ { CDF2(30809, 32335) }, { CDF2(31299, 32423) },
+ },
+ }, {
+ {
+ { CDF2( 1786, 12612) }, { CDF2(30663, 31625) },
+ { CDF2(32339, 32468) }, { CDF2(31148, 31833) },
+ }, {
+ { CDF2(18857, 23865) }, { CDF2(31428, 32428) },
+ { CDF2(31744, 32373) }, { CDF2(31775, 32526) },
+ },
+ }, {
+ {
+ { CDF2( 1787, 2532) }, { CDF2(30832, 31662) },
+ { CDF2(31824, 32682) }, { CDF2(32133, 32569) },
+ }, {
+ { CDF2(13751, 22235) }, { CDF2(32089, 32409) },
+ { CDF2(27084, 27920) }, { CDF2(29291, 32594) },
+ },
+ }, {
+ {
+ { CDF2( 1725, 3449) }, { CDF2(31102, 31935) },
+ { CDF2(32457, 32613) }, { CDF2(32412, 32649) },
+ }, {
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ },
+ },
+ }, .base_tok = {
+ {
+ {
+ { CDF3( 4034, 8930, 12727) },
+ { CDF3(18082, 29741, 31877) },
+ { CDF3(12596, 26124, 30493) },
+ { CDF3( 9446, 21118, 27005) },
+ { CDF3( 6308, 15141, 21279) },
+ { CDF3( 2463, 6357, 9783) },
+ { CDF3(20667, 30546, 31929) },
+ { CDF3(13043, 26123, 30134) },
+ { CDF3( 8151, 18757, 24778) },
+ { CDF3( 5255, 12839, 18632) },
+ { CDF3( 2820, 7206, 11161) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(15736, 27553, 30604) },
+ { CDF3(11210, 23794, 28787) },
+ { CDF3( 5947, 13874, 19701) },
+ { CDF3( 4215, 9323, 13891) },
+ { CDF3( 2833, 6462, 10059) },
+ { CDF3(19605, 30393, 31582) },
+ { CDF3(13523, 26252, 30248) },
+ { CDF3( 8446, 18622, 24512) },
+ { CDF3( 3818, 10343, 15974) },
+ { CDF3( 1481, 4117, 6796) },
+ { CDF3(22649, 31302, 32190) },
+ { CDF3(14829, 27127, 30449) },
+ { CDF3( 8313, 17702, 23304) },
+ { CDF3( 3022, 8301, 12786) },
+ { CDF3( 1536, 4412, 7184) },
+ { CDF3(22354, 29774, 31372) },
+ { CDF3(14723, 25472, 29214) },
+ { CDF3( 6673, 13745, 18662) },
+ { CDF3( 2068, 5766, 9322) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 6302, 16444, 21761) },
+ { CDF3(23040, 31538, 32475) },
+ { CDF3(15196, 28452, 31496) },
+ { CDF3(10020, 22946, 28514) },
+ { CDF3( 6533, 16862, 23501) },
+ { CDF3( 3538, 9816, 15076) },
+ { CDF3(24444, 31875, 32525) },
+ { CDF3(15881, 28924, 31635) },
+ { CDF3( 9922, 22873, 28466) },
+ { CDF3( 6527, 16966, 23691) },
+ { CDF3( 4114, 11303, 17220) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(20201, 30770, 32209) },
+ { CDF3(14754, 28071, 31258) },
+ { CDF3( 8378, 20186, 26517) },
+ { CDF3( 5916, 15299, 21978) },
+ { CDF3( 4268, 11583, 17901) },
+ { CDF3(24361, 32025, 32581) },
+ { CDF3(18673, 30105, 31943) },
+ { CDF3(10196, 22244, 27576) },
+ { CDF3( 5495, 14349, 20417) },
+ { CDF3( 2676, 7415, 11498) },
+ { CDF3(24678, 31958, 32585) },
+ { CDF3(18629, 29906, 31831) },
+ { CDF3( 9364, 20724, 26315) },
+ { CDF3( 4641, 12318, 18094) },
+ { CDF3( 2758, 7387, 11579) },
+ { CDF3(25433, 31842, 32469) },
+ { CDF3(18795, 29289, 31411) },
+ { CDF3( 7644, 17584, 23592) },
+ { CDF3( 3408, 9014, 15047) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 4536, 10072, 14001) },
+ { CDF3(25459, 31416, 32206) },
+ { CDF3(16605, 28048, 30818) },
+ { CDF3(11008, 22857, 27719) },
+ { CDF3( 6915, 16268, 22315) },
+ { CDF3( 2625, 6812, 10537) },
+ { CDF3(24257, 31788, 32499) },
+ { CDF3(16880, 29454, 31879) },
+ { CDF3(11958, 25054, 29778) },
+ { CDF3( 7916, 18718, 25084) },
+ { CDF3( 3383, 8777, 13446) },
+ { CDF3(22720, 31603, 32393) },
+ { CDF3(14960, 28125, 31335) },
+ { CDF3( 9731, 22210, 27928) },
+ { CDF3( 6304, 15832, 22277) },
+ { CDF3( 2910, 7818, 12166) },
+ { CDF3(20375, 30627, 32131) },
+ { CDF3(13904, 27284, 30887) },
+ { CDF3( 9368, 21558, 27144) },
+ { CDF3( 5937, 14966, 21119) },
+ { CDF3( 2667, 7225, 11319) },
+ { CDF3(23970, 31470, 32378) },
+ { CDF3(17173, 29734, 32018) },
+ { CDF3(12795, 25441, 29965) },
+ { CDF3( 8981, 19680, 25893) },
+ { CDF3( 4728, 11372, 16902) },
+ { CDF3(24287, 31797, 32439) },
+ { CDF3(16703, 29145, 31696) },
+ { CDF3(10833, 23554, 28725) },
+ { CDF3( 6468, 16566, 23057) },
+ { CDF3( 2415, 6562, 10278) },
+ { CDF3(26610, 32395, 32659) },
+ { CDF3(18590, 30498, 32117) },
+ { CDF3(12420, 25756, 29950) },
+ { CDF3( 7639, 18746, 24710) },
+ { CDF3( 3001, 8086, 12347) },
+ { CDF3(25076, 32064, 32580) },
+ { CDF3(17946, 30128, 32028) },
+ { CDF3(12024, 24985, 29378) },
+ { CDF3( 7517, 18390, 24304) },
+ { CDF3( 3243, 8781, 13331) },
+ }, {
+ { CDF3( 6037, 16771, 21957) },
+ { CDF3(24774, 31704, 32426) },
+ { CDF3(16830, 28589, 31056) },
+ { CDF3(10602, 22828, 27760) },
+ { CDF3( 6733, 16829, 23071) },
+ { CDF3( 3250, 8914, 13556) },
+ { CDF3(25582, 32220, 32668) },
+ { CDF3(18659, 30342, 32223) },
+ { CDF3(12546, 26149, 30515) },
+ { CDF3( 8420, 20451, 26801) },
+ { CDF3( 4636, 12420, 18344) },
+ { CDF3(27581, 32362, 32639) },
+ { CDF3(18987, 30083, 31978) },
+ { CDF3(11327, 24248, 29084) },
+ { CDF3( 7264, 17719, 24120) },
+ { CDF3( 3995, 10768, 16169) },
+ { CDF3(25893, 31831, 32487) },
+ { CDF3(16577, 28587, 31379) },
+ { CDF3(10189, 22748, 28182) },
+ { CDF3( 6832, 17094, 23556) },
+ { CDF3( 3708, 10110, 15334) },
+ { CDF3(25904, 32282, 32656) },
+ { CDF3(19721, 30792, 32276) },
+ { CDF3(12819, 26243, 30411) },
+ { CDF3( 8572, 20614, 26891) },
+ { CDF3( 5364, 14059, 20467) },
+ { CDF3(26580, 32438, 32677) },
+ { CDF3(20852, 31225, 32340) },
+ { CDF3(12435, 25700, 29967) },
+ { CDF3( 8691, 20825, 26976) },
+ { CDF3( 4446, 12209, 17269) },
+ { CDF3(27350, 32429, 32696) },
+ { CDF3(21372, 30977, 32272) },
+ { CDF3(12673, 25270, 29853) },
+ { CDF3( 9208, 20925, 26640) },
+ { CDF3( 5018, 13351, 18732) },
+ { CDF3(27351, 32479, 32713) },
+ { CDF3(21398, 31209, 32387) },
+ { CDF3(12162, 25047, 29842) },
+ { CDF3( 7896, 18691, 25319) },
+ { CDF3( 4670, 12882, 18881) },
+ },
+ }, {
+ {
+ { CDF3( 5487, 10460, 13708) },
+ { CDF3(21597, 28303, 30674) },
+ { CDF3(11037, 21953, 26476) },
+ { CDF3( 8147, 17962, 22952) },
+ { CDF3( 5242, 13061, 18532) },
+ { CDF3( 1889, 5208, 8182) },
+ { CDF3(26774, 32133, 32590) },
+ { CDF3(17844, 29564, 31767) },
+ { CDF3(11690, 24438, 29171) },
+ { CDF3( 7542, 18215, 24459) },
+ { CDF3( 2993, 8050, 12319) },
+ { CDF3(28023, 32328, 32591) },
+ { CDF3(18651, 30126, 31954) },
+ { CDF3(12164, 25146, 29589) },
+ { CDF3( 7762, 18530, 24771) },
+ { CDF3( 3492, 9183, 13920) },
+ { CDF3(27591, 32008, 32491) },
+ { CDF3(17149, 28853, 31510) },
+ { CDF3(11485, 24003, 28860) },
+ { CDF3( 7697, 18086, 24210) },
+ { CDF3( 3075, 7999, 12218) },
+ { CDF3(28268, 32482, 32654) },
+ { CDF3(19631, 31051, 32404) },
+ { CDF3(13860, 27260, 31020) },
+ { CDF3( 9605, 21613, 27594) },
+ { CDF3( 4876, 12162, 17908) },
+ { CDF3(27248, 32316, 32576) },
+ { CDF3(18955, 30457, 32075) },
+ { CDF3(11824, 23997, 28795) },
+ { CDF3( 7346, 18196, 24647) },
+ { CDF3( 3403, 9247, 14111) },
+ { CDF3(29711, 32655, 32735) },
+ { CDF3(21169, 31394, 32417) },
+ { CDF3(13487, 27198, 30957) },
+ { CDF3( 8828, 21683, 27614) },
+ { CDF3( 4270, 11451, 17038) },
+ { CDF3(28708, 32578, 32731) },
+ { CDF3(20120, 31241, 32482) },
+ { CDF3(13692, 27550, 31321) },
+ { CDF3( 9418, 22514, 28439) },
+ { CDF3( 4999, 13283, 19462) },
+ }, {
+ { CDF3( 5673, 14302, 19711) },
+ { CDF3(26251, 30701, 31834) },
+ { CDF3(12782, 23783, 27803) },
+ { CDF3( 9127, 20657, 25808) },
+ { CDF3( 6368, 16208, 21462) },
+ { CDF3( 2465, 7177, 10822) },
+ { CDF3(29961, 32563, 32719) },
+ { CDF3(18318, 29891, 31949) },
+ { CDF3(11361, 24514, 29357) },
+ { CDF3( 7900, 19603, 25607) },
+ { CDF3( 4002, 10590, 15546) },
+ { CDF3(29637, 32310, 32595) },
+ { CDF3(18296, 29913, 31809) },
+ { CDF3(10144, 21515, 26871) },
+ { CDF3( 5358, 14322, 20394) },
+ { CDF3( 3067, 8362, 13346) },
+ { CDF3(28652, 32470, 32676) },
+ { CDF3(17538, 30771, 32209) },
+ { CDF3(13924, 26882, 30494) },
+ { CDF3(10496, 22837, 27869) },
+ { CDF3( 7236, 16396, 21621) },
+ { CDF3(30743, 32687, 32746) },
+ { CDF3(23006, 31676, 32489) },
+ { CDF3(14494, 27828, 31120) },
+ { CDF3(10174, 22801, 28352) },
+ { CDF3( 6242, 15281, 21043) },
+ { CDF3(25817, 32243, 32720) },
+ { CDF3(18618, 31367, 32325) },
+ { CDF3(13997, 28318, 31878) },
+ { CDF3(12255, 26534, 31383) },
+ { CDF3( 9561, 21588, 28450) },
+ { CDF3(28188, 32635, 32724) },
+ { CDF3(22060, 32365, 32728) },
+ { CDF3(18102, 30690, 32528) },
+ { CDF3(14196, 28864, 31999) },
+ { CDF3(12262, 25792, 30865) },
+ { CDF3(24176, 32109, 32628) },
+ { CDF3(18280, 29681, 31963) },
+ { CDF3(10205, 23703, 29664) },
+ { CDF3( 7889, 20025, 27676) },
+ { CDF3( 6060, 16743, 23970) },
+ },
+ }, {
+ {
+ { CDF3( 5141, 7096, 8260) },
+ { CDF3(27186, 29022, 29789) },
+ { CDF3( 6668, 12568, 15682) },
+ { CDF3( 2172, 6181, 8638) },
+ { CDF3( 1126, 3379, 4531) },
+ { CDF3( 443, 1361, 2254) },
+ { CDF3(26083, 31153, 32436) },
+ { CDF3(13486, 24603, 28483) },
+ { CDF3( 6508, 14840, 19910) },
+ { CDF3( 3386, 8800, 13286) },
+ { CDF3( 1530, 4322, 7054) },
+ { CDF3(29639, 32080, 32548) },
+ { CDF3(15897, 27552, 30290) },
+ { CDF3( 8588, 20047, 25383) },
+ { CDF3( 4889, 13339, 19269) },
+ { CDF3( 2240, 6871, 10498) },
+ { CDF3(28165, 32197, 32517) },
+ { CDF3(20735, 30427, 31568) },
+ { CDF3(14325, 24671, 27692) },
+ { CDF3( 5119, 12554, 17805) },
+ { CDF3( 1810, 5441, 8261) },
+ { CDF3(31212, 32724, 32748) },
+ { CDF3(23352, 31766, 32545) },
+ { CDF3(14669, 27570, 31059) },
+ { CDF3( 8492, 20894, 27272) },
+ { CDF3( 3644, 10194, 15204) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 2461, 7013, 9371) },
+ { CDF3(24749, 29600, 30986) },
+ { CDF3( 9466, 19037, 22417) },
+ { CDF3( 3584, 9280, 14400) },
+ { CDF3( 1505, 3929, 5433) },
+ { CDF3( 677, 1500, 2736) },
+ { CDF3(23987, 30702, 32117) },
+ { CDF3(13554, 24571, 29263) },
+ { CDF3( 6211, 14556, 21155) },
+ { CDF3( 3135, 10972, 15625) },
+ { CDF3( 2435, 7127, 11427) },
+ { CDF3(31300, 32532, 32550) },
+ { CDF3(14757, 30365, 31954) },
+ { CDF3( 4405, 11612, 18553) },
+ { CDF3( 580, 4132, 7322) },
+ { CDF3( 1695, 10169, 14124) },
+ { CDF3(30008, 32282, 32591) },
+ { CDF3(19244, 30108, 31748) },
+ { CDF3(11180, 24158, 29555) },
+ { CDF3( 5650, 14972, 19209) },
+ { CDF3( 2114, 5109, 8456) },
+ { CDF3(31856, 32716, 32748) },
+ { CDF3(23012, 31664, 32572) },
+ { CDF3(13694, 26656, 30636) },
+ { CDF3( 8142, 19508, 26093) },
+ { CDF3( 4253, 10955, 16724) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 601, 983, 1311) },
+ { CDF3(18725, 23406, 28087) },
+ { CDF3( 5461, 8192, 10923) },
+ { CDF3( 3781, 15124, 21425) },
+ { CDF3( 2587, 7761, 12072) },
+ { CDF3( 106, 458, 810) },
+ { CDF3(22282, 29710, 31894) },
+ { CDF3( 8508, 20926, 25984) },
+ { CDF3( 3726, 12713, 18083) },
+ { CDF3( 1620, 7112, 10893) },
+ { CDF3( 729, 2236, 3495) },
+ { CDF3(30163, 32474, 32684) },
+ { CDF3(18304, 30464, 32000) },
+ { CDF3(11443, 26526, 29647) },
+ { CDF3( 6007, 15292, 21299) },
+ { CDF3( 2234, 6703, 8937) },
+ { CDF3(30954, 32177, 32571) },
+ { CDF3(17363, 29562, 31076) },
+ { CDF3( 9686, 22464, 27410) },
+ { CDF3( 8192, 16384, 21390) },
+ { CDF3( 1755, 8046, 11264) },
+ { CDF3(31168, 32734, 32748) },
+ { CDF3(22486, 31441, 32471) },
+ { CDF3(12833, 25627, 29738) },
+ { CDF3( 6980, 17379, 23122) },
+ { CDF3( 3111, 8887, 13479) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ },
+ }, .dc_sign = {
+ { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } },
+ { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } },
+ }, .br_tok = {
+ {
+ {
+ { CDF3(14298, 20718, 24174) },
+ { CDF3(12536, 19601, 23789) },
+ { CDF3( 8712, 15051, 19503) },
+ { CDF3( 6170, 11327, 15434) },
+ { CDF3( 4742, 8926, 12538) },
+ { CDF3( 3803, 7317, 10546) },
+ { CDF3( 1696, 3317, 4871) },
+ { CDF3(14392, 19951, 22756) },
+ { CDF3(15978, 23218, 26818) },
+ { CDF3(12187, 19474, 23889) },
+ { CDF3( 9176, 15640, 20259) },
+ { CDF3( 7068, 12655, 17028) },
+ { CDF3( 5656, 10442, 14472) },
+ { CDF3( 2580, 4992, 7244) },
+ { CDF3(12136, 18049, 21426) },
+ { CDF3(13784, 20721, 24481) },
+ { CDF3(10836, 17621, 21900) },
+ { CDF3( 8372, 14444, 18847) },
+ { CDF3( 6523, 11779, 16000) },
+ { CDF3( 5337, 9898, 13760) },
+ { CDF3( 3034, 5860, 8462) },
+ }, {
+ { CDF3(15967, 22905, 26286) },
+ { CDF3(13534, 20654, 24579) },
+ { CDF3( 9504, 16092, 20535) },
+ { CDF3( 6975, 12568, 16903) },
+ { CDF3( 5364, 10091, 14020) },
+ { CDF3( 4357, 8370, 11857) },
+ { CDF3( 2506, 4934, 7218) },
+ { CDF3(23032, 28815, 30936) },
+ { CDF3(19540, 26704, 29719) },
+ { CDF3(15158, 22969, 27097) },
+ { CDF3(11408, 18865, 23650) },
+ { CDF3( 8885, 15448, 20250) },
+ { CDF3( 7108, 12853, 17416) },
+ { CDF3( 4231, 8041, 11480) },
+ { CDF3(19823, 26490, 29156) },
+ { CDF3(18890, 25929, 28932) },
+ { CDF3(15660, 23491, 27433) },
+ { CDF3(12147, 19776, 24488) },
+ { CDF3( 9728, 16774, 21649) },
+ { CDF3( 7919, 14277, 19066) },
+ { CDF3( 5440, 10170, 14185) },
+ },
+ }, {
+ {
+ { CDF3(14406, 20862, 24414) },
+ { CDF3(11824, 18907, 23109) },
+ { CDF3( 8257, 14393, 18803) },
+ { CDF3( 5860, 10747, 14778) },
+ { CDF3( 4475, 8486, 11984) },
+ { CDF3( 3606, 6954, 10043) },
+ { CDF3( 1736, 3410, 5048) },
+ { CDF3(14430, 20046, 22882) },
+ { CDF3(15593, 22899, 26709) },
+ { CDF3(12102, 19368, 23811) },
+ { CDF3( 9059, 15584, 20262) },
+ { CDF3( 6999, 12603, 17048) },
+ { CDF3( 5684, 10497, 14553) },
+ { CDF3( 2822, 5438, 7862) },
+ { CDF3(15785, 21585, 24359) },
+ { CDF3(18347, 25229, 28266) },
+ { CDF3(14974, 22487, 26389) },
+ { CDF3(11423, 18681, 23271) },
+ { CDF3( 8863, 15350, 20008) },
+ { CDF3( 7153, 12852, 17278) },
+ { CDF3( 3707, 7036, 9982) },
+ }, {
+ { CDF3(15460, 21696, 25469) },
+ { CDF3(12170, 19249, 23191) },
+ { CDF3( 8723, 15027, 19332) },
+ { CDF3( 6428, 11704, 15874) },
+ { CDF3( 4922, 9292, 13052) },
+ { CDF3( 4139, 7695, 11010) },
+ { CDF3( 2291, 4508, 6598) },
+ { CDF3(19856, 26920, 29828) },
+ { CDF3(17923, 25289, 28792) },
+ { CDF3(14278, 21968, 26297) },
+ { CDF3(10910, 18136, 22950) },
+ { CDF3( 8423, 14815, 19627) },
+ { CDF3( 6771, 12283, 16774) },
+ { CDF3( 4074, 7750, 11081) },
+ { CDF3(19852, 26074, 28672) },
+ { CDF3(19371, 26110, 28989) },
+ { CDF3(16265, 23873, 27663) },
+ { CDF3(12758, 20378, 24952) },
+ { CDF3(10095, 17098, 21961) },
+ { CDF3( 8250, 14628, 19451) },
+ { CDF3( 5205, 9745, 13622) },
+ },
+ }, {
+ {
+ { CDF3(10563, 16233, 19763) },
+ { CDF3( 9794, 16022, 19804) },
+ { CDF3( 6750, 11945, 15759) },
+ { CDF3( 4963, 9186, 12752) },
+ { CDF3( 3845, 7435, 10627) },
+ { CDF3( 3051, 6085, 8834) },
+ { CDF3( 1311, 2596, 3830) },
+ { CDF3(11246, 16404, 19689) },
+ { CDF3(12315, 18911, 22731) },
+ { CDF3(10557, 17095, 21289) },
+ { CDF3( 8136, 14006, 18249) },
+ { CDF3( 6348, 11474, 15565) },
+ { CDF3( 5196, 9655, 13400) },
+ { CDF3( 2349, 4526, 6587) },
+ { CDF3(13337, 18730, 21569) },
+ { CDF3(19306, 26071, 28882) },
+ { CDF3(15952, 23540, 27254) },
+ { CDF3(12409, 19934, 24430) },
+ { CDF3( 9760, 16706, 21389) },
+ { CDF3( 8004, 14220, 18818) },
+ { CDF3( 4138, 7794, 10961) },
+ }, {
+ { CDF3(10870, 16684, 20949) },
+ { CDF3( 9664, 15230, 18680) },
+ { CDF3( 6886, 12109, 15408) },
+ { CDF3( 4825, 8900, 12305) },
+ { CDF3( 3630, 7162, 10314) },
+ { CDF3( 3036, 6429, 9387) },
+ { CDF3( 1671, 3296, 4940) },
+ { CDF3(13819, 19159, 23026) },
+ { CDF3(11984, 19108, 23120) },
+ { CDF3(10690, 17210, 21663) },
+ { CDF3( 7984, 14154, 18333) },
+ { CDF3( 6868, 12294, 16124) },
+ { CDF3( 5274, 8994, 12868) },
+ { CDF3( 2988, 5771, 8424) },
+ { CDF3(19736, 26647, 29141) },
+ { CDF3(18933, 26070, 28984) },
+ { CDF3(15779, 23048, 27200) },
+ { CDF3(12638, 20061, 24532) },
+ { CDF3(10692, 17545, 22220) },
+ { CDF3( 9217, 15251, 20054) },
+ { CDF3( 5078, 9284, 12594) },
+ },
+ }, {
+ {
+ { CDF3( 2331, 3662, 5244) },
+ { CDF3( 2891, 4771, 6145) },
+ { CDF3( 4598, 7623, 9729) },
+ { CDF3( 3520, 6845, 9199) },
+ { CDF3( 3417, 6119, 9324) },
+ { CDF3( 2601, 5412, 7385) },
+ { CDF3( 600, 1173, 1744) },
+ { CDF3( 7672, 13286, 17469) },
+ { CDF3( 4232, 7792, 10793) },
+ { CDF3( 2915, 5317, 7397) },
+ { CDF3( 2318, 4356, 6152) },
+ { CDF3( 2127, 4000, 5554) },
+ { CDF3( 1850, 3478, 5275) },
+ { CDF3( 977, 1933, 2843) },
+ { CDF3(18280, 24387, 27989) },
+ { CDF3(15852, 22671, 26185) },
+ { CDF3(13845, 20951, 24789) },
+ { CDF3(11055, 17966, 22129) },
+ { CDF3( 9138, 15422, 19801) },
+ { CDF3( 7454, 13145, 17456) },
+ { CDF3( 3370, 6393, 9013) },
+ }, {
+ { CDF3( 5842, 9229, 10838) },
+ { CDF3( 2313, 3491, 4276) },
+ { CDF3( 2998, 6104, 7496) },
+ { CDF3( 2420, 7447, 9868) },
+ { CDF3( 3034, 8495, 10923) },
+ { CDF3( 4076, 8937, 10975) },
+ { CDF3( 1086, 2370, 3299) },
+ { CDF3( 9714, 17254, 20444) },
+ { CDF3( 8543, 13698, 17123) },
+ { CDF3( 4918, 9007, 11910) },
+ { CDF3( 4129, 7532, 10553) },
+ { CDF3( 2364, 5533, 8058) },
+ { CDF3( 1834, 3546, 5563) },
+ { CDF3( 1473, 2908, 4133) },
+ { CDF3(15405, 21193, 25619) },
+ { CDF3(15691, 21952, 26561) },
+ { CDF3(12962, 19194, 24165) },
+ { CDF3(10272, 17855, 22129) },
+ { CDF3( 8588, 15270, 20718) },
+ { CDF3( 8682, 14669, 19500) },
+ { CDF3( 4870, 9636, 13205) },
+ },
+ },
+ },
+ }, [1] = {
+ .skip = {
+ {
+ { CDF1(30371) }, { CDF1( 7570) }, { CDF1(13155) },
+ { CDF1(20751) }, { CDF1(20969) }, { CDF1(27067) },
+ { CDF1(32013) }, { CDF1( 5495) }, { CDF1(17942) },
+ { CDF1(28280) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(31782) }, { CDF1( 1836) }, { CDF1(10689) },
+ { CDF1(17604) }, { CDF1(21622) }, { CDF1(27518) },
+ { CDF1(32399) }, { CDF1( 4419) }, { CDF1(16294) },
+ { CDF1(28345) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(31901) }, { CDF1(10311) }, { CDF1(18047) },
+ { CDF1(24806) }, { CDF1(23288) }, { CDF1(27914) },
+ { CDF1(32296) }, { CDF1( 4215) }, { CDF1(15756) },
+ { CDF1(28341) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(26726) }, { CDF1( 1045) }, { CDF1(11703) },
+ { CDF1(20590) }, { CDF1(18554) }, { CDF1(25970) },
+ { CDF1(31938) }, { CDF1( 5583) }, { CDF1(21313) },
+ { CDF1(29390) }, { CDF1( 641) }, { CDF1(22265) },
+ { CDF1(31452) },
+ }, {
+ { CDF1(26584) }, { CDF1( 188) }, { CDF1( 8847) },
+ { CDF1(24519) }, { CDF1(22938) }, { CDF1(30583) },
+ { CDF1(32608) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ },
+ }, .eob_bin_16 = {
+ {
+ { CDF4( 2125, 2551, 5165, 8946) },
+ { CDF4( 513, 765, 1859, 6339) },
+ }, {
+ { CDF4( 7637, 9498, 14259, 19108) },
+ { CDF4( 2497, 4096, 8866, 16993) },
+ },
+ }, .eob_bin_32 = {
+ {
+ { CDF5( 989, 1249, 2019, 4151, 10785) },
+ { CDF5( 313, 441, 1099, 2917, 8562) },
+ }, {
+ { CDF5( 8394, 10352, 13932, 18855, 26014) },
+ { CDF5( 2578, 4124, 8181, 13670, 24234) },
+ },
+ }, .eob_bin_64 = {
+ {
+ { CDF6( 1260, 1446, 2253, 3712, 6652, 13369) },
+ { CDF6( 401, 605, 1029, 2563, 5845, 12626) },
+ }, {
+ { CDF6( 8609, 10612, 14624, 18714, 22614, 29024) },
+ { CDF6( 1923, 3127, 5867, 9703, 14277, 27100) },
+ },
+ }, .eob_bin_128 = {
+ {
+ { CDF7( 685, 933, 1488, 2714, 4766, 8562, 19254) },
+ { CDF7( 217, 352, 618, 2303, 5261, 9969, 17472) },
+ }, {
+ { CDF7( 8045, 11200, 15497, 19595, 23948, 27408, 30938) },
+ { CDF7( 2310, 4160, 7471, 14997, 17931, 20768, 30240) },
+ },
+ }, .eob_bin_256 = {
+ {
+ { CDF8( 1448, 2109, 4151, 6263,
+ 9329, 13260, 17944, 23300) },
+ { CDF8( 399, 1019, 1749, 3038,
+ 10444, 15546, 22739, 27294) },
+ }, {
+ { CDF8( 6402, 8148, 12623, 15072,
+ 18728, 22847, 26447, 29377) },
+ { CDF8( 1674, 3252, 5734, 10159,
+ 22397, 23802, 24821, 30940) },
+ },
+ }, .eob_bin_512 = {
+ { CDF9( 1230, 2278, 5035, 7776, 11871,
+ 15346, 19590, 24584, 28749) },
+ { CDF9( 7265, 9979, 15819, 19250, 21780,
+ 23846, 26478, 28396, 31811) },
+ }, .eob_bin_1024 = {
+ { CDF10( 696, 948, 3145, 5702, 9706,
+ 13217, 17851, 21856, 25692, 28034) },
+ { CDF10( 2672, 3591, 9330, 17084, 22725,
+ 24284, 26527, 28027, 28377, 30876) },
+ }, .eob_hi_bit = {
+ {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(17471) },
+ { CDF1(20223) }, { CDF1(11357) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20335) },
+ { CDF1(21667) }, { CDF1(14818) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20430) },
+ { CDF1(20662) }, { CDF1(15367) }, { CDF1(16970) },
+ { CDF1(14657) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(22117) },
+ { CDF1(22028) }, { CDF1(18650) }, { CDF1(16042) },
+ { CDF1(15885) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(22409) },
+ { CDF1(21012) }, { CDF1(15650) }, { CDF1(17395) },
+ { CDF1(15469) }, { CDF1(20205) }, { CDF1(19511) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(24220) },
+ { CDF1(22480) }, { CDF1(17737) }, { CDF1(18916) },
+ { CDF1(19268) }, { CDF1(18412) }, { CDF1(18844) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(25991) },
+ { CDF1(20314) }, { CDF1(17731) }, { CDF1(19678) },
+ { CDF1(18649) }, { CDF1(17307) }, { CDF1(21798) },
+ { CDF1(17549) }, { CDF1(15630) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(26585) },
+ { CDF1(21469) }, { CDF1(20432) }, { CDF1(17735) },
+ { CDF1(19280) }, { CDF1(15235) }, { CDF1(20297) },
+ { CDF1(22471) }, { CDF1(28997) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(26605) },
+ { CDF1(11304) }, { CDF1(16726) }, { CDF1(16560) },
+ { CDF1(20866) }, { CDF1(23524) }, { CDF1(19878) },
+ { CDF1(13469) }, { CDF1(23084) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ },
+ }, .eob_base_tok = {
+ {
+ {
+ { CDF2(17560, 29888) }, { CDF2(29671, 31549) },
+ { CDF2(31007, 32056) }, { CDF2(27286, 30006) },
+ }, {
+ { CDF2(26594, 31212) }, { CDF2(31208, 32582) },
+ { CDF2(31835, 32637) }, { CDF2(30595, 32206) },
+ },
+ }, {
+ {
+ { CDF2(15239, 29932) }, { CDF2(31315, 32095) },
+ { CDF2(32130, 32434) }, { CDF2(30864, 31996) },
+ }, {
+ { CDF2(26279, 30968) }, { CDF2(31142, 32495) },
+ { CDF2(31713, 32540) }, { CDF2(31929, 32594) },
+ },
+ }, {
+ {
+ { CDF2( 2644, 25198) }, { CDF2(32038, 32451) },
+ { CDF2(32639, 32695) }, { CDF2(32166, 32518) },
+ }, {
+ { CDF2(17187, 27668) }, { CDF2(31714, 32550) },
+ { CDF2(32283, 32678) }, { CDF2(31930, 32563) },
+ },
+ }, {
+ {
+ { CDF2( 1044, 2257) }, { CDF2(30755, 31923) },
+ { CDF2(32208, 32693) }, { CDF2(32244, 32615) },
+ }, {
+ { CDF2(21317, 26207) }, { CDF2(29133, 30868) },
+ { CDF2(29311, 31231) }, { CDF2(29657, 31087) },
+ },
+ }, {
+ {
+ { CDF2( 478, 1834) }, { CDF2(31005, 31987) },
+ { CDF2(32317, 32724) }, { CDF2(30865, 32648) },
+ }, {
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ },
+ },
+ }, .base_tok = {
+ {
+ {
+ { CDF3( 6041, 11854, 15927) },
+ { CDF3(20326, 30905, 32251) },
+ { CDF3(14164, 26831, 30725) },
+ { CDF3( 9760, 20647, 26585) },
+ { CDF3( 6416, 14953, 21219) },
+ { CDF3( 2966, 7151, 10891) },
+ { CDF3(23567, 31374, 32254) },
+ { CDF3(14978, 27416, 30946) },
+ { CDF3( 9434, 20225, 26254) },
+ { CDF3( 6658, 14558, 20535) },
+ { CDF3( 3916, 8677, 12989) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(18088, 29545, 31587) },
+ { CDF3(13062, 25843, 30073) },
+ { CDF3( 8940, 16827, 22251) },
+ { CDF3( 7654, 13220, 17973) },
+ { CDF3( 5733, 10316, 14456) },
+ { CDF3(22879, 31388, 32114) },
+ { CDF3(15215, 27993, 30955) },
+ { CDF3( 9397, 19445, 24978) },
+ { CDF3( 3442, 9813, 15344) },
+ { CDF3( 1368, 3936, 6532) },
+ { CDF3(25494, 32033, 32406) },
+ { CDF3(16772, 27963, 30718) },
+ { CDF3( 9419, 18165, 23260) },
+ { CDF3( 2677, 7501, 11797) },
+ { CDF3( 1516, 4344, 7170) },
+ { CDF3(26556, 31454, 32101) },
+ { CDF3(17128, 27035, 30108) },
+ { CDF3( 8324, 15344, 20249) },
+ { CDF3( 1903, 5696, 9469) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 8455, 19003, 24368) },
+ { CDF3(23563, 32021, 32604) },
+ { CDF3(16237, 29446, 31935) },
+ { CDF3(10724, 23999, 29358) },
+ { CDF3( 6725, 17528, 24416) },
+ { CDF3( 3927, 10927, 16825) },
+ { CDF3(26313, 32288, 32634) },
+ { CDF3(17430, 30095, 32095) },
+ { CDF3(11116, 24606, 29679) },
+ { CDF3( 7195, 18384, 25269) },
+ { CDF3( 4726, 12852, 19315) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(22822, 31648, 32483) },
+ { CDF3(16724, 29633, 31929) },
+ { CDF3(10261, 23033, 28725) },
+ { CDF3( 7029, 17840, 24528) },
+ { CDF3( 4867, 13886, 21502) },
+ { CDF3(25298, 31892, 32491) },
+ { CDF3(17809, 29330, 31512) },
+ { CDF3( 9668, 21329, 26579) },
+ { CDF3( 4774, 12956, 18976) },
+ { CDF3( 2322, 7030, 11540) },
+ { CDF3(25472, 31920, 32543) },
+ { CDF3(17957, 29387, 31632) },
+ { CDF3( 9196, 20593, 26400) },
+ { CDF3( 4680, 12705, 19202) },
+ { CDF3( 2917, 8456, 13436) },
+ { CDF3(26471, 32059, 32574) },
+ { CDF3(18458, 29783, 31909) },
+ { CDF3( 8400, 19464, 25956) },
+ { CDF3( 3812, 10973, 17206) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 6779, 13743, 17678) },
+ { CDF3(24806, 31797, 32457) },
+ { CDF3(17616, 29047, 31372) },
+ { CDF3(11063, 23175, 28003) },
+ { CDF3( 6521, 16110, 22324) },
+ { CDF3( 2764, 7504, 11654) },
+ { CDF3(25266, 32367, 32637) },
+ { CDF3(19054, 30553, 32175) },
+ { CDF3(12139, 25212, 29807) },
+ { CDF3( 7311, 18162, 24704) },
+ { CDF3( 3397, 9164, 14074) },
+ { CDF3(25988, 32208, 32522) },
+ { CDF3(16253, 28912, 31526) },
+ { CDF3( 9151, 21387, 27372) },
+ { CDF3( 5688, 14915, 21496) },
+ { CDF3( 2717, 7627, 12004) },
+ { CDF3(23144, 31855, 32443) },
+ { CDF3(16070, 28491, 31325) },
+ { CDF3( 8702, 20467, 26517) },
+ { CDF3( 5243, 13956, 20367) },
+ { CDF3( 2621, 7335, 11567) },
+ { CDF3(26636, 32340, 32630) },
+ { CDF3(19990, 31050, 32341) },
+ { CDF3(13243, 26105, 30315) },
+ { CDF3( 8588, 19521, 25918) },
+ { CDF3( 4717, 11585, 17304) },
+ { CDF3(25844, 32292, 32582) },
+ { CDF3(19090, 30635, 32097) },
+ { CDF3(11963, 24546, 28939) },
+ { CDF3( 6218, 16087, 22354) },
+ { CDF3( 2340, 6608, 10426) },
+ { CDF3(28046, 32576, 32694) },
+ { CDF3(21178, 31313, 32296) },
+ { CDF3(13486, 26184, 29870) },
+ { CDF3( 7149, 17871, 23723) },
+ { CDF3( 2833, 7958, 12259) },
+ { CDF3(27710, 32528, 32686) },
+ { CDF3(20674, 31076, 32268) },
+ { CDF3(12413, 24955, 29243) },
+ { CDF3( 6676, 16927, 23097) },
+ { CDF3( 2966, 8333, 12919) },
+ }, {
+ { CDF3( 8639, 19339, 24429) },
+ { CDF3(24404, 31837, 32525) },
+ { CDF3(16997, 29425, 31784) },
+ { CDF3(11253, 24234, 29149) },
+ { CDF3( 6751, 17394, 24028) },
+ { CDF3( 3490, 9830, 15191) },
+ { CDF3(26283, 32471, 32714) },
+ { CDF3(19599, 31168, 32442) },
+ { CDF3(13146, 26954, 30893) },
+ { CDF3( 8214, 20588, 26890) },
+ { CDF3( 4699, 13081, 19300) },
+ { CDF3(28212, 32458, 32669) },
+ { CDF3(18594, 30316, 32100) },
+ { CDF3(11219, 24408, 29234) },
+ { CDF3( 6865, 17656, 24149) },
+ { CDF3( 3678, 10362, 16006) },
+ { CDF3(25825, 32136, 32616) },
+ { CDF3(17313, 29853, 32021) },
+ { CDF3(11197, 24471, 29472) },
+ { CDF3( 6947, 17781, 24405) },
+ { CDF3( 3768, 10660, 16261) },
+ { CDF3(27352, 32500, 32706) },
+ { CDF3(20850, 31468, 32469) },
+ { CDF3(14021, 27707, 31133) },
+ { CDF3( 8964, 21748, 27838) },
+ { CDF3( 5437, 14665, 21187) },
+ { CDF3(26304, 32492, 32698) },
+ { CDF3(20409, 31380, 32385) },
+ { CDF3(13682, 27222, 30632) },
+ { CDF3( 8974, 21236, 26685) },
+ { CDF3( 4234, 11665, 16934) },
+ { CDF3(26273, 32357, 32711) },
+ { CDF3(20672, 31242, 32441) },
+ { CDF3(14172, 27254, 30902) },
+ { CDF3( 9870, 21898, 27275) },
+ { CDF3( 5164, 13506, 19270) },
+ { CDF3(26725, 32459, 32728) },
+ { CDF3(20991, 31442, 32527) },
+ { CDF3(13071, 26434, 30811) },
+ { CDF3( 8184, 20090, 26742) },
+ { CDF3( 4803, 13255, 19895) },
+ },
+ }, {
+ {
+ { CDF3( 7555, 14942, 18501) },
+ { CDF3(24410, 31178, 32287) },
+ { CDF3(14394, 26738, 30253) },
+ { CDF3( 8413, 19554, 25195) },
+ { CDF3( 4766, 12924, 18785) },
+ { CDF3( 2029, 5806, 9207) },
+ { CDF3(26776, 32364, 32663) },
+ { CDF3(18732, 29967, 31931) },
+ { CDF3(11005, 23786, 28852) },
+ { CDF3( 6466, 16909, 23510) },
+ { CDF3( 3044, 8638, 13419) },
+ { CDF3(29208, 32582, 32704) },
+ { CDF3(20068, 30857, 32208) },
+ { CDF3(12003, 25085, 29595) },
+ { CDF3( 6947, 17750, 24189) },
+ { CDF3( 3245, 9103, 14007) },
+ { CDF3(27359, 32465, 32669) },
+ { CDF3(19421, 30614, 32174) },
+ { CDF3(11915, 25010, 29579) },
+ { CDF3( 6950, 17676, 24074) },
+ { CDF3( 3007, 8473, 13096) },
+ { CDF3(29002, 32676, 32735) },
+ { CDF3(22102, 31849, 32576) },
+ { CDF3(14408, 28009, 31405) },
+ { CDF3( 9027, 21679, 27931) },
+ { CDF3( 4694, 12678, 18748) },
+ { CDF3(28216, 32528, 32682) },
+ { CDF3(20849, 31264, 32318) },
+ { CDF3(12756, 25815, 29751) },
+ { CDF3( 7565, 18801, 24923) },
+ { CDF3( 3509, 9533, 14477) },
+ { CDF3(30133, 32687, 32739) },
+ { CDF3(23063, 31910, 32515) },
+ { CDF3(14588, 28051, 31132) },
+ { CDF3( 9085, 21649, 27457) },
+ { CDF3( 4261, 11654, 17264) },
+ { CDF3(29518, 32691, 32748) },
+ { CDF3(22451, 31959, 32613) },
+ { CDF3(14864, 28722, 31700) },
+ { CDF3( 9695, 22964, 28716) },
+ { CDF3( 4932, 13358, 19502) },
+ }, {
+ { CDF3( 6465, 16958, 21688) },
+ { CDF3(25199, 31514, 32360) },
+ { CDF3(14774, 27149, 30607) },
+ { CDF3( 9257, 21438, 26972) },
+ { CDF3( 5723, 15183, 21882) },
+ { CDF3( 3150, 8879, 13731) },
+ { CDF3(26989, 32262, 32682) },
+ { CDF3(17396, 29937, 32085) },
+ { CDF3(11387, 24901, 29784) },
+ { CDF3( 7289, 18821, 25548) },
+ { CDF3( 3734, 10577, 16086) },
+ { CDF3(29728, 32501, 32695) },
+ { CDF3(17431, 29701, 31903) },
+ { CDF3( 9921, 22826, 28300) },
+ { CDF3( 5896, 15434, 22068) },
+ { CDF3( 3430, 9646, 14757) },
+ { CDF3(28614, 32511, 32705) },
+ { CDF3(19364, 30638, 32263) },
+ { CDF3(13129, 26254, 30402) },
+ { CDF3( 8754, 20484, 26440) },
+ { CDF3( 4378, 11607, 17110) },
+ { CDF3(30292, 32671, 32744) },
+ { CDF3(21780, 31603, 32501) },
+ { CDF3(14314, 27829, 31291) },
+ { CDF3( 9611, 22327, 28263) },
+ { CDF3( 4890, 13087, 19065) },
+ { CDF3(25862, 32567, 32733) },
+ { CDF3(20794, 32050, 32567) },
+ { CDF3(17243, 30625, 32254) },
+ { CDF3(13283, 27628, 31474) },
+ { CDF3( 9669, 22532, 28918) },
+ { CDF3(27435, 32697, 32748) },
+ { CDF3(24922, 32390, 32714) },
+ { CDF3(21449, 31504, 32536) },
+ { CDF3(16392, 29729, 31832) },
+ { CDF3(11692, 24884, 29076) },
+ { CDF3(24193, 32290, 32735) },
+ { CDF3(18909, 31104, 32563) },
+ { CDF3(12236, 26841, 31403) },
+ { CDF3( 8171, 21840, 29082) },
+ { CDF3( 7224, 17280, 25275) },
+ },
+ }, {
+ {
+ { CDF3( 3078, 6839, 9890) },
+ { CDF3(13837, 20450, 24479) },
+ { CDF3( 5914, 14222, 19328) },
+ { CDF3( 3866, 10267, 14762) },
+ { CDF3( 2612, 7208, 11042) },
+ { CDF3( 1067, 2991, 4776) },
+ { CDF3(25817, 31646, 32529) },
+ { CDF3(13708, 26338, 30385) },
+ { CDF3( 7328, 18585, 24870) },
+ { CDF3( 4691, 13080, 19276) },
+ { CDF3( 1825, 5253, 8352) },
+ { CDF3(29386, 32315, 32624) },
+ { CDF3(17160, 29001, 31360) },
+ { CDF3( 9602, 21862, 27396) },
+ { CDF3( 5915, 15772, 22148) },
+ { CDF3( 2786, 7779, 12047) },
+ { CDF3(29246, 32450, 32663) },
+ { CDF3(18696, 29929, 31818) },
+ { CDF3(10510, 23369, 28560) },
+ { CDF3( 6229, 16499, 23125) },
+ { CDF3( 2608, 7448, 11705) },
+ { CDF3(30753, 32710, 32748) },
+ { CDF3(21638, 31487, 32503) },
+ { CDF3(12937, 26854, 30870) },
+ { CDF3( 8182, 20596, 26970) },
+ { CDF3( 3637, 10269, 15497) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 5244, 12150, 16906) },
+ { CDF3(20486, 26858, 29701) },
+ { CDF3( 7756, 18317, 23735) },
+ { CDF3( 3452, 9256, 13146) },
+ { CDF3( 2020, 5206, 8229) },
+ { CDF3( 1801, 4993, 7903) },
+ { CDF3(27051, 31858, 32531) },
+ { CDF3(15988, 27531, 30619) },
+ { CDF3( 9188, 21484, 26719) },
+ { CDF3( 6273, 17186, 23800) },
+ { CDF3( 3108, 9355, 14764) },
+ { CDF3(31076, 32520, 32680) },
+ { CDF3(18119, 30037, 31850) },
+ { CDF3(10244, 22969, 27472) },
+ { CDF3( 4692, 14077, 19273) },
+ { CDF3( 3694, 11677, 17556) },
+ { CDF3(30060, 32581, 32720) },
+ { CDF3(21011, 30775, 32120) },
+ { CDF3(11931, 24820, 29289) },
+ { CDF3( 7119, 17662, 24356) },
+ { CDF3( 3833, 10706, 16304) },
+ { CDF3(31954, 32731, 32748) },
+ { CDF3(23913, 31724, 32489) },
+ { CDF3(15520, 28060, 31286) },
+ { CDF3(11517, 23008, 28571) },
+ { CDF3( 6193, 14508, 20629) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 1035, 2807, 4156) },
+ { CDF3(13162, 18138, 20939) },
+ { CDF3( 2696, 6633, 8755) },
+ { CDF3( 1373, 4161, 6853) },
+ { CDF3( 1099, 2746, 4716) },
+ { CDF3( 340, 1021, 1599) },
+ { CDF3(22826, 30419, 32135) },
+ { CDF3(10395, 21762, 26942) },
+ { CDF3( 4726, 12407, 17361) },
+ { CDF3( 2447, 7080, 10593) },
+ { CDF3( 1227, 3717, 6011) },
+ { CDF3(28156, 31424, 31934) },
+ { CDF3(16915, 27754, 30373) },
+ { CDF3( 9148, 20990, 26431) },
+ { CDF3( 5950, 15515, 21148) },
+ { CDF3( 2492, 7327, 11526) },
+ { CDF3(30602, 32477, 32670) },
+ { CDF3(20026, 29955, 31568) },
+ { CDF3(11220, 23628, 28105) },
+ { CDF3( 6652, 17019, 22973) },
+ { CDF3( 3064, 8536, 13043) },
+ { CDF3(31769, 32724, 32748) },
+ { CDF3(22230, 30887, 32373) },
+ { CDF3(12234, 25079, 29731) },
+ { CDF3( 7326, 18816, 25353) },
+ { CDF3( 3933, 10907, 16616) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ },
+ }, .dc_sign = {
+ { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } },
+ { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } },
+ }, .br_tok = {
+ {
+ {
+ { CDF3(14995, 21341, 24749) },
+ { CDF3(13158, 20289, 24601) },
+ { CDF3( 8941, 15326, 19876) },
+ { CDF3( 6297, 11541, 15807) },
+ { CDF3( 4817, 9029, 12776) },
+ { CDF3( 3731, 7273, 10627) },
+ { CDF3( 1847, 3617, 5354) },
+ { CDF3(14472, 19659, 22343) },
+ { CDF3(16806, 24162, 27533) },
+ { CDF3(12900, 20404, 24713) },
+ { CDF3( 9411, 16112, 20797) },
+ { CDF3( 7056, 12697, 17148) },
+ { CDF3( 5544, 10339, 14460) },
+ { CDF3( 2954, 5704, 8319) },
+ { CDF3(12464, 18071, 21354) },
+ { CDF3(15482, 22528, 26034) },
+ { CDF3(12070, 19269, 23624) },
+ { CDF3( 8953, 15406, 20106) },
+ { CDF3( 7027, 12730, 17220) },
+ { CDF3( 5887, 10913, 15140) },
+ { CDF3( 3793, 7278, 10447) },
+ }, {
+ { CDF3(15571, 22232, 25749) },
+ { CDF3(14506, 21575, 25374) },
+ { CDF3(10189, 17089, 21569) },
+ { CDF3( 7316, 13301, 17915) },
+ { CDF3( 5783, 10912, 15190) },
+ { CDF3( 4760, 9155, 13088) },
+ { CDF3( 2993, 5966, 8774) },
+ { CDF3(23424, 28903, 30778) },
+ { CDF3(20775, 27666, 30290) },
+ { CDF3(16474, 24410, 28299) },
+ { CDF3(12471, 20180, 24987) },
+ { CDF3( 9410, 16487, 21439) },
+ { CDF3( 7536, 13614, 18529) },
+ { CDF3( 5048, 9586, 13549) },
+ { CDF3(21090, 27290, 29756) },
+ { CDF3(20796, 27402, 30026) },
+ { CDF3(17819, 25485, 28969) },
+ { CDF3(13860, 21909, 26462) },
+ { CDF3(11002, 18494, 23529) },
+ { CDF3( 8953, 15929, 20897) },
+ { CDF3( 6448, 11918, 16454) },
+ },
+ }, {
+ {
+ { CDF3(15999, 22208, 25449) },
+ { CDF3(13050, 19988, 24122) },
+ { CDF3( 8594, 14864, 19378) },
+ { CDF3( 6033, 11079, 15238) },
+ { CDF3( 4554, 8683, 12347) },
+ { CDF3( 3672, 7139, 10337) },
+ { CDF3( 1900, 3771, 5576) },
+ { CDF3(15788, 21340, 23949) },
+ { CDF3(16825, 24235, 27758) },
+ { CDF3(12873, 20402, 24810) },
+ { CDF3( 9590, 16363, 21094) },
+ { CDF3( 7352, 13209, 17733) },
+ { CDF3( 5960, 10989, 15184) },
+ { CDF3( 3232, 6234, 9007) },
+ { CDF3(15761, 20716, 23224) },
+ { CDF3(19318, 25989, 28759) },
+ { CDF3(15529, 23094, 26929) },
+ { CDF3(11662, 18989, 23641) },
+ { CDF3( 8955, 15568, 20366) },
+ { CDF3( 7281, 13106, 17708) },
+ { CDF3( 4248, 8059, 11440) },
+ }, {
+ { CDF3(14899, 21217, 24503) },
+ { CDF3(13519, 20283, 24047) },
+ { CDF3( 9429, 15966, 20365) },
+ { CDF3( 6700, 12355, 16652) },
+ { CDF3( 5088, 9704, 13716) },
+ { CDF3( 4243, 8154, 11731) },
+ { CDF3( 2702, 5364, 7861) },
+ { CDF3(22745, 28388, 30454) },
+ { CDF3(20235, 27146, 29922) },
+ { CDF3(15896, 23715, 27637) },
+ { CDF3(11840, 19350, 24131) },
+ { CDF3( 9122, 15932, 20880) },
+ { CDF3( 7488, 13581, 18362) },
+ { CDF3( 5114, 9568, 13370) },
+ { CDF3(20845, 26553, 28932) },
+ { CDF3(20981, 27372, 29884) },
+ { CDF3(17781, 25335, 28785) },
+ { CDF3(13760, 21708, 26297) },
+ { CDF3(10975, 18415, 23365) },
+ { CDF3( 9045, 15789, 20686) },
+ { CDF3( 6130, 11199, 15423) },
+ },
+ }, {
+ {
+ { CDF3(13549, 19724, 23158) },
+ { CDF3(11844, 18382, 22246) },
+ { CDF3( 7919, 13619, 17773) },
+ { CDF3( 5486, 10143, 13946) },
+ { CDF3( 4166, 7983, 11324) },
+ { CDF3( 3364, 6506, 9427) },
+ { CDF3( 1598, 3160, 4674) },
+ { CDF3(15281, 20979, 23781) },
+ { CDF3(14939, 22119, 25952) },
+ { CDF3(11363, 18407, 22812) },
+ { CDF3( 8609, 14857, 19370) },
+ { CDF3( 6737, 12184, 16480) },
+ { CDF3( 5506, 10263, 14262) },
+ { CDF3( 2990, 5786, 8380) },
+ { CDF3(20249, 25253, 27417) },
+ { CDF3(21070, 27518, 30001) },
+ { CDF3(16854, 24469, 28074) },
+ { CDF3(12864, 20486, 25000) },
+ { CDF3( 9962, 16978, 21778) },
+ { CDF3( 8074, 14338, 19048) },
+ { CDF3( 4494, 8479, 11906) },
+ }, {
+ { CDF3(13960, 19617, 22829) },
+ { CDF3(11150, 17341, 21228) },
+ { CDF3( 7150, 12964, 17190) },
+ { CDF3( 5331, 10002, 13867) },
+ { CDF3( 4167, 7744, 11057) },
+ { CDF3( 3480, 6629, 9646) },
+ { CDF3( 1883, 3784, 5686) },
+ { CDF3(18752, 25660, 28912) },
+ { CDF3(16968, 24586, 28030) },
+ { CDF3(13520, 21055, 25313) },
+ { CDF3(10453, 17626, 22280) },
+ { CDF3( 8386, 14505, 19116) },
+ { CDF3( 6742, 12595, 17008) },
+ { CDF3( 4273, 8140, 11499) },
+ { CDF3(22120, 27827, 30233) },
+ { CDF3(20563, 27358, 29895) },
+ { CDF3(17076, 24644, 28153) },
+ { CDF3(13362, 20942, 25309) },
+ { CDF3(10794, 17965, 22695) },
+ { CDF3( 9014, 15652, 20319) },
+ { CDF3( 5708, 10512, 14497) },
+ },
+ }, {
+ {
+ { CDF3( 5705, 10930, 15725) },
+ { CDF3( 7946, 12765, 16115) },
+ { CDF3( 6801, 12123, 16226) },
+ { CDF3( 5462, 10135, 14200) },
+ { CDF3( 4189, 8011, 11507) },
+ { CDF3( 3191, 6229, 9408) },
+ { CDF3( 1057, 2137, 3212) },
+ { CDF3(10018, 17067, 21491) },
+ { CDF3( 7380, 12582, 16453) },
+ { CDF3( 6068, 10845, 14339) },
+ { CDF3( 5098, 9198, 12555) },
+ { CDF3( 4312, 8010, 11119) },
+ { CDF3( 3700, 6966, 9781) },
+ { CDF3( 1693, 3326, 4887) },
+ { CDF3(18757, 24930, 27774) },
+ { CDF3(17648, 24596, 27817) },
+ { CDF3(14707, 22052, 26026) },
+ { CDF3(11720, 18852, 23292) },
+ { CDF3( 9357, 15952, 20525) },
+ { CDF3( 7810, 13753, 18210) },
+ { CDF3( 3879, 7333, 10328) },
+ }, {
+ { CDF3( 8278, 13242, 15922) },
+ { CDF3(10547, 15867, 18919) },
+ { CDF3( 9106, 15842, 20609) },
+ { CDF3( 6833, 13007, 17218) },
+ { CDF3( 4811, 9712, 13923) },
+ { CDF3( 3985, 7352, 11128) },
+ { CDF3( 1688, 3458, 5262) },
+ { CDF3(12951, 21861, 26510) },
+ { CDF3( 9788, 16044, 20276) },
+ { CDF3( 6309, 11244, 14870) },
+ { CDF3( 5183, 9349, 12566) },
+ { CDF3( 4389, 8229, 11492) },
+ { CDF3( 3633, 6945, 10620) },
+ { CDF3( 3600, 6847, 9907) },
+ { CDF3(21748, 28137, 30255) },
+ { CDF3(19436, 26581, 29560) },
+ { CDF3(16359, 24201, 27953) },
+ { CDF3(13961, 21693, 25871) },
+ { CDF3(11544, 18686, 23322) },
+ { CDF3( 9372, 16462, 20952) },
+ { CDF3( 6138, 11210, 15390) },
+ },
+ },
+ },
+ }, [2] = {
+ .skip = {
+ {
+ { CDF1(29614) }, { CDF1( 9068) }, { CDF1(12924) },
+ { CDF1(19538) }, { CDF1(17737) }, { CDF1(24619) },
+ { CDF1(30642) }, { CDF1( 4119) }, { CDF1(16026) },
+ { CDF1(25657) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(31957) }, { CDF1( 3230) }, { CDF1(11153) },
+ { CDF1(18123) }, { CDF1(20143) }, { CDF1(26536) },
+ { CDF1(31986) }, { CDF1( 3050) }, { CDF1(14603) },
+ { CDF1(25155) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(32363) }, { CDF1(10692) }, { CDF1(19090) },
+ { CDF1(24357) }, { CDF1(24442) }, { CDF1(28312) },
+ { CDF1(32169) }, { CDF1( 3648) }, { CDF1(15690) },
+ { CDF1(26815) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(30669) }, { CDF1( 3832) }, { CDF1(11663) },
+ { CDF1(18889) }, { CDF1(19782) }, { CDF1(23313) },
+ { CDF1(31330) }, { CDF1( 5124) }, { CDF1(18719) },
+ { CDF1(28468) }, { CDF1( 3082) }, { CDF1(20982) },
+ { CDF1(29443) },
+ }, {
+ { CDF1(28573) }, { CDF1( 3183) }, { CDF1(17802) },
+ { CDF1(25977) }, { CDF1(26677) }, { CDF1(27832) },
+ { CDF1(32387) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ },
+ }, .eob_bin_16 = {
+ {
+ { CDF4( 4016, 4897, 8881, 14968) },
+ { CDF4( 716, 1105, 2646, 10056) },
+ }, {
+ { CDF4(11139, 13270, 18241, 23566) },
+ { CDF4( 3192, 5032, 10297, 19755) },
+ },
+ }, .eob_bin_32 = {
+ {
+ { CDF5( 2515, 3003, 4452, 8162, 16041) },
+ { CDF5( 574, 821, 1836, 5089, 13128) },
+ }, {
+ { CDF5(13468, 16303, 20361, 25105, 29281) },
+ { CDF5( 3542, 5502, 10415, 16760, 25644) },
+ },
+ }, .eob_bin_64 = {
+ {
+ { CDF6( 2374, 2772, 4583, 7276, 12288, 19706) },
+ { CDF6( 497, 810, 1315, 3000, 7004, 15641) },
+ }, {
+ { CDF6(15050, 17126, 21410, 24886, 28156, 30726) },
+ { CDF6( 4034, 6290, 10235, 14982, 21214, 28491) },
+ },
+ }, .eob_bin_128 = {
+ {
+ { CDF7( 1366, 1738, 2527, 5016, 9355, 15797, 24643) },
+ { CDF7( 354, 558, 944, 2760, 7287, 14037, 21779) },
+ }, {
+ { CDF7(13627, 16246, 20173, 24429, 27948, 30415, 31863) },
+ { CDF7( 6275, 9889, 14769, 23164, 27988, 30493, 32272) },
+ },
+ }, .eob_bin_256 = {
+ {
+ { CDF8( 3089, 3920, 6038, 9460,
+ 14266, 19881, 25766, 29176) },
+ { CDF8( 1084, 2358, 3488, 5122,
+ 11483, 18103, 26023, 29799) },
+ }, {
+ { CDF8(11514, 13794, 17480, 20754,
+ 24361, 27378, 29492, 31277) },
+ { CDF8( 6571, 9610, 15516, 21826,
+ 29092, 30829, 31842, 32708) },
+ },
+ }, .eob_bin_512 = {
+ { CDF9( 2624, 3936, 6480, 9686, 13979,
+ 17726, 23267, 28410, 31078) },
+ { CDF9(12015, 14769, 19588, 22052, 24222,
+ 25812, 27300, 29219, 32114) },
+ }, .eob_bin_1024 = {
+ { CDF10( 2784, 3831, 7041, 10521, 14847,
+ 18844, 23155, 26682, 29229, 31045) },
+ { CDF10( 9577, 12466, 17739, 20750, 22061,
+ 23215, 24601, 25483, 25843, 32056) },
+ }, .eob_hi_bit = {
+ {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(18983) },
+ { CDF1(20512) }, { CDF1(14885) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20090) },
+ { CDF1(19444) }, { CDF1(17286) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(19139) },
+ { CDF1(21487) }, { CDF1(18959) }, { CDF1(20910) },
+ { CDF1(19089) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20536) },
+ { CDF1(20664) }, { CDF1(20625) }, { CDF1(19123) },
+ { CDF1(14862) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(19833) },
+ { CDF1(21502) }, { CDF1(17485) }, { CDF1(20267) },
+ { CDF1(18353) }, { CDF1(23329) }, { CDF1(21478) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(22041) },
+ { CDF1(23434) }, { CDF1(20001) }, { CDF1(20554) },
+ { CDF1(20951) }, { CDF1(20145) }, { CDF1(15562) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(23312) },
+ { CDF1(21607) }, { CDF1(16526) }, { CDF1(18957) },
+ { CDF1(18034) }, { CDF1(18934) }, { CDF1(24247) },
+ { CDF1(16921) }, { CDF1(17080) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(26579) },
+ { CDF1(24910) }, { CDF1(18637) }, { CDF1(19800) },
+ { CDF1(20388) }, { CDF1( 9887) }, { CDF1(15642) },
+ { CDF1(30198) }, { CDF1(24721) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(26998) },
+ { CDF1(16737) }, { CDF1(17838) }, { CDF1(18922) },
+ { CDF1(19515) }, { CDF1(18636) }, { CDF1(17333) },
+ { CDF1(15776) }, { CDF1(22658) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ },
+ }, .eob_base_tok = {
+ {
+ {
+ { CDF2(20092, 30774) }, { CDF2(30695, 32020) },
+ { CDF2(31131, 32103) }, { CDF2(28666, 30870) },
+ }, {
+ { CDF2(27258, 31095) }, { CDF2(31804, 32623) },
+ { CDF2(31763, 32528) }, { CDF2(31438, 32506) },
+ },
+ }, {
+ {
+ { CDF2(18049, 30489) }, { CDF2(31706, 32286) },
+ { CDF2(32163, 32473) }, { CDF2(31550, 32184) },
+ }, {
+ { CDF2(27116, 30842) }, { CDF2(31971, 32598) },
+ { CDF2(32088, 32576) }, { CDF2(32067, 32664) },
+ },
+ }, {
+ {
+ { CDF2(12854, 29093) }, { CDF2(32272, 32558) },
+ { CDF2(32667, 32729) }, { CDF2(32306, 32585) },
+ }, {
+ { CDF2(25476, 30366) }, { CDF2(32169, 32687) },
+ { CDF2(32479, 32689) }, { CDF2(31673, 32634) },
+ },
+ }, {
+ {
+ { CDF2( 2809, 19301) }, { CDF2(32205, 32622) },
+ { CDF2(32338, 32730) }, { CDF2(31786, 32616) },
+ }, {
+ { CDF2(22737, 29105) }, { CDF2(30810, 32362) },
+ { CDF2(30014, 32627) }, { CDF2(30528, 32574) },
+ },
+ }, {
+ {
+ { CDF2( 935, 3382) }, { CDF2(30789, 31909) },
+ { CDF2(32466, 32756) }, { CDF2(30860, 32513) },
+ }, {
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ },
+ },
+ }, .base_tok = {
+ {
+ {
+ { CDF3( 8896, 16227, 20630) },
+ { CDF3(23629, 31782, 32527) },
+ { CDF3(15173, 27755, 31321) },
+ { CDF3(10158, 21233, 27382) },
+ { CDF3( 6420, 14857, 21558) },
+ { CDF3( 3269, 8155, 12646) },
+ { CDF3(24835, 32009, 32496) },
+ { CDF3(16509, 28421, 31579) },
+ { CDF3(10957, 21514, 27418) },
+ { CDF3( 7881, 15930, 22096) },
+ { CDF3( 5388, 10960, 15918) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(20745, 30773, 32093) },
+ { CDF3(15200, 27221, 30861) },
+ { CDF3(13032, 20873, 25667) },
+ { CDF3(12285, 18663, 23494) },
+ { CDF3(11563, 17481, 21489) },
+ { CDF3(26260, 31982, 32320) },
+ { CDF3(15397, 28083, 31100) },
+ { CDF3( 9742, 19217, 24824) },
+ { CDF3( 3261, 9629, 15362) },
+ { CDF3( 1480, 4322, 7499) },
+ { CDF3(27599, 32256, 32460) },
+ { CDF3(16857, 27659, 30774) },
+ { CDF3( 9551, 18290, 23748) },
+ { CDF3( 3052, 8933, 14103) },
+ { CDF3( 2021, 5910, 9787) },
+ { CDF3(29005, 32015, 32392) },
+ { CDF3(17677, 27694, 30863) },
+ { CDF3( 9204, 17356, 23219) },
+ { CDF3( 2403, 7516, 12814) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3(10808, 22056, 26896) },
+ { CDF3(25739, 32313, 32676) },
+ { CDF3(17288, 30203, 32221) },
+ { CDF3(11359, 24878, 29896) },
+ { CDF3( 6949, 17767, 24893) },
+ { CDF3( 4287, 11796, 18071) },
+ { CDF3(27880, 32521, 32705) },
+ { CDF3(19038, 31004, 32414) },
+ { CDF3(12564, 26345, 30768) },
+ { CDF3( 8269, 19947, 26779) },
+ { CDF3( 5674, 14657, 21674) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(25742, 32319, 32671) },
+ { CDF3(19557, 31164, 32454) },
+ { CDF3(13381, 26381, 30755) },
+ { CDF3(10101, 21466, 26722) },
+ { CDF3( 9209, 19650, 26825) },
+ { CDF3(27107, 31917, 32432) },
+ { CDF3(18056, 28893, 31203) },
+ { CDF3(10200, 21434, 26764) },
+ { CDF3( 4660, 12913, 19502) },
+ { CDF3( 2368, 6930, 12504) },
+ { CDF3(26960, 32158, 32613) },
+ { CDF3(18628, 30005, 32031) },
+ { CDF3(10233, 22442, 28232) },
+ { CDF3( 5471, 14630, 21516) },
+ { CDF3( 3235, 10767, 17109) },
+ { CDF3(27696, 32440, 32692) },
+ { CDF3(20032, 31167, 32438) },
+ { CDF3( 8700, 21341, 28442) },
+ { CDF3( 5662, 14831, 21795) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 9704, 17294, 21132) },
+ { CDF3(26762, 32278, 32633) },
+ { CDF3(18382, 29620, 31819) },
+ { CDF3(10891, 23475, 28723) },
+ { CDF3( 6358, 16583, 23309) },
+ { CDF3( 3248, 9118, 14141) },
+ { CDF3(27204, 32573, 32699) },
+ { CDF3(19818, 30824, 32329) },
+ { CDF3(11772, 25120, 30041) },
+ { CDF3( 6995, 18033, 25039) },
+ { CDF3( 3752, 10442, 16098) },
+ { CDF3(27222, 32256, 32559) },
+ { CDF3(15356, 28399, 31475) },
+ { CDF3( 8821, 20635, 27057) },
+ { CDF3( 5511, 14404, 21239) },
+ { CDF3( 2935, 8222, 13051) },
+ { CDF3(24875, 32120, 32529) },
+ { CDF3(15233, 28265, 31445) },
+ { CDF3( 8605, 20570, 26932) },
+ { CDF3( 5431, 14413, 21196) },
+ { CDF3( 2994, 8341, 13223) },
+ { CDF3(28201, 32604, 32700) },
+ { CDF3(21041, 31446, 32456) },
+ { CDF3(13221, 26213, 30475) },
+ { CDF3( 8255, 19385, 26037) },
+ { CDF3( 4930, 12585, 18830) },
+ { CDF3(28768, 32448, 32627) },
+ { CDF3(19705, 30561, 32021) },
+ { CDF3(11572, 23589, 28220) },
+ { CDF3( 5532, 15034, 21446) },
+ { CDF3( 2460, 7150, 11456) },
+ { CDF3(29874, 32619, 32699) },
+ { CDF3(21621, 31071, 32201) },
+ { CDF3(12511, 24747, 28992) },
+ { CDF3( 6281, 16395, 22748) },
+ { CDF3( 3246, 9278, 14497) },
+ { CDF3(29715, 32625, 32712) },
+ { CDF3(20958, 31011, 32283) },
+ { CDF3(11233, 23671, 28806) },
+ { CDF3( 6012, 16128, 22868) },
+ { CDF3( 3427, 9851, 15414) },
+ }, {
+ { CDF3(11016, 22111, 26794) },
+ { CDF3(25946, 32357, 32677) },
+ { CDF3(17890, 30452, 32252) },
+ { CDF3(11678, 25142, 29816) },
+ { CDF3( 6720, 17534, 24584) },
+ { CDF3( 4230, 11665, 17820) },
+ { CDF3(28400, 32623, 32747) },
+ { CDF3(21164, 31668, 32575) },
+ { CDF3(13572, 27388, 31182) },
+ { CDF3( 8234, 20750, 27358) },
+ { CDF3( 5065, 14055, 20897) },
+ { CDF3(28981, 32547, 32705) },
+ { CDF3(18681, 30543, 32239) },
+ { CDF3(10919, 24075, 29286) },
+ { CDF3( 6431, 17199, 24077) },
+ { CDF3( 3819, 10464, 16618) },
+ { CDF3(26870, 32467, 32693) },
+ { CDF3(19041, 30831, 32347) },
+ { CDF3(11794, 25211, 30016) },
+ { CDF3( 6888, 18019, 24970) },
+ { CDF3( 4370, 12363, 18992) },
+ { CDF3(29578, 32670, 32744) },
+ { CDF3(23159, 32007, 32613) },
+ { CDF3(15315, 28669, 31676) },
+ { CDF3( 9298, 22607, 28782) },
+ { CDF3( 6144, 15913, 22968) },
+ { CDF3(28110, 32499, 32669) },
+ { CDF3(21574, 30937, 32015) },
+ { CDF3(12759, 24818, 28727) },
+ { CDF3( 6545, 16761, 23042) },
+ { CDF3( 3649, 10597, 16833) },
+ { CDF3(28163, 32552, 32728) },
+ { CDF3(22101, 31469, 32464) },
+ { CDF3(13160, 25472, 30143) },
+ { CDF3( 7303, 18684, 25468) },
+ { CDF3( 5241, 13975, 20955) },
+ { CDF3(28400, 32631, 32744) },
+ { CDF3(22104, 31793, 32603) },
+ { CDF3(13557, 26571, 30846) },
+ { CDF3( 7749, 19861, 26675) },
+ { CDF3( 4873, 14030, 21234) },
+ },
+ }, {
+ {
+ { CDF3( 9800, 17635, 21073) },
+ { CDF3(26153, 31885, 32527) },
+ { CDF3(15038, 27852, 31006) },
+ { CDF3( 8718, 20564, 26486) },
+ { CDF3( 5128, 14076, 20514) },
+ { CDF3( 2636, 7566, 11925) },
+ { CDF3(27551, 32504, 32701) },
+ { CDF3(18310, 30054, 32100) },
+ { CDF3(10211, 23420, 29082) },
+ { CDF3( 6222, 16876, 23916) },
+ { CDF3( 3462, 9954, 15498) },
+ { CDF3(29991, 32633, 32721) },
+ { CDF3(19883, 30751, 32201) },
+ { CDF3(11141, 24184, 29285) },
+ { CDF3( 6420, 16940, 23774) },
+ { CDF3( 3392, 9753, 15118) },
+ { CDF3(28465, 32616, 32712) },
+ { CDF3(19850, 30702, 32244) },
+ { CDF3(10983, 24024, 29223) },
+ { CDF3( 6294, 16770, 23582) },
+ { CDF3( 3244, 9283, 14509) },
+ { CDF3(30023, 32717, 32748) },
+ { CDF3(22940, 32032, 32626) },
+ { CDF3(14282, 27928, 31473) },
+ { CDF3( 8562, 21327, 27914) },
+ { CDF3( 4846, 13393, 19919) },
+ { CDF3(29981, 32590, 32695) },
+ { CDF3(20465, 30963, 32166) },
+ { CDF3(11479, 23579, 28195) },
+ { CDF3( 5916, 15648, 22073) },
+ { CDF3( 3031, 8605, 13398) },
+ { CDF3(31146, 32691, 32739) },
+ { CDF3(23106, 31724, 32444) },
+ { CDF3(13783, 26738, 30439) },
+ { CDF3( 7852, 19468, 25807) },
+ { CDF3( 3860, 11124, 16853) },
+ { CDF3(31014, 32724, 32748) },
+ { CDF3(23629, 32109, 32628) },
+ { CDF3(14747, 28115, 31403) },
+ { CDF3( 8545, 21242, 27478) },
+ { CDF3( 4574, 12781, 19067) },
+ }, {
+ { CDF3( 9185, 19694, 24688) },
+ { CDF3(26081, 31985, 32621) },
+ { CDF3(16015, 29000, 31787) },
+ { CDF3(10542, 23690, 29206) },
+ { CDF3( 6732, 17945, 24677) },
+ { CDF3( 3916, 11039, 16722) },
+ { CDF3(28224, 32566, 32744) },
+ { CDF3(19100, 31138, 32485) },
+ { CDF3(12528, 26620, 30879) },
+ { CDF3( 7741, 20277, 26885) },
+ { CDF3( 4566, 12845, 18990) },
+ { CDF3(29933, 32593, 32718) },
+ { CDF3(17670, 30333, 32155) },
+ { CDF3(10385, 23600, 28909) },
+ { CDF3( 6243, 16236, 22407) },
+ { CDF3( 3976, 10389, 16017) },
+ { CDF3(28377, 32561, 32738) },
+ { CDF3(19366, 31175, 32482) },
+ { CDF3(13327, 27175, 31094) },
+ { CDF3( 8258, 20769, 27143) },
+ { CDF3( 4703, 13198, 19527) },
+ { CDF3(31086, 32706, 32748) },
+ { CDF3(22853, 31902, 32583) },
+ { CDF3(14759, 28186, 31419) },
+ { CDF3( 9284, 22382, 28348) },
+ { CDF3( 5585, 15192, 21868) },
+ { CDF3(28291, 32652, 32746) },
+ { CDF3(19849, 32107, 32571) },
+ { CDF3(14834, 26818, 29214) },
+ { CDF3(10306, 22594, 28672) },
+ { CDF3( 6615, 17384, 23384) },
+ { CDF3(28947, 32604, 32745) },
+ { CDF3(25625, 32289, 32646) },
+ { CDF3(18758, 28672, 31403) },
+ { CDF3(10017, 23430, 28523) },
+ { CDF3( 6862, 15269, 22131) },
+ { CDF3(23933, 32509, 32739) },
+ { CDF3(19927, 31495, 32631) },
+ { CDF3(11903, 26023, 30621) },
+ { CDF3( 7026, 20094, 27252) },
+ { CDF3( 5998, 18106, 24437) },
+ },
+ }, {
+ {
+ { CDF3( 4456, 11274, 15533) },
+ { CDF3(21219, 29079, 31616) },
+ { CDF3(11173, 23774, 28567) },
+ { CDF3( 7282, 18293, 24263) },
+ { CDF3( 4890, 13286, 19115) },
+ { CDF3( 1890, 5508, 8659) },
+ { CDF3(26651, 32136, 32647) },
+ { CDF3(14630, 28254, 31455) },
+ { CDF3( 8716, 21287, 27395) },
+ { CDF3( 5615, 15331, 22008) },
+ { CDF3( 2675, 7700, 12150) },
+ { CDF3(29954, 32526, 32690) },
+ { CDF3(16126, 28982, 31633) },
+ { CDF3( 9030, 21361, 27352) },
+ { CDF3( 5411, 14793, 21271) },
+ { CDF3( 2943, 8422, 13163) },
+ { CDF3(29539, 32601, 32730) },
+ { CDF3(18125, 30385, 32201) },
+ { CDF3(10422, 24090, 29468) },
+ { CDF3( 6468, 17487, 24438) },
+ { CDF3( 2970, 8653, 13531) },
+ { CDF3(30912, 32715, 32748) },
+ { CDF3(20666, 31373, 32497) },
+ { CDF3(12509, 26640, 30917) },
+ { CDF3( 8058, 20629, 27290) },
+ { CDF3( 4231, 12006, 18052) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3(10202, 20633, 25484) },
+ { CDF3(27336, 31445, 32352) },
+ { CDF3(12420, 24384, 28552) },
+ { CDF3( 7648, 18115, 23856) },
+ { CDF3( 5662, 14341, 19902) },
+ { CDF3( 3611, 10328, 15390) },
+ { CDF3(30945, 32616, 32736) },
+ { CDF3(18682, 30505, 32253) },
+ { CDF3(11513, 25336, 30203) },
+ { CDF3( 7449, 19452, 26148) },
+ { CDF3( 4482, 13051, 18886) },
+ { CDF3(32022, 32690, 32747) },
+ { CDF3(18578, 30501, 32146) },
+ { CDF3(11249, 23368, 28631) },
+ { CDF3( 5645, 16958, 22158) },
+ { CDF3( 5009, 11444, 16637) },
+ { CDF3(31357, 32710, 32748) },
+ { CDF3(21552, 31494, 32504) },
+ { CDF3(13891, 27677, 31340) },
+ { CDF3( 9051, 22098, 28172) },
+ { CDF3( 5190, 13377, 19486) },
+ { CDF3(32364, 32740, 32748) },
+ { CDF3(24839, 31907, 32551) },
+ { CDF3(17160, 28779, 31696) },
+ { CDF3(12452, 24137, 29602) },
+ { CDF3( 6165, 15389, 22477) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 2575, 7281, 11077) },
+ { CDF3(14002, 20866, 25402) },
+ { CDF3( 6343, 15056, 19658) },
+ { CDF3( 4474, 11858, 17041) },
+ { CDF3( 2865, 8299, 12534) },
+ { CDF3( 1344, 3949, 6391) },
+ { CDF3(24720, 31239, 32459) },
+ { CDF3(12585, 25356, 29968) },
+ { CDF3( 7181, 18246, 24444) },
+ { CDF3( 5025, 13667, 19885) },
+ { CDF3( 2521, 7304, 11605) },
+ { CDF3(29908, 32252, 32584) },
+ { CDF3(17421, 29156, 31575) },
+ { CDF3( 9889, 22188, 27782) },
+ { CDF3( 5878, 15647, 22123) },
+ { CDF3( 2814, 8665, 13323) },
+ { CDF3(30183, 32568, 32713) },
+ { CDF3(18528, 30195, 32049) },
+ { CDF3(10982, 24606, 29657) },
+ { CDF3( 6957, 18165, 25231) },
+ { CDF3( 3508, 10118, 15468) },
+ { CDF3(31761, 32736, 32748) },
+ { CDF3(21041, 31328, 32546) },
+ { CDF3(12568, 26732, 31166) },
+ { CDF3( 8052, 20720, 27733) },
+ { CDF3( 4336, 12192, 18396) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ },
+ }, .dc_sign = {
+ { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } },
+ { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } },
+ }, .br_tok = {
+ {
+ {
+ { CDF3(16138, 22223, 25509) },
+ { CDF3(15347, 22430, 26332) },
+ { CDF3( 9614, 16736, 21332) },
+ { CDF3( 6600, 12275, 16907) },
+ { CDF3( 4811, 9424, 13547) },
+ { CDF3( 3748, 7809, 11420) },
+ { CDF3( 2254, 4587, 6890) },
+ { CDF3(15196, 20284, 23177) },
+ { CDF3(18317, 25469, 28451) },
+ { CDF3(13918, 21651, 25842) },
+ { CDF3(10052, 17150, 21995) },
+ { CDF3( 7499, 13630, 18587) },
+ { CDF3( 6158, 11417, 16003) },
+ { CDF3( 4014, 7785, 11252) },
+ { CDF3(15048, 21067, 24384) },
+ { CDF3(18202, 25346, 28553) },
+ { CDF3(14302, 22019, 26356) },
+ { CDF3(10839, 18139, 23166) },
+ { CDF3( 8715, 15744, 20806) },
+ { CDF3( 7536, 13576, 18544) },
+ { CDF3( 5413, 10335, 14498) },
+ }, {
+ { CDF3(17394, 24501, 27895) },
+ { CDF3(15889, 23420, 27185) },
+ { CDF3(11561, 19133, 23870) },
+ { CDF3( 8285, 14812, 19844) },
+ { CDF3( 6496, 12043, 16550) },
+ { CDF3( 4771, 9574, 13677) },
+ { CDF3( 3603, 6830, 10144) },
+ { CDF3(21656, 27704, 30200) },
+ { CDF3(21324, 27915, 30511) },
+ { CDF3(17327, 25336, 28997) },
+ { CDF3(13417, 21381, 26033) },
+ { CDF3(10132, 17425, 22338) },
+ { CDF3( 8580, 15016, 19633) },
+ { CDF3( 5694, 11477, 16411) },
+ { CDF3(24116, 29780, 31450) },
+ { CDF3(23853, 29695, 31591) },
+ { CDF3(20085, 27614, 30428) },
+ { CDF3(15326, 24335, 28575) },
+ { CDF3(11814, 19472, 24810) },
+ { CDF3(10221, 18611, 24767) },
+ { CDF3( 7689, 14558, 20321) },
+ },
+ }, {
+ {
+ { CDF3(16214, 22380, 25770) },
+ { CDF3(14213, 21304, 25295) },
+ { CDF3( 9213, 15823, 20455) },
+ { CDF3( 6395, 11758, 16139) },
+ { CDF3( 4779, 9187, 13066) },
+ { CDF3( 3821, 7501, 10953) },
+ { CDF3( 2293, 4567, 6795) },
+ { CDF3(15859, 21283, 23820) },
+ { CDF3(18404, 25602, 28726) },
+ { CDF3(14325, 21980, 26206) },
+ { CDF3(10669, 17937, 22720) },
+ { CDF3( 8297, 14642, 19447) },
+ { CDF3( 6746, 12389, 16893) },
+ { CDF3( 4324, 8251, 11770) },
+ { CDF3(16532, 21631, 24475) },
+ { CDF3(20667, 27150, 29668) },
+ { CDF3(16728, 24510, 28175) },
+ { CDF3(12861, 20645, 25332) },
+ { CDF3(10076, 17361, 22417) },
+ { CDF3( 8395, 14940, 19963) },
+ { CDF3( 5731, 10683, 14912) },
+ }, {
+ { CDF3(14433, 21155, 24938) },
+ { CDF3(14658, 21716, 25545) },
+ { CDF3( 9923, 16824, 21557) },
+ { CDF3( 6982, 13052, 17721) },
+ { CDF3( 5419, 10503, 15050) },
+ { CDF3( 4852, 9162, 13014) },
+ { CDF3( 3271, 6395, 9630) },
+ { CDF3(22210, 27833, 30109) },
+ { CDF3(20750, 27368, 29821) },
+ { CDF3(16894, 24828, 28573) },
+ { CDF3(13247, 21276, 25757) },
+ { CDF3(10038, 17265, 22563) },
+ { CDF3( 8587, 14947, 20327) },
+ { CDF3( 5645, 11371, 15252) },
+ { CDF3(22027, 27526, 29714) },
+ { CDF3(23098, 29146, 31221) },
+ { CDF3(19886, 27341, 30272) },
+ { CDF3(15609, 23747, 28046) },
+ { CDF3(11993, 20065, 24939) },
+ { CDF3( 9637, 18267, 23671) },
+ { CDF3( 7625, 13801, 19144) },
+ },
+ }, {
+ {
+ { CDF3(14438, 20798, 24089) },
+ { CDF3(12621, 19203, 23097) },
+ { CDF3( 8177, 14125, 18402) },
+ { CDF3( 5674, 10501, 14456) },
+ { CDF3( 4236, 8239, 11733) },
+ { CDF3( 3447, 6750, 9806) },
+ { CDF3( 1986, 3950, 5864) },
+ { CDF3(16208, 22099, 24930) },
+ { CDF3(16537, 24025, 27585) },
+ { CDF3(12780, 20381, 24867) },
+ { CDF3( 9767, 16612, 21416) },
+ { CDF3( 7686, 13738, 18398) },
+ { CDF3( 6333, 11614, 15964) },
+ { CDF3( 3941, 7571, 10836) },
+ { CDF3(22819, 27422, 29202) },
+ { CDF3(22224, 28514, 30721) },
+ { CDF3(17660, 25433, 28913) },
+ { CDF3(13574, 21482, 26002) },
+ { CDF3(10629, 17977, 22938) },
+ { CDF3( 8612, 15298, 20265) },
+ { CDF3( 5607, 10491, 14596) },
+ }, {
+ { CDF3(13569, 19800, 23206) },
+ { CDF3(13128, 19924, 23869) },
+ { CDF3( 8329, 14841, 19403) },
+ { CDF3( 6130, 10976, 15057) },
+ { CDF3( 4682, 8839, 12518) },
+ { CDF3( 3656, 7409, 10588) },
+ { CDF3( 2577, 5099, 7412) },
+ { CDF3(22427, 28684, 30585) },
+ { CDF3(20913, 27750, 30139) },
+ { CDF3(15840, 24109, 27834) },
+ { CDF3(12308, 20029, 24569) },
+ { CDF3(10216, 16785, 21458) },
+ { CDF3( 8309, 14203, 19113) },
+ { CDF3( 6043, 11168, 15307) },
+ { CDF3(23166, 28901, 30998) },
+ { CDF3(21899, 28405, 30751) },
+ { CDF3(18413, 26091, 29443) },
+ { CDF3(15233, 23114, 27352) },
+ { CDF3(12683, 20472, 25288) },
+ { CDF3(10702, 18259, 23409) },
+ { CDF3( 8125, 14464, 19226) },
+ },
+ }, {
+ {
+ { CDF3( 9040, 14786, 18360) },
+ { CDF3( 9979, 15718, 19415) },
+ { CDF3( 7913, 13918, 18311) },
+ { CDF3( 5859, 10889, 15184) },
+ { CDF3( 4593, 8677, 12510) },
+ { CDF3( 3820, 7396, 10791) },
+ { CDF3( 1730, 3471, 5192) },
+ { CDF3(11803, 18365, 22709) },
+ { CDF3(11419, 18058, 22225) },
+ { CDF3( 9418, 15774, 20243) },
+ { CDF3( 7539, 13325, 17657) },
+ { CDF3( 6233, 11317, 15384) },
+ { CDF3( 5137, 9656, 13545) },
+ { CDF3( 2977, 5774, 8349) },
+ { CDF3(21207, 27246, 29640) },
+ { CDF3(19547, 26578, 29497) },
+ { CDF3(16169, 23871, 27690) },
+ { CDF3(12820, 20458, 25018) },
+ { CDF3(10224, 17332, 22214) },
+ { CDF3( 8526, 15048, 19884) },
+ { CDF3( 5037, 9410, 13118) },
+ }, {
+ { CDF3(12339, 17329, 20140) },
+ { CDF3(13505, 19895, 23225) },
+ { CDF3( 9847, 16944, 21564) },
+ { CDF3( 7280, 13256, 18348) },
+ { CDF3( 4712, 10009, 14454) },
+ { CDF3( 4361, 7914, 12477) },
+ { CDF3( 2870, 5628, 7995) },
+ { CDF3(20061, 25504, 28526) },
+ { CDF3(15235, 22878, 26145) },
+ { CDF3(12985, 19958, 24155) },
+ { CDF3( 9782, 16641, 21403) },
+ { CDF3( 9456, 16360, 20760) },
+ { CDF3( 6855, 12940, 18557) },
+ { CDF3( 5661, 10564, 15002) },
+ { CDF3(25656, 30602, 31894) },
+ { CDF3(22570, 29107, 31092) },
+ { CDF3(18917, 26423, 29541) },
+ { CDF3(15940, 23649, 27754) },
+ { CDF3(12803, 20581, 25219) },
+ { CDF3(11082, 18695, 23376) },
+ { CDF3( 7939, 14373, 19005) },
+ },
+ },
+ },
+ }, [3] = {
+ .skip = {
+ {
+ { CDF1(26887) }, { CDF1( 6729) }, { CDF1(10361) },
+ { CDF1(17442) }, { CDF1(15045) }, { CDF1(22478) },
+ { CDF1(29072) }, { CDF1( 2713) }, { CDF1(11861) },
+ { CDF1(20773) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(31903) }, { CDF1( 2044) }, { CDF1( 7528) },
+ { CDF1(14618) }, { CDF1(16182) }, { CDF1(24168) },
+ { CDF1(31037) }, { CDF1( 2786) }, { CDF1(11194) },
+ { CDF1(20155) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(32510) }, { CDF1( 8430) }, { CDF1(17318) },
+ { CDF1(24154) }, { CDF1(23674) }, { CDF1(28789) },
+ { CDF1(32139) }, { CDF1( 3440) }, { CDF1(13117) },
+ { CDF1(22702) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ }, {
+ { CDF1(31671) }, { CDF1( 2056) }, { CDF1(11746) },
+ { CDF1(16852) }, { CDF1(18635) }, { CDF1(24715) },
+ { CDF1(31484) }, { CDF1( 4656) }, { CDF1(16074) },
+ { CDF1(24704) }, { CDF1( 1806) }, { CDF1(14645) },
+ { CDF1(25336) },
+ }, {
+ { CDF1(31539) }, { CDF1( 8433) }, { CDF1(20576) },
+ { CDF1(27904) }, { CDF1(27852) }, { CDF1(30026) },
+ { CDF1(32441) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) },
+ },
+ }, .eob_bin_16 = {
+ {
+ { CDF4( 6708, 8958, 14746, 22133) },
+ { CDF4( 1222, 2074, 4783, 15410) },
+ }, {
+ { CDF4(19575, 21766, 26044, 29709) },
+ { CDF4( 7297, 10767, 19273, 28194) },
+ },
+ }, .eob_bin_32 = {
+ {
+ { CDF5( 4617, 5709, 8446, 13584, 23135) },
+ { CDF5( 1156, 1702, 3675, 9274, 20539) },
+ }, {
+ { CDF5(22086, 24282, 27010, 29770, 31743) },
+ { CDF5( 7699, 10897, 20891, 26926, 31628) },
+ },
+ }, .eob_bin_64 = {
+ {
+ { CDF6( 6307, 7541, 12060, 16358, 22553, 27865) },
+ { CDF6( 1289, 2320, 3971, 7926, 14153, 24291) },
+ }, {
+ { CDF6(24212, 25708, 28268, 30035, 31307, 32049) },
+ { CDF6( 8726, 12378, 19409, 26450, 30038, 32462) },
+ },
+ }, .eob_bin_128 = {
+ {
+ { CDF7( 3472, 4885, 7489, 12481, 18517, 24536, 29635) },
+ { CDF7( 886, 1731, 3271, 8469, 15569, 22126, 28383) },
+ }, {
+ { CDF7(24313, 26062, 28385, 30107, 31217, 31898, 32345) },
+ { CDF7( 9165, 13282, 21150, 30286, 31894, 32571, 32712) },
+ },
+ }, .eob_bin_256 = {
+ {
+ { CDF8( 5348, 7113, 11820, 15924,
+ 22106, 26777, 30334, 31757) },
+ { CDF8( 2453, 4474, 6307, 8777,
+ 16474, 22975, 29000, 31547) },
+ }, {
+ { CDF8(23110, 24597, 27140, 28894,
+ 30167, 30927, 31392, 32094) },
+ { CDF8( 9998, 17661, 25178, 28097,
+ 31308, 32038, 32403, 32695) },
+ },
+ }, .eob_bin_512 = {
+ { CDF9( 5927, 7809, 10923, 14597, 19439,
+ 24135, 28456, 31142, 32060) },
+ { CDF9(21093, 23043, 25742, 27658, 29097,
+ 29716, 30073, 30820, 31956) },
+ }, .eob_bin_1024 = {
+ { CDF10( 6698, 8334, 11961, 15762, 20186,
+ 23862, 27434, 29326, 31082, 32050) },
+ { CDF10(20569, 22426, 25569, 26859, 28053,
+ 28913, 29486, 29724, 29807, 32570) },
+ }, .eob_hi_bit = {
+ {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20177) },
+ { CDF1(20789) }, { CDF1(20262) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(21416) },
+ { CDF1(20855) }, { CDF1(23410) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20238) },
+ { CDF1(21057) }, { CDF1(19159) }, { CDF1(22337) },
+ { CDF1(20159) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(20125) },
+ { CDF1(20559) }, { CDF1(21707) }, { CDF1(22296) },
+ { CDF1(17333) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(19941) },
+ { CDF1(20527) }, { CDF1(21470) }, { CDF1(22487) },
+ { CDF1(19558) }, { CDF1(22354) }, { CDF1(20331) },
+ { CDF1(16384) }, { CDF1(16384) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(22752) },
+ { CDF1(25006) }, { CDF1(22075) }, { CDF1(21576) },
+ { CDF1(17740) }, { CDF1(21690) }, { CDF1(19211) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(21442) },
+ { CDF1(22358) }, { CDF1(18503) }, { CDF1(20291) },
+ { CDF1(19945) }, { CDF1(21294) }, { CDF1(21178) },
+ { CDF1(19400) }, { CDF1(10556) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(24648) },
+ { CDF1(24949) }, { CDF1(20708) }, { CDF1(23905) },
+ { CDF1(20501) }, { CDF1( 9558) }, { CDF1( 9423) },
+ { CDF1(30365) }, { CDF1(19253) },
+ },
+ }, {
+ {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(26064) },
+ { CDF1(22098) }, { CDF1(19613) }, { CDF1(20525) },
+ { CDF1(17595) }, { CDF1(16618) }, { CDF1(20497) },
+ { CDF1(18989) }, { CDF1(15513) },
+ }, {
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) },
+ { CDF1(16384) }, { CDF1(16384) },
+ },
+ },
+ }, .eob_base_tok = {
+ {
+ {
+ { CDF2(22497, 31198) }, { CDF2(31715, 32495) },
+ { CDF2(31606, 32337) }, { CDF2(30388, 31990) },
+ }, {
+ { CDF2(27877, 31584) }, { CDF2(32170, 32728) },
+ { CDF2(32155, 32688) }, { CDF2(32219, 32702) },
+ },
+ }, {
+ {
+ { CDF2(21457, 31043) }, { CDF2(31951, 32483) },
+ { CDF2(32153, 32562) }, { CDF2(31473, 32215) },
+ }, {
+ { CDF2(27558, 31151) }, { CDF2(32020, 32640) },
+ { CDF2(32097, 32575) }, { CDF2(32242, 32719) },
+ },
+ }, {
+ {
+ { CDF2(19980, 30591) }, { CDF2(32219, 32597) },
+ { CDF2(32581, 32706) }, { CDF2(31803, 32287) },
+ }, {
+ { CDF2(26473, 30507) }, { CDF2(32431, 32723) },
+ { CDF2(32196, 32611) }, { CDF2(31588, 32528) },
+ },
+ }, {
+ {
+ { CDF2(24647, 30463) }, { CDF2(32412, 32695) },
+ { CDF2(32468, 32720) }, { CDF2(31269, 32523) },
+ }, {
+ { CDF2(28482, 31505) }, { CDF2(32152, 32701) },
+ { CDF2(31732, 32598) }, { CDF2(31767, 32712) },
+ },
+ }, {
+ {
+ { CDF2(12358, 24977) }, { CDF2(31331, 32385) },
+ { CDF2(32634, 32756) }, { CDF2(30411, 32548) },
+ }, {
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ { CDF2(10923, 21845) }, { CDF2(10923, 21845) },
+ },
+ },
+ }, .base_tok = {
+ {
+ {
+ { CDF3( 7062, 16472, 22319) },
+ { CDF3(24538, 32261, 32674) },
+ { CDF3(13675, 28041, 31779) },
+ { CDF3( 8590, 20674, 27631) },
+ { CDF3( 5685, 14675, 22013) },
+ { CDF3( 3655, 9898, 15731) },
+ { CDF3(26493, 32418, 32658) },
+ { CDF3(16376, 29342, 32090) },
+ { CDF3(10594, 22649, 28970) },
+ { CDF3( 8176, 17170, 24303) },
+ { CDF3( 5605, 12694, 19139) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(23888, 31902, 32542) },
+ { CDF3(18612, 29687, 31987) },
+ { CDF3(16245, 24852, 29249) },
+ { CDF3(15765, 22608, 27559) },
+ { CDF3(19895, 24699, 27510) },
+ { CDF3(28401, 32212, 32457) },
+ { CDF3(15274, 27825, 30980) },
+ { CDF3( 9364, 18128, 24332) },
+ { CDF3( 2283, 8193, 15082) },
+ { CDF3( 1228, 3972, 7881) },
+ { CDF3(29455, 32469, 32620) },
+ { CDF3(17981, 28245, 31388) },
+ { CDF3(10921, 20098, 26240) },
+ { CDF3( 3743, 11829, 18657) },
+ { CDF3( 2374, 9593, 15715) },
+ { CDF3(31068, 32466, 32635) },
+ { CDF3(20321, 29572, 31971) },
+ { CDF3(10771, 20255, 27119) },
+ { CDF3( 2795, 10410, 17361) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 9320, 22102, 27840) },
+ { CDF3(27057, 32464, 32724) },
+ { CDF3(16331, 30268, 32309) },
+ { CDF3(10319, 23935, 29720) },
+ { CDF3( 6189, 16448, 24106) },
+ { CDF3( 3589, 10884, 18808) },
+ { CDF3(29026, 32624, 32748) },
+ { CDF3(19226, 31507, 32587) },
+ { CDF3(12692, 26921, 31203) },
+ { CDF3( 7049, 19532, 27635) },
+ { CDF3( 7727, 15669, 23252) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3(28056, 32625, 32748) },
+ { CDF3(22383, 32075, 32669) },
+ { CDF3(15417, 27098, 31749) },
+ { CDF3(18127, 26493, 27190) },
+ { CDF3( 5461, 16384, 21845) },
+ { CDF3(27982, 32091, 32584) },
+ { CDF3(19045, 29868, 31972) },
+ { CDF3(10397, 22266, 27932) },
+ { CDF3( 5990, 13697, 21500) },
+ { CDF3( 1792, 6912, 15104) },
+ { CDF3(28198, 32501, 32718) },
+ { CDF3(21534, 31521, 32569) },
+ { CDF3(11109, 25217, 30017) },
+ { CDF3( 5671, 15124, 26151) },
+ { CDF3( 4681, 14043, 18725) },
+ { CDF3(28688, 32580, 32741) },
+ { CDF3(22576, 32079, 32661) },
+ { CDF3(10627, 22141, 28340) },
+ { CDF3( 9362, 14043, 28087) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 7754, 16948, 22142) },
+ { CDF3(25670, 32330, 32691) },
+ { CDF3(15663, 29225, 31994) },
+ { CDF3( 9878, 23288, 29158) },
+ { CDF3( 6419, 17088, 24336) },
+ { CDF3( 3859, 11003, 17039) },
+ { CDF3(27562, 32595, 32725) },
+ { CDF3(17575, 30588, 32399) },
+ { CDF3(10819, 24838, 30309) },
+ { CDF3( 7124, 18686, 25916) },
+ { CDF3( 4479, 12688, 19340) },
+ { CDF3(28385, 32476, 32673) },
+ { CDF3(15306, 29005, 31938) },
+ { CDF3( 8937, 21615, 28322) },
+ { CDF3( 5982, 15603, 22786) },
+ { CDF3( 3620, 10267, 16136) },
+ { CDF3(27280, 32464, 32667) },
+ { CDF3(15607, 29160, 32004) },
+ { CDF3( 9091, 22135, 28740) },
+ { CDF3( 6232, 16632, 24020) },
+ { CDF3( 4047, 11377, 17672) },
+ { CDF3(29220, 32630, 32718) },
+ { CDF3(19650, 31220, 32462) },
+ { CDF3(13050, 26312, 30827) },
+ { CDF3( 9228, 20870, 27468) },
+ { CDF3( 6146, 15149, 21971) },
+ { CDF3(30169, 32481, 32623) },
+ { CDF3(17212, 29311, 31554) },
+ { CDF3( 9911, 21311, 26882) },
+ { CDF3( 4487, 13314, 20372) },
+ { CDF3( 2570, 7772, 12889) },
+ { CDF3(30924, 32613, 32708) },
+ { CDF3(19490, 30206, 32107) },
+ { CDF3(11232, 23998, 29276) },
+ { CDF3( 6769, 17955, 25035) },
+ { CDF3( 4398, 12623, 19214) },
+ { CDF3(30609, 32627, 32722) },
+ { CDF3(19370, 30582, 32287) },
+ { CDF3(10457, 23619, 29409) },
+ { CDF3( 6443, 17637, 24834) },
+ { CDF3( 4645, 13236, 20106) },
+ }, {
+ { CDF3( 8626, 20271, 26216) },
+ { CDF3(26707, 32406, 32711) },
+ { CDF3(16999, 30329, 32286) },
+ { CDF3(11445, 25123, 30286) },
+ { CDF3( 6411, 18828, 25601) },
+ { CDF3( 6801, 12458, 20248) },
+ { CDF3(29918, 32682, 32748) },
+ { CDF3(20649, 31739, 32618) },
+ { CDF3(12879, 27773, 31581) },
+ { CDF3( 7896, 21751, 28244) },
+ { CDF3( 5260, 14870, 23698) },
+ { CDF3(29252, 32593, 32731) },
+ { CDF3(17072, 30460, 32294) },
+ { CDF3(10653, 24143, 29365) },
+ { CDF3( 6536, 17490, 23983) },
+ { CDF3( 4929, 13170, 20085) },
+ { CDF3(28137, 32518, 32715) },
+ { CDF3(18171, 30784, 32407) },
+ { CDF3(11437, 25436, 30459) },
+ { CDF3( 7252, 18534, 26176) },
+ { CDF3( 4126, 13353, 20978) },
+ { CDF3(31162, 32726, 32748) },
+ { CDF3(23017, 32222, 32701) },
+ { CDF3(15629, 29233, 32046) },
+ { CDF3( 9387, 22621, 29480) },
+ { CDF3( 6922, 17616, 25010) },
+ { CDF3(28838, 32265, 32614) },
+ { CDF3(19701, 30206, 31920) },
+ { CDF3(11214, 22410, 27933) },
+ { CDF3( 5320, 14177, 23034) },
+ { CDF3( 5049, 12881, 17827) },
+ { CDF3(27484, 32471, 32734) },
+ { CDF3(21076, 31526, 32561) },
+ { CDF3(12707, 26303, 31211) },
+ { CDF3( 8169, 21722, 28219) },
+ { CDF3( 6045, 19406, 27042) },
+ { CDF3(27753, 32572, 32745) },
+ { CDF3(20832, 31878, 32653) },
+ { CDF3(13250, 27356, 31674) },
+ { CDF3( 7718, 21508, 29858) },
+ { CDF3( 7209, 18350, 25559) },
+ },
+ }, {
+ {
+ { CDF3( 7876, 16901, 21741) },
+ { CDF3(24001, 31898, 32625) },
+ { CDF3(14529, 27959, 31451) },
+ { CDF3( 8273, 20818, 27258) },
+ { CDF3( 5278, 14673, 21510) },
+ { CDF3( 2983, 8843, 14039) },
+ { CDF3(28016, 32574, 32732) },
+ { CDF3(17471, 30306, 32301) },
+ { CDF3(10224, 24063, 29728) },
+ { CDF3( 6602, 17954, 25052) },
+ { CDF3( 4002, 11585, 17759) },
+ { CDF3(30190, 32634, 32739) },
+ { CDF3(17497, 30282, 32270) },
+ { CDF3(10229, 23729, 29538) },
+ { CDF3( 6344, 17211, 24440) },
+ { CDF3( 3849, 11189, 17108) },
+ { CDF3(28570, 32583, 32726) },
+ { CDF3(17521, 30161, 32238) },
+ { CDF3(10153, 23565, 29378) },
+ { CDF3( 6455, 17341, 24443) },
+ { CDF3( 3907, 11042, 17024) },
+ { CDF3(30689, 32715, 32748) },
+ { CDF3(21546, 31840, 32610) },
+ { CDF3(13547, 27581, 31459) },
+ { CDF3( 8912, 21757, 28309) },
+ { CDF3( 5548, 15080, 22046) },
+ { CDF3(30783, 32540, 32685) },
+ { CDF3(17540, 29528, 31668) },
+ { CDF3(10160, 21468, 26783) },
+ { CDF3( 4724, 13393, 20054) },
+ { CDF3( 2702, 8174, 13102) },
+ { CDF3(31648, 32686, 32742) },
+ { CDF3(20954, 31094, 32337) },
+ { CDF3(12420, 25698, 30179) },
+ { CDF3( 7304, 19320, 26248) },
+ { CDF3( 4366, 12261, 18864) },
+ { CDF3(31581, 32723, 32748) },
+ { CDF3(21373, 31586, 32525) },
+ { CDF3(12744, 26625, 30885) },
+ { CDF3( 7431, 20322, 26950) },
+ { CDF3( 4692, 13323, 20111) },
+ }, {
+ { CDF3( 7833, 18369, 24095) },
+ { CDF3(26650, 32273, 32702) },
+ { CDF3(16371, 29961, 32191) },
+ { CDF3(11055, 24082, 29629) },
+ { CDF3( 6892, 18644, 25400) },
+ { CDF3( 5006, 13057, 19240) },
+ { CDF3(29834, 32666, 32748) },
+ { CDF3(19577, 31335, 32570) },
+ { CDF3(12253, 26509, 31122) },
+ { CDF3( 7991, 20772, 27711) },
+ { CDF3( 5677, 15910, 23059) },
+ { CDF3(30109, 32532, 32720) },
+ { CDF3(16747, 30166, 32252) },
+ { CDF3(10134, 23542, 29184) },
+ { CDF3( 5791, 16176, 23556) },
+ { CDF3( 4362, 10414, 17284) },
+ { CDF3(29492, 32626, 32748) },
+ { CDF3(19894, 31402, 32525) },
+ { CDF3(12942, 27071, 30869) },
+ { CDF3( 8346, 21216, 27405) },
+ { CDF3( 6572, 17087, 23859) },
+ { CDF3(32035, 32735, 32748) },
+ { CDF3(22957, 31838, 32618) },
+ { CDF3(14724, 28572, 31772) },
+ { CDF3(10364, 23999, 29553) },
+ { CDF3( 7004, 18433, 25655) },
+ { CDF3(27528, 32277, 32681) },
+ { CDF3(16959, 31171, 32096) },
+ { CDF3(10486, 23593, 27962) },
+ { CDF3( 8192, 16384, 23211) },
+ { CDF3( 8937, 17873, 20852) },
+ { CDF3(27715, 32002, 32615) },
+ { CDF3(15073, 29491, 31676) },
+ { CDF3(11264, 24576, 28672) },
+ { CDF3( 2341, 18725, 23406) },
+ { CDF3( 7282, 18204, 25486) },
+ { CDF3(28547, 32213, 32657) },
+ { CDF3(20788, 29773, 32239) },
+ { CDF3( 6780, 21469, 30508) },
+ { CDF3( 5958, 14895, 23831) },
+ { CDF3(16384, 21845, 27307) },
+ },
+ }, {
+ {
+ { CDF3( 5992, 14304, 19765) },
+ { CDF3(22612, 31238, 32456) },
+ { CDF3(13456, 27162, 31087) },
+ { CDF3( 8001, 20062, 26504) },
+ { CDF3( 5168, 14105, 20764) },
+ { CDF3( 2632, 7771, 12385) },
+ { CDF3(27034, 32344, 32709) },
+ { CDF3(15850, 29415, 31997) },
+ { CDF3( 9494, 22776, 28841) },
+ { CDF3( 6151, 16830, 23969) },
+ { CDF3( 3461, 10039, 15722) },
+ { CDF3(30134, 32569, 32731) },
+ { CDF3(15638, 29422, 31945) },
+ { CDF3( 9150, 21865, 28218) },
+ { CDF3( 5647, 15719, 22676) },
+ { CDF3( 3402, 9772, 15477) },
+ { CDF3(28530, 32586, 32735) },
+ { CDF3(17139, 30298, 32292) },
+ { CDF3(10200, 24039, 29685) },
+ { CDF3( 6419, 17674, 24786) },
+ { CDF3( 3544, 10225, 15824) },
+ { CDF3(31333, 32726, 32748) },
+ { CDF3(20618, 31487, 32544) },
+ { CDF3(12901, 27217, 31232) },
+ { CDF3( 8624, 21734, 28171) },
+ { CDF3( 5104, 14191, 20748) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3(11206, 21090, 26561) },
+ { CDF3(28759, 32279, 32671) },
+ { CDF3(14171, 27952, 31569) },
+ { CDF3( 9743, 22907, 29141) },
+ { CDF3( 6871, 17886, 24868) },
+ { CDF3( 4960, 13152, 19315) },
+ { CDF3(31077, 32661, 32748) },
+ { CDF3(19400, 31195, 32515) },
+ { CDF3(12752, 26858, 31040) },
+ { CDF3( 8370, 22098, 28591) },
+ { CDF3( 5457, 15373, 22298) },
+ { CDF3(31697, 32706, 32748) },
+ { CDF3(17860, 30657, 32333) },
+ { CDF3(12510, 24812, 29261) },
+ { CDF3( 6180, 19124, 24722) },
+ { CDF3( 5041, 13548, 17959) },
+ { CDF3(31552, 32716, 32748) },
+ { CDF3(21908, 31769, 32623) },
+ { CDF3(14470, 28201, 31565) },
+ { CDF3( 9493, 22982, 28608) },
+ { CDF3( 6858, 17240, 24137) },
+ { CDF3(32543, 32752, 32756) },
+ { CDF3(24286, 32097, 32666) },
+ { CDF3(15958, 29217, 32024) },
+ { CDF3(10207, 24234, 29958) },
+ { CDF3( 6929, 18305, 25652) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ }, {
+ {
+ { CDF3( 4137, 10847, 15682) },
+ { CDF3(17824, 27001, 30058) },
+ { CDF3(10204, 22796, 28291) },
+ { CDF3( 6076, 15935, 22125) },
+ { CDF3( 3852, 10937, 16816) },
+ { CDF3( 2252, 6324, 10131) },
+ { CDF3(25840, 32016, 32662) },
+ { CDF3(15109, 28268, 31531) },
+ { CDF3( 9385, 22231, 28340) },
+ { CDF3( 6082, 16672, 23479) },
+ { CDF3( 3318, 9427, 14681) },
+ { CDF3(30594, 32574, 32718) },
+ { CDF3(16836, 29552, 31859) },
+ { CDF3( 9556, 22542, 28356) },
+ { CDF3( 6305, 16725, 23540) },
+ { CDF3( 3376, 9895, 15184) },
+ { CDF3(29383, 32617, 32745) },
+ { CDF3(18891, 30809, 32401) },
+ { CDF3(11688, 25942, 30687) },
+ { CDF3( 7468, 19469, 26651) },
+ { CDF3( 3909, 11358, 17012) },
+ { CDF3(31564, 32736, 32748) },
+ { CDF3(20906, 31611, 32600) },
+ { CDF3(13191, 27621, 31537) },
+ { CDF3( 8768, 22029, 28676) },
+ { CDF3( 5079, 14109, 20906) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ }, {
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ { CDF3( 8192, 16384, 24576) },
+ },
+ },
+ }, .dc_sign = {
+ { { CDF1(16000) }, { CDF1(13056) }, { CDF1(18816) } },
+ { { CDF1(15232) }, { CDF1(12928) }, { CDF1(17280) } },
+ }, .br_tok = {
+ {
+ {
+ { CDF3(18315, 24289, 27551) },
+ { CDF3(16854, 24068, 27835) },
+ { CDF3(10140, 17927, 23173) },
+ { CDF3( 6722, 12982, 18267) },
+ { CDF3( 4661, 9826, 14706) },
+ { CDF3( 3832, 8165, 12294) },
+ { CDF3( 2795, 6098, 9245) },
+ { CDF3(17145, 23326, 26672) },
+ { CDF3(20733, 27680, 30308) },
+ { CDF3(16032, 24461, 28546) },
+ { CDF3(11653, 20093, 25081) },
+ { CDF3( 9290, 16429, 22086) },
+ { CDF3( 7796, 14598, 19982) },
+ { CDF3( 6502, 12378, 17441) },
+ { CDF3(21681, 27732, 30320) },
+ { CDF3(22389, 29044, 31261) },
+ { CDF3(19027, 26731, 30087) },
+ { CDF3(14739, 23755, 28624) },
+ { CDF3(11358, 20778, 25511) },
+ { CDF3(10995, 18073, 24190) },
+ { CDF3( 9162, 14990, 20617) },
+ }, {
+ { CDF3(21425, 27952, 30388) },
+ { CDF3(18062, 25838, 29034) },
+ { CDF3(11956, 19881, 24808) },
+ { CDF3( 7718, 15000, 20980) },
+ { CDF3( 5702, 11254, 16143) },
+ { CDF3( 4898, 9088, 16864) },
+ { CDF3( 3679, 6776, 11907) },
+ { CDF3(23294, 30160, 31663) },
+ { CDF3(24397, 29896, 31836) },
+ { CDF3(19245, 27128, 30593) },
+ { CDF3(13202, 19825, 26404) },
+ { CDF3(11578, 19297, 23957) },
+ { CDF3( 8073, 13297, 21370) },
+ { CDF3( 5461, 10923, 19745) },
+ { CDF3(27367, 30521, 31934) },
+ { CDF3(24904, 30671, 31940) },
+ { CDF3(23075, 28460, 31299) },
+ { CDF3(14400, 23658, 30417) },
+ { CDF3(13885, 23882, 28325) },
+ { CDF3(14746, 22938, 27853) },
+ { CDF3( 5461, 16384, 27307) },
+ },
+ }, {
+ {
+ { CDF3(18274, 24813, 27890) },
+ { CDF3(15537, 23149, 27003) },
+ { CDF3( 9449, 16740, 21827) },
+ { CDF3( 6700, 12498, 17261) },
+ { CDF3( 4988, 9866, 14198) },
+ { CDF3( 4236, 8147, 11902) },
+ { CDF3( 2867, 5860, 8654) },
+ { CDF3(17124, 23171, 26101) },
+ { CDF3(20396, 27477, 30148) },
+ { CDF3(16573, 24629, 28492) },
+ { CDF3(12749, 20846, 25674) },
+ { CDF3(10233, 17878, 22818) },
+ { CDF3( 8525, 15332, 20363) },
+ { CDF3( 6283, 11632, 16255) },
+ { CDF3(20466, 26511, 29286) },
+ { CDF3(23059, 29174, 31191) },
+ { CDF3(19481, 27263, 30241) },
+ { CDF3(15458, 23631, 28137) },
+ { CDF3(12416, 20608, 25693) },
+ { CDF3(10261, 18011, 23261) },
+ { CDF3( 8016, 14655, 19666) },
+ }, {
+ { CDF3(17616, 24586, 28112) },
+ { CDF3(15809, 23299, 27155) },
+ { CDF3(10767, 18890, 23793) },
+ { CDF3( 7727, 14255, 18865) },
+ { CDF3( 6129, 11926, 16882) },
+ { CDF3( 4482, 9704, 14861) },
+ { CDF3( 3277, 7452, 11522) },
+ { CDF3(22956, 28551, 30730) },
+ { CDF3(22724, 28937, 30961) },
+ { CDF3(18467, 26324, 29580) },
+ { CDF3(13234, 20713, 25649) },
+ { CDF3(11181, 17592, 22481) },
+ { CDF3( 8291, 18358, 24576) },
+ { CDF3( 7568, 11881, 14984) },
+ { CDF3(24948, 29001, 31147) },
+ { CDF3(25674, 30619, 32151) },
+ { CDF3(20841, 26793, 29603) },
+ { CDF3(14669, 24356, 28666) },
+ { CDF3(11334, 23593, 28219) },
+ { CDF3( 8922, 14762, 22873) },
+ { CDF3( 8301, 13544, 20535) },
+ },
+ }, {
+ {
+ { CDF3(17113, 23733, 27081) },
+ { CDF3(14139, 21406, 25452) },
+ { CDF3( 8552, 15002, 19776) },
+ { CDF3( 5871, 11120, 15378) },
+ { CDF3( 4455, 8616, 12253) },
+ { CDF3( 3469, 6910, 10386) },
+ { CDF3( 2255, 4553, 6782) },
+ { CDF3(18224, 24376, 27053) },
+ { CDF3(19290, 26710, 29614) },
+ { CDF3(14936, 22991, 27184) },
+ { CDF3(11238, 18951, 23762) },
+ { CDF3( 8786, 15617, 20588) },
+ { CDF3( 7317, 13228, 18003) },
+ { CDF3( 5101, 9512, 13493) },
+ { CDF3(22639, 28222, 30210) },
+ { CDF3(23216, 29331, 31307) },
+ { CDF3(19075, 26762, 29895) },
+ { CDF3(15014, 23113, 27457) },
+ { CDF3(11938, 19857, 24752) },
+ { CDF3( 9942, 17280, 22282) },
+ { CDF3( 7167, 13144, 17752) },
+ }, {
+ { CDF3(15820, 22738, 26488) },
+ { CDF3(13530, 20885, 25216) },
+ { CDF3( 8395, 15530, 20452) },
+ { CDF3( 6574, 12321, 16380) },
+ { CDF3( 5353, 10419, 14568) },
+ { CDF3( 4613, 8446, 12381) },
+ { CDF3( 3440, 7158, 9903) },
+ { CDF3(24247, 29051, 31224) },
+ { CDF3(22118, 28058, 30369) },
+ { CDF3(16498, 24768, 28389) },
+ { CDF3(12920, 21175, 26137) },
+ { CDF3(10730, 18619, 25352) },
+ { CDF3(10187, 16279, 22791) },
+ { CDF3( 9310, 14631, 22127) },
+ { CDF3(24970, 30558, 32057) },
+ { CDF3(24801, 29942, 31698) },
+ { CDF3(22432, 28453, 30855) },
+ { CDF3(19054, 25680, 29580) },
+ { CDF3(14392, 23036, 28109) },
+ { CDF3(12495, 20947, 26650) },
+ { CDF3(12442, 20326, 26214) },
+ },
+ }, {
+ {
+ { CDF3(12162, 18785, 22648) },
+ { CDF3(12749, 19697, 23806) },
+ { CDF3( 8580, 15297, 20346) },
+ { CDF3( 6169, 11749, 16543) },
+ { CDF3( 4836, 9391, 13448) },
+ { CDF3( 3821, 7711, 11613) },
+ { CDF3( 2228, 4601, 7070) },
+ { CDF3(16319, 24725, 28280) },
+ { CDF3(15698, 23277, 27168) },
+ { CDF3(12726, 20368, 25047) },
+ { CDF3( 9912, 17015, 21976) },
+ { CDF3( 7888, 14220, 19179) },
+ { CDF3( 6777, 12284, 17018) },
+ { CDF3( 4492, 8590, 12252) },
+ { CDF3(23249, 28904, 30947) },
+ { CDF3(21050, 27908, 30512) },
+ { CDF3(17440, 25340, 28949) },
+ { CDF3(14059, 22018, 26541) },
+ { CDF3(11288, 18903, 23898) },
+ { CDF3( 9411, 16342, 21428) },
+ { CDF3( 6278, 11588, 15944) },
+ }, {
+ { CDF3(13981, 20067, 23226) },
+ { CDF3(16922, 23580, 26783) },
+ { CDF3(11005, 19039, 24487) },
+ { CDF3( 7389, 14218, 19798) },
+ { CDF3( 5598, 11505, 17206) },
+ { CDF3( 6090, 11213, 15659) },
+ { CDF3( 3820, 7371, 10119) },
+ { CDF3(21082, 26925, 29675) },
+ { CDF3(21262, 28627, 31128) },
+ { CDF3(18392, 26454, 30437) },
+ { CDF3(14870, 22910, 27096) },
+ { CDF3(12620, 19484, 24908) },
+ { CDF3( 9290, 16553, 22802) },
+ { CDF3( 6668, 14288, 20004) },
+ { CDF3(27704, 31055, 31949) },
+ { CDF3(24709, 29978, 31788) },
+ { CDF3(21668, 29264, 31657) },
+ { CDF3(18295, 26968, 30074) },
+ { CDF3(16399, 24422, 29313) },
+ { CDF3(14347, 23026, 28104) },
+ { CDF3(12370, 19806, 24477) },
+ },
+ },
+ },
+ }
+};
+
+void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr,
+ CdfContext *const dst,
+ const CdfContext *const src)
+{
+#define update_cdf_1d(n1d, name) \
+ do { \
+ memcpy(dst->name, src->name, sizeof(dst->name)); \
+ dst->name[n1d] = 0; \
+ } while (0)
+
+#define update_cdf_2d(n1d, n2d, name) \
+ for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j])
+#define update_cdf_3d(n1d, n2d, n3d, name) \
+ for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k])
+#define update_cdf_4d(n1d, n2d, n3d, n4d, name) \
+ for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l])
+
+#define update_bit_0d(name) \
+ do { \
+ dst->name[0] = src->name[0]; \
+ dst->name[1] = 0; \
+ } while (0)
+
+#define update_bit_1d(n1d, name) \
+ for (int i = 0; i < (n1d); i++) update_bit_0d(name[i])
+#define update_bit_2d(n1d, n2d, name) \
+ for (int j = 0; j < (n1d); j++) update_bit_1d(n2d, name[j])
+#define update_bit_3d(n1d, n2d, n3d, name) \
+ for (int k = 0; k < (n1d); k++) update_bit_2d(n2d, n3d, name[k])
+
+ update_bit_1d(N_BS_SIZES, m.use_filter_intra);
+ update_cdf_1d(4, m.filter_intra);
+ update_cdf_3d(2, N_INTRA_PRED_MODES, N_UV_INTRA_PRED_MODES - 1 - !k, m.uv_mode);
+ update_cdf_2d(8, 6, m.angle_delta);
+ update_cdf_3d(N_TX_SIZES - 1, 3, imin(k + 1, 2), m.txsz);
+ update_cdf_3d(2, N_INTRA_PRED_MODES, 6, m.txtp_intra1);
+ update_cdf_3d(3, N_INTRA_PRED_MODES, 4, m.txtp_intra2);
+ update_bit_1d(3, m.skip);
+ update_cdf_3d(N_BL_LEVELS, 4, dav1d_partition_type_count[k], m.partition);
+ update_bit_2d(N_TX_SIZES, 13, coef.skip);
+ update_cdf_3d(2, 2, 4, coef.eob_bin_16);
+ update_cdf_3d(2, 2, 5, coef.eob_bin_32);
+ update_cdf_3d(2, 2, 6, coef.eob_bin_64);
+ update_cdf_3d(2, 2, 7, coef.eob_bin_128);
+ update_cdf_3d(2, 2, 8, coef.eob_bin_256);
+ update_cdf_2d(2, 9, coef.eob_bin_512);
+ update_cdf_2d(2, 10, coef.eob_bin_1024);
+ update_bit_3d(N_TX_SIZES, 2, 11 /*22*/, coef.eob_hi_bit);
+ update_cdf_4d(N_TX_SIZES, 2, 4, 2, coef.eob_base_tok);
+ update_cdf_4d(N_TX_SIZES, 2, 41 /*42*/, 3, coef.base_tok);
+ update_bit_2d(2, 3, coef.dc_sign);
+ update_cdf_4d(4, 2, 21, 3, coef.br_tok);
+ update_cdf_2d(3, DAV1D_MAX_SEGMENTS - 1, m.seg_id);
+ update_cdf_1d(7, m.cfl_sign);
+ update_cdf_2d(6, 15, m.cfl_alpha);
+ update_bit_0d(m.restore_wiener);
+ update_bit_0d(m.restore_sgrproj);
+ update_cdf_1d(2, m.restore_switchable);
+ update_cdf_1d(3, m.delta_q);
+ update_cdf_2d(5, 3, m.delta_lf);
+ update_bit_2d(7, 3, m.pal_y);
+ update_bit_1d(2, m.pal_uv);
+ update_cdf_3d(2, 7, 6, m.pal_sz);
+ update_cdf_4d(2, 7, 5, k + 1, m.color_map);
+ update_bit_2d(7, 3, m.txpart);
+ update_cdf_2d(2, 15, m.txtp_inter1);
+ update_cdf_1d(11, m.txtp_inter2);
+ update_bit_1d(4, m.txtp_inter3);
+
+ if (IS_KEY_OR_INTRA(hdr)) {
+ update_bit_0d(m.intrabc);
+
+ update_cdf_1d(N_MV_JOINTS - 1, dmv.joint);
+ for (int k = 0; k < 2; k++) {
+ update_cdf_1d(10, dmv.comp[k].classes);
+ update_bit_0d(dmv.comp[k].class0);
+ update_bit_1d(10, dmv.comp[k].classN);
+ update_bit_0d(dmv.comp[k].sign);
+ }
+ return;
+ }
+
+ update_bit_1d(3, m.skip_mode);
+ update_cdf_2d(4, N_INTRA_PRED_MODES - 1, m.y_mode);
+ update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS - 1, m.filter);
+ update_bit_1d(6, m.newmv_mode);
+ update_bit_1d(2, m.globalmv_mode);
+ update_bit_1d(6, m.refmv_mode);
+ update_bit_1d(3, m.drl_bit);
+ update_cdf_2d(8, N_COMP_INTER_PRED_MODES - 1, m.comp_inter_mode);
+ update_bit_1d(4, m.intra);
+ update_bit_1d(5, m.comp);
+ update_bit_1d(5, m.comp_dir);
+ update_bit_1d(6, m.jnt_comp);
+ update_bit_1d(6, m.mask_comp);
+ update_bit_1d(9, m.wedge_comp);
+ update_cdf_2d(9, 15, m.wedge_idx);
+ update_bit_2d(6, 3, m.ref);
+ update_bit_2d(3, 3, m.comp_fwd_ref);
+ update_bit_2d(2, 3, m.comp_bwd_ref);
+ update_bit_2d(3, 3, m.comp_uni_ref);
+ update_bit_1d(3, m.seg_pred);
+ update_bit_1d(4, m.interintra);
+ update_bit_1d(7, m.interintra_wedge);
+ update_cdf_2d(4, 3, m.interintra_mode);
+ update_cdf_2d(N_BS_SIZES, 2, m.motion_mode);
+ update_bit_1d(N_BS_SIZES, m.obmc);
+
+ update_cdf_1d(N_MV_JOINTS - 1, mv.joint);
+ for (int k = 0; k < 2; k++) {
+ update_cdf_1d(10, mv.comp[k].classes);
+ update_bit_0d(mv.comp[k].class0);
+ update_bit_1d(10, mv.comp[k].classN);
+ update_cdf_2d(2, 3, mv.comp[k].class0_fp);
+ update_cdf_1d(3, mv.comp[k].classN_fp);
+ update_bit_0d(mv.comp[k].class0_hp);
+ update_bit_0d(mv.comp[k].classN_hp);
+ update_bit_0d(mv.comp[k].sign);
+ }
+}
+
+/*
+ * CDF threading wrappers.
+ */
+static inline int get_qcat_idx(const int q) {
+ if (q <= 20) return 0;
+ if (q <= 60) return 1;
+ if (q <= 120) return 2;
+ return 3;
+}
+
+void dav1d_cdf_thread_init_static(CdfThreadContext *const cdf, const int qidx) {
+ cdf->ref = NULL;
+ cdf->data.qcat = get_qcat_idx(qidx);
+}
+
+void dav1d_cdf_thread_copy(CdfContext *const dst, const CdfThreadContext *const src) {
+ if (src->ref) {
+ memcpy(dst, src->data.cdf, sizeof(*dst));
+ } else {
+ dst->m = av1_default_cdf;
+ memcpy(dst->kfym, default_kf_y_mode_cdf, sizeof(default_kf_y_mode_cdf));
+ dst->coef = av1_default_coef_cdf[src->data.qcat];
+ memcpy(dst->mv.joint, default_mv_joint_cdf, sizeof(default_mv_joint_cdf));
+ memcpy(dst->dmv.joint, default_mv_joint_cdf, sizeof(default_mv_joint_cdf));
+ dst->mv.comp[0] = dst->mv.comp[1] = dst->dmv.comp[0] = dst->dmv.comp[1] =
+ default_mv_component_cdf;
+ }
+}
+
+int dav1d_cdf_thread_alloc(Dav1dContext *const c, CdfThreadContext *const cdf,
+ const int have_frame_mt)
+{
+ cdf->ref = dav1d_ref_create_using_pool(c->cdf_pool,
+ sizeof(CdfContext) + sizeof(atomic_uint));
+ if (!cdf->ref) return DAV1D_ERR(ENOMEM);
+ cdf->data.cdf = cdf->ref->data;
+ if (have_frame_mt) {
+ cdf->progress = (atomic_uint *) &cdf->data.cdf[1];
+ atomic_init(cdf->progress, 0);
+ }
+ return 0;
+}
+
+void dav1d_cdf_thread_ref(CdfThreadContext *const dst,
+ CdfThreadContext *const src)
+{
+ *dst = *src;
+ if (src->ref)
+ dav1d_ref_inc(src->ref);
+}
+
+void dav1d_cdf_thread_unref(CdfThreadContext *const cdf) {
+ memset(&cdf->data, 0, sizeof(*cdf) - offsetof(CdfThreadContext, data));
+ dav1d_ref_dec(&cdf->ref);
+}
diff --git a/third_party/dav1d/src/cdf.h b/third_party/dav1d/src/cdf.h
new file mode 100644
index 0000000000..4b30474baa
--- /dev/null
+++ b/third_party/dav1d/src/cdf.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CDF_H
+#define DAV1D_SRC_CDF_H
+
+#include <stdint.h>
+
+#include "src/levels.h"
+#include "src/ref.h"
+#include "src/thread_data.h"
+
+/* Buffers padded to [8] or [16] for SIMD where needed. */
+
+typedef struct CdfModeContext {
+ ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32);
+ ALIGN(uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 2], 32);
+ ALIGN(uint16_t wedge_idx[9][16], 32);
+ ALIGN(uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 6], 32);
+ ALIGN(uint16_t cfl_alpha[6][16], 32);
+ ALIGN(uint16_t txtp_inter1[2][16], 32);
+ ALIGN(uint16_t txtp_inter2[12 + 4], 32);
+ ALIGN(uint16_t txtp_intra1[2][N_INTRA_PRED_MODES][7 + 1], 16);
+ ALIGN(uint16_t txtp_intra2[3][N_INTRA_PRED_MODES][5 + 3], 16);
+ ALIGN(uint16_t cfl_sign[8], 16);
+ ALIGN(uint16_t angle_delta[8][8], 16);
+ ALIGN(uint16_t filter_intra[5 + 3], 16);
+ ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16);
+ ALIGN(uint16_t seg_id[3][DAV1D_MAX_SEGMENTS], 16);
+ ALIGN(uint16_t pal_sz[2][7][7 + 1], 16);
+ ALIGN(uint16_t color_map[2][7][5][8], 16);
+ ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8);
+ ALIGN(uint16_t txsz[N_TX_SIZES - 1][3][4], 8);
+ ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8);
+ ALIGN(uint16_t delta_q[4], 8);
+ ALIGN(uint16_t delta_lf[5][4], 8);
+ ALIGN(uint16_t interintra_mode[4][4], 8);
+ ALIGN(uint16_t restore_switchable[3 + 1], 8);
+ ALIGN(uint16_t restore_wiener[2], 4);
+ ALIGN(uint16_t restore_sgrproj[2], 4);
+ ALIGN(uint16_t interintra[7][2], 4);
+ ALIGN(uint16_t interintra_wedge[7][2], 4);
+ ALIGN(uint16_t txtp_inter3[4][2], 4);
+ ALIGN(uint16_t use_filter_intra[N_BS_SIZES][2], 4);
+ ALIGN(uint16_t newmv_mode[6][2], 4);
+ ALIGN(uint16_t globalmv_mode[2][2], 4);
+ ALIGN(uint16_t refmv_mode[6][2], 4);
+ ALIGN(uint16_t drl_bit[3][2], 4);
+ ALIGN(uint16_t intra[4][2], 4);
+ ALIGN(uint16_t comp[5][2], 4);
+ ALIGN(uint16_t comp_dir[5][2], 4);
+ ALIGN(uint16_t jnt_comp[6][2], 4);
+ ALIGN(uint16_t mask_comp[6][2], 4);
+ ALIGN(uint16_t wedge_comp[9][2], 4);
+ ALIGN(uint16_t ref[6][3][2], 4);
+ ALIGN(uint16_t comp_fwd_ref[3][3][2], 4);
+ ALIGN(uint16_t comp_bwd_ref[2][3][2], 4);
+ ALIGN(uint16_t comp_uni_ref[3][3][2], 4);
+ ALIGN(uint16_t txpart[7][3][2], 4);
+ ALIGN(uint16_t skip[3][2], 4);
+ ALIGN(uint16_t skip_mode[3][2], 4);
+ ALIGN(uint16_t seg_pred[3][2], 4);
+ ALIGN(uint16_t obmc[N_BS_SIZES][2], 4);
+ ALIGN(uint16_t pal_y[7][3][2], 4);
+ ALIGN(uint16_t pal_uv[2][2], 4);
+ ALIGN(uint16_t intrabc[2], 4);
+} CdfModeContext;
+
+typedef struct CdfCoefContext {
+ ALIGN(uint16_t eob_bin_16[2][2][5 + 3], 16);
+ ALIGN(uint16_t eob_bin_32[2][2][6 + 2], 16);
+ ALIGN(uint16_t eob_bin_64[2][2][7 + 1], 16);
+ ALIGN(uint16_t eob_bin_128[2][2][8 + 0], 16);
+ ALIGN(uint16_t eob_bin_256[2][2][9 + 7], 32);
+ ALIGN(uint16_t eob_bin_512[2][10 + 6], 32);
+ ALIGN(uint16_t eob_bin_1024[2][11 + 5], 32);
+ ALIGN(uint16_t eob_base_tok[N_TX_SIZES][2][4][4], 8);
+ ALIGN(uint16_t base_tok[N_TX_SIZES][2][41][4], 8);
+ ALIGN(uint16_t br_tok[4 /*5*/][2][21][4], 8);
+ ALIGN(uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2], 4);
+ ALIGN(uint16_t skip[N_TX_SIZES][13][2], 4);
+ ALIGN(uint16_t dc_sign[2][3][2], 4);
+} CdfCoefContext;
+
+typedef struct CdfMvComponent {
+ ALIGN(uint16_t classes[11 + 5], 32);
+ ALIGN(uint16_t class0_fp[2][4], 8);
+ ALIGN(uint16_t classN_fp[4], 8);
+ ALIGN(uint16_t class0_hp[2], 4);
+ ALIGN(uint16_t classN_hp[2], 4);
+ ALIGN(uint16_t class0[2], 4);
+ ALIGN(uint16_t classN[10][2], 4);
+ ALIGN(uint16_t sign[2], 4);
+} CdfMvComponent;
+
+typedef struct CdfMvContext {
+ CdfMvComponent comp[2];
+ ALIGN(uint16_t joint[N_MV_JOINTS], 8);
+} CdfMvContext;
+
+typedef struct CdfContext {
+ CdfModeContext m;
+ ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32);
+ CdfCoefContext coef;
+ CdfMvContext mv, dmv;
+} CdfContext;
+
+typedef struct CdfThreadContext {
+ Dav1dRef *ref; ///< allocation origin
+ union {
+ CdfContext *cdf; // if ref != NULL
+ unsigned qcat; // if ref == NULL, from static CDF tables
+ } data;
+ atomic_uint *progress;
+} CdfThreadContext;
+
+void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, int qidx);
+int dav1d_cdf_thread_alloc(Dav1dContext *c, CdfThreadContext *cdf,
+ const int have_frame_mt);
+void dav1d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src);
+void dav1d_cdf_thread_ref(CdfThreadContext *dst, CdfThreadContext *src);
+void dav1d_cdf_thread_unref(CdfThreadContext *cdf);
+void dav1d_cdf_thread_update(const Dav1dFrameHeader *hdr, CdfContext *dst,
+ const CdfContext *src);
+
+#endif /* DAV1D_SRC_CDF_H */
diff --git a/third_party/dav1d/src/cpu.c b/third_party/dav1d/src/cpu.c
new file mode 100644
index 0000000000..d24148c352
--- /dev/null
+++ b/third_party/dav1d/src/cpu.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "config.h"
+
+#include <stdint.h>
+
+#include "src/cpu.h"
+#include "src/log.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#else
+#include <pthread.h>
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_PTHREAD_NP_H
+#include <pthread_np.h>
+#endif
+#if defined(__FreeBSD__)
+#define cpu_set_t cpuset_t
+#endif
+
+unsigned dav1d_cpu_flags = 0U;
+unsigned dav1d_cpu_flags_mask = ~0U;
+
+COLD void dav1d_init_cpu(void) {
+#if HAVE_ASM && !__has_feature(memory_sanitizer)
+// memory sanitizer is inherently incompatible with asm
+#if ARCH_AARCH64 || ARCH_ARM
+ dav1d_cpu_flags = dav1d_get_cpu_flags_arm();
+#elif ARCH_PPC64LE
+ dav1d_cpu_flags = dav1d_get_cpu_flags_ppc();
+#elif ARCH_X86
+ dav1d_cpu_flags = dav1d_get_cpu_flags_x86();
+#endif
+#endif
+}
+
+COLD void dav1d_set_cpu_flags_mask(const unsigned mask) {
+ dav1d_cpu_flags_mask = mask;
+}
+
+COLD int dav1d_num_logical_processors(Dav1dContext *const c) {
+#ifdef _WIN32
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+ GROUP_AFFINITY affinity;
+ if (GetThreadGroupAffinity(GetCurrentThread(), &affinity)) {
+ int num_processors = 1;
+ while (affinity.Mask &= affinity.Mask - 1)
+ num_processors++;
+ return num_processors;
+ }
+#else
+ SYSTEM_INFO system_info;
+ GetNativeSystemInfo(&system_info);
+ return system_info.dwNumberOfProcessors;
+#endif
+#elif defined(HAVE_PTHREAD_GETAFFINITY_NP) && defined(CPU_COUNT)
+ cpu_set_t affinity;
+ if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity))
+ return CPU_COUNT(&affinity);
+#elif defined(__APPLE__)
+ int num_processors;
+ size_t length = sizeof(num_processors);
+ if (!sysctlbyname("hw.logicalcpu", &num_processors, &length, NULL, 0))
+ return num_processors;
+#elif defined(_SC_NPROCESSORS_ONLN)
+ return (int)sysconf(_SC_NPROCESSORS_ONLN);
+#endif
+ if (c)
+ dav1d_log(c, "Unable to detect thread count, defaulting to single-threaded mode\n");
+ return 1;
+}
diff --git a/third_party/dav1d/src/cpu.h b/third_party/dav1d/src/cpu.h
new file mode 100644
index 0000000000..8f70fefe54
--- /dev/null
+++ b/third_party/dav1d/src/cpu.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright © 2018-2022, VideoLAN and dav1d authors
+ * Copyright © 2018-2022, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CPU_H
+#define DAV1D_SRC_CPU_H
+
+#include "config.h"
+
+#include "common/attributes.h"
+
+#include "dav1d/common.h"
+#include "dav1d/dav1d.h"
+
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/cpu.h"
+#elif ARCH_PPC64LE
+#include "src/ppc/cpu.h"
+#elif ARCH_X86
+#include "src/x86/cpu.h"
+#endif
+
+EXTERN unsigned dav1d_cpu_flags;
+EXTERN unsigned dav1d_cpu_flags_mask;
+
+void dav1d_init_cpu(void);
+DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);
+int dav1d_num_logical_processors(Dav1dContext *c);
+
+static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
+ unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
+
+#if TRIM_DSP_FUNCTIONS
+/* Since this function is inlined, unconditionally setting a flag here will
+ * enable dead code elimination in the calling function. */
+#if ARCH_AARCH64 || ARCH_ARM
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
+ flags |= DAV1D_ARM_CPU_FLAG_NEON;
+#endif
+#elif ARCH_PPC64LE
+#if defined(__VSX__)
+ flags |= DAV1D_PPC_CPU_FLAG_VSX;
+#endif
+#elif ARCH_X86
+#if defined(__AVX512F__) && defined(__AVX512CD__) && \
+ defined(__AVX512BW__) && defined(__AVX512DQ__) && \
+ defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
+ defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
+ defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
+ defined(__AVX512BITALG__) && defined(__GFNI__) && \
+ defined(__VAES__) && defined(__VPCLMULQDQ__)
+ flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
+ DAV1D_X86_CPU_FLAG_AVX2 |
+ DAV1D_X86_CPU_FLAG_SSE41 |
+ DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif defined(__AVX2__)
+ flags |= DAV1D_X86_CPU_FLAG_AVX2 |
+ DAV1D_X86_CPU_FLAG_SSE41 |
+ DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif defined(__SSE4_1__) || defined(__AVX__)
+ flags |= DAV1D_X86_CPU_FLAG_SSE41 |
+ DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif defined(__SSSE3__)
+ flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif ARCH_X86_64 || defined(__SSE2__) || \
+ (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+ flags |= DAV1D_X86_CPU_FLAG_SSE2;
+#endif
+#endif
+#endif
+
+ return flags;
+}
+
+#endif /* DAV1D_SRC_CPU_H */
diff --git a/third_party/dav1d/src/ctx.h b/third_party/dav1d/src/ctx.h
new file mode 100644
index 0000000000..d0e1f310ae
--- /dev/null
+++ b/third_party/dav1d/src/ctx.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_CTX_H
+#define DAV1D_SRC_CTX_H
+
+#include <stdint.h>
+
+#include "common/attributes.h"
+
+union alias64 { uint64_t u64; uint8_t u8[8]; } ATTR_ALIAS;
+union alias32 { uint32_t u32; uint8_t u8[4]; } ATTR_ALIAS;
+union alias16 { uint16_t u16; uint8_t u8[2]; } ATTR_ALIAS;
+union alias8 { uint8_t u8; } ATTR_ALIAS;
+
+#define set_ctx_rep4(type, var, off, val) do { \
+ const uint64_t const_val = val; \
+ ((union alias64 *) &var[off + 0])->u64 = const_val; \
+ ((union alias64 *) &var[off + 8])->u64 = const_val; \
+ ((union alias64 *) &var[off + 16])->u64 = const_val; \
+ ((union alias64 *) &var[off + 24])->u64 = const_val; \
+ } while (0)
+#define set_ctx_rep2(type, var, off, val) do { \
+ const uint64_t const_val = val; \
+ ((union alias64 *) &var[off + 0])->u64 = const_val; \
+ ((union alias64 *) &var[off + 8])->u64 = const_val; \
+ } while (0)
+#define set_ctx_rep1(typesz, var, off, val) \
+ ((union alias##typesz *) &var[off])->u##typesz = val
+#define case_set(var, dir, diridx, off) \
+ switch (var) { \
+ case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+ case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+ case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+ case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+ case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+ case 32: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
+ }
+#define case_set_upto16(var, dir, diridx, off) \
+ switch (var) { \
+ case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+ case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+ case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+ case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+ case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+ }
+#define case_set_upto32_with_default(var, dir, diridx, off) \
+ switch (var) { \
+ case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+ case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+ case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+ case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+ case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+ case 32: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
+ default: default_memset(dir, diridx, off, var); break; \
+ }
+#define case_set_upto16_with_default(var, dir, diridx, off) \
+ switch (var) { \
+ case 1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+ case 2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+ case 4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+ case 8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+ case 16: set_ctx( , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+ default: default_memset(dir, diridx, off, var); break; \
+ }
+
+#endif /* DAV1D_SRC_CTX_H */
diff --git a/third_party/dav1d/src/data.c b/third_party/dav1d/src/data.c
new file mode 100644
index 0000000000..8a1386ad95
--- /dev/null
+++ b/third_party/dav1d/src/data.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "dav1d/data.h"
+
+#include "common/attributes.h"
+#include "common/validate.h"
+
+#include "src/data.h"
+#include "src/ref.h"
+
+uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
+ validate_input_or_ret(buf != NULL, NULL);
+
+ if (sz > SIZE_MAX / 2) return NULL;
+ buf->ref = dav1d_ref_create(sz);
+ if (!buf->ref) return NULL;
+ buf->data = buf->ref->const_data;
+ buf->sz = sz;
+ dav1d_data_props_set_defaults(&buf->m);
+ buf->m.size = sz;
+
+ return buf->ref->data;
+}
+
+int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr,
+ const size_t sz,
+ void (*const free_callback)(const uint8_t *data,
+ void *cookie),
+ void *const cookie)
+{
+ validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(ptr != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));
+
+ buf->ref = dav1d_ref_wrap(ptr, free_callback, cookie);
+ if (!buf->ref) return DAV1D_ERR(ENOMEM);
+ buf->data = ptr;
+ buf->sz = sz;
+ dav1d_data_props_set_defaults(&buf->m);
+ buf->m.size = sz;
+
+ return 0;
+}
+
+int dav1d_data_wrap_user_data_internal(Dav1dData *const buf,
+ const uint8_t *const user_data,
+ void (*const free_callback)(const uint8_t *user_data,
+ void *cookie),
+ void *const cookie)
+{
+ validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));
+
+ buf->m.user_data.ref = dav1d_ref_wrap(user_data, free_callback, cookie);
+ if (!buf->m.user_data.ref) return DAV1D_ERR(ENOMEM);
+ buf->m.user_data.data = user_data;
+
+ return 0;
+}
+
+
+void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) {
+ validate_input(dst != NULL);
+ validate_input(dst->data == NULL);
+ validate_input(src != NULL);
+
+ if (src->ref) {
+ validate_input(src->data != NULL);
+ dav1d_ref_inc(src->ref);
+ }
+ if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
+ *dst = *src;
+}
+
+void dav1d_data_props_copy(Dav1dDataProps *const dst,
+ const Dav1dDataProps *const src)
+{
+ assert(dst != NULL);
+ assert(src != NULL);
+
+ dav1d_ref_dec(&dst->user_data.ref);
+ *dst = *src;
+ if (dst->user_data.ref) dav1d_ref_inc(dst->user_data.ref);
+}
+
+void dav1d_data_props_set_defaults(Dav1dDataProps *const props) {
+ assert(props != NULL);
+
+ memset(props, 0, sizeof(*props));
+ props->timestamp = INT64_MIN;
+ props->offset = -1;
+}
+
+void dav1d_data_props_unref_internal(Dav1dDataProps *const props) {
+ validate_input(props != NULL);
+
+ struct Dav1dRef *user_data_ref = props->user_data.ref;
+ dav1d_data_props_set_defaults(props);
+ dav1d_ref_dec(&user_data_ref);
+}
+
+void dav1d_data_unref_internal(Dav1dData *const buf) {
+ validate_input(buf != NULL);
+
+ struct Dav1dRef *user_data_ref = buf->m.user_data.ref;
+ if (buf->ref) {
+ validate_input(buf->data != NULL);
+ dav1d_ref_dec(&buf->ref);
+ }
+ memset(buf, 0, sizeof(*buf));
+ dav1d_data_props_set_defaults(&buf->m);
+ dav1d_ref_dec(&user_data_ref);
+}
diff --git a/third_party/dav1d/src/data.h b/third_party/dav1d/src/data.h
new file mode 100644
index 0000000000..b34c1db702
--- /dev/null
+++ b/third_party/dav1d/src/data.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_DATA_H
+#define DAV1D_SRC_DATA_H
+
+#include "dav1d/data.h"
+
+void dav1d_data_ref(Dav1dData *dst, const Dav1dData *src);
+
+/**
+ * Copy the source properties to the destination and increase the
+ * user_data's reference count (if it's not NULL).
+ */
+void dav1d_data_props_copy(Dav1dDataProps *dst, const Dav1dDataProps *src);
+
+void dav1d_data_props_set_defaults(Dav1dDataProps *props);
+
+uint8_t *dav1d_data_create_internal(Dav1dData *buf, size_t sz);
+int dav1d_data_wrap_internal(Dav1dData *buf, const uint8_t *ptr, size_t sz,
+ void (*free_callback)(const uint8_t *data,
+ void *user_data),
+ void *user_data);
+int dav1d_data_wrap_user_data_internal(Dav1dData *buf,
+ const uint8_t *user_data,
+ void (*free_callback)(const uint8_t *user_data,
+ void *cookie),
+ void *cookie);
+void dav1d_data_unref_internal(Dav1dData *buf);
+void dav1d_data_props_unref_internal(Dav1dDataProps *props);
+
+#endif /* DAV1D_SRC_DATA_H */
diff --git a/third_party/dav1d/src/dav1d.rc.in b/third_party/dav1d/src/dav1d.rc.in
new file mode 100644
index 0000000000..ad6aab481d
--- /dev/null
+++ b/third_party/dav1d/src/dav1d.rc.in
@@ -0,0 +1,32 @@
+#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0
+#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@"
+#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0
+#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@"
+
+#include <windows.h>
+
+1 VERSIONINFO
+FILETYPE VFT_DLL
+FILEOS VOS_NT_WINDOWS32
+PRODUCTVERSION PROJECT_VERSION_NUMBER
+FILEVERSION API_VERSION_NUMBER
+BEGIN
+ BLOCK "StringFileInfo"
+ BEGIN
+ BLOCK "040904E4"
+ BEGIN
+ VALUE "CompanyName", "VideoLAN"
+ VALUE "ProductName", "dav1d"
+ VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR
+ VALUE "FileVersion", API_VERSION_NUMBER_STR
+ VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder"
+ VALUE "InternalName", "dav1d"
+ VALUE "OriginalFilename", "libdav1d.dll"
+ VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
+ END
+ END
+ BLOCK "VarFileInfo"
+ BEGIN
+ VALUE "Translation", 0x409, 1252
+ END
+END
diff --git a/third_party/dav1d/src/decode.c b/third_party/dav1d/src/decode.c
new file mode 100644
index 0000000000..b4853088e2
--- /dev/null
+++ b/third_party/dav1d/src/decode.c
@@ -0,0 +1,3910 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <string.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "dav1d/data.h"
+
+#include "common/frame.h"
+#include "common/intops.h"
+
+#include "src/ctx.h"
+#include "src/decode.h"
+#include "src/dequant_tables.h"
+#include "src/env.h"
+#include "src/filmgrain.h"
+#include "src/log.h"
+#include "src/qm.h"
+#include "src/recon.h"
+#include "src/ref.h"
+#include "src/tables.h"
+#include "src/thread_task.h"
+#include "src/warpmv.h"
+
+static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr,
+ const Dav1dFrameHeader *const frame_hdr,
+ const int qidx, uint16_t (*dq)[3][2])
+{
+ for (int i = 0; i < (frame_hdr->segmentation.enabled ? 8 : 1); i++) {
+ const int yac = frame_hdr->segmentation.enabled ?
+ iclip_u8(qidx + frame_hdr->segmentation.seg_data.d[i].delta_q) : qidx;
+ const int ydc = iclip_u8(yac + frame_hdr->quant.ydc_delta);
+ const int uac = iclip_u8(yac + frame_hdr->quant.uac_delta);
+ const int udc = iclip_u8(yac + frame_hdr->quant.udc_delta);
+ const int vac = iclip_u8(yac + frame_hdr->quant.vac_delta);
+ const int vdc = iclip_u8(yac + frame_hdr->quant.vdc_delta);
+
+ dq[i][0][0] = dav1d_dq_tbl[seq_hdr->hbd][ydc][0];
+ dq[i][0][1] = dav1d_dq_tbl[seq_hdr->hbd][yac][1];
+ dq[i][1][0] = dav1d_dq_tbl[seq_hdr->hbd][udc][0];
+ dq[i][1][1] = dav1d_dq_tbl[seq_hdr->hbd][uac][1];
+ dq[i][2][0] = dav1d_dq_tbl[seq_hdr->hbd][vdc][0];
+ dq[i][2][1] = dav1d_dq_tbl[seq_hdr->hbd][vac][1];
+ }
+}
+
+static int read_mv_component_diff(Dav1dTaskContext *const t,
+ CdfMvComponent *const mv_comp,
+ const int have_fp)
+{
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dFrameContext *const f = t->f;
+ const int have_hp = f->frame_hdr->hp;
+ const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
+ const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+ mv_comp->classes, 10);
+ int up, fp, hp;
+
+ if (!cl) {
+ up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
+ if (have_fp) {
+ fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ mv_comp->class0_fp[up], 3);
+ hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
+ mv_comp->class0_hp) : 1;
+ } else {
+ fp = 3;
+ hp = 1;
+ }
+ } else {
+ up = 1 << cl;
+ for (int n = 0; n < cl; n++)
+ up |= dav1d_msac_decode_bool_adapt(&ts->msac,
+ mv_comp->classN[n]) << n;
+ if (have_fp) {
+ fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ mv_comp->classN_fp, 3);
+ hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
+ mv_comp->classN_hp) : 1;
+ } else {
+ fp = 3;
+ hp = 1;
+ }
+ }
+
+ const int diff = ((up << 3) | (fp << 1) | hp) + 1;
+
+ return sign ? -diff : diff;
+}
+
+static void read_mv_residual(Dav1dTaskContext *const t, mv *const ref_mv,
+ CdfMvContext *const mv_cdf, const int have_fp)
+{
+ switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint,
+ N_MV_JOINTS - 1))
+ {
+ case MV_JOINT_HV:
+ ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
+ ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp);
+ break;
+ case MV_JOINT_H:
+ ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp);
+ break;
+ case MV_JOINT_V:
+ ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
+ break;
+ default:
+ break;
+ }
+}
+
+static void read_tx_tree(Dav1dTaskContext *const t,
+ const enum RectTxfmSize from,
+ const int depth, uint16_t *const masks,
+ const int x_off, const int y_off)
+{
+ const Dav1dFrameContext *const f = t->f;
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[from];
+ const int txw = t_dim->lw, txh = t_dim->lh;
+ int is_split;
+
+ if (depth < 2 && from > (int) TX_4X4) {
+ const int cat = 2 * (TX_64X64 - t_dim->max) - depth;
+ const int a = t->a->tx[bx4] < txw;
+ const int l = t->l.tx[by4] < txh;
+
+ is_split = dav1d_msac_decode_bool_adapt(&t->ts->msac,
+ t->ts->cdf.m.txpart[cat][a + l]);
+ if (is_split)
+ masks[depth] |= 1 << (y_off * 4 + x_off);
+ } else {
+ is_split = 0;
+ }
+
+ if (is_split && t_dim->max > TX_8X8) {
+ const enum RectTxfmSize sub = t_dim->sub;
+ const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
+ const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
+
+ read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 0, y_off * 2 + 0);
+ t->bx += txsw;
+ if (txw >= txh && t->bx < f->bw)
+ read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 1, y_off * 2 + 0);
+ t->bx -= txsw;
+ t->by += txsh;
+ if (txh >= txw && t->by < f->bh) {
+ read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 0, y_off * 2 + 1);
+ t->bx += txsw;
+ if (txw >= txh && t->bx < f->bw)
+ read_tx_tree(t, sub, depth + 1, masks,
+ x_off * 2 + 1, y_off * 2 + 1);
+ t->bx -= txsw;
+ }
+ t->by -= txsh;
+ } else {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txh)
+ case_set_upto16(t_dim->h, l., 1, by4);
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txw)
+ case_set_upto16(t_dim->w, a->, 0, bx4);
+#undef set_ctx
+ }
+}
+
+static int neg_deinterleave(int diff, int ref, int max) {
+ if (!ref) return diff;
+ if (ref >= (max - 1)) return max - diff - 1;
+ if (2 * ref < max) {
+ if (diff <= 2 * ref) {
+ if (diff & 1)
+ return ref + ((diff + 1) >> 1);
+ else
+ return ref - (diff >> 1);
+ }
+ return diff;
+ } else {
+ if (diff <= 2 * (max - ref - 1)) {
+ if (diff & 1)
+ return ref + ((diff + 1) >> 1);
+ else
+ return ref - (diff >> 1);
+ }
+ return max - (diff + 1);
+ }
+}
+
+static void find_matching_ref(const Dav1dTaskContext *const t,
+ const enum EdgeFlags intra_edge_flags,
+ const int bw4, const int bh4,
+ const int w4, const int h4,
+ const int have_left, const int have_top,
+ const int ref, uint64_t masks[2])
+{
+ /*const*/ refmvs_block *const *r = &t->rt.r[(t->by & 31) + 5];
+ int count = 0;
+ int have_topleft = have_top && have_left;
+ int have_topright = imax(bw4, bh4) < 32 &&
+ have_top && t->bx + bw4 < t->ts->tiling.col_end &&
+ (intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT);
+
+#define bs(rp) dav1d_block_dimensions[(rp)->bs]
+#define matches(rp) ((rp)->ref.ref[0] == ref + 1 && (rp)->ref.ref[1] == -1)
+
+ if (have_top) {
+ const refmvs_block *r2 = &r[-1][t->bx];
+ if (matches(r2)) {
+ masks[0] |= 1;
+ count = 1;
+ }
+ int aw4 = bs(r2)[0];
+ if (aw4 >= bw4) {
+ const int off = t->bx & (aw4 - 1);
+ if (off) have_topleft = 0;
+ if (aw4 - off > bw4) have_topright = 0;
+ } else {
+ unsigned mask = 1 << aw4;
+ for (int x = aw4; x < w4; x += aw4) {
+ r2 += aw4;
+ if (matches(r2)) {
+ masks[0] |= mask;
+ if (++count >= 8) return;
+ }
+ aw4 = bs(r2)[0];
+ mask <<= aw4;
+ }
+ }
+ }
+ if (have_left) {
+ /*const*/ refmvs_block *const *r2 = r;
+ if (matches(&r2[0][t->bx - 1])) {
+ masks[1] |= 1;
+ if (++count >= 8) return;
+ }
+ int lh4 = bs(&r2[0][t->bx - 1])[1];
+ if (lh4 >= bh4) {
+ if (t->by & (lh4 - 1)) have_topleft = 0;
+ } else {
+ unsigned mask = 1 << lh4;
+ for (int y = lh4; y < h4; y += lh4) {
+ r2 += lh4;
+ if (matches(&r2[0][t->bx - 1])) {
+ masks[1] |= mask;
+ if (++count >= 8) return;
+ }
+ lh4 = bs(&r2[0][t->bx - 1])[1];
+ mask <<= lh4;
+ }
+ }
+ }
+ if (have_topleft && matches(&r[-1][t->bx - 1])) {
+ masks[1] |= 1ULL << 32;
+ if (++count >= 8) return;
+ }
+ if (have_topright && matches(&r[-1][t->bx + bw4])) {
+ masks[0] |= 1ULL << 32;
+ }
+#undef matches
+}
+
+static void derive_warpmv(const Dav1dTaskContext *const t,
+ const int bw4, const int bh4,
+ const uint64_t masks[2], const union mv mv,
+ Dav1dWarpedMotionParams *const wmp)
+{
+ int pts[8][2 /* in, out */][2 /* x, y */], np = 0;
+ /*const*/ refmvs_block *const *r = &t->rt.r[(t->by & 31) + 5];
+
+#define add_sample(dx, dy, sx, sy, rp) do { \
+ pts[np][0][0] = 16 * (2 * dx + sx * bs(rp)[0]) - 8; \
+ pts[np][0][1] = 16 * (2 * dy + sy * bs(rp)[1]) - 8; \
+ pts[np][1][0] = pts[np][0][0] + (rp)->mv.mv[0].x; \
+ pts[np][1][1] = pts[np][0][1] + (rp)->mv.mv[0].y; \
+ np++; \
+} while (0)
+
+ // use masks[] to find the projectable motion vectors in the edges
+ if ((unsigned) masks[0] == 1 && !(masks[1] >> 32)) {
+ const int off = t->bx & (bs(&r[-1][t->bx])[0] - 1);
+ add_sample(-off, 0, 1, -1, &r[-1][t->bx]);
+ } else for (unsigned off = 0, xmask = (uint32_t) masks[0]; np < 8 && xmask;) { // top
+ const int tz = ctz(xmask);
+ off += tz;
+ xmask >>= tz;
+ add_sample(off, 0, 1, -1, &r[-1][t->bx + off]);
+ xmask &= ~1;
+ }
+ if (np < 8 && masks[1] == 1) {
+ const int off = t->by & (bs(&r[0][t->bx - 1])[1] - 1);
+ add_sample(0, -off, -1, 1, &r[-off][t->bx - 1]);
+ } else for (unsigned off = 0, ymask = (uint32_t) masks[1]; np < 8 && ymask;) { // left
+ const int tz = ctz(ymask);
+ off += tz;
+ ymask >>= tz;
+ add_sample(0, off, -1, 1, &r[off][t->bx - 1]);
+ ymask &= ~1;
+ }
+ if (np < 8 && masks[1] >> 32) // top/left
+ add_sample(0, 0, -1, -1, &r[-1][t->bx - 1]);
+ if (np < 8 && masks[0] >> 32) // top/right
+ add_sample(bw4, 0, 1, -1, &r[-1][t->bx + bw4]);
+ assert(np > 0 && np <= 8);
+#undef bs
+
+ // select according to motion vector difference against a threshold
+ int mvd[8], ret = 0;
+ const int thresh = 4 * iclip(imax(bw4, bh4), 4, 28);
+ for (int i = 0; i < np; i++) {
+ mvd[i] = abs(pts[i][1][0] - pts[i][0][0] - mv.x) +
+ abs(pts[i][1][1] - pts[i][0][1] - mv.y);
+ if (mvd[i] > thresh)
+ mvd[i] = -1;
+ else
+ ret++;
+ }
+ if (!ret) {
+ ret = 1;
+ } else for (int i = 0, j = np - 1, k = 0; k < np - ret; k++, i++, j--) {
+ while (mvd[i] != -1) i++;
+ while (mvd[j] == -1) j--;
+ assert(i != j);
+ if (i > j) break;
+ // replace the discarded samples;
+ mvd[i] = mvd[j];
+ memcpy(pts[i], pts[j], sizeof(*pts));
+ }
+
+ if (!dav1d_find_affine_int(pts, ret, bw4, bh4, mv, wmp, t->bx, t->by) &&
+ !dav1d_get_shear_params(wmp))
+ {
+ wmp->type = DAV1D_WM_TYPE_AFFINE;
+ } else
+ wmp->type = DAV1D_WM_TYPE_IDENTITY;
+}
+
+static inline int findoddzero(const uint8_t *buf, int len) {
+ for (int n = 0; n < len; n++)
+ if (!buf[n * 2]) return 1;
+ return 0;
+}
+
+static void read_pal_plane(Dav1dTaskContext *const t, Av1Block *const b,
+ const int pl, const int sz_ctx,
+ const int bx4, const int by4)
+{
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dFrameContext *const f = t->f;
+ const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+ ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
+ uint16_t cache[16], used_cache[8];
+ int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
+ int n_cache = 0;
+ // don't reuse above palette outside SB64 boundaries
+ int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
+ const uint16_t *l = t->al_pal[1][by4][pl], *a = t->al_pal[0][bx4][pl];
+
+ // fill/sort cache
+ while (l_cache && a_cache) {
+ if (*l < *a) {
+ if (!n_cache || cache[n_cache - 1] != *l)
+ cache[n_cache++] = *l;
+ l++;
+ l_cache--;
+ } else {
+ if (*a == *l) {
+ l++;
+ l_cache--;
+ }
+ if (!n_cache || cache[n_cache - 1] != *a)
+ cache[n_cache++] = *a;
+ a++;
+ a_cache--;
+ }
+ }
+ if (l_cache) {
+ do {
+ if (!n_cache || cache[n_cache - 1] != *l)
+ cache[n_cache++] = *l;
+ l++;
+ } while (--l_cache > 0);
+ } else if (a_cache) {
+ do {
+ if (!n_cache || cache[n_cache - 1] != *a)
+ cache[n_cache++] = *a;
+ a++;
+ } while (--a_cache > 0);
+ }
+
+ // find reused cache entries
+ int i = 0;
+ for (int n = 0; n < n_cache && i < pal_sz; n++)
+ if (dav1d_msac_decode_bool_equi(&ts->msac))
+ used_cache[i++] = cache[n];
+ const int n_used_cache = i;
+
+ // parse new entries
+ uint16_t *const pal = t->frame_thread.pass ?
+ f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+ ((t->bx >> 1) + (t->by & 1))][pl] : t->scratch.pal[pl];
+ if (i < pal_sz) {
+ int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
+
+ if (i < pal_sz) {
+ int bits = f->cur.p.bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
+ const int max = (1 << f->cur.p.bpc) - 1;
+
+ do {
+ const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
+ prev = pal[i++] = imin(prev + delta + !pl, max);
+ if (prev + !pl >= max) {
+ for (; i < pal_sz; i++)
+ pal[i] = max;
+ break;
+ }
+ bits = imin(bits, 1 + ulog2(max - prev - !pl));
+ } while (i < pal_sz);
+ }
+
+ // merge cache+new entries
+ int n = 0, m = n_used_cache;
+ for (i = 0; i < pal_sz; i++) {
+ if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
+ pal[i] = used_cache[n++];
+ } else {
+ assert(m < pal_sz);
+ pal[i] = pal[m++];
+ }
+ }
+ } else {
+ memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
+ }
+
+ if (DEBUG_BLOCK_INFO) {
+ printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
+ pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
+ for (int n = 0; n < n_cache; n++)
+ printf("%c%02x", n ? ' ' : '[', cache[n]);
+ printf("%s, pal=", n_cache ? "]" : "[]");
+ for (int n = 0; n < pal_sz; n++)
+ printf("%c%02x", n ? ' ' : '[', pal[n]);
+ printf("]\n");
+ }
+}
+
+static void read_pal_uv(Dav1dTaskContext *const t, Av1Block *const b,
+ const int sz_ctx, const int bx4, const int by4)
+{
+ read_pal_plane(t, b, 1, sz_ctx, bx4, by4);
+
+ // V pal coding
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dFrameContext *const f = t->f;
+ uint16_t *const pal = t->frame_thread.pass ?
+ f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+ ((t->bx >> 1) + (t->by & 1))][2] : t->scratch.pal[2];
+ if (dav1d_msac_decode_bool_equi(&ts->msac)) {
+ const int bits = f->cur.p.bpc - 4 +
+ dav1d_msac_decode_bools(&ts->msac, 2);
+ int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
+ const int max = (1 << f->cur.p.bpc) - 1;
+ for (int i = 1; i < b->pal_sz[1]; i++) {
+ int delta = dav1d_msac_decode_bools(&ts->msac, bits);
+ if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
+ prev = pal[i] = (prev + delta) & max;
+ }
+ } else {
+ for (int i = 0; i < b->pal_sz[1]; i++)
+ pal[i] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
+ }
+ if (DEBUG_BLOCK_INFO) {
+ printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
+ for (int n = 0; n < b->pal_sz[1]; n++)
+ printf("%c%02x", n ? ' ' : '[', pal[n]);
+ printf("]\n");
+ }
+}
+
+// meant to be SIMD'able, so that theoretical complexity of this function
+// times block size goes from w4*h4 to w4+h4-1
+// a and b are previous two lines containing (a) top/left entries or (b)
+// top/left entries, with a[0] being either the first top or first left entry,
+// depending on top_offset being 1 or 0, and b being the first top/left entry
+// for whichever has one. left_offset indicates whether the (len-1)th entry
+// has a left neighbour.
+// output is order[] and ctx for each member of this diagonal.
+static void order_palette(const uint8_t *pal_idx, const ptrdiff_t stride,
+ const int i, const int first, const int last,
+ uint8_t (*const order)[8], uint8_t *const ctx)
+{
+ int have_top = i > first;
+
+ assert(pal_idx);
+ pal_idx += first + (i - first) * stride;
+ for (int j = first, n = 0; j >= last; have_top = 1, j--, n++, pal_idx += stride - 1) {
+ const int have_left = j > 0;
+
+ assert(have_left || have_top);
+
+#define add(v_in) do { \
+ const int v = v_in; \
+ assert((unsigned)v < 8U); \
+ order[n][o_idx++] = v; \
+ mask |= 1 << v; \
+ } while (0)
+
+ unsigned mask = 0;
+ int o_idx = 0;
+ if (!have_left) {
+ ctx[n] = 0;
+ add(pal_idx[-stride]);
+ } else if (!have_top) {
+ ctx[n] = 0;
+ add(pal_idx[-1]);
+ } else {
+ const int l = pal_idx[-1], t = pal_idx[-stride], tl = pal_idx[-(stride + 1)];
+ const int same_t_l = t == l;
+ const int same_t_tl = t == tl;
+ const int same_l_tl = l == tl;
+ const int same_all = same_t_l & same_t_tl & same_l_tl;
+
+ if (same_all) {
+ ctx[n] = 4;
+ add(t);
+ } else if (same_t_l) {
+ ctx[n] = 3;
+ add(t);
+ add(tl);
+ } else if (same_t_tl | same_l_tl) {
+ ctx[n] = 2;
+ add(tl);
+ add(same_t_tl ? l : t);
+ } else {
+ ctx[n] = 1;
+ add(imin(t, l));
+ add(imax(t, l));
+ add(tl);
+ }
+ }
+ for (unsigned m = 1, bit = 0; m < 0x100; m <<= 1, bit++)
+ if (!(mask & m))
+ order[n][o_idx++] = bit;
+ assert(o_idx == 8);
+#undef add
+ }
+}
+
+static void read_pal_indices(Dav1dTaskContext *const t,
+ uint8_t *const pal_idx,
+ const Av1Block *const b, const int pl,
+ const int w4, const int h4,
+ const int bw4, const int bh4)
+{
+ Dav1dTileState *const ts = t->ts;
+ const ptrdiff_t stride = bw4 * 4;
+ assert(pal_idx);
+ pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
+ uint16_t (*const color_map_cdf)[8] =
+ ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
+ uint8_t (*const order)[8] = t->scratch.pal_order;
+ uint8_t *const ctx = t->scratch.pal_ctx;
+ for (int i = 1; i < 4 * (w4 + h4) - 1; i++) {
+ // top/left-to-bottom/right diagonals ("wave-front")
+ const int first = imin(i, w4 * 4 - 1);
+ const int last = imax(0, i - h4 * 4 + 1);
+ order_palette(pal_idx, stride, i, first, last, order, ctx);
+ for (int j = first, m = 0; j >= last; j--, m++) {
+ const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+ color_map_cdf[ctx[m]], b->pal_sz[pl] - 1);
+ pal_idx[(i - j) * stride + j] = order[m][color_idx];
+ }
+ }
+ // fill invisible edges
+ if (bw4 > w4)
+ for (int y = 0; y < 4 * h4; y++)
+ memset(&pal_idx[y * stride + 4 * w4],
+ pal_idx[y * stride + 4 * w4 - 1], 4 * (bw4 - w4));
+ if (h4 < bh4) {
+ const uint8_t *const src = &pal_idx[stride * (4 * h4 - 1)];
+ for (int y = h4 * 4; y < bh4 * 4; y++)
+ memcpy(&pal_idx[y * stride], src, bw4 * 4);
+ }
+}
+
+static void read_vartx_tree(Dav1dTaskContext *const t,
+ Av1Block *const b, const enum BlockSize bs,
+ const int bx4, const int by4)
+{
+ const Dav1dFrameContext *const f = t->f;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+
+ // var-tx tree coding
+ uint16_t tx_split[2] = { 0 };
+ b->max_ytx = dav1d_max_txfm_size_for_bs[bs][0];
+ if (!b->skip && (f->frame_hdr->segmentation.lossless[b->seg_id] ||
+ b->max_ytx == TX_4X4))
+ {
+ b->max_ytx = b->uvtx = TX_4X4;
+ if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx, off, TX_4X4)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ }
+ } else if (f->frame_hdr->txfm_mode != DAV1D_TX_SWITCHABLE || b->skip) {
+ if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx, off, mul * b_dim[2 + diridx])
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ }
+ b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
+ } else {
+ assert(bw4 <= 16 || bh4 <= 16 || b->max_ytx == TX_64X64);
+ int y, x, y_off, x_off;
+ const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
+ for (y = 0, y_off = 0; y < bh4; y += ytx->h, y_off++) {
+ for (x = 0, x_off = 0; x < bw4; x += ytx->w, x_off++) {
+ read_tx_tree(t, b->max_ytx, 0, tx_split, x_off, y_off);
+ // contexts are updated inside read_tx_tree()
+ t->bx += ytx->w;
+ }
+ t->bx -= x;
+ t->by += ytx->h;
+ }
+ t->by -= y;
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-vartxtree[%x/%x]: r=%d\n",
+ tx_split[0], tx_split[1], t->ts->msac.rng);
+ b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
+ }
+ assert(!(tx_split[0] & ~0x33));
+ b->tx_split0 = (uint8_t)tx_split[0];
+ b->tx_split1 = tx_split[1];
+}
+
+static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f,
+ const int by, const int bx,
+ const int w4, int h4,
+ const uint8_t *ref_seg_map,
+ const ptrdiff_t stride)
+{
+ assert(f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
+
+ unsigned seg_id = 8;
+ ref_seg_map += by * stride + bx;
+ do {
+ for (int x = 0; x < w4; x++)
+ seg_id = imin(seg_id, ref_seg_map[x]);
+ ref_seg_map += stride;
+ } while (--h4 > 0 && seg_id);
+ assert(seg_id < 8);
+
+ return seg_id;
+}
+
+static inline void splat_oneref_mv(const Dav1dContext *const c,
+ Dav1dTaskContext *const t,
+ const enum BlockSize bs,
+ const Av1Block *const b,
+ const int bw4, const int bh4)
+{
+ const enum InterPredMode mode = b->inter_mode;
+ const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
+ .ref.ref = { b->ref[0] + 1, b->interintra_type ? 0 : -1 },
+ .mv.mv[0] = b->mv[0],
+ .bs = bs,
+ .mf = (mode == GLOBALMV && imin(bw4, bh4) >= 2) | ((mode == NEWMV) * 2),
+ };
+ c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
+}
+
+static inline void splat_intrabc_mv(const Dav1dContext *const c,
+ Dav1dTaskContext *const t,
+ const enum BlockSize bs,
+ const Av1Block *const b,
+ const int bw4, const int bh4)
+{
+ const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
+ .ref.ref = { 0, -1 },
+ .mv.mv[0] = b->mv[0],
+ .bs = bs,
+ .mf = 0,
+ };
+ c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
+}
+
+static inline void splat_tworef_mv(const Dav1dContext *const c,
+ Dav1dTaskContext *const t,
+ const enum BlockSize bs,
+ const Av1Block *const b,
+ const int bw4, const int bh4)
+{
+ assert(bw4 >= 2 && bh4 >= 2);
+ const enum CompInterPredMode mode = b->inter_mode;
+ const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
+ .ref.ref = { b->ref[0] + 1, b->ref[1] + 1 },
+ .mv.mv = { b->mv[0], b->mv[1] },
+ .bs = bs,
+ .mf = (mode == GLOBALMV_GLOBALMV) | !!((1 << mode) & (0xbc)) * 2,
+ };
+ c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
+}
+
+static inline void splat_intraref(const Dav1dContext *const c,
+ Dav1dTaskContext *const t,
+ const enum BlockSize bs,
+ const int bw4, const int bh4)
+{
+ const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
+ .ref.ref = { 0, -1 },
+ .mv.mv[0].n = INVALID_MV,
+ .bs = bs,
+ .mf = 0,
+ };
+ c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
+}
+
+static void mc_lowest_px(int *const dst, const int by4, const int bh4,
+ const int mvy, const int ss_ver,
+ const struct ScalableMotionParams *const smp)
+{
+ const int v_mul = 4 >> ss_ver;
+ if (!smp->scale) {
+ const int my = mvy >> (3 + ss_ver), dy = mvy & (15 >> !ss_ver);
+ *dst = imax(*dst, (by4 + bh4) * v_mul + my + 4 * !!dy);
+ } else {
+ int y = (by4 * v_mul << 4) + mvy * (1 << !ss_ver);
+ const int64_t tmp = (int64_t)(y) * smp->scale + (smp->scale - 0x4000) * 8;
+ y = apply_sign64((int)((llabs(tmp) + 128) >> 8), tmp) + 32;
+ const int bottom = ((y + (bh4 * v_mul - 1) * smp->step) >> 10) + 1 + 4;
+ *dst = imax(*dst, bottom);
+ }
+}
+
+static ALWAYS_INLINE void affine_lowest_px(Dav1dTaskContext *const t, int *const dst,
+ const uint8_t *const b_dim,
+ const Dav1dWarpedMotionParams *const wmp,
+ const int ss_ver, const int ss_hor)
+{
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+ assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
+ const int32_t *const mat = wmp->matrix;
+ const int y = b_dim[1] * v_mul - 8; // lowest y
+
+ const int src_y = t->by * 4 + ((y + 4) << ss_ver);
+ const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
+ // check left- and right-most blocks
+ for (int x = 0; x < b_dim[0] * h_mul; x += imax(8, b_dim[0] * h_mul - 8)) {
+ // calculate transformation relative to center of 8x8 block in
+ // luma pixel units
+ const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
+ const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
+ const int dy = (int) (mvy >> 16) - 4;
+ *dst = imax(*dst, dy + 4 + 8);
+ }
+}
+
+static NOINLINE void affine_lowest_px_luma(Dav1dTaskContext *const t, int *const dst,
+ const uint8_t *const b_dim,
+ const Dav1dWarpedMotionParams *const wmp)
+{
+ affine_lowest_px(t, dst, b_dim, wmp, 0, 0);
+}
+
+static NOINLINE void affine_lowest_px_chroma(Dav1dTaskContext *const t, int *const dst,
+ const uint8_t *const b_dim,
+ const Dav1dWarpedMotionParams *const wmp)
+{
+ const Dav1dFrameContext *const f = t->f;
+ assert(f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400);
+ if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I444)
+ affine_lowest_px_luma(t, dst, b_dim, wmp);
+ else
+ affine_lowest_px(t, dst, b_dim, wmp, f->cur.p.layout & DAV1D_PIXEL_LAYOUT_I420, 1);
+}
+
+static void obmc_lowest_px(Dav1dTaskContext *const t,
+ int (*const dst)[2], const int is_chroma,
+ const uint8_t *const b_dim,
+ const int bx4, const int by4, const int w4, const int h4)
+{
+ assert(!(t->bx & 1) && !(t->by & 1));
+ const Dav1dFrameContext *const f = t->f;
+ /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
+ const int ss_ver = is_chroma && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = is_chroma && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+
+ if (t->by > t->ts->tiling.row_start &&
+ (!is_chroma || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
+ {
+ for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
+ // only odd blocks are considered for overlap handling, hence +1
+ const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
+ const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
+
+ if (a_r->ref.ref[0] > 0) {
+ const int oh4 = imin(b_dim[1], 16) >> 1;
+ mc_lowest_px(&dst[a_r->ref.ref[0] - 1][is_chroma], t->by,
+ (oh4 * 3 + 3) >> 2, a_r->mv.mv[0].y, ss_ver,
+ &f->svc[a_r->ref.ref[0] - 1][1]);
+ i++;
+ }
+ x += imax(a_b_dim[0], 2);
+ }
+ }
+
+ if (t->bx > t->ts->tiling.col_start)
+ for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
+ // only odd blocks are considered for overlap handling, hence +1
+ const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
+ const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
+
+ if (l_r->ref.ref[0] > 0) {
+ const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]);
+ mc_lowest_px(&dst[l_r->ref.ref[0] - 1][is_chroma],
+ t->by + y, oh4, l_r->mv.mv[0].y, ss_ver,
+ &f->svc[l_r->ref.ref[0] - 1][1]);
+ i++;
+ }
+ y += imax(l_b_dim[1], 2);
+ }
+}
+
+static int decode_b(Dav1dTaskContext *const t,
+ const enum BlockLevel bl,
+ const enum BlockSize bs,
+ const enum BlockPartition bp,
+ const enum EdgeFlags intra_edge_flags)
+{
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dFrameContext *const f = t->f;
+ Av1Block b_mem, *const b = t->frame_thread.pass ?
+ &f->frame_thread.b[t->by * f->b4_stride + t->bx] : &b_mem;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+ const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+ const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
+ const int have_left = t->bx > ts->tiling.col_start;
+ const int have_top = t->by > ts->tiling.row_start;
+ const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+ (bw4 > ss_hor || t->bx & 1) &&
+ (bh4 > ss_ver || t->by & 1);
+
+ if (t->frame_thread.pass == 2) {
+ if (b->intra) {
+ f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
+
+ const enum IntraPredMode y_mode_nofilt =
+ b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
+ rep_macro(type, t->dir intra, off, mul)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
+ for (int x = 0; x < bw4; x++) {
+ r[x].ref.ref[0] = 0;
+ r[x].bs = bs;
+ }
+ refmvs_block *const *rr = &t->rt.r[(t->by & 31) + 5];
+ for (int y = 0; y < bh4 - 1; y++) {
+ rr[y][t->bx + bw4 - 1].ref.ref[0] = 0;
+ rr[y][t->bx + bw4 - 1].bs = bs;
+ }
+ }
+
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ }
+ } else {
+ if (IS_INTER_OR_SWITCH(f->frame_hdr) /* not intrabc */ &&
+ b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP)
+ {
+ if (b->matrix[0] == SHRT_MIN) {
+ t->warpmv.type = DAV1D_WM_TYPE_IDENTITY;
+ } else {
+ t->warpmv.type = DAV1D_WM_TYPE_AFFINE;
+ t->warpmv.matrix[2] = b->matrix[0] + 0x10000;
+ t->warpmv.matrix[3] = b->matrix[1];
+ t->warpmv.matrix[4] = b->matrix[2];
+ t->warpmv.matrix[5] = b->matrix[3] + 0x10000;
+ dav1d_set_affine_mv2d(bw4, bh4, b->mv2d, &t->warpmv,
+ t->bx, t->by);
+ dav1d_get_shear_params(&t->warpmv);
+#define signabs(v) v < 0 ? '-' : ' ', abs(v)
+ if (DEBUG_BLOCK_INFO)
+ printf("[ %c%x %c%x %c%x\n %c%x %c%x %c%x ]\n"
+ "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x, mv=y:%d,x:%d\n",
+ signabs(t->warpmv.matrix[0]),
+ signabs(t->warpmv.matrix[1]),
+ signabs(t->warpmv.matrix[2]),
+ signabs(t->warpmv.matrix[3]),
+ signabs(t->warpmv.matrix[4]),
+ signabs(t->warpmv.matrix[5]),
+ signabs(t->warpmv.u.p.alpha),
+ signabs(t->warpmv.u.p.beta),
+ signabs(t->warpmv.u.p.gamma),
+ signabs(t->warpmv.u.p.delta),
+ b->mv2d.y, b->mv2d.x);
+#undef signabs
+ }
+ }
+ if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
+
+ const uint8_t *const filter = dav1d_filter_dir[b->filter2d];
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
+ rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
+ rep_macro(type, t->dir intra, off, 0)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
+ if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
+ for (int x = 0; x < bw4; x++) {
+ r[x].ref.ref[0] = b->ref[0] + 1;
+ r[x].mv.mv[0] = b->mv[0];
+ r[x].bs = bs;
+ }
+ refmvs_block *const *rr = &t->rt.r[(t->by & 31) + 5];
+ for (int y = 0; y < bh4 - 1; y++) {
+ rr[y][t->bx + bw4 - 1].ref.ref[0] = b->ref[0] + 1;
+ rr[y][t->bx + bw4 - 1].mv.mv[0] = b->mv[0];
+ rr[y][t->bx + bw4 - 1].bs = bs;
+ }
+ }
+
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ }
+ }
+ return 0;
+ }
+
+ const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+
+ b->bl = bl;
+ b->bp = bp;
+ b->bs = bs;
+
+ const Dav1dSegmentationData *seg = NULL;
+
+ // segment_id (if seg_feature for skip/ref/gmv is enabled)
+ int seg_pred = 0;
+ if (f->frame_hdr->segmentation.enabled) {
+ if (!f->frame_hdr->segmentation.update_map) {
+ if (f->prev_segmap) {
+ unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
+ f->prev_segmap,
+ f->b4_stride);
+ if (seg_id >= 8) return -1;
+ b->seg_id = seg_id;
+ } else {
+ b->seg_id = 0;
+ }
+ seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
+ } else if (f->frame_hdr->segmentation.seg_data.preskip) {
+ if (f->frame_hdr->segmentation.temporal &&
+ (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
+ t->l.seg_pred[by4]])))
+ {
+ // temporal predicted seg_id
+ if (f->prev_segmap) {
+ unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx,
+ w4, h4,
+ f->prev_segmap,
+ f->b4_stride);
+ if (seg_id >= 8) return -1;
+ b->seg_id = seg_id;
+ } else {
+ b->seg_id = 0;
+ }
+ } else {
+ int seg_ctx;
+ const unsigned pred_seg_id =
+ get_cur_frame_segid(t->by, t->bx, have_top, have_left,
+ &seg_ctx, f->cur_segmap, f->b4_stride);
+ const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+ ts->cdf.m.seg_id[seg_ctx],
+ DAV1D_MAX_SEGMENTS - 1);
+ const unsigned last_active_seg_id =
+ f->frame_hdr->segmentation.seg_data.last_active_segid;
+ b->seg_id = neg_deinterleave(diff, pred_seg_id,
+ last_active_seg_id + 1);
+ if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
+ if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error?
+ }
+
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-segid[preskip;%d]: r=%d\n",
+ b->seg_id, ts->msac.rng);
+
+ seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
+ }
+ } else {
+ b->seg_id = 0;
+ }
+
+ // skip_mode
+ if ((!seg || (!seg->globalmv && seg->ref == -1 && !seg->skip)) &&
+ f->frame_hdr->skip_mode_enabled && imin(bw4, bh4) > 1)
+ {
+ const int smctx = t->a->skip_mode[bx4] + t->l.skip_mode[by4];
+ b->skip_mode = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.skip_mode[smctx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-skipmode[%d]: r=%d\n", b->skip_mode, ts->msac.rng);
+ } else {
+ b->skip_mode = 0;
+ }
+
+ // skip
+ if (b->skip_mode || (seg && seg->skip)) {
+ b->skip = 1;
+ } else {
+ const int sctx = t->a->skip[bx4] + t->l.skip[by4];
+ b->skip = dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip[sctx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-skip[%d]: r=%d\n", b->skip, ts->msac.rng);
+ }
+
+ // segment_id
+ if (f->frame_hdr->segmentation.enabled &&
+ f->frame_hdr->segmentation.update_map &&
+ !f->frame_hdr->segmentation.seg_data.preskip)
+ {
+ if (!b->skip && f->frame_hdr->segmentation.temporal &&
+ (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
+ t->l.seg_pred[by4]])))
+ {
+ // temporal predicted seg_id
+ if (f->prev_segmap) {
+ unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
+ f->prev_segmap,
+ f->b4_stride);
+ if (seg_id >= 8) return -1;
+ b->seg_id = seg_id;
+ } else {
+ b->seg_id = 0;
+ }
+ } else {
+ int seg_ctx;
+ const unsigned pred_seg_id =
+ get_cur_frame_segid(t->by, t->bx, have_top, have_left,
+ &seg_ctx, f->cur_segmap, f->b4_stride);
+ if (b->skip) {
+ b->seg_id = pred_seg_id;
+ } else {
+ const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+ ts->cdf.m.seg_id[seg_ctx],
+ DAV1D_MAX_SEGMENTS - 1);
+ const unsigned last_active_seg_id =
+ f->frame_hdr->segmentation.seg_data.last_active_segid;
+ b->seg_id = neg_deinterleave(diff, pred_seg_id,
+ last_active_seg_id + 1);
+ if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
+ }
+ if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error?
+ }
+
+ seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
+
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-segid[postskip;%d]: r=%d\n",
+ b->seg_id, ts->msac.rng);
+ }
+
+ // cdef index
+ if (!b->skip) {
+ const int idx = f->seq_hdr->sb128 ? ((t->bx & 16) >> 4) +
+ ((t->by & 16) >> 3) : 0;
+ if (t->cur_sb_cdef_idx_ptr[idx] == -1) {
+ const int v = dav1d_msac_decode_bools(&ts->msac,
+ f->frame_hdr->cdef.n_bits);
+ t->cur_sb_cdef_idx_ptr[idx] = v;
+ if (bw4 > 16) t->cur_sb_cdef_idx_ptr[idx + 1] = v;
+ if (bh4 > 16) t->cur_sb_cdef_idx_ptr[idx + 2] = v;
+ if (bw4 == 32 && bh4 == 32) t->cur_sb_cdef_idx_ptr[idx + 3] = v;
+
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-cdef_idx[%d]: r=%d\n",
+ *t->cur_sb_cdef_idx_ptr, ts->msac.rng);
+ }
+ }
+
+ // delta-q/lf
+ if (!(t->bx & (31 >> !f->seq_hdr->sb128)) &&
+ !(t->by & (31 >> !f->seq_hdr->sb128)))
+ {
+ const int prev_qidx = ts->last_qidx;
+ const int have_delta_q = f->frame_hdr->delta.q.present &&
+ (bs != (f->seq_hdr->sb128 ? BS_128x128 : BS_64x64) || !b->skip);
+
+ int8_t prev_delta_lf[4];
+ memcpy(prev_delta_lf, ts->last_delta_lf, 4);
+
+ if (have_delta_q) {
+ int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.delta_q, 3);
+ if (delta_q == 3) {
+ const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
+ delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) +
+ 1 + (1 << n_bits);
+ }
+ if (delta_q) {
+ if (dav1d_msac_decode_bool_equi(&ts->msac)) delta_q = -delta_q;
+ delta_q *= 1 << f->frame_hdr->delta.q.res_log2;
+ }
+ ts->last_qidx = iclip(ts->last_qidx + delta_q, 1, 255);
+ if (have_delta_q && DEBUG_BLOCK_INFO)
+ printf("Post-delta_q[%d->%d]: r=%d\n",
+ delta_q, ts->last_qidx, ts->msac.rng);
+
+ if (f->frame_hdr->delta.lf.present) {
+ const int n_lfs = f->frame_hdr->delta.lf.multi ?
+ f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1;
+
+ for (int i = 0; i < n_lfs; i++) {
+ int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 3);
+ if (delta_lf == 3) {
+ const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
+ delta_lf = dav1d_msac_decode_bools(&ts->msac, n_bits) +
+ 1 + (1 << n_bits);
+ }
+ if (delta_lf) {
+ if (dav1d_msac_decode_bool_equi(&ts->msac))
+ delta_lf = -delta_lf;
+ delta_lf *= 1 << f->frame_hdr->delta.lf.res_log2;
+ }
+ ts->last_delta_lf[i] =
+ iclip(ts->last_delta_lf[i] + delta_lf, -63, 63);
+ if (have_delta_q && DEBUG_BLOCK_INFO)
+ printf("Post-delta_lf[%d:%d]: r=%d\n", i, delta_lf,
+ ts->msac.rng);
+ }
+ }
+ }
+ if (ts->last_qidx == f->frame_hdr->quant.yac) {
+ // assign frame-wide q values to this sb
+ ts->dq = f->dq;
+ } else if (ts->last_qidx != prev_qidx) {
+ // find sb-specific quant parameters
+ init_quant_tables(f->seq_hdr, f->frame_hdr, ts->last_qidx, ts->dqmem);
+ ts->dq = ts->dqmem;
+ }
+ if (!memcmp(ts->last_delta_lf, (int8_t[4]) { 0, 0, 0, 0 }, 4)) {
+ // assign frame-wide lf values to this sb
+ ts->lflvl = f->lf.lvl;
+ } else if (memcmp(ts->last_delta_lf, prev_delta_lf, 4)) {
+ // find sb-specific lf lvl parameters
+ dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf);
+ ts->lflvl = ts->lflvlmem;
+ }
+ }
+
+ if (b->skip_mode) {
+ b->intra = 0;
+ } else if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ if (seg && (seg->ref >= 0 || seg->globalmv)) {
+ b->intra = !seg->ref;
+ } else {
+ const int ictx = get_intra_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.intra[ictx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-intra[%d]: r=%d\n", b->intra, ts->msac.rng);
+ }
+ } else if (f->frame_hdr->allow_intrabc) {
+ b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-intrabcflag[%d]: r=%d\n", b->intra, ts->msac.rng);
+ } else {
+ b->intra = 1;
+ }
+
+ // intra/inter-specific stuff
+ if (b->intra) {
+ uint16_t *const ymode_cdf = IS_INTER_OR_SWITCH(f->frame_hdr) ?
+ ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
+ ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
+ [dav1d_intra_mode_context[t->l.mode[by4]]];
+ b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf,
+ N_INTRA_PRED_MODES - 1);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);
+
+ // angle delta
+ if (b_dim[2] + b_dim[3] >= 2 && b->y_mode >= VERT_PRED &&
+ b->y_mode <= VERT_LEFT_PRED)
+ {
+ uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
+ const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
+ b->y_angle = angle - 3;
+ } else {
+ b->y_angle = 0;
+ }
+
+ if (has_chroma) {
+ const int cfl_allowed = f->frame_hdr->segmentation.lossless[b->seg_id] ?
+ cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
+ uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
+ b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf,
+ N_UV_INTRA_PRED_MODES - 1 - !cfl_allowed);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
+
+ b->uv_angle = 0;
+ if (b->uv_mode == CFL_PRED) {
+#define SIGN(a) (!!(a) + ((a) > 0))
+ const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+ ts->cdf.m.cfl_sign, 7) + 1;
+ const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3;
+ assert(sign_u == sign / 3);
+ if (sign_u) {
+ const int ctx = (sign_u == 2) * 3 + sign_v;
+ b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+ ts->cdf.m.cfl_alpha[ctx], 15) + 1;
+ if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0];
+ } else {
+ b->cfl_alpha[0] = 0;
+ }
+ if (sign_v) {
+ const int ctx = (sign_v == 2) * 3 + sign_u;
+ b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+ ts->cdf.m.cfl_alpha[ctx], 15) + 1;
+ if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1];
+ } else {
+ b->cfl_alpha[1] = 0;
+ }
+#undef SIGN
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uvalphas[%d/%d]: r=%d\n",
+ b->cfl_alpha[0], b->cfl_alpha[1], ts->msac.rng);
+ } else if (b_dim[2] + b_dim[3] >= 2 && b->uv_mode >= VERT_PRED &&
+ b->uv_mode <= VERT_LEFT_PRED)
+ {
+ uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
+ const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
+ b->uv_angle = angle - 3;
+ }
+ }
+
+ b->pal_sz[0] = b->pal_sz[1] = 0;
+ if (f->frame_hdr->allow_screen_content_tools &&
+ imax(bw4, bh4) <= 16 && bw4 + bh4 >= 4)
+ {
+ const int sz_ctx = b_dim[2] + b_dim[3] - 2;
+ if (b->y_mode == DC_PRED) {
+ const int pal_ctx = (t->a->pal_sz[bx4] > 0) + (t->l.pal_sz[by4] > 0);
+ const int use_y_pal = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.pal_y[sz_ctx][pal_ctx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-y_pal[%d]: r=%d\n", use_y_pal, ts->msac.rng);
+ if (use_y_pal)
+ read_pal_plane(t, b, 0, sz_ctx, bx4, by4);
+ }
+
+ if (has_chroma && b->uv_mode == DC_PRED) {
+ const int pal_ctx = b->pal_sz[0] > 0;
+ const int use_uv_pal = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.pal_uv[pal_ctx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uv_pal[%d]: r=%d\n", use_uv_pal, ts->msac.rng);
+ if (use_uv_pal) // see aomedia bug 2183 for why we use luma coordinates
+ read_pal_uv(t, b, sz_ctx, bx4, by4);
+ }
+ }
+
+ if (b->y_mode == DC_PRED && !b->pal_sz[0] &&
+ imax(b_dim[2], b_dim[3]) <= 3 && f->seq_hdr->filter_intra)
+ {
+ const int is_filter = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.use_filter_intra[bs]);
+ if (is_filter) {
+ b->y_mode = FILTER_PRED;
+ b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.filter_intra, 4);
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-filterintramode[%d/%d]: r=%d\n",
+ b->y_mode, b->y_angle, ts->msac.rng);
+ }
+
+ if (b->pal_sz[0]) {
+ uint8_t *pal_idx;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ assert(ts->frame_thread[p].pal_idx);
+ pal_idx = ts->frame_thread[p].pal_idx;
+ ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
+ } else
+ pal_idx = t->scratch.pal_idx;
+ read_pal_indices(t, pal_idx, b, 0, w4, h4, bw4, bh4);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-y-pal-indices: r=%d\n", ts->msac.rng);
+ }
+
+ if (has_chroma && b->pal_sz[1]) {
+ uint8_t *pal_idx;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ assert(ts->frame_thread[p].pal_idx);
+ pal_idx = ts->frame_thread[p].pal_idx;
+ ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
+ } else
+ pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
+ read_pal_indices(t, pal_idx, b, 1, cw4, ch4, cbw4, cbh4);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uv-pal-indices: r=%d\n", ts->msac.rng);
+ }
+
+ const TxfmInfo *t_dim;
+ if (f->frame_hdr->segmentation.lossless[b->seg_id]) {
+ b->tx = b->uvtx = (int) TX_4X4;
+ t_dim = &dav1d_txfm_dimensions[TX_4X4];
+ } else {
+ b->tx = dav1d_max_txfm_size_for_bs[bs][0];
+ b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
+ t_dim = &dav1d_txfm_dimensions[b->tx];
+ if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE && t_dim->max > TX_4X4) {
+ const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
+ uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
+ int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf,
+ imin(t_dim->max, 2));
+
+ while (depth--) {
+ b->tx = t_dim->sub;
+ t_dim = &dav1d_txfm_dimensions[b->tx];
+ }
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-tx[%d]: r=%d\n", b->tx, ts->msac.rng);
+ }
+
+ // reconstruction
+ if (t->frame_thread.pass == 1) {
+ f->bd_fn.read_coef_blocks(t, bs, b);
+ } else {
+ f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
+ }
+
+ if (f->frame_hdr->loopfilter.level_y[0] ||
+ f->frame_hdr->loopfilter.level_y[1])
+ {
+ dav1d_create_lf_mask_intra(t->lf_mask, f->lf.level, f->b4_stride,
+ (const uint8_t (*)[8][2])
+ &ts->lflvl[b->seg_id][0][0][0],
+ t->bx, t->by, f->w4, f->h4, bs,
+ b->tx, b->uvtx, f->cur.p.layout,
+ &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
+ has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
+ has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
+ }
+
+ // update contexts
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx_intra, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
+ rep_macro(type, t->dir tx, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
+ rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
+ rep_macro(type, t->dir pal_sz, off, mul * b->pal_sz[0]); \
+ rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+ rep_macro(type, t->dir skip_mode, off, 0); \
+ rep_macro(type, t->dir intra, off, mul); \
+ rep_macro(type, t->dir skip, off, mul * b->skip); \
+ /* see aomedia bug 2183 for why we use luma coordinates here */ \
+ rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \
+ if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
+ rep_macro(type, t->dir comp_type, off, mul * COMP_INTER_NONE); \
+ rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \
+ rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \
+ rep_macro(type, t->dir filter[0], off, mul * DAV1D_N_SWITCHABLE_FILTERS); \
+ rep_macro(type, t->dir filter[1], off, mul * DAV1D_N_SWITCHABLE_FILTERS); \
+ }
+ const enum IntraPredMode y_mode_nofilt =
+ b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ if (b->pal_sz[0]) {
+ uint16_t *const pal = t->frame_thread.pass ?
+ f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+ ((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0];
+ for (int x = 0; x < bw4; x++)
+ memcpy(t->al_pal[0][bx4 + x][0], pal, 16);
+ for (int y = 0; y < bh4; y++)
+ memcpy(t->al_pal[1][by4 + y][0], pal, 16);
+ }
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ if (b->pal_sz[1]) {
+ const uint16_t (*const pal)[8] = t->frame_thread.pass ?
+ f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) *
+ (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))] :
+ t->scratch.pal;
+ // see aomedia bug 2183 for why we use luma coordinates here
+ for (int pl = 1; pl <= 2; pl++) {
+ for (int x = 0; x < bw4; x++)
+ memcpy(t->al_pal[0][bx4 + x][pl], pal[pl], 16);
+ for (int y = 0; y < bh4; y++)
+ memcpy(t->al_pal[1][by4 + y][pl], pal[pl], 16);
+ }
+ }
+ }
+ if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc)
+ splat_intraref(f->c, t, bs, bw4, bh4);
+ } else if (IS_KEY_OR_INTRA(f->frame_hdr)) {
+ // intra block copy
+ refmvs_candidate mvstack[8];
+ int n_mvs, ctx;
+ dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+ (union refmvs_refpair) { .ref = { 0, -1 }},
+ bs, intra_edge_flags, t->by, t->bx);
+
+ if (mvstack[0].mv.mv[0].n)
+ b->mv[0] = mvstack[0].mv.mv[0];
+ else if (mvstack[1].mv.mv[0].n)
+ b->mv[0] = mvstack[1].mv.mv[0];
+ else {
+ if (t->by - (16 << f->seq_hdr->sb128) < ts->tiling.row_start) {
+ b->mv[0].y = 0;
+ b->mv[0].x = -(512 << f->seq_hdr->sb128) - 2048;
+ } else {
+ b->mv[0].y = -(512 << f->seq_hdr->sb128);
+ b->mv[0].x = 0;
+ }
+ }
+
+ const union mv ref = b->mv[0];
+ read_mv_residual(t, &b->mv[0], &ts->cdf.dmv, 0);
+
+ // clip intrabc motion vector to decoded parts of current tile
+ int border_left = ts->tiling.col_start * 4;
+ int border_top = ts->tiling.row_start * 4;
+ if (has_chroma) {
+ if (bw4 < 2 && ss_hor)
+ border_left += 4;
+ if (bh4 < 2 && ss_ver)
+ border_top += 4;
+ }
+ int src_left = t->bx * 4 + (b->mv[0].x >> 3);
+ int src_top = t->by * 4 + (b->mv[0].y >> 3);
+ int src_right = src_left + bw4 * 4;
+ int src_bottom = src_top + bh4 * 4;
+ const int border_right = ((ts->tiling.col_end + (bw4 - 1)) & ~(bw4 - 1)) * 4;
+
+ // check against left or right tile boundary and adjust if necessary
+ if (src_left < border_left) {
+ src_right += border_left - src_left;
+ src_left += border_left - src_left;
+ } else if (src_right > border_right) {
+ src_left -= src_right - border_right;
+ src_right -= src_right - border_right;
+ }
+ // check against top tile boundary and adjust if necessary
+ if (src_top < border_top) {
+ src_bottom += border_top - src_top;
+ src_top += border_top - src_top;
+ }
+
+ const int sbx = (t->bx >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128);
+ const int sby = (t->by >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128);
+ const int sb_size = 1 << (6 + f->seq_hdr->sb128);
+ // check for overlap with current superblock
+ if (src_bottom > sby && src_right > sbx) {
+ if (src_top - border_top >= src_bottom - sby) {
+ // if possible move src up into the previous suberblock row
+ src_top -= src_bottom - sby;
+ src_bottom -= src_bottom - sby;
+ } else if (src_left - border_left >= src_right - sbx) {
+ // if possible move src left into the previous suberblock
+ src_left -= src_right - sbx;
+ src_right -= src_right - sbx;
+ }
+ }
+ // move src up if it is below current superblock row
+ if (src_bottom > sby + sb_size) {
+ src_top -= src_bottom - (sby + sb_size);
+ src_bottom -= src_bottom - (sby + sb_size);
+ }
+ // error out if mv still overlaps with the current superblock
+ if (src_bottom > sby && src_right > sbx)
+ return -1;
+
+ b->mv[0].x = (src_left - t->bx * 4) * 8;
+ b->mv[0].y = (src_top - t->by * 4) * 8;
+
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-dmv[%d/%d,ref=%d/%d|%d/%d]: r=%d\n",
+ b->mv[0].y, b->mv[0].x, ref.y, ref.x,
+ mvstack[0].mv.mv[0].y, mvstack[0].mv.mv[0].x, ts->msac.rng);
+ read_vartx_tree(t, b, bs, bx4, by4);
+
+ // reconstruction
+ if (t->frame_thread.pass == 1) {
+ f->bd_fn.read_coef_blocks(t, bs, b);
+ b->filter2d = FILTER_2D_BILINEAR;
+ } else {
+ if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
+ }
+
+ splat_intrabc_mv(f->c, t, bs, b, bw4, bh4);
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
+ rep_macro(type, t->dir mode, off, mul * DC_PRED); \
+ rep_macro(type, t->dir pal_sz, off, 0); \
+ /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
+ rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
+ rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+ rep_macro(type, t->dir skip_mode, off, 0); \
+ rep_macro(type, t->dir intra, off, 0); \
+ rep_macro(type, t->dir skip, off, mul * b->skip)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ }
+ } else {
+ // inter-specific mode/mv coding
+ int is_comp, has_subpel_filter;
+
+ if (b->skip_mode) {
+ is_comp = 1;
+ } else if ((!seg || (seg->ref == -1 && !seg->globalmv && !seg->skip)) &&
+ f->frame_hdr->switchable_comp_refs && imin(bw4, bh4) > 1)
+ {
+ const int ctx = get_comp_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ is_comp = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp[ctx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-compflag[%d]: r=%d\n", is_comp, ts->msac.rng);
+ } else {
+ is_comp = 0;
+ }
+
+ if (b->skip_mode) {
+ b->ref[0] = f->frame_hdr->skip_mode_refs[0];
+ b->ref[1] = f->frame_hdr->skip_mode_refs[1];
+ b->comp_type = COMP_INTER_AVG;
+ b->inter_mode = NEARESTMV_NEARESTMV;
+ b->drl_idx = NEAREST_DRL;
+ has_subpel_filter = 0;
+
+ refmvs_candidate mvstack[8];
+ int n_mvs, ctx;
+ dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+ (union refmvs_refpair) { .ref = {
+ b->ref[0] + 1, b->ref[1] + 1 }},
+ bs, intra_edge_flags, t->by, t->bx);
+
+ b->mv[0] = mvstack[0].mv.mv[0];
+ b->mv[1] = mvstack[0].mv.mv[1];
+ fix_mv_precision(f->frame_hdr, &b->mv[0]);
+ fix_mv_precision(f->frame_hdr, &b->mv[1]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-skipmodeblock[mv=1:y=%d,x=%d,2:y=%d,x=%d,refs=%d+%d\n",
+ b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
+ b->ref[0], b->ref[1]);
+ } else if (is_comp) {
+ const int dir_ctx = get_comp_dir_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_dir[dir_ctx]))
+ {
+ // bidir - first reference (fw)
+ const int ctx1 = av1_get_fwd_ref_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_fwd_ref[0][ctx1]))
+ {
+ const int ctx2 = av1_get_fwd_ref_2_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_fwd_ref[2][ctx2]);
+ } else {
+ const int ctx2 = av1_get_fwd_ref_1_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_fwd_ref[1][ctx2]);
+ }
+
+ // second reference (bw)
+ const int ctx3 = av1_get_bwd_ref_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_bwd_ref[0][ctx3]))
+ {
+ b->ref[1] = 6;
+ } else {
+ const int ctx4 = av1_get_bwd_ref_1_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[1] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_bwd_ref[1][ctx4]);
+ }
+ } else {
+ // unidir
+ const int uctx_p = av1_get_uni_p_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_uni_ref[0][uctx_p]))
+ {
+ b->ref[0] = 4;
+ b->ref[1] = 6;
+ } else {
+ const int uctx_p1 = av1_get_uni_p1_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[0] = 0;
+ b->ref[1] = 1 + dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_uni_ref[1][uctx_p1]);
+ if (b->ref[1] == 2) {
+ const int uctx_p2 = av1_get_uni_p2_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[1] += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.comp_uni_ref[2][uctx_p2]);
+ }
+ }
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-refs[%d/%d]: r=%d\n",
+ b->ref[0], b->ref[1], ts->msac.rng);
+
+ refmvs_candidate mvstack[8];
+ int n_mvs, ctx;
+ dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+ (union refmvs_refpair) { .ref = {
+ b->ref[0] + 1, b->ref[1] + 1 }},
+ bs, intra_edge_flags, t->by, t->bx);
+
+ b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+ ts->cdf.m.comp_inter_mode[ctx],
+ N_COMP_INTER_PRED_MODES - 1);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-compintermode[%d,ctx=%d,n_mvs=%d]: r=%d\n",
+ b->inter_mode, ctx, n_mvs, ts->msac.rng);
+
+ const uint8_t *const im = dav1d_comp_inter_pred_modes[b->inter_mode];
+ b->drl_idx = NEAREST_DRL;
+ if (b->inter_mode == NEWMV_NEWMV) {
+ if (n_mvs > 1) { // NEARER, NEAR or NEARISH
+ const int drl_ctx_v1 = get_drl_context(mvstack, 0);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v1]);
+ if (b->drl_idx == NEARER_DRL && n_mvs > 2) {
+ const int drl_ctx_v2 = get_drl_context(mvstack, 1);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v2]);
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n",
+ b->drl_idx, n_mvs, ts->msac.rng);
+ }
+ } else if (im[0] == NEARMV || im[1] == NEARMV) {
+ b->drl_idx = NEARER_DRL;
+ if (n_mvs > 2) { // NEAR or NEARISH
+ const int drl_ctx_v2 = get_drl_context(mvstack, 1);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v2]);
+ if (b->drl_idx == NEAR_DRL && n_mvs > 3) {
+ const int drl_ctx_v3 = get_drl_context(mvstack, 2);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v3]);
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n",
+ b->drl_idx, n_mvs, ts->msac.rng);
+ }
+ }
+ assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
+
+#define assign_comp_mv(idx) \
+ switch (im[idx]) { \
+ case NEARMV: \
+ case NEARESTMV: \
+ b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
+ fix_mv_precision(f->frame_hdr, &b->mv[idx]); \
+ break; \
+ case GLOBALMV: \
+ has_subpel_filter |= \
+ f->frame_hdr->gmv[b->ref[idx]].type == DAV1D_WM_TYPE_TRANSLATION; \
+ b->mv[idx] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[idx]], \
+ t->bx, t->by, bw4, bh4, f->frame_hdr); \
+ break; \
+ case NEWMV: \
+ b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
+ read_mv_residual(t, &b->mv[idx], &ts->cdf.mv, \
+ !f->frame_hdr->force_integer_mv); \
+ break; \
+ }
+ has_subpel_filter = imin(bw4, bh4) == 1 ||
+ b->inter_mode != GLOBALMV_GLOBALMV;
+ assign_comp_mv(0);
+ assign_comp_mv(1);
+#undef assign_comp_mv
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-residual_mv[1:y=%d,x=%d,2:y=%d,x=%d]: r=%d\n",
+ b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
+ ts->msac.rng);
+
+ // jnt_comp vs. seg vs. wedge
+ int is_segwedge = 0;
+ if (f->seq_hdr->masked_compound) {
+ const int mask_ctx = get_mask_comp_ctx(t->a, &t->l, by4, bx4);
+
+ is_segwedge = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.mask_comp[mask_ctx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-segwedge_vs_jntavg[%d,ctx=%d]: r=%d\n",
+ is_segwedge, mask_ctx, ts->msac.rng);
+ }
+
+ if (!is_segwedge) {
+ if (f->seq_hdr->jnt_comp) {
+ const int jnt_ctx =
+ get_jnt_comp_ctx(f->seq_hdr->order_hint_n_bits,
+ f->cur.frame_hdr->frame_offset,
+ f->refp[b->ref[0]].p.frame_hdr->frame_offset,
+ f->refp[b->ref[1]].p.frame_hdr->frame_offset,
+ t->a, &t->l, by4, bx4);
+ b->comp_type = COMP_INTER_WEIGHTED_AVG +
+ dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.jnt_comp[jnt_ctx]);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-jnt_comp[%d,ctx=%d[ac:%d,ar:%d,lc:%d,lr:%d]]: r=%d\n",
+ b->comp_type == COMP_INTER_AVG,
+ jnt_ctx, t->a->comp_type[bx4], t->a->ref[0][bx4],
+ t->l.comp_type[by4], t->l.ref[0][by4],
+ ts->msac.rng);
+ } else {
+ b->comp_type = COMP_INTER_AVG;
+ }
+ } else {
+ if (wedge_allowed_mask & (1 << bs)) {
+ const int ctx = dav1d_wedge_ctx_lut[bs];
+ b->comp_type = COMP_INTER_WEDGE -
+ dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.wedge_comp[ctx]);
+ if (b->comp_type == COMP_INTER_WEDGE)
+ b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+ ts->cdf.m.wedge_idx[ctx], 15);
+ } else {
+ b->comp_type = COMP_INTER_SEG;
+ }
+ b->mask_sign = dav1d_msac_decode_bool_equi(&ts->msac);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-seg/wedge[%d,wedge_idx=%d,sign=%d]: r=%d\n",
+ b->comp_type == COMP_INTER_WEDGE,
+ b->wedge_idx, b->mask_sign, ts->msac.rng);
+ }
+ } else {
+ b->comp_type = COMP_INTER_NONE;
+
+ // ref
+ if (seg && seg->ref > 0) {
+ b->ref[0] = seg->ref - 1;
+ } else if (seg && (seg->globalmv || seg->skip)) {
+ b->ref[0] = 0;
+ } else {
+ const int ctx1 = av1_get_ref_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.ref[0][ctx1]))
+ {
+ const int ctx2 = av1_get_ref_2_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.ref[1][ctx2]))
+ {
+ b->ref[0] = 6;
+ } else {
+ const int ctx3 = av1_get_ref_6_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[0] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.ref[5][ctx3]);
+ }
+ } else {
+ const int ctx2 = av1_get_ref_3_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.ref[2][ctx2]))
+ {
+ const int ctx3 = av1_get_ref_5_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.ref[4][ctx3]);
+ } else {
+ const int ctx3 = av1_get_ref_4_ctx(t->a, &t->l, by4, bx4,
+ have_top, have_left);
+ b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.ref[3][ctx3]);
+ }
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-ref[%d]: r=%d\n", b->ref[0], ts->msac.rng);
+ }
+ b->ref[1] = -1;
+
+ refmvs_candidate mvstack[8];
+ int n_mvs, ctx;
+ dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
+ (union refmvs_refpair) { .ref = { b->ref[0] + 1, -1 }},
+ bs, intra_edge_flags, t->by, t->bx);
+
+ // mode parsing and mv derivation from ref_mvs
+ if ((seg && (seg->skip || seg->globalmv)) ||
+ dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.newmv_mode[ctx & 7]))
+ {
+ if ((seg && (seg->skip || seg->globalmv)) ||
+ !dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.globalmv_mode[(ctx >> 3) & 1]))
+ {
+ b->inter_mode = GLOBALMV;
+ b->mv[0] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[0]],
+ t->bx, t->by, bw4, bh4, f->frame_hdr);
+ has_subpel_filter = imin(bw4, bh4) == 1 ||
+ f->frame_hdr->gmv[b->ref[0]].type == DAV1D_WM_TYPE_TRANSLATION;
+ } else {
+ has_subpel_filter = 1;
+ if (dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.refmv_mode[(ctx >> 4) & 15]))
+ { // NEAREST, NEARER, NEAR or NEARISH
+ b->inter_mode = NEARMV;
+ b->drl_idx = NEARER_DRL;
+ if (n_mvs > 2) { // NEARER, NEAR or NEARISH
+ const int drl_ctx_v2 = get_drl_context(mvstack, 1);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v2]);
+ if (b->drl_idx == NEAR_DRL && n_mvs > 3) { // NEAR or NEARISH
+ const int drl_ctx_v3 =
+ get_drl_context(mvstack, 2);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v3]);
+ }
+ }
+ } else {
+ b->inter_mode = NEARESTMV;
+ b->drl_idx = NEAREST_DRL;
+ }
+ assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
+ b->mv[0] = mvstack[b->drl_idx].mv.mv[0];
+ if (b->drl_idx < NEAR_DRL)
+ fix_mv_precision(f->frame_hdr, &b->mv[0]);
+ }
+
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-intermode[%d,drl=%d,mv=y:%d,x:%d,n_mvs=%d]: r=%d\n",
+ b->inter_mode, b->drl_idx, b->mv[0].y, b->mv[0].x, n_mvs,
+ ts->msac.rng);
+ } else {
+ has_subpel_filter = 1;
+ b->inter_mode = NEWMV;
+ b->drl_idx = NEAREST_DRL;
+ if (n_mvs > 1) { // NEARER, NEAR or NEARISH
+ const int drl_ctx_v1 = get_drl_context(mvstack, 0);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v1]);
+ if (b->drl_idx == NEARER_DRL && n_mvs > 2) { // NEAR or NEARISH
+ const int drl_ctx_v2 = get_drl_context(mvstack, 1);
+ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.drl_bit[drl_ctx_v2]);
+ }
+ }
+ assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
+ if (n_mvs > 1) {
+ b->mv[0] = mvstack[b->drl_idx].mv.mv[0];
+ } else {
+ assert(!b->drl_idx);
+ b->mv[0] = mvstack[0].mv.mv[0];
+ fix_mv_precision(f->frame_hdr, &b->mv[0]);
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-intermode[%d,drl=%d]: r=%d\n",
+ b->inter_mode, b->drl_idx, ts->msac.rng);
+ read_mv_residual(t, &b->mv[0], &ts->cdf.mv,
+ !f->frame_hdr->force_integer_mv);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-residualmv[mv=y:%d,x:%d]: r=%d\n",
+ b->mv[0].y, b->mv[0].x, ts->msac.rng);
+ }
+
+ // interintra flags
+ const int ii_sz_grp = dav1d_ymode_size_context[bs];
+ if (f->seq_hdr->inter_intra &&
+ interintra_allowed_mask & (1 << bs) &&
+ dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.interintra[ii_sz_grp]))
+ {
+ b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.interintra_mode[ii_sz_grp],
+ N_INTER_INTRA_PRED_MODES - 1);
+ const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
+ b->interintra_type = INTER_INTRA_BLEND +
+ dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.interintra_wedge[wedge_ctx]);
+ if (b->interintra_type == INTER_INTRA_WEDGE)
+ b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+ ts->cdf.m.wedge_idx[wedge_ctx], 15);
+ } else {
+ b->interintra_type = INTER_INTRA_NONE;
+ }
+ if (DEBUG_BLOCK_INFO && f->seq_hdr->inter_intra &&
+ interintra_allowed_mask & (1 << bs))
+ {
+ printf("Post-interintra[t=%d,m=%d,w=%d]: r=%d\n",
+ b->interintra_type, b->interintra_mode,
+ b->wedge_idx, ts->msac.rng);
+ }
+
+ // motion variation
+ if (f->frame_hdr->switchable_motion_mode &&
+ b->interintra_type == INTER_INTRA_NONE && imin(bw4, bh4) >= 2 &&
+ // is not warped global motion
+ !(!f->frame_hdr->force_integer_mv && b->inter_mode == GLOBALMV &&
+ f->frame_hdr->gmv[b->ref[0]].type > DAV1D_WM_TYPE_TRANSLATION) &&
+ // has overlappable neighbours
+ ((have_left && findoddzero(&t->l.intra[by4 + 1], h4 >> 1)) ||
+ (have_top && findoddzero(&t->a->intra[bx4 + 1], w4 >> 1))))
+ {
+ // reaching here means the block allows obmc - check warp by
+ // finding matching-ref blocks in top/left edges
+ uint64_t mask[2] = { 0, 0 };
+ find_matching_ref(t, intra_edge_flags, bw4, bh4, w4, h4,
+ have_left, have_top, b->ref[0], mask);
+ const int allow_warp = !f->svc[b->ref[0]][0].scale &&
+ !f->frame_hdr->force_integer_mv &&
+ f->frame_hdr->warp_motion && (mask[0] | mask[1]);
+
+ b->motion_mode = allow_warp ?
+ dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.motion_mode[bs], 2) :
+ dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
+ if (b->motion_mode == MM_WARP) {
+ has_subpel_filter = 0;
+ derive_warpmv(t, bw4, bh4, mask, b->mv[0], &t->warpmv);
+#define signabs(v) v < 0 ? '-' : ' ', abs(v)
+ if (DEBUG_BLOCK_INFO)
+ printf("[ %c%x %c%x %c%x\n %c%x %c%x %c%x ]\n"
+ "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x, "
+ "mv=y:%d,x:%d\n",
+ signabs(t->warpmv.matrix[0]),
+ signabs(t->warpmv.matrix[1]),
+ signabs(t->warpmv.matrix[2]),
+ signabs(t->warpmv.matrix[3]),
+ signabs(t->warpmv.matrix[4]),
+ signabs(t->warpmv.matrix[5]),
+ signabs(t->warpmv.u.p.alpha),
+ signabs(t->warpmv.u.p.beta),
+ signabs(t->warpmv.u.p.gamma),
+ signabs(t->warpmv.u.p.delta),
+ b->mv[0].y, b->mv[0].x);
+#undef signabs
+ if (t->frame_thread.pass) {
+ if (t->warpmv.type == DAV1D_WM_TYPE_AFFINE) {
+ b->matrix[0] = t->warpmv.matrix[2] - 0x10000;
+ b->matrix[1] = t->warpmv.matrix[3];
+ b->matrix[2] = t->warpmv.matrix[4];
+ b->matrix[3] = t->warpmv.matrix[5] - 0x10000;
+ } else {
+ b->matrix[0] = SHRT_MIN;
+ }
+ }
+ }
+
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-motionmode[%d]: r=%d [mask: 0x%" PRIx64 "/0x%"
+ PRIx64 "]\n", b->motion_mode, ts->msac.rng, mask[0],
+ mask[1]);
+ } else {
+ b->motion_mode = MM_TRANSLATION;
+ }
+ }
+
+ // subpel filter
+ enum Dav1dFilterMode filter[2];
+ if (f->frame_hdr->subpel_filter_mode == DAV1D_FILTER_SWITCHABLE) {
+ if (has_subpel_filter) {
+ const int comp = b->comp_type != COMP_INTER_NONE;
+ const int ctx1 = get_filter_ctx(t->a, &t->l, comp, 0, b->ref[0],
+ by4, bx4);
+ filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.filter[0][ctx1],
+ DAV1D_N_SWITCHABLE_FILTERS - 1);
+ if (f->seq_hdr->dual_filter) {
+ const int ctx2 = get_filter_ctx(t->a, &t->l, comp, 1,
+ b->ref[0], by4, bx4);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-subpel_filter1[%d,ctx=%d]: r=%d\n",
+ filter[0], ctx1, ts->msac.rng);
+ filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.filter[1][ctx2],
+ DAV1D_N_SWITCHABLE_FILTERS - 1);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-subpel_filter2[%d,ctx=%d]: r=%d\n",
+ filter[1], ctx2, ts->msac.rng);
+ } else {
+ filter[1] = filter[0];
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-subpel_filter[%d,ctx=%d]: r=%d\n",
+ filter[0], ctx1, ts->msac.rng);
+ }
+ } else {
+ filter[0] = filter[1] = DAV1D_FILTER_8TAP_REGULAR;
+ }
+ } else {
+ filter[0] = filter[1] = f->frame_hdr->subpel_filter_mode;
+ }
+ b->filter2d = dav1d_filter_2d[filter[1]][filter[0]];
+
+ read_vartx_tree(t, b, bs, bx4, by4);
+
+ // reconstruction
+ if (t->frame_thread.pass == 1) {
+ f->bd_fn.read_coef_blocks(t, bs, b);
+ } else {
+ if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
+ }
+
+ if (f->frame_hdr->loopfilter.level_y[0] ||
+ f->frame_hdr->loopfilter.level_y[1])
+ {
+ const int is_globalmv =
+ b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
+ const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
+ &ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
+ const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
+ enum RectTxfmSize ytx = b->max_ytx, uvtx = b->uvtx;
+ if (f->frame_hdr->segmentation.lossless[b->seg_id]) {
+ ytx = (enum RectTxfmSize) TX_4X4;
+ uvtx = (enum RectTxfmSize) TX_4X4;
+ }
+ dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride, lf_lvls,
+ t->bx, t->by, f->w4, f->h4, b->skip, bs,
+ ytx, tx_split, uvtx, f->cur.p.layout,
+ &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
+ has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
+ has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
+ }
+
+ // context updates
+ if (is_comp)
+ splat_tworef_mv(f->c, t, bs, b, bw4, bh4);
+ else
+ splat_oneref_mv(f->c, t, bs, b, bw4, bh4);
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+ rep_macro(type, t->dir skip_mode, off, mul * b->skip_mode); \
+ rep_macro(type, t->dir intra, off, 0); \
+ rep_macro(type, t->dir skip, off, mul * b->skip); \
+ rep_macro(type, t->dir pal_sz, off, 0); \
+ /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
+ rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
+ rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
+ rep_macro(type, t->dir comp_type, off, mul * b->comp_type); \
+ rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
+ rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
+ rep_macro(type, t->dir mode, off, mul * b->inter_mode); \
+ rep_macro(type, t->dir ref[0], off, mul * b->ref[0]); \
+ rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) b->ref[1]))
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ }
+ }
+
+ // update contexts
+ if (f->frame_hdr->segmentation.enabled &&
+ f->frame_hdr->segmentation.update_map)
+ {
+ uint8_t *seg_ptr = &f->cur_segmap[t->by * f->b4_stride + t->bx];
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ for (int y = 0; y < bh4; y++) { \
+ rep_macro(type, seg_ptr, 0, mul * b->seg_id); \
+ seg_ptr += f->b4_stride; \
+ }
+ case_set(bw4, NULL, 0, 0);
+#undef set_ctx
+ }
+ if (!b->skip) {
+ uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4 >> 1];
+ const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15);
+ const int bx_idx = (bx4 & 16) >> 4;
+ for (int y = 0; y < bh4; y += 2, noskip_mask++) {
+ (*noskip_mask)[bx_idx] |= mask;
+ if (bw4 == 32) // this should be mask >> 16, but it's 0xffffffff anyway
+ (*noskip_mask)[1] |= mask;
+ }
+ }
+
+ if (t->frame_thread.pass == 1 && !b->intra && IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift;
+ int (*const lowest_px)[2] = ts->lowest_pixel[sby];
+
+ // keep track of motion vectors for each reference
+ if (b->comp_type == COMP_INTER_NONE) {
+ // y
+ if (imin(bw4, bh4) > 1 &&
+ ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
+ (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
+ {
+ affine_lowest_px_luma(t, &lowest_px[b->ref[0]][0], b_dim,
+ b->motion_mode == MM_WARP ? &t->warpmv :
+ &f->frame_hdr->gmv[b->ref[0]]);
+ } else {
+ mc_lowest_px(&lowest_px[b->ref[0]][0], t->by, bh4, b->mv[0].y,
+ 0, &f->svc[b->ref[0]][1]);
+ if (b->motion_mode == MM_OBMC) {
+ obmc_lowest_px(t, lowest_px, 0, b_dim, bx4, by4, w4, h4);
+ }
+ }
+
+ // uv
+ if (has_chroma) {
+ // sub8x8 derivation
+ int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
+ refmvs_block *const *r;
+ if (is_sub8x8) {
+ assert(ss_hor == 1);
+ r = &t->rt.r[(t->by & 31) + 5];
+ if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
+ if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
+ if (bw4 == 1 && bh4 == ss_ver)
+ is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
+ }
+
+ // chroma prediction
+ if (is_sub8x8) {
+ assert(ss_hor == 1);
+ if (bw4 == 1 && bh4 == ss_ver) {
+ const refmvs_block *const rr = &r[-1][t->bx - 1];
+ mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
+ t->by - 1, bh4, rr->mv.mv[0].y, ss_ver,
+ &f->svc[rr->ref.ref[0] - 1][1]);
+ }
+ if (bw4 == 1) {
+ const refmvs_block *const rr = &r[0][t->bx - 1];
+ mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
+ t->by, bh4, rr->mv.mv[0].y, ss_ver,
+ &f->svc[rr->ref.ref[0] - 1][1]);
+ }
+ if (bh4 == ss_ver) {
+ const refmvs_block *const rr = &r[-1][t->bx];
+ mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
+ t->by - 1, bh4, rr->mv.mv[0].y, ss_ver,
+ &f->svc[rr->ref.ref[0] - 1][1]);
+ }
+ mc_lowest_px(&lowest_px[b->ref[0]][1], t->by, bh4,
+ b->mv[0].y, ss_ver, &f->svc[b->ref[0]][1]);
+ } else {
+ if (imin(cbw4, cbh4) > 1 &&
+ ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
+ (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
+ {
+ affine_lowest_px_chroma(t, &lowest_px[b->ref[0]][1], b_dim,
+ b->motion_mode == MM_WARP ? &t->warpmv :
+ &f->frame_hdr->gmv[b->ref[0]]);
+ } else {
+ mc_lowest_px(&lowest_px[b->ref[0]][1],
+ t->by & ~ss_ver, bh4 << (bh4 == ss_ver),
+ b->mv[0].y, ss_ver, &f->svc[b->ref[0]][1]);
+ if (b->motion_mode == MM_OBMC) {
+ obmc_lowest_px(t, lowest_px, 1, b_dim, bx4, by4, w4, h4);
+ }
+ }
+ }
+ }
+ } else {
+ // y
+ for (int i = 0; i < 2; i++) {
+ if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
+ affine_lowest_px_luma(t, &lowest_px[b->ref[i]][0], b_dim,
+ &f->frame_hdr->gmv[b->ref[i]]);
+ } else {
+ mc_lowest_px(&lowest_px[b->ref[i]][0], t->by, bh4,
+ b->mv[i].y, 0, &f->svc[b->ref[i]][1]);
+ }
+ }
+
+ // uv
+ if (has_chroma) for (int i = 0; i < 2; i++) {
+ if (b->inter_mode == GLOBALMV_GLOBALMV &&
+ imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
+ {
+ affine_lowest_px_chroma(t, &lowest_px[b->ref[i]][1], b_dim,
+ &f->frame_hdr->gmv[b->ref[i]]);
+ } else {
+ mc_lowest_px(&lowest_px[b->ref[i]][1], t->by, bh4,
+ b->mv[i].y, ss_ver, &f->svc[b->ref[i]][1]);
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+#if __has_feature(memory_sanitizer)
+
+#include <sanitizer/msan_interface.h>
+
+static int checked_decode_b(Dav1dTaskContext *const t,
+ const enum BlockLevel bl,
+ const enum BlockSize bs,
+ const enum BlockPartition bp,
+ const enum EdgeFlags intra_edge_flags)
+{
+ const Dav1dFrameContext *const f = t->f;
+ const int err = decode_b(t, bl, bs, bp, intra_edge_flags);
+
+ if (err == 0 && !(t->frame_thread.pass & 1)) {
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+ const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+ const int has_chroma = f->seq_hdr->layout != DAV1D_PIXEL_LAYOUT_I400 &&
+ (bw4 > ss_hor || t->bx & 1) &&
+ (bh4 > ss_ver || t->by & 1);
+
+ for (int p = 0; p < 1 + 2 * has_chroma; p++) {
+ const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const ptrdiff_t stride = f->cur.stride[!!p];
+ const int bx = t->bx & ~ss_hor;
+ const int by = t->by & ~ss_ver;
+ const int width = w4 << (2 - ss_hor + (bw4 == ss_hor));
+ const int height = h4 << (2 - ss_ver + (bh4 == ss_ver));
+
+ const uint8_t *data = f->cur.data[p] + (by << (2 - ss_ver)) * stride +
+ (bx << (2 - ss_hor + !!f->seq_hdr->hbd));
+
+ for (int y = 0; y < height; data += stride, y++) {
+ const size_t line_sz = width << !!f->seq_hdr->hbd;
+ if (__msan_test_shadow(data, line_sz) != -1) {
+ fprintf(stderr, "B[%d](%d, %d) w4:%d, h4:%d, row:%d\n",
+ p, bx, by, w4, h4, y);
+ __msan_check_mem_is_initialized(data, line_sz);
+ }
+ }
+ }
+ }
+
+ return err;
+}
+
+#define decode_b checked_decode_b
+
+#endif /* defined(__has_feature) */
+
+static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl,
+ const EdgeNode *const node)
+{
+ const Dav1dFrameContext *const f = t->f;
+ Dav1dTileState *const ts = t->ts;
+ const int hsz = 16 >> bl;
+ const int have_h_split = f->bw > t->bx + hsz;
+ const int have_v_split = f->bh > t->by + hsz;
+
+ if (!have_h_split && !have_v_split) {
+ assert(bl < BL_8X8);
+ return decode_sb(t, bl + 1, ((const EdgeBranch *) node)->split[0]);
+ }
+
+ uint16_t *pc;
+ enum BlockPartition bp;
+ int ctx, bx8, by8;
+ if (t->frame_thread.pass != 2) {
+ if (0 && bl == BL_64X64)
+ printf("poc=%d,y=%d,x=%d,bl=%d,r=%d\n",
+ f->frame_hdr->frame_offset, t->by, t->bx, bl, ts->msac.rng);
+ bx8 = (t->bx & 31) >> 1;
+ by8 = (t->by & 31) >> 1;
+ ctx = get_partition_ctx(t->a, &t->l, bl, by8, bx8);
+ pc = ts->cdf.m.partition[bl][ctx];
+ }
+
+ if (have_h_split && have_v_split) {
+ if (t->frame_thread.pass == 2) {
+ const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
+ bp = b->bl == bl ? b->bp : PARTITION_SPLIT;
+ } else {
+ bp = dav1d_msac_decode_symbol_adapt16(&ts->msac, pc,
+ dav1d_partition_type_count[bl]);
+ if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
+ (bp == PARTITION_V || bp == PARTITION_V4 ||
+ bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
+ {
+ return 1;
+ }
+ if (DEBUG_BLOCK_INFO)
+ printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
+ f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx, bp,
+ ts->msac.rng);
+ }
+ const uint8_t *const b = dav1d_block_sizes[bl][bp];
+
+ switch (bp) {
+ case PARTITION_NONE:
+ if (decode_b(t, bl, b[0], PARTITION_NONE, node->o))
+ return -1;
+ break;
+ case PARTITION_H:
+ if (decode_b(t, bl, b[0], PARTITION_H, node->h[0]))
+ return -1;
+ t->by += hsz;
+ if (decode_b(t, bl, b[0], PARTITION_H, node->h[1]))
+ return -1;
+ t->by -= hsz;
+ break;
+ case PARTITION_V:
+ if (decode_b(t, bl, b[0], PARTITION_V, node->v[0]))
+ return -1;
+ t->bx += hsz;
+ if (decode_b(t, bl, b[0], PARTITION_V, node->v[1]))
+ return -1;
+ t->bx -= hsz;
+ break;
+ case PARTITION_SPLIT:
+ if (bl == BL_8X8) {
+ const EdgeTip *const tip = (const EdgeTip *) node;
+ assert(hsz == 1);
+ if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[0]))
+ return -1;
+ const enum Filter2d tl_filter = t->tl_4x4_filter;
+ t->bx++;
+ if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[1]))
+ return -1;
+ t->bx--;
+ t->by++;
+ if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[2]))
+ return -1;
+ t->bx++;
+ t->tl_4x4_filter = tl_filter;
+ if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[3]))
+ return -1;
+ t->bx--;
+ t->by--;
+#if ARCH_X86_64
+ if (t->frame_thread.pass) {
+ /* In 8-bit mode with 2-pass decoding the coefficient buffer
+ * can end up misaligned due to skips here. Work around
+ * the issue by explicitly realigning the buffer. */
+ const int p = t->frame_thread.pass & 1;
+ ts->frame_thread[p].cf =
+ (void*)(((uintptr_t)ts->frame_thread[p].cf + 63) & ~63);
+ }
+#endif
+ } else {
+ const EdgeBranch *const branch = (const EdgeBranch *) node;
+ if (decode_sb(t, bl + 1, branch->split[0]))
+ return 1;
+ t->bx += hsz;
+ if (decode_sb(t, bl + 1, branch->split[1]))
+ return 1;
+ t->bx -= hsz;
+ t->by += hsz;
+ if (decode_sb(t, bl + 1, branch->split[2]))
+ return 1;
+ t->bx += hsz;
+ if (decode_sb(t, bl + 1, branch->split[3]))
+ return 1;
+ t->bx -= hsz;
+ t->by -= hsz;
+ }
+ break;
+ case PARTITION_T_TOP_SPLIT: {
+ const EdgeBranch *const branch = (const EdgeBranch *) node;
+ if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, branch->tts[0]))
+ return -1;
+ t->bx += hsz;
+ if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, branch->tts[1]))
+ return -1;
+ t->bx -= hsz;
+ t->by += hsz;
+ if (decode_b(t, bl, b[1], PARTITION_T_TOP_SPLIT, branch->tts[2]))
+ return -1;
+ t->by -= hsz;
+ break;
+ }
+ case PARTITION_T_BOTTOM_SPLIT: {
+ const EdgeBranch *const branch = (const EdgeBranch *) node;
+ if (decode_b(t, bl, b[0], PARTITION_T_BOTTOM_SPLIT, branch->tbs[0]))
+ return -1;
+ t->by += hsz;
+ if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, branch->tbs[1]))
+ return -1;
+ t->bx += hsz;
+ if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, branch->tbs[2]))
+ return -1;
+ t->bx -= hsz;
+ t->by -= hsz;
+ break;
+ }
+ case PARTITION_T_LEFT_SPLIT: {
+ const EdgeBranch *const branch = (const EdgeBranch *) node;
+ if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, branch->tls[0]))
+ return -1;
+ t->by += hsz;
+ if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, branch->tls[1]))
+ return -1;
+ t->by -= hsz;
+ t->bx += hsz;
+ if (decode_b(t, bl, b[1], PARTITION_T_LEFT_SPLIT, branch->tls[2]))
+ return -1;
+ t->bx -= hsz;
+ break;
+ }
+ case PARTITION_T_RIGHT_SPLIT: {
+ const EdgeBranch *const branch = (const EdgeBranch *) node;
+ if (decode_b(t, bl, b[0], PARTITION_T_RIGHT_SPLIT, branch->trs[0]))
+ return -1;
+ t->bx += hsz;
+ if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, branch->trs[1]))
+ return -1;
+ t->by += hsz;
+ if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, branch->trs[2]))
+ return -1;
+ t->by -= hsz;
+ t->bx -= hsz;
+ break;
+ }
+ case PARTITION_H4: {
+ const EdgeBranch *const branch = (const EdgeBranch *) node;
+ if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[0]))
+ return -1;
+ t->by += hsz >> 1;
+ if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[1]))
+ return -1;
+ t->by += hsz >> 1;
+ if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[2]))
+ return -1;
+ t->by += hsz >> 1;
+ if (t->by < f->bh)
+ if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[3]))
+ return -1;
+ t->by -= hsz * 3 >> 1;
+ break;
+ }
+ case PARTITION_V4: {
+ const EdgeBranch *const branch = (const EdgeBranch *) node;
+ if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[0]))
+ return -1;
+ t->bx += hsz >> 1;
+ if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[1]))
+ return -1;
+ t->bx += hsz >> 1;
+ if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[2]))
+ return -1;
+ t->bx += hsz >> 1;
+ if (t->bx < f->bw)
+ if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[3]))
+ return -1;
+ t->bx -= hsz * 3 >> 1;
+ break;
+ }
+ default: assert(0);
+ }
+ } else if (have_h_split) {
+ unsigned is_split;
+ if (t->frame_thread.pass == 2) {
+ const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
+ is_split = b->bl != bl;
+ } else {
+ is_split = dav1d_msac_decode_bool(&ts->msac,
+ gather_top_partition_prob(pc, bl));
+ if (DEBUG_BLOCK_INFO)
+ printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
+ f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
+ is_split ? PARTITION_SPLIT : PARTITION_H, ts->msac.rng);
+ }
+
+ assert(bl < BL_8X8);
+ if (is_split) {
+ const EdgeBranch *const branch = (const EdgeBranch *) node;
+ bp = PARTITION_SPLIT;
+ if (decode_sb(t, bl + 1, branch->split[0])) return 1;
+ t->bx += hsz;
+ if (decode_sb(t, bl + 1, branch->split[1])) return 1;
+ t->bx -= hsz;
+ } else {
+ bp = PARTITION_H;
+ if (decode_b(t, bl, dav1d_block_sizes[bl][PARTITION_H][0],
+ PARTITION_H, node->h[0]))
+ return -1;
+ }
+ } else {
+ assert(have_v_split);
+ unsigned is_split;
+ if (t->frame_thread.pass == 2) {
+ const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
+ is_split = b->bl != bl;
+ } else {
+ is_split = dav1d_msac_decode_bool(&ts->msac,
+ gather_left_partition_prob(pc, bl));
+ if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && !is_split)
+ return 1;
+ if (DEBUG_BLOCK_INFO)
+ printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
+ f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
+ is_split ? PARTITION_SPLIT : PARTITION_V, ts->msac.rng);
+ }
+
+ assert(bl < BL_8X8);
+ if (is_split) {
+ const EdgeBranch *const branch = (const EdgeBranch *) node;
+ bp = PARTITION_SPLIT;
+ if (decode_sb(t, bl + 1, branch->split[0])) return 1;
+ t->by += hsz;
+ if (decode_sb(t, bl + 1, branch->split[2])) return 1;
+ t->by -= hsz;
+ } else {
+ bp = PARTITION_V;
+ if (decode_b(t, bl, dav1d_block_sizes[bl][PARTITION_V][0],
+ PARTITION_V, node->v[0]))
+ return -1;
+ }
+ }
+
+ if (t->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->a->partition, bx8, mul * dav1d_al_part_ctx[0][bl][bp]); \
+ rep_macro(type, t->l.partition, by8, mul * dav1d_al_part_ctx[1][bl][bp])
+ case_set_upto16(hsz,,,);
+#undef set_ctx
+ }
+
+ return 0;
+}
+
+static void reset_context(BlockContext *const ctx, const int keyframe, const int pass) {
+ memset(ctx->intra, keyframe, sizeof(ctx->intra));
+ memset(ctx->uvmode, DC_PRED, sizeof(ctx->uvmode));
+ if (keyframe)
+ memset(ctx->mode, DC_PRED, sizeof(ctx->mode));
+
+ if (pass == 2) return;
+
+ memset(ctx->partition, 0, sizeof(ctx->partition));
+ memset(ctx->skip, 0, sizeof(ctx->skip));
+ memset(ctx->skip_mode, 0, sizeof(ctx->skip_mode));
+ memset(ctx->tx_lpf_y, 2, sizeof(ctx->tx_lpf_y));
+ memset(ctx->tx_lpf_uv, 1, sizeof(ctx->tx_lpf_uv));
+ memset(ctx->tx_intra, -1, sizeof(ctx->tx_intra));
+ memset(ctx->tx, TX_64X64, sizeof(ctx->tx));
+ if (!keyframe) {
+ memset(ctx->ref, -1, sizeof(ctx->ref));
+ memset(ctx->comp_type, 0, sizeof(ctx->comp_type));
+ memset(ctx->mode, NEARESTMV, sizeof(ctx->mode));
+ }
+ memset(ctx->lcoef, 0x40, sizeof(ctx->lcoef));
+ memset(ctx->ccoef, 0x40, sizeof(ctx->ccoef));
+ memset(ctx->filter, DAV1D_N_SWITCHABLE_FILTERS, sizeof(ctx->filter));
+ memset(ctx->seg_pred, 0, sizeof(ctx->seg_pred));
+ memset(ctx->pal_sz, 0, sizeof(ctx->pal_sz));
+}
+
+// { Y+U+V, Y+U } * 4
+static const uint8_t ss_size_mul[4][2] = {
+ [DAV1D_PIXEL_LAYOUT_I400] = { 4, 4 },
+ [DAV1D_PIXEL_LAYOUT_I420] = { 6, 5 },
+ [DAV1D_PIXEL_LAYOUT_I422] = { 8, 6 },
+ [DAV1D_PIXEL_LAYOUT_I444] = { 12, 8 },
+};
+
+static void setup_tile(Dav1dTileState *const ts,
+ const Dav1dFrameContext *const f,
+ const uint8_t *const data, const size_t sz,
+ const int tile_row, const int tile_col,
+ const int tile_start_off)
+{
+ const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
+ const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
+ const int col_sb_end = f->frame_hdr->tiling.col_start_sb[tile_col + 1];
+ const int row_sb_start = f->frame_hdr->tiling.row_start_sb[tile_row];
+ const int row_sb_end = f->frame_hdr->tiling.row_start_sb[tile_row + 1];
+ const int sb_shift = f->sb_shift;
+
+ const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
+ for (int p = 0; p < 2; p++) {
+ ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ?
+ &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] :
+ NULL;
+ ts->frame_thread[p].cf = f->frame_thread.cf ?
+ (uint8_t*)f->frame_thread.cf +
+ (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
+ NULL;
+ }
+
+ dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf);
+ ts->last_qidx = f->frame_hdr->quant.yac;
+ memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf));
+
+ dav1d_msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update);
+
+ ts->tiling.row = tile_row;
+ ts->tiling.col = tile_col;
+ ts->tiling.col_start = col_sb_start << sb_shift;
+ ts->tiling.col_end = imin(col_sb_end << sb_shift, f->bw);
+ ts->tiling.row_start = row_sb_start << sb_shift;
+ ts->tiling.row_end = imin(row_sb_end << sb_shift, f->bh);
+
+ // Reference Restoration Unit (used for exp coding)
+ int sb_idx, unit_idx;
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+ // vertical components only
+ sb_idx = (ts->tiling.row_start >> 5) * f->sr_sb128w;
+ unit_idx = (ts->tiling.row_start & 16) >> 3;
+ } else {
+ sb_idx = (ts->tiling.row_start >> 5) * f->sb128w + col_sb128_start;
+ unit_idx = ((ts->tiling.row_start & 16) >> 3) +
+ ((ts->tiling.col_start & 16) >> 4);
+ }
+ for (int p = 0; p < 3; p++) {
+ if (!((f->lf.restore_planes >> p) & 1U))
+ continue;
+
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+ const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int d = f->frame_hdr->super_res.width_scale_denominator;
+ const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
+ const int rnd = (8 << unit_size_log2) - 1, shift = unit_size_log2 + 3;
+ const int x = ((4 * ts->tiling.col_start * d >> ss_hor) + rnd) >> shift;
+ const int px_x = x << (unit_size_log2 + ss_hor);
+ const int u_idx = unit_idx + ((px_x & 64) >> 6);
+ const int sb128x = px_x >> 7;
+ if (sb128x >= f->sr_sb128w) continue;
+ ts->lr_ref[p] = &f->lf.lr_mask[sb_idx + sb128x].lr[p][u_idx];
+ } else {
+ ts->lr_ref[p] = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
+ }
+
+ ts->lr_ref[p]->filter_v[0] = 3;
+ ts->lr_ref[p]->filter_v[1] = -7;
+ ts->lr_ref[p]->filter_v[2] = 15;
+ ts->lr_ref[p]->filter_h[0] = 3;
+ ts->lr_ref[p]->filter_h[1] = -7;
+ ts->lr_ref[p]->filter_h[2] = 15;
+ ts->lr_ref[p]->sgr_weights[0] = -32;
+ ts->lr_ref[p]->sgr_weights[1] = 31;
+ }
+
+ if (f->c->n_tc > 1) {
+ for (int p = 0; p < 2; p++)
+ atomic_init(&ts->progress[p], row_sb_start);
+ }
+}
+
+static void read_restoration_info(Dav1dTaskContext *const t,
+ Av1RestorationUnit *const lr, const int p,
+ const enum Dav1dRestorationType frame_type)
+{
+ const Dav1dFrameContext *const f = t->f;
+ Dav1dTileState *const ts = t->ts;
+
+ if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
+ const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.restore_switchable, 2);
+ lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ :
+ DAV1D_RESTORATION_WIENER :
+ DAV1D_RESTORATION_NONE;
+ } else {
+ const unsigned type =
+ dav1d_msac_decode_bool_adapt(&ts->msac,
+ frame_type == DAV1D_RESTORATION_WIENER ?
+ ts->cdf.m.restore_wiener : ts->cdf.m.restore_sgrproj);
+ lr->type = type ? frame_type : DAV1D_RESTORATION_NONE;
+ }
+
+ if (lr->type == DAV1D_RESTORATION_WIENER) {
+ lr->filter_v[0] = p ? 0 :
+ dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->filter_v[0] + 5, 16, 1) - 5;
+ lr->filter_v[1] =
+ dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->filter_v[1] + 23, 32, 2) - 23;
+ lr->filter_v[2] =
+ dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->filter_v[2] + 17, 64, 3) - 17;
+
+ lr->filter_h[0] = p ? 0 :
+ dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->filter_h[0] + 5, 16, 1) - 5;
+ lr->filter_h[1] =
+ dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->filter_h[1] + 23, 32, 2) - 23;
+ lr->filter_h[2] =
+ dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->filter_h[2] + 17, 64, 3) - 17;
+ memcpy(lr->sgr_weights, ts->lr_ref[p]->sgr_weights, sizeof(lr->sgr_weights));
+ ts->lr_ref[p] = lr;
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-lr_wiener[pl=%d,v[%d,%d,%d],h[%d,%d,%d]]: r=%d\n",
+ p, lr->filter_v[0], lr->filter_v[1],
+ lr->filter_v[2], lr->filter_h[0],
+ lr->filter_h[1], lr->filter_h[2], ts->msac.rng);
+ } else if (lr->type == DAV1D_RESTORATION_SGRPROJ) {
+ const unsigned idx = dav1d_msac_decode_bools(&ts->msac, 4);
+ const uint16_t *const sgr_params = dav1d_sgr_params[idx];
+ lr->sgr_idx = idx;
+ lr->sgr_weights[0] = sgr_params[0] ? dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 : 0;
+ lr->sgr_weights[1] = sgr_params[1] ? dav1d_msac_decode_subexp(&ts->msac,
+ ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 : 95;
+ memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v));
+ memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h));
+ ts->lr_ref[p] = lr;
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-lr_sgrproj[pl=%d,idx=%d,w[%d,%d]]: r=%d\n",
+ p, lr->sgr_idx, lr->sgr_weights[0],
+ lr->sgr_weights[1], ts->msac.rng);
+ }
+}
+
+int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) {
+ const Dav1dFrameContext *const f = t->f;
+ const enum BlockLevel root_bl = f->seq_hdr->sb128 ? BL_128X128 : BL_64X64;
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dContext *const c = f->c;
+ const int sb_step = f->sb_step;
+ const int tile_row = ts->tiling.row, tile_col = ts->tiling.col;
+ const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
+ const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
+
+ if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
+ dav1d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start,
+ ts->tiling.col_end, ts->tiling.row_start,
+ ts->tiling.row_end, t->by >> f->sb_shift,
+ ts->tiling.row, t->frame_thread.pass);
+ }
+
+ if (IS_INTER_OR_SWITCH(f->frame_hdr) && c->n_fc > 1) {
+ const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift;
+ int (*const lowest_px)[2] = ts->lowest_pixel[sby];
+ for (int n = 0; n < 7; n++)
+ for (int m = 0; m < 2; m++)
+ lowest_px[n][m] = INT_MIN;
+ }
+
+ reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), t->frame_thread.pass);
+ if (t->frame_thread.pass == 2) {
+ const int off_2pass = c->n_tc > 1 ? f->sb128w * f->frame_hdr->tiling.rows : 0;
+ for (t->bx = ts->tiling.col_start,
+ t->a = f->a + off_2pass + col_sb128_start + tile_row * f->sb128w;
+ t->bx < ts->tiling.col_end; t->bx += sb_step)
+ {
+ if (atomic_load_explicit(c->flush, memory_order_acquire))
+ return 1;
+ if (decode_sb(t, root_bl, c->intra_edge.root[root_bl]))
+ return 1;
+ if (t->bx & 16 || f->seq_hdr->sb128)
+ t->a++;
+ }
+ f->bd_fn.backup_ipred_edge(t);
+ return 0;
+ }
+
+ // error out on symbol decoder overread
+ if (ts->msac.cnt < -15) return 1;
+
+ if (f->c->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) {
+ f->c->refmvs_dsp.load_tmvs(&f->rf, ts->tiling.row,
+ ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
+ t->by >> 1, (t->by + sb_step) >> 1);
+ }
+ memset(t->pal_sz_uv[1], 0, sizeof(*t->pal_sz_uv));
+ const int sb128y = t->by >> 5;
+ for (t->bx = ts->tiling.col_start, t->a = f->a + col_sb128_start + tile_row * f->sb128w,
+ t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start;
+ t->bx < ts->tiling.col_end; t->bx += sb_step)
+ {
+ if (atomic_load_explicit(c->flush, memory_order_acquire))
+ return 1;
+ if (root_bl == BL_128X128) {
+ t->cur_sb_cdef_idx_ptr = t->lf_mask->cdef_idx;
+ t->cur_sb_cdef_idx_ptr[0] = -1;
+ t->cur_sb_cdef_idx_ptr[1] = -1;
+ t->cur_sb_cdef_idx_ptr[2] = -1;
+ t->cur_sb_cdef_idx_ptr[3] = -1;
+ } else {
+ t->cur_sb_cdef_idx_ptr =
+ &t->lf_mask->cdef_idx[((t->bx & 16) >> 4) +
+ ((t->by & 16) >> 3)];
+ t->cur_sb_cdef_idx_ptr[0] = -1;
+ }
+ // Restoration filter
+ for (int p = 0; p < 3; p++) {
+ if (!((f->lf.restore_planes >> p) & 1U))
+ continue;
+
+ const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
+ const int y = t->by * 4 >> ss_ver;
+ const int h = (f->cur.p.h + ss_ver) >> ss_ver;
+
+ const int unit_size = 1 << unit_size_log2;
+ const unsigned mask = unit_size - 1;
+ if (y & mask) continue;
+ const int half_unit = unit_size >> 1;
+ // Round half up at frame boundaries, if there's more than one
+ // restoration unit
+ if (y && y + half_unit > h) continue;
+
+ const enum Dav1dRestorationType frame_type = f->frame_hdr->restoration.type[p];
+
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+ const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+ const int n_units = imax(1, (w + half_unit) >> unit_size_log2);
+
+ const int d = f->frame_hdr->super_res.width_scale_denominator;
+ const int rnd = unit_size * 8 - 1, shift = unit_size_log2 + 3;
+ const int x0 = ((4 * t->bx * d >> ss_hor) + rnd) >> shift;
+ const int x1 = ((4 * (t->bx + sb_step) * d >> ss_hor) + rnd) >> shift;
+
+ for (int x = x0; x < imin(x1, n_units); x++) {
+ const int px_x = x << (unit_size_log2 + ss_hor);
+ const int sb_idx = (t->by >> 5) * f->sr_sb128w + (px_x >> 7);
+ const int unit_idx = ((t->by & 16) >> 3) + ((px_x & 64) >> 6);
+ Av1RestorationUnit *const lr = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
+
+ read_restoration_info(t, lr, p, frame_type);
+ }
+ } else {
+ const int x = 4 * t->bx >> ss_hor;
+ if (x & mask) continue;
+ const int w = (f->cur.p.w + ss_hor) >> ss_hor;
+ // Round half up at frame boundaries, if there's more than one
+ // restoration unit
+ if (x && x + half_unit > w) continue;
+ const int sb_idx = (t->by >> 5) * f->sr_sb128w + (t->bx >> 5);
+ const int unit_idx = ((t->by & 16) >> 3) + ((t->bx & 16) >> 4);
+ Av1RestorationUnit *const lr = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
+
+ read_restoration_info(t, lr, p, frame_type);
+ }
+ }
+ if (decode_sb(t, root_bl, c->intra_edge.root[root_bl]))
+ return 1;
+ if (t->bx & 16 || f->seq_hdr->sb128) {
+ t->a++;
+ t->lf_mask++;
+ }
+ }
+
+ if (f->seq_hdr->ref_frame_mvs && f->c->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ dav1d_refmvs_save_tmvs(&f->c->refmvs_dsp, &t->rt,
+ ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
+ t->by >> 1, (t->by + sb_step) >> 1);
+ }
+
+ // backup pre-loopfilter pixels for intra prediction of the next sbrow
+ if (t->frame_thread.pass != 1)
+ f->bd_fn.backup_ipred_edge(t);
+
+ // backup t->a/l.tx_lpf_y/uv at tile boundaries to use them to "fix"
+ // up the initial value in neighbour tiles when running the loopfilter
+ int align_h = (f->bh + 31) & ~31;
+ memcpy(&f->lf.tx_lpf_right_edge[0][align_h * tile_col + t->by],
+ &t->l.tx_lpf_y[t->by & 16], sb_step);
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ align_h >>= ss_ver;
+ memcpy(&f->lf.tx_lpf_right_edge[1][align_h * tile_col + (t->by >> ss_ver)],
+ &t->l.tx_lpf_uv[(t->by & 16) >> ss_ver], sb_step >> ss_ver);
+
+ return 0;
+}
+
+int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
+ const Dav1dContext *const c = f->c;
+ int retval = DAV1D_ERR(ENOMEM);
+
+ if (f->sbh > f->lf.start_of_tile_row_sz) {
+ free(f->lf.start_of_tile_row);
+ f->lf.start_of_tile_row = malloc(f->sbh * sizeof(uint8_t));
+ if (!f->lf.start_of_tile_row) {
+ f->lf.start_of_tile_row_sz = 0;
+ goto error;
+ }
+ f->lf.start_of_tile_row_sz = f->sbh;
+ }
+ int sby = 0;
+ for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
+ f->lf.start_of_tile_row[sby++] = tile_row;
+ while (sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1])
+ f->lf.start_of_tile_row[sby++] = 0;
+ }
+
+ const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
+ if (n_ts != f->n_ts) {
+ if (c->n_fc > 1) {
+ freep(&f->frame_thread.tile_start_off);
+ f->frame_thread.tile_start_off =
+ malloc(sizeof(*f->frame_thread.tile_start_off) * n_ts);
+ if (!f->frame_thread.tile_start_off) {
+ f->n_ts = 0;
+ goto error;
+ }
+ }
+ dav1d_free_aligned(f->ts);
+ f->ts = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32);
+ if (!f->ts) goto error;
+ f->n_ts = n_ts;
+ }
+
+ const int a_sz = f->sb128w * f->frame_hdr->tiling.rows * (1 + (c->n_fc > 1 && c->n_tc > 1));
+ if (a_sz != f->a_sz) {
+ freep(&f->a);
+ f->a = malloc(sizeof(*f->a) * a_sz);
+ if (!f->a) {
+ f->a_sz = 0;
+ goto error;
+ }
+ f->a_sz = a_sz;
+ }
+
+ const int num_sb128 = f->sb128w * f->sb128h;
+ const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
+ const int hbd = !!f->seq_hdr->hbd;
+ if (c->n_fc > 1) {
+ int tile_idx = 0;
+ for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
+ int row_off = f->frame_hdr->tiling.row_start_sb[tile_row] *
+ f->sb_step * 4 * f->sb128w * 128;
+ int b_diff = (f->frame_hdr->tiling.row_start_sb[tile_row + 1] -
+ f->frame_hdr->tiling.row_start_sb[tile_row]) * f->sb_step * 4;
+ for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
+ f->frame_thread.tile_start_off[tile_idx++] = row_off + b_diff *
+ f->frame_hdr->tiling.col_start_sb[tile_col] * f->sb_step * 4;
+ }
+ }
+
+ const int lowest_pixel_mem_sz = f->frame_hdr->tiling.cols * f->sbh;
+ if (lowest_pixel_mem_sz != f->tile_thread.lowest_pixel_mem_sz) {
+ free(f->tile_thread.lowest_pixel_mem);
+ f->tile_thread.lowest_pixel_mem =
+ malloc(lowest_pixel_mem_sz * sizeof(*f->tile_thread.lowest_pixel_mem));
+ if (!f->tile_thread.lowest_pixel_mem) {
+ f->tile_thread.lowest_pixel_mem_sz = 0;
+ goto error;
+ }
+ f->tile_thread.lowest_pixel_mem_sz = lowest_pixel_mem_sz;
+ }
+ int (*lowest_pixel_ptr)[7][2] = f->tile_thread.lowest_pixel_mem;
+ for (int tile_row = 0, tile_row_base = 0; tile_row < f->frame_hdr->tiling.rows;
+ tile_row++, tile_row_base += f->frame_hdr->tiling.cols)
+ {
+ const int tile_row_sb_h = f->frame_hdr->tiling.row_start_sb[tile_row + 1] -
+ f->frame_hdr->tiling.row_start_sb[tile_row];
+ for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
+ f->ts[tile_row_base + tile_col].lowest_pixel = lowest_pixel_ptr;
+ lowest_pixel_ptr += tile_row_sb_h;
+ }
+ }
+
+ const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
+ if (cf_sz != f->frame_thread.cf_sz) {
+ dav1d_freep_aligned(&f->frame_thread.cf);
+ f->frame_thread.cf =
+ dav1d_alloc_aligned((size_t)cf_sz * 128 * 128 / 2, 64);
+ if (!f->frame_thread.cf) {
+ f->frame_thread.cf_sz = 0;
+ goto error;
+ }
+ memset(f->frame_thread.cf, 0, (size_t)cf_sz * 128 * 128 / 2);
+ f->frame_thread.cf_sz = cf_sz;
+ }
+
+ if (f->frame_hdr->allow_screen_content_tools) {
+ if (num_sb128 != f->frame_thread.pal_sz) {
+ dav1d_freep_aligned(&f->frame_thread.pal);
+ f->frame_thread.pal =
+ dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) *
+ num_sb128 * 16 * 16, 64);
+ if (!f->frame_thread.pal) {
+ f->frame_thread.pal_sz = 0;
+ goto error;
+ }
+ f->frame_thread.pal_sz = num_sb128;
+ }
+
+ const int pal_idx_sz = num_sb128 * size_mul[1];
+ if (pal_idx_sz != f->frame_thread.pal_idx_sz) {
+ dav1d_freep_aligned(&f->frame_thread.pal_idx);
+ f->frame_thread.pal_idx =
+ dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) *
+ pal_idx_sz * 128 * 128 / 4, 64);
+ if (!f->frame_thread.pal_idx) {
+ f->frame_thread.pal_idx_sz = 0;
+ goto error;
+ }
+ f->frame_thread.pal_idx_sz = pal_idx_sz;
+ }
+ } else if (f->frame_thread.pal) {
+ dav1d_freep_aligned(&f->frame_thread.pal);
+ dav1d_freep_aligned(&f->frame_thread.pal_idx);
+ f->frame_thread.pal_sz = f->frame_thread.pal_idx_sz = 0;
+ }
+ }
+
+ // update allocation of block contexts for above
+ ptrdiff_t y_stride = f->cur.stride[0], uv_stride = f->cur.stride[1];
+ const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
+ const int need_cdef_lpf_copy = c->n_tc > 1 && has_resize;
+ if (y_stride * f->sbh * 4 != f->lf.cdef_buf_plane_sz[0] ||
+ uv_stride * f->sbh * 8 != f->lf.cdef_buf_plane_sz[1] ||
+ need_cdef_lpf_copy != f->lf.need_cdef_lpf_copy ||
+ f->sbh != f->lf.cdef_buf_sbh)
+ {
+ dav1d_free_aligned(f->lf.cdef_line_buf);
+ size_t alloc_sz = 64;
+ alloc_sz += (size_t)llabs(y_stride) * 4 * f->sbh << need_cdef_lpf_copy;
+ alloc_sz += (size_t)llabs(uv_stride) * 8 * f->sbh << need_cdef_lpf_copy;
+ uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(alloc_sz, 32);
+ if (!ptr) {
+ f->lf.cdef_buf_plane_sz[0] = f->lf.cdef_buf_plane_sz[1] = 0;
+ goto error;
+ }
+
+ ptr += 32;
+ if (y_stride < 0) {
+ f->lf.cdef_line[0][0] = ptr - y_stride * (f->sbh * 4 - 1);
+ f->lf.cdef_line[1][0] = ptr - y_stride * (f->sbh * 4 - 3);
+ } else {
+ f->lf.cdef_line[0][0] = ptr + y_stride * 0;
+ f->lf.cdef_line[1][0] = ptr + y_stride * 2;
+ }
+ ptr += llabs(y_stride) * f->sbh * 4;
+ if (uv_stride < 0) {
+ f->lf.cdef_line[0][1] = ptr - uv_stride * (f->sbh * 8 - 1);
+ f->lf.cdef_line[0][2] = ptr - uv_stride * (f->sbh * 8 - 3);
+ f->lf.cdef_line[1][1] = ptr - uv_stride * (f->sbh * 8 - 5);
+ f->lf.cdef_line[1][2] = ptr - uv_stride * (f->sbh * 8 - 7);
+ } else {
+ f->lf.cdef_line[0][1] = ptr + uv_stride * 0;
+ f->lf.cdef_line[0][2] = ptr + uv_stride * 2;
+ f->lf.cdef_line[1][1] = ptr + uv_stride * 4;
+ f->lf.cdef_line[1][2] = ptr + uv_stride * 6;
+ }
+
+ if (need_cdef_lpf_copy) {
+ ptr += llabs(uv_stride) * f->sbh * 8;
+ if (y_stride < 0)
+ f->lf.cdef_lpf_line[0] = ptr - y_stride * (f->sbh * 4 - 1);
+ else
+ f->lf.cdef_lpf_line[0] = ptr;
+ ptr += llabs(y_stride) * f->sbh * 4;
+ if (uv_stride < 0) {
+ f->lf.cdef_lpf_line[1] = ptr - uv_stride * (f->sbh * 4 - 1);
+ f->lf.cdef_lpf_line[2] = ptr - uv_stride * (f->sbh * 8 - 1);
+ } else {
+ f->lf.cdef_lpf_line[1] = ptr;
+ f->lf.cdef_lpf_line[2] = ptr + uv_stride * f->sbh * 4;
+ }
+ }
+
+ f->lf.cdef_buf_plane_sz[0] = (int) y_stride * f->sbh * 4;
+ f->lf.cdef_buf_plane_sz[1] = (int) uv_stride * f->sbh * 8;
+ f->lf.need_cdef_lpf_copy = need_cdef_lpf_copy;
+ f->lf.cdef_buf_sbh = f->sbh;
+ }
+
+ const int sb128 = f->seq_hdr->sb128;
+ const int num_lines = c->n_tc > 1 ? f->sbh * 4 << sb128 : 12;
+ y_stride = f->sr_cur.p.stride[0], uv_stride = f->sr_cur.p.stride[1];
+ if (y_stride * num_lines != f->lf.lr_buf_plane_sz[0] ||
+ uv_stride * num_lines * 2 != f->lf.lr_buf_plane_sz[1])
+ {
+ dav1d_free_aligned(f->lf.lr_line_buf);
+ // lr simd may overread the input, so slightly over-allocate the lpf buffer
+ size_t alloc_sz = 128;
+ alloc_sz += (size_t)llabs(y_stride) * num_lines;
+ alloc_sz += (size_t)llabs(uv_stride) * num_lines * 2;
+ uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(alloc_sz, 64);
+ if (!ptr) {
+ f->lf.lr_buf_plane_sz[0] = f->lf.lr_buf_plane_sz[1] = 0;
+ goto error;
+ }
+
+ ptr += 64;
+ if (y_stride < 0)
+ f->lf.lr_lpf_line[0] = ptr - y_stride * (num_lines - 1);
+ else
+ f->lf.lr_lpf_line[0] = ptr;
+ ptr += llabs(y_stride) * num_lines;
+ if (uv_stride < 0) {
+ f->lf.lr_lpf_line[1] = ptr - uv_stride * (num_lines * 1 - 1);
+ f->lf.lr_lpf_line[2] = ptr - uv_stride * (num_lines * 2 - 1);
+ } else {
+ f->lf.lr_lpf_line[1] = ptr;
+ f->lf.lr_lpf_line[2] = ptr + uv_stride * num_lines;
+ }
+
+ f->lf.lr_buf_plane_sz[0] = (int) y_stride * num_lines;
+ f->lf.lr_buf_plane_sz[1] = (int) uv_stride * num_lines * 2;
+ }
+
+ // update allocation for loopfilter masks
+ if (num_sb128 != f->lf.mask_sz) {
+ freep(&f->lf.mask);
+ freep(&f->lf.level);
+ f->lf.mask = malloc(sizeof(*f->lf.mask) * num_sb128);
+ // over-allocate by 3 bytes since some of the SIMD implementations
+ // index this from the level type and can thus over-read by up to 3
+ f->lf.level = malloc(sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3);
+ if (!f->lf.mask || !f->lf.level) {
+ f->lf.mask_sz = 0;
+ goto error;
+ }
+ if (c->n_fc > 1) {
+ freep(&f->frame_thread.b);
+ freep(&f->frame_thread.cbi);
+ f->frame_thread.b = malloc(sizeof(*f->frame_thread.b) *
+ num_sb128 * 32 * 32);
+ f->frame_thread.cbi = malloc(sizeof(*f->frame_thread.cbi) *
+ num_sb128 * 32 * 32);
+ if (!f->frame_thread.b || !f->frame_thread.cbi) {
+ f->lf.mask_sz = 0;
+ goto error;
+ }
+ }
+ f->lf.mask_sz = num_sb128;
+ }
+
+ f->sr_sb128w = (f->sr_cur.p.p.w + 127) >> 7;
+ const int lr_mask_sz = f->sr_sb128w * f->sb128h;
+ if (lr_mask_sz != f->lf.lr_mask_sz) {
+ freep(&f->lf.lr_mask);
+ f->lf.lr_mask = malloc(sizeof(*f->lf.lr_mask) * lr_mask_sz);
+ if (!f->lf.lr_mask) {
+ f->lf.lr_mask_sz = 0;
+ goto error;
+ }
+ f->lf.lr_mask_sz = lr_mask_sz;
+ }
+ f->lf.restore_planes =
+ ((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
+ ((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
+ ((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
+ if (f->frame_hdr->loopfilter.sharpness != f->lf.last_sharpness) {
+ dav1d_calc_eih(&f->lf.lim_lut, f->frame_hdr->loopfilter.sharpness);
+ f->lf.last_sharpness = f->frame_hdr->loopfilter.sharpness;
+ }
+ dav1d_calc_lf_values(f->lf.lvl, f->frame_hdr, (int8_t[4]) { 0, 0, 0, 0 });
+ memset(f->lf.mask, 0, sizeof(*f->lf.mask) * num_sb128);
+
+ const int ipred_edge_sz = f->sbh * f->sb128w << hbd;
+ if (ipred_edge_sz != f->ipred_edge_sz) {
+ dav1d_freep_aligned(&f->ipred_edge[0]);
+ uint8_t *ptr = f->ipred_edge[0] =
+ dav1d_alloc_aligned(ipred_edge_sz * 128 * 3, 64);
+ if (!ptr) {
+ f->ipred_edge_sz = 0;
+ goto error;
+ }
+ f->ipred_edge[1] = ptr + ipred_edge_sz * 128 * 1;
+ f->ipred_edge[2] = ptr + ipred_edge_sz * 128 * 2;
+ f->ipred_edge_sz = ipred_edge_sz;
+ }
+
+ const int re_sz = f->sb128h * f->frame_hdr->tiling.cols;
+ if (re_sz != f->lf.re_sz) {
+ freep(&f->lf.tx_lpf_right_edge[0]);
+ f->lf.tx_lpf_right_edge[0] = malloc(re_sz * 32 * 2);
+ if (!f->lf.tx_lpf_right_edge[0]) {
+ f->lf.re_sz = 0;
+ goto error;
+ }
+ f->lf.tx_lpf_right_edge[1] = f->lf.tx_lpf_right_edge[0] + re_sz * 32;
+ f->lf.re_sz = re_sz;
+ }
+
+ // init ref mvs
+ if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
+ const int ret =
+ dav1d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr,
+ f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs,
+ f->c->n_tc, f->c->n_fc);
+ if (ret < 0) goto error;
+ }
+
+ // setup dequant tables
+ init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq);
+ if (f->frame_hdr->quant.qm)
+ for (int i = 0; i < N_RECT_TX_SIZES; i++) {
+ f->qm[i][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][i];
+ f->qm[i][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][i];
+ f->qm[i][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][i];
+ }
+ else
+ memset(f->qm, 0, sizeof(f->qm));
+
+ // setup jnt_comp weights
+ if (f->frame_hdr->switchable_comp_refs) {
+ for (int i = 0; i < 7; i++) {
+ const unsigned ref0poc = f->refp[i].p.frame_hdr->frame_offset;
+
+ for (int j = i + 1; j < 7; j++) {
+ const unsigned ref1poc = f->refp[j].p.frame_hdr->frame_offset;
+
+ const unsigned d1 =
+ imin(abs(get_poc_diff(f->seq_hdr->order_hint_n_bits, ref0poc,
+ f->cur.frame_hdr->frame_offset)), 31);
+ const unsigned d0 =
+ imin(abs(get_poc_diff(f->seq_hdr->order_hint_n_bits, ref1poc,
+ f->cur.frame_hdr->frame_offset)), 31);
+ const int order = d0 <= d1;
+
+ static const uint8_t quant_dist_weight[3][2] = {
+ { 2, 3 }, { 2, 5 }, { 2, 7 }
+ };
+ static const uint8_t quant_dist_lookup_table[4][2] = {
+ { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 }
+ };
+
+ int k;
+ for (k = 0; k < 3; k++) {
+ const int c0 = quant_dist_weight[k][order];
+ const int c1 = quant_dist_weight[k][!order];
+ const int d0_c0 = d0 * c0;
+ const int d1_c1 = d1 * c1;
+ if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
+ }
+
+ f->jnt_weights[i][j] = quant_dist_lookup_table[k][order];
+ }
+ }
+ }
+
+ /* Init loopfilter pointers. Increasing NULL pointers is technically UB,
+ * so just point the chroma pointers in 4:0:0 to the luma plane here to
+ * avoid having additional in-loop branches in various places. We never
+ * dereference those pointers so it doesn't really matter what they
+ * point at, as long as the pointers are valid. */
+ const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
+ f->lf.mask_ptr = f->lf.mask;
+ f->lf.p[0] = f->cur.data[0];
+ f->lf.p[1] = f->cur.data[has_chroma ? 1 : 0];
+ f->lf.p[2] = f->cur.data[has_chroma ? 2 : 0];
+ f->lf.sr_p[0] = f->sr_cur.p.data[0];
+ f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0];
+ f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0];
+
+ retval = 0;
+error:
+ return retval;
+}
+
+int dav1d_decode_frame_init_cdf(Dav1dFrameContext *const f) {
+ const Dav1dContext *const c = f->c;
+ int retval = DAV1D_ERR(EINVAL);
+
+ if (f->frame_hdr->refresh_context)
+ dav1d_cdf_thread_copy(f->out_cdf.data.cdf, &f->in_cdf);
+
+ // parse individual tiles per tile group
+ int tile_row = 0, tile_col = 0;
+ f->task_thread.update_set = 0;
+ for (int i = 0; i < f->n_tile_data; i++) {
+ const uint8_t *data = f->tile[i].data.data;
+ size_t size = f->tile[i].data.sz;
+
+ for (int j = f->tile[i].start; j <= f->tile[i].end; j++) {
+ size_t tile_sz;
+ if (j == f->tile[i].end) {
+ tile_sz = size;
+ } else {
+ if (f->frame_hdr->tiling.n_bytes > size) goto error;
+ tile_sz = 0;
+ for (unsigned k = 0; k < f->frame_hdr->tiling.n_bytes; k++)
+ tile_sz |= (unsigned)*data++ << (k * 8);
+ tile_sz++;
+ size -= f->frame_hdr->tiling.n_bytes;
+ if (tile_sz > size) goto error;
+ }
+
+ setup_tile(&f->ts[j], f, data, tile_sz, tile_row, tile_col++,
+ c->n_fc > 1 ? f->frame_thread.tile_start_off[j] : 0);
+
+ if (tile_col == f->frame_hdr->tiling.cols) {
+ tile_col = 0;
+ tile_row++;
+ }
+ if (j == f->frame_hdr->tiling.update && f->frame_hdr->refresh_context)
+ f->task_thread.update_set = 1;
+ data += tile_sz;
+ size -= tile_sz;
+ }
+ }
+
+ if (c->n_tc > 1) {
+ const int uses_2pass = c->n_fc > 1;
+ for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows * (1 + uses_2pass); n++)
+ reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr),
+ uses_2pass ? 1 + (n >= f->sb128w * f->frame_hdr->tiling.rows) : 0);
+ }
+
+ retval = 0;
+error:
+ return retval;
+}
+
+int dav1d_decode_frame_main(Dav1dFrameContext *const f) {
+ const Dav1dContext *const c = f->c;
+ int retval = DAV1D_ERR(EINVAL);
+
+ assert(f->c->n_tc == 1);
+
+ Dav1dTaskContext *const t = &c->tc[f - c->fc];
+ t->f = f;
+ t->frame_thread.pass = 0;
+
+ for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++)
+ reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), 0);
+
+ // no threading - we explicitly interleave tile/sbrow decoding
+ // and post-filtering, so that the full process runs in-line
+ for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
+ const int sbh_end =
+ imin(f->frame_hdr->tiling.row_start_sb[tile_row + 1], f->sbh);
+ for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
+ sby < sbh_end; sby++)
+ {
+ t->by = sby << (4 + f->seq_hdr->sb128);
+ const int by_end = (t->by + f->sb_step) >> 1;
+ if (f->frame_hdr->use_ref_frame_mvs) {
+ f->c->refmvs_dsp.load_tmvs(&f->rf, tile_row,
+ 0, f->bw >> 1, t->by >> 1, by_end);
+ }
+ for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
+ t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
+ if (dav1d_decode_tile_sbrow(t)) goto error;
+ }
+ if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ dav1d_refmvs_save_tmvs(&f->c->refmvs_dsp, &t->rt,
+ 0, f->bw >> 1, t->by >> 1, by_end);
+ }
+
+ // loopfilter + cdef + restoration
+ f->bd_fn.filter_sbrow(f, sby);
+ }
+ }
+
+ retval = 0;
+error:
+ return retval;
+}
+
+void dav1d_decode_frame_exit(Dav1dFrameContext *const f, const int retval) {
+ const Dav1dContext *const c = f->c;
+
+ if (f->sr_cur.p.data[0])
+ atomic_init(&f->task_thread.error, 0);
+
+ if (c->n_fc > 1 && retval && f->frame_thread.cf) {
+ memset(f->frame_thread.cf, 0,
+ (size_t)f->frame_thread.cf_sz * 128 * 128 / 2);
+ }
+ for (int i = 0; i < 7; i++) {
+ if (f->refp[i].p.frame_hdr)
+ dav1d_thread_picture_unref(&f->refp[i]);
+ dav1d_ref_dec(&f->ref_mvs_ref[i]);
+ }
+
+ dav1d_picture_unref_internal(&f->cur);
+ dav1d_thread_picture_unref(&f->sr_cur);
+ dav1d_cdf_thread_unref(&f->in_cdf);
+ if (f->frame_hdr && f->frame_hdr->refresh_context) {
+ if (f->out_cdf.progress)
+ atomic_store(f->out_cdf.progress, retval == 0 ? 1 : TILE_ERROR);
+ dav1d_cdf_thread_unref(&f->out_cdf);
+ }
+ dav1d_ref_dec(&f->cur_segmap_ref);
+ dav1d_ref_dec(&f->prev_segmap_ref);
+ dav1d_ref_dec(&f->mvs_ref);
+ dav1d_ref_dec(&f->seq_hdr_ref);
+ dav1d_ref_dec(&f->frame_hdr_ref);
+
+ for (int i = 0; i < f->n_tile_data; i++)
+ dav1d_data_unref_internal(&f->tile[i].data);
+ f->task_thread.retval = retval;
+}
+
+int dav1d_decode_frame(Dav1dFrameContext *const f) {
+ assert(f->c->n_fc == 1);
+ // if n_tc > 1 (but n_fc == 1), we could run init/exit in the task
+ // threads also. Not sure it makes a measurable difference.
+ int res = dav1d_decode_frame_init(f);
+ if (!res) res = dav1d_decode_frame_init_cdf(f);
+ // wait until all threads have completed
+ if (!res) {
+ if (f->c->n_tc > 1) {
+ res = dav1d_task_create_tile_sbrow(f, 0, 1);
+ pthread_mutex_lock(&f->task_thread.ttd->lock);
+ pthread_cond_signal(&f->task_thread.ttd->cond);
+ if (!res) {
+ while (!f->task_thread.done[0] ||
+ atomic_load(&f->task_thread.task_counter) > 0)
+ {
+ pthread_cond_wait(&f->task_thread.cond,
+ &f->task_thread.ttd->lock);
+ }
+ }
+ pthread_mutex_unlock(&f->task_thread.ttd->lock);
+ res = f->task_thread.retval;
+ } else {
+ res = dav1d_decode_frame_main(f);
+ if (!res && f->frame_hdr->refresh_context && f->task_thread.update_set) {
+ dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf,
+ &f->ts[f->frame_hdr->tiling.update].cdf);
+ }
+ }
+ }
+ dav1d_decode_frame_exit(f, res);
+ f->n_tile_data = 0;
+ return res;
+}
+
+static int get_upscale_x0(const int in_w, const int out_w, const int step) {
+ const int err = out_w * step - (in_w << 14);
+ const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err / 2);
+ return x0 & 0x3fff;
+}
+
+int dav1d_submit_frame(Dav1dContext *const c) {
+ Dav1dFrameContext *f;
+ int res = -1;
+
+ // wait for c->out_delayed[next] and move into c->out if visible
+ Dav1dThreadPicture *out_delayed;
+ if (c->n_fc > 1) {
+ pthread_mutex_lock(&c->task_thread.lock);
+ const unsigned next = c->frame_thread.next++;
+ if (c->frame_thread.next == c->n_fc)
+ c->frame_thread.next = 0;
+
+ f = &c->fc[next];
+ while (f->n_tile_data > 0)
+ pthread_cond_wait(&f->task_thread.cond,
+ &c->task_thread.lock);
+ out_delayed = &c->frame_thread.out_delayed[next];
+ if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
+ unsigned first = atomic_load(&c->task_thread.first);
+ if (first + 1U < c->n_fc)
+ atomic_fetch_add(&c->task_thread.first, 1U);
+ else
+ atomic_store(&c->task_thread.first, 0);
+ atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
+ &first, UINT_MAX);
+ if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
+ c->task_thread.cur--;
+ }
+ const int error = f->task_thread.retval;
+ if (error) {
+ f->task_thread.retval = 0;
+ c->cached_error = error;
+ dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
+ dav1d_thread_picture_unref(out_delayed);
+ } else if (out_delayed->p.data[0]) {
+ const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
+ memory_order_relaxed);
+ if ((out_delayed->visible || c->output_invisible_frames) &&
+ progress != FRAME_ERROR)
+ {
+ dav1d_thread_picture_ref(&c->out, out_delayed);
+ c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
+ }
+ dav1d_thread_picture_unref(out_delayed);
+ }
+ } else {
+ f = c->fc;
+ }
+
+ f->seq_hdr = c->seq_hdr;
+ f->seq_hdr_ref = c->seq_hdr_ref;
+ dav1d_ref_inc(f->seq_hdr_ref);
+ f->frame_hdr = c->frame_hdr;
+ f->frame_hdr_ref = c->frame_hdr_ref;
+ c->frame_hdr = NULL;
+ c->frame_hdr_ref = NULL;
+ f->dsp = &c->dsp[f->seq_hdr->hbd];
+
+ const int bpc = 8 + 2 * f->seq_hdr->hbd;
+
+ if (!f->dsp->ipred.intra_pred[DC_PRED]) {
+ Dav1dDSPContext *const dsp = &c->dsp[f->seq_hdr->hbd];
+
+ switch (bpc) {
+#define assign_bitdepth_case(bd) \
+ dav1d_cdef_dsp_init_##bd##bpc(&dsp->cdef); \
+ dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \
+ dav1d_itx_dsp_init_##bd##bpc(&dsp->itx, bpc); \
+ dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
+ dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \
+ dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
+ dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \
+ break
+#if CONFIG_8BPC
+ case 8:
+ assign_bitdepth_case(8);
+#endif
+#if CONFIG_16BPC
+ case 10:
+ case 12:
+ assign_bitdepth_case(16);
+#endif
+#undef assign_bitdepth_case
+ default:
+ dav1d_log(c, "Compiled without support for %d-bit decoding\n",
+ 8 + 2 * f->seq_hdr->hbd);
+ res = DAV1D_ERR(ENOPROTOOPT);
+ goto error;
+ }
+ }
+
+#define assign_bitdepth_case(bd) \
+ f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
+ f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
+ f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
+ f->bd_fn.filter_sbrow_deblock_cols = dav1d_filter_sbrow_deblock_cols_##bd##bpc; \
+ f->bd_fn.filter_sbrow_deblock_rows = dav1d_filter_sbrow_deblock_rows_##bd##bpc; \
+ f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \
+ f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \
+ f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \
+ f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
+ f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc
+ if (!f->seq_hdr->hbd) {
+#if CONFIG_8BPC
+ assign_bitdepth_case(8);
+#endif
+ } else {
+#if CONFIG_16BPC
+ assign_bitdepth_case(16);
+#endif
+ }
+#undef assign_bitdepth_case
+
+ int ref_coded_width[7];
+ if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ if (f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE) {
+ const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
+ if (!c->refs[pri_ref].p.p.data[0]) {
+ res = DAV1D_ERR(EINVAL);
+ goto error;
+ }
+ }
+ for (int i = 0; i < 7; i++) {
+ const int refidx = f->frame_hdr->refidx[i];
+ if (!c->refs[refidx].p.p.data[0] ||
+ f->frame_hdr->width[0] * 2 < c->refs[refidx].p.p.p.w ||
+ f->frame_hdr->height * 2 < c->refs[refidx].p.p.p.h ||
+ f->frame_hdr->width[0] > c->refs[refidx].p.p.p.w * 16 ||
+ f->frame_hdr->height > c->refs[refidx].p.p.p.h * 16 ||
+ f->seq_hdr->layout != c->refs[refidx].p.p.p.layout ||
+ bpc != c->refs[refidx].p.p.p.bpc)
+ {
+ for (int j = 0; j < i; j++)
+ dav1d_thread_picture_unref(&f->refp[j]);
+ res = DAV1D_ERR(EINVAL);
+ goto error;
+ }
+ dav1d_thread_picture_ref(&f->refp[i], &c->refs[refidx].p);
+ ref_coded_width[i] = c->refs[refidx].p.p.frame_hdr->width[0];
+ if (f->frame_hdr->width[0] != c->refs[refidx].p.p.p.w ||
+ f->frame_hdr->height != c->refs[refidx].p.p.p.h)
+ {
+#define scale_fac(ref_sz, this_sz) \
+ ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz))
+ f->svc[i][0].scale = scale_fac(c->refs[refidx].p.p.p.w,
+ f->frame_hdr->width[0]);
+ f->svc[i][1].scale = scale_fac(c->refs[refidx].p.p.p.h,
+ f->frame_hdr->height);
+ f->svc[i][0].step = (f->svc[i][0].scale + 8) >> 4;
+ f->svc[i][1].step = (f->svc[i][1].scale + 8) >> 4;
+ } else {
+ f->svc[i][0].scale = f->svc[i][1].scale = 0;
+ }
+ f->gmv_warp_allowed[i] = f->frame_hdr->gmv[i].type > DAV1D_WM_TYPE_TRANSLATION &&
+ !f->frame_hdr->force_integer_mv &&
+ !dav1d_get_shear_params(&f->frame_hdr->gmv[i]) &&
+ !f->svc[i][0].scale;
+ }
+ }
+
+ // setup entropy
+ if (f->frame_hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
+ dav1d_cdf_thread_init_static(&f->in_cdf, f->frame_hdr->quant.yac);
+ } else {
+ const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
+ dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]);
+ }
+ if (f->frame_hdr->refresh_context) {
+ res = dav1d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1);
+ if (res < 0) goto error;
+ }
+
+ // FIXME qsort so tiles are in order (for frame threading)
+ if (f->n_tile_data_alloc < c->n_tile_data) {
+ freep(&f->tile);
+ assert(c->n_tile_data < INT_MAX / (int)sizeof(*f->tile));
+ f->tile = malloc(c->n_tile_data * sizeof(*f->tile));
+ if (!f->tile) {
+ f->n_tile_data_alloc = f->n_tile_data = 0;
+ res = DAV1D_ERR(ENOMEM);
+ goto error;
+ }
+ f->n_tile_data_alloc = c->n_tile_data;
+ }
+ memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile));
+ memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile));
+ f->n_tile_data = c->n_tile_data;
+ c->n_tile_data = 0;
+
+ // allocate frame
+ res = dav1d_thread_picture_alloc(c, f, bpc);
+ if (res < 0) goto error;
+
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+ res = dav1d_picture_alloc_copy(c, &f->cur, f->frame_hdr->width[0], &f->sr_cur.p);
+ if (res < 0) goto error;
+ } else {
+ dav1d_picture_ref(&f->cur, &f->sr_cur.p);
+ }
+
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
+ f->resize_step[0] = scale_fac(f->cur.p.w, f->sr_cur.p.p.w);
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int in_cw = (f->cur.p.w + ss_hor) >> ss_hor;
+ const int out_cw = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+ f->resize_step[1] = scale_fac(in_cw, out_cw);
+#undef scale_fac
+ f->resize_start[0] = get_upscale_x0(f->cur.p.w, f->sr_cur.p.p.w, f->resize_step[0]);
+ f->resize_start[1] = get_upscale_x0(in_cw, out_cw, f->resize_step[1]);
+ }
+
+ // move f->cur into output queue
+ if (c->n_fc == 1) {
+ if (f->frame_hdr->show_frame || c->output_invisible_frames) {
+ dav1d_thread_picture_ref(&c->out, &f->sr_cur);
+ c->event_flags |= dav1d_picture_get_event_flags(&f->sr_cur);
+ }
+ } else {
+ dav1d_thread_picture_ref(out_delayed, &f->sr_cur);
+ }
+
+ f->w4 = (f->frame_hdr->width[0] + 3) >> 2;
+ f->h4 = (f->frame_hdr->height + 3) >> 2;
+ f->bw = ((f->frame_hdr->width[0] + 7) >> 3) << 1;
+ f->bh = ((f->frame_hdr->height + 7) >> 3) << 1;
+ f->sb128w = (f->bw + 31) >> 5;
+ f->sb128h = (f->bh + 31) >> 5;
+ f->sb_shift = 4 + f->seq_hdr->sb128;
+ f->sb_step = 16 << f->seq_hdr->sb128;
+ f->sbh = (f->bh + f->sb_step - 1) >> f->sb_shift;
+ f->b4_stride = (f->bw + 31) & ~31;
+ f->bitdepth_max = (1 << f->cur.p.bpc) - 1;
+ atomic_init(&f->task_thread.error, 0);
+ const int uses_2pass = c->n_fc > 1;
+ const int cols = f->frame_hdr->tiling.cols;
+ const int rows = f->frame_hdr->tiling.rows;
+ atomic_store(&f->task_thread.task_counter,
+ (cols * rows + f->sbh) << uses_2pass);
+
+ // ref_mvs
+ if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
+ f->mvs_ref = dav1d_ref_create_using_pool(c->refmvs_pool,
+ sizeof(*f->mvs) * f->sb128h * 16 * (f->b4_stride >> 1));
+ if (!f->mvs_ref) {
+ res = DAV1D_ERR(ENOMEM);
+ goto error;
+ }
+ f->mvs = f->mvs_ref->data;
+ if (!f->frame_hdr->allow_intrabc) {
+ for (int i = 0; i < 7; i++)
+ f->refpoc[i] = f->refp[i].p.frame_hdr->frame_offset;
+ } else {
+ memset(f->refpoc, 0, sizeof(f->refpoc));
+ }
+ if (f->frame_hdr->use_ref_frame_mvs) {
+ for (int i = 0; i < 7; i++) {
+ const int refidx = f->frame_hdr->refidx[i];
+ const int ref_w = ((ref_coded_width[i] + 7) >> 3) << 1;
+ const int ref_h = ((f->refp[i].p.p.h + 7) >> 3) << 1;
+ if (c->refs[refidx].refmvs != NULL &&
+ ref_w == f->bw && ref_h == f->bh)
+ {
+ f->ref_mvs_ref[i] = c->refs[refidx].refmvs;
+ dav1d_ref_inc(f->ref_mvs_ref[i]);
+ f->ref_mvs[i] = c->refs[refidx].refmvs->data;
+ } else {
+ f->ref_mvs[i] = NULL;
+ f->ref_mvs_ref[i] = NULL;
+ }
+ memcpy(f->refrefpoc[i], c->refs[refidx].refpoc,
+ sizeof(*f->refrefpoc));
+ }
+ } else {
+ memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref));
+ }
+ } else {
+ f->mvs_ref = NULL;
+ memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref));
+ }
+
+ // segmap
+ if (f->frame_hdr->segmentation.enabled) {
+ // By default, the previous segmentation map is not initialised.
+ f->prev_segmap_ref = NULL;
+ f->prev_segmap = NULL;
+
+ // We might need a previous frame's segmentation map. This
+ // happens if there is either no update or a temporal update.
+ if (f->frame_hdr->segmentation.temporal || !f->frame_hdr->segmentation.update_map) {
+ const int pri_ref = f->frame_hdr->primary_ref_frame;
+ assert(pri_ref != DAV1D_PRIMARY_REF_NONE);
+ const int ref_w = ((ref_coded_width[pri_ref] + 7) >> 3) << 1;
+ const int ref_h = ((f->refp[pri_ref].p.p.h + 7) >> 3) << 1;
+ if (ref_w == f->bw && ref_h == f->bh) {
+ f->prev_segmap_ref = c->refs[f->frame_hdr->refidx[pri_ref]].segmap;
+ if (f->prev_segmap_ref) {
+ dav1d_ref_inc(f->prev_segmap_ref);
+ f->prev_segmap = f->prev_segmap_ref->data;
+ }
+ }
+ }
+
+ if (f->frame_hdr->segmentation.update_map) {
+ // We're updating an existing map, but need somewhere to
+ // put the new values. Allocate them here (the data
+ // actually gets set elsewhere)
+ f->cur_segmap_ref = dav1d_ref_create_using_pool(c->segmap_pool,
+ sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h);
+ if (!f->cur_segmap_ref) {
+ dav1d_ref_dec(&f->prev_segmap_ref);
+ res = DAV1D_ERR(ENOMEM);
+ goto error;
+ }
+ f->cur_segmap = f->cur_segmap_ref->data;
+ } else if (f->prev_segmap_ref) {
+ // We're not updating an existing map, and we have a valid
+ // reference. Use that.
+ f->cur_segmap_ref = f->prev_segmap_ref;
+ dav1d_ref_inc(f->cur_segmap_ref);
+ f->cur_segmap = f->prev_segmap_ref->data;
+ } else {
+ // We need to make a new map. Allocate one here and zero it out.
+ const size_t segmap_size = sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h;
+ f->cur_segmap_ref = dav1d_ref_create_using_pool(c->segmap_pool, segmap_size);
+ if (!f->cur_segmap_ref) {
+ res = DAV1D_ERR(ENOMEM);
+ goto error;
+ }
+ f->cur_segmap = f->cur_segmap_ref->data;
+ memset(f->cur_segmap, 0, segmap_size);
+ }
+ } else {
+ f->cur_segmap = NULL;
+ f->cur_segmap_ref = NULL;
+ f->prev_segmap_ref = NULL;
+ }
+
+ // update references etc.
+ const unsigned refresh_frame_flags = f->frame_hdr->refresh_frame_flags;
+ for (int i = 0; i < 8; i++) {
+ if (refresh_frame_flags & (1 << i)) {
+ if (c->refs[i].p.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->refs[i].p);
+ dav1d_thread_picture_ref(&c->refs[i].p, &f->sr_cur);
+
+ dav1d_cdf_thread_unref(&c->cdf[i]);
+ if (f->frame_hdr->refresh_context) {
+ dav1d_cdf_thread_ref(&c->cdf[i], &f->out_cdf);
+ } else {
+ dav1d_cdf_thread_ref(&c->cdf[i], &f->in_cdf);
+ }
+
+ dav1d_ref_dec(&c->refs[i].segmap);
+ c->refs[i].segmap = f->cur_segmap_ref;
+ if (f->cur_segmap_ref)
+ dav1d_ref_inc(f->cur_segmap_ref);
+ dav1d_ref_dec(&c->refs[i].refmvs);
+ if (!f->frame_hdr->allow_intrabc) {
+ c->refs[i].refmvs = f->mvs_ref;
+ if (f->mvs_ref)
+ dav1d_ref_inc(f->mvs_ref);
+ }
+ memcpy(c->refs[i].refpoc, f->refpoc, sizeof(f->refpoc));
+ }
+ }
+
+ if (c->n_fc == 1) {
+ if ((res = dav1d_decode_frame(f)) < 0) {
+ dav1d_thread_picture_unref(&c->out);
+ for (int i = 0; i < 8; i++) {
+ if (refresh_frame_flags & (1 << i)) {
+ if (c->refs[i].p.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->refs[i].p);
+ dav1d_cdf_thread_unref(&c->cdf[i]);
+ dav1d_ref_dec(&c->refs[i].segmap);
+ dav1d_ref_dec(&c->refs[i].refmvs);
+ }
+ }
+ goto error;
+ }
+ } else {
+ dav1d_task_frame_init(f);
+ pthread_mutex_unlock(&c->task_thread.lock);
+ }
+
+ return 0;
+error:
+ atomic_init(&f->task_thread.error, 1);
+ dav1d_cdf_thread_unref(&f->in_cdf);
+ if (f->frame_hdr->refresh_context)
+ dav1d_cdf_thread_unref(&f->out_cdf);
+ for (int i = 0; i < 7; i++) {
+ if (f->refp[i].p.frame_hdr)
+ dav1d_thread_picture_unref(&f->refp[i]);
+ dav1d_ref_dec(&f->ref_mvs_ref[i]);
+ }
+ if (c->n_fc == 1)
+ dav1d_thread_picture_unref(&c->out);
+ else
+ dav1d_thread_picture_unref(out_delayed);
+ dav1d_picture_unref_internal(&f->cur);
+ dav1d_thread_picture_unref(&f->sr_cur);
+ dav1d_ref_dec(&f->mvs_ref);
+ dav1d_ref_dec(&f->seq_hdr_ref);
+ dav1d_ref_dec(&f->frame_hdr_ref);
+ dav1d_data_props_copy(&c->cached_error_props, &c->in.m);
+
+ for (int i = 0; i < f->n_tile_data; i++)
+ dav1d_data_unref_internal(&f->tile[i].data);
+ f->n_tile_data = 0;
+
+ if (c->n_fc > 1)
+ pthread_mutex_unlock(&c->task_thread.lock);
+
+ return res;
+}
diff --git a/third_party/dav1d/src/decode.h b/third_party/dav1d/src/decode.h
new file mode 100644
index 0000000000..1eae5850a5
--- /dev/null
+++ b/third_party/dav1d/src/decode.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_DECODE_H
+#define DAV1D_SRC_DECODE_H
+
+#include "src/internal.h"
+
+int dav1d_submit_frame(Dav1dContext *c);
+
+#endif /* DAV1D_SRC_DECODE_H */
diff --git a/third_party/dav1d/src/dequant_tables.c b/third_party/dav1d/src/dequant_tables.c
new file mode 100644
index 0000000000..520d727b03
--- /dev/null
+++ b/third_party/dav1d/src/dequant_tables.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/dequant_tables.h"
+
+const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2] = {
+ {
+ { 4, 4, }, { 8, 8, }, { 8, 9, }, { 9, 10, },
+ { 10, 11, }, { 11, 12, }, { 12, 13, }, { 12, 14, },
+ { 13, 15, }, { 14, 16, }, { 15, 17, }, { 16, 18, },
+ { 17, 19, }, { 18, 20, }, { 19, 21, }, { 19, 22, },
+ { 20, 23, }, { 21, 24, }, { 22, 25, }, { 23, 26, },
+ { 24, 27, }, { 25, 28, }, { 26, 29, }, { 26, 30, },
+ { 27, 31, }, { 28, 32, }, { 29, 33, }, { 30, 34, },
+ { 31, 35, }, { 32, 36, }, { 32, 37, }, { 33, 38, },
+ { 34, 39, }, { 35, 40, }, { 36, 41, }, { 37, 42, },
+ { 38, 43, }, { 38, 44, }, { 39, 45, }, { 40, 46, },
+ { 41, 47, }, { 42, 48, }, { 43, 49, }, { 43, 50, },
+ { 44, 51, }, { 45, 52, }, { 46, 53, }, { 47, 54, },
+ { 48, 55, }, { 48, 56, }, { 49, 57, }, { 50, 58, },
+ { 51, 59, }, { 52, 60, }, { 53, 61, }, { 53, 62, },
+ { 54, 63, }, { 55, 64, }, { 56, 65, }, { 57, 66, },
+ { 57, 67, }, { 58, 68, }, { 59, 69, }, { 60, 70, },
+ { 61, 71, }, { 62, 72, }, { 62, 73, }, { 63, 74, },
+ { 64, 75, }, { 65, 76, }, { 66, 77, }, { 66, 78, },
+ { 67, 79, }, { 68, 80, }, { 69, 81, }, { 70, 82, },
+ { 70, 83, }, { 71, 84, }, { 72, 85, }, { 73, 86, },
+ { 74, 87, }, { 74, 88, }, { 75, 89, }, { 76, 90, },
+ { 77, 91, }, { 78, 92, }, { 78, 93, }, { 79, 94, },
+ { 80, 95, }, { 81, 96, }, { 81, 97, }, { 82, 98, },
+ { 83, 99, }, { 84, 100, }, { 85, 101, }, { 85, 102, },
+ { 87, 104, }, { 88, 106, }, { 90, 108, }, { 92, 110, },
+ { 93, 112, }, { 95, 114, }, { 96, 116, }, { 98, 118, },
+ { 99, 120, }, { 101, 122, }, { 102, 124, }, { 104, 126, },
+ { 105, 128, }, { 107, 130, }, { 108, 132, }, { 110, 134, },
+ { 111, 136, }, { 113, 138, }, { 114, 140, }, { 116, 142, },
+ { 117, 144, }, { 118, 146, }, { 120, 148, }, { 121, 150, },
+ { 123, 152, }, { 125, 155, }, { 127, 158, }, { 129, 161, },
+ { 131, 164, }, { 134, 167, }, { 136, 170, }, { 138, 173, },
+ { 140, 176, }, { 142, 179, }, { 144, 182, }, { 146, 185, },
+ { 148, 188, }, { 150, 191, }, { 152, 194, }, { 154, 197, },
+ { 156, 200, }, { 158, 203, }, { 161, 207, }, { 164, 211, },
+ { 166, 215, }, { 169, 219, }, { 172, 223, }, { 174, 227, },
+ { 177, 231, }, { 180, 235, }, { 182, 239, }, { 185, 243, },
+ { 187, 247, }, { 190, 251, }, { 192, 255, }, { 195, 260, },
+ { 199, 265, }, { 202, 270, }, { 205, 275, }, { 208, 280, },
+ { 211, 285, }, { 214, 290, }, { 217, 295, }, { 220, 300, },
+ { 223, 305, }, { 226, 311, }, { 230, 317, }, { 233, 323, },
+ { 237, 329, }, { 240, 335, }, { 243, 341, }, { 247, 347, },
+ { 250, 353, }, { 253, 359, }, { 257, 366, }, { 261, 373, },
+ { 265, 380, }, { 269, 387, }, { 272, 394, }, { 276, 401, },
+ { 280, 408, }, { 284, 416, }, { 288, 424, }, { 292, 432, },
+ { 296, 440, }, { 300, 448, }, { 304, 456, }, { 309, 465, },
+ { 313, 474, }, { 317, 483, }, { 322, 492, }, { 326, 501, },
+ { 330, 510, }, { 335, 520, }, { 340, 530, }, { 344, 540, },
+ { 349, 550, }, { 354, 560, }, { 359, 571, }, { 364, 582, },
+ { 369, 593, }, { 374, 604, }, { 379, 615, }, { 384, 627, },
+ { 389, 639, }, { 395, 651, }, { 400, 663, }, { 406, 676, },
+ { 411, 689, }, { 417, 702, }, { 423, 715, }, { 429, 729, },
+ { 435, 743, }, { 441, 757, }, { 447, 771, }, { 454, 786, },
+ { 461, 801, }, { 467, 816, }, { 475, 832, }, { 482, 848, },
+ { 489, 864, }, { 497, 881, }, { 505, 898, }, { 513, 915, },
+ { 522, 933, }, { 530, 951, }, { 539, 969, }, { 549, 988, },
+ { 559, 1007, }, { 569, 1026, }, { 579, 1046, }, { 590, 1066, },
+ { 602, 1087, }, { 614, 1108, }, { 626, 1129, }, { 640, 1151, },
+ { 654, 1173, }, { 668, 1196, }, { 684, 1219, }, { 700, 1243, },
+ { 717, 1267, }, { 736, 1292, }, { 755, 1317, }, { 775, 1343, },
+ { 796, 1369, }, { 819, 1396, }, { 843, 1423, }, { 869, 1451, },
+ { 896, 1479, }, { 925, 1508, }, { 955, 1537, }, { 988, 1567, },
+ { 1022, 1597, }, { 1058, 1628, }, { 1098, 1660, }, { 1139, 1692, },
+ { 1184, 1725, }, { 1232, 1759, }, { 1282, 1793, }, { 1336, 1828, },
+ }, {
+ { 4, 4, }, { 9, 9, }, { 10, 11, }, { 13, 13, },
+ { 15, 16, }, { 17, 18, }, { 20, 21, }, { 22, 24, },
+ { 25, 27, }, { 28, 30, }, { 31, 33, }, { 34, 37, },
+ { 37, 40, }, { 40, 44, }, { 43, 48, }, { 47, 51, },
+ { 50, 55, }, { 53, 59, }, { 57, 63, }, { 60, 67, },
+ { 64, 71, }, { 68, 75, }, { 71, 79, }, { 75, 83, },
+ { 78, 88, }, { 82, 92, }, { 86, 96, }, { 90, 100, },
+ { 93, 105, }, { 97, 109, }, { 101, 114, }, { 105, 118, },
+ { 109, 122, }, { 113, 127, }, { 116, 131, }, { 120, 136, },
+ { 124, 140, }, { 128, 145, }, { 132, 149, }, { 136, 154, },
+ { 140, 158, }, { 143, 163, }, { 147, 168, }, { 151, 172, },
+ { 155, 177, }, { 159, 181, }, { 163, 186, }, { 166, 190, },
+ { 170, 195, }, { 174, 199, }, { 178, 204, }, { 182, 208, },
+ { 185, 213, }, { 189, 217, }, { 193, 222, }, { 197, 226, },
+ { 200, 231, }, { 204, 235, }, { 208, 240, }, { 212, 244, },
+ { 215, 249, }, { 219, 253, }, { 223, 258, }, { 226, 262, },
+ { 230, 267, }, { 233, 271, }, { 237, 275, }, { 241, 280, },
+ { 244, 284, }, { 248, 289, }, { 251, 293, }, { 255, 297, },
+ { 259, 302, }, { 262, 306, }, { 266, 311, }, { 269, 315, },
+ { 273, 319, }, { 276, 324, }, { 280, 328, }, { 283, 332, },
+ { 287, 337, }, { 290, 341, }, { 293, 345, }, { 297, 349, },
+ { 300, 354, }, { 304, 358, }, { 307, 362, }, { 310, 367, },
+ { 314, 371, }, { 317, 375, }, { 321, 379, }, { 324, 384, },
+ { 327, 388, }, { 331, 392, }, { 334, 396, }, { 337, 401, },
+ { 343, 409, }, { 350, 417, }, { 356, 425, }, { 362, 433, },
+ { 369, 441, }, { 375, 449, }, { 381, 458, }, { 387, 466, },
+ { 394, 474, }, { 400, 482, }, { 406, 490, }, { 412, 498, },
+ { 418, 506, }, { 424, 514, }, { 430, 523, }, { 436, 531, },
+ { 442, 539, }, { 448, 547, }, { 454, 555, }, { 460, 563, },
+ { 466, 571, }, { 472, 579, }, { 478, 588, }, { 484, 596, },
+ { 490, 604, }, { 499, 616, }, { 507, 628, }, { 516, 640, },
+ { 525, 652, }, { 533, 664, }, { 542, 676, }, { 550, 688, },
+ { 559, 700, }, { 567, 713, }, { 576, 725, }, { 584, 737, },
+ { 592, 749, }, { 601, 761, }, { 609, 773, }, { 617, 785, },
+ { 625, 797, }, { 634, 809, }, { 644, 825, }, { 655, 841, },
+ { 666, 857, }, { 676, 873, }, { 687, 889, }, { 698, 905, },
+ { 708, 922, }, { 718, 938, }, { 729, 954, }, { 739, 970, },
+ { 749, 986, }, { 759, 1002, }, { 770, 1018, }, { 782, 1038, },
+ { 795, 1058, }, { 807, 1078, }, { 819, 1098, }, { 831, 1118, },
+ { 844, 1138, }, { 856, 1158, }, { 868, 1178, }, { 880, 1198, },
+ { 891, 1218, }, { 906, 1242, }, { 920, 1266, }, { 933, 1290, },
+ { 947, 1314, }, { 961, 1338, }, { 975, 1362, }, { 988, 1386, },
+ { 1001, 1411, }, { 1015, 1435, }, { 1030, 1463, }, { 1045, 1491, },
+ { 1061, 1519, }, { 1076, 1547, }, { 1090, 1575, }, { 1105, 1603, },
+ { 1120, 1631, }, { 1137, 1663, }, { 1153, 1695, }, { 1170, 1727, },
+ { 1186, 1759, }, { 1202, 1791, }, { 1218, 1823, }, { 1236, 1859, },
+ { 1253, 1895, }, { 1271, 1931, }, { 1288, 1967, }, { 1306, 2003, },
+ { 1323, 2039, }, { 1342, 2079, }, { 1361, 2119, }, { 1379, 2159, },
+ { 1398, 2199, }, { 1416, 2239, }, { 1436, 2283, }, { 1456, 2327, },
+ { 1476, 2371, }, { 1496, 2415, }, { 1516, 2459, }, { 1537, 2507, },
+ { 1559, 2555, }, { 1580, 2603, }, { 1601, 2651, }, { 1624, 2703, },
+ { 1647, 2755, }, { 1670, 2807, }, { 1692, 2859, }, { 1717, 2915, },
+ { 1741, 2971, }, { 1766, 3027, }, { 1791, 3083, }, { 1817, 3143, },
+ { 1844, 3203, }, { 1871, 3263, }, { 1900, 3327, }, { 1929, 3391, },
+ { 1958, 3455, }, { 1990, 3523, }, { 2021, 3591, }, { 2054, 3659, },
+ { 2088, 3731, }, { 2123, 3803, }, { 2159, 3876, }, { 2197, 3952, },
+ { 2236, 4028, }, { 2276, 4104, }, { 2319, 4184, }, { 2363, 4264, },
+ { 2410, 4348, }, { 2458, 4432, }, { 2508, 4516, }, { 2561, 4604, },
+ { 2616, 4692, }, { 2675, 4784, }, { 2737, 4876, }, { 2802, 4972, },
+ { 2871, 5068, }, { 2944, 5168, }, { 3020, 5268, }, { 3102, 5372, },
+ { 3188, 5476, }, { 3280, 5584, }, { 3375, 5692, }, { 3478, 5804, },
+ { 3586, 5916, }, { 3702, 6032, }, { 3823, 6148, }, { 3953, 6268, },
+ { 4089, 6388, }, { 4236, 6512, }, { 4394, 6640, }, { 4559, 6768, },
+ { 4737, 6900, }, { 4929, 7036, }, { 5130, 7172, }, { 5347, 7312, },
+ }, {
+ { 4, 4 }, { 12, 13 }, { 18, 19 }, { 25, 27 },
+ { 33, 35 }, { 41, 44 }, { 50, 54 }, { 60, 64 },
+ { 70, 75 }, { 80, 87 }, { 91, 99 }, { 103, 112 },
+ { 115, 126 }, { 127, 139 }, { 140, 154 }, { 153, 168 },
+ { 166, 183 }, { 180, 199 }, { 194, 214 }, { 208, 230 },
+ { 222, 247 }, { 237, 263 }, { 251, 280 }, { 266, 297 },
+ { 281, 314 }, { 296, 331 }, { 312, 349 }, { 327, 366 },
+ { 343, 384 }, { 358, 402 }, { 374, 420 }, { 390, 438 },
+ { 405, 456 }, { 421, 475 }, { 437, 493 }, { 453, 511 },
+ { 469, 530 }, { 484, 548 }, { 500, 567 }, { 516, 586 },
+ { 532, 604 }, { 548, 623 }, { 564, 642 }, { 580, 660 },
+ { 596, 679 }, { 611, 698 }, { 627, 716 }, { 643, 735 },
+ { 659, 753 }, { 674, 772 }, { 690, 791 }, { 706, 809 },
+ { 721, 828 }, { 737, 846 }, { 752, 865 }, { 768, 884 },
+ { 783, 902 }, { 798, 920 }, { 814, 939 }, { 829, 957 },
+ { 844, 976 }, { 859, 994 }, { 874, 1012 }, { 889, 1030 },
+ { 904, 1049 }, { 919, 1067 }, { 934, 1085 }, { 949, 1103 },
+ { 964, 1121 }, { 978, 1139 }, { 993, 1157 }, { 1008, 1175 },
+ { 1022, 1193 }, { 1037, 1211 }, { 1051, 1229 }, { 1065, 1246 },
+ { 1080, 1264 }, { 1094, 1282 }, { 1108, 1299 }, { 1122, 1317 },
+ { 1136, 1335 }, { 1151, 1352 }, { 1165, 1370 }, { 1179, 1387 },
+ { 1192, 1405 }, { 1206, 1422 }, { 1220, 1440 }, { 1234, 1457 },
+ { 1248, 1474 }, { 1261, 1491 }, { 1275, 1509 }, { 1288, 1526 },
+ { 1302, 1543 }, { 1315, 1560 }, { 1329, 1577 }, { 1342, 1595 },
+ { 1368, 1627 }, { 1393, 1660 }, { 1419, 1693 }, { 1444, 1725 },
+ { 1469, 1758 }, { 1494, 1791 }, { 1519, 1824 }, { 1544, 1856 },
+ { 1569, 1889 }, { 1594, 1922 }, { 1618, 1954 }, { 1643, 1987 },
+ { 1668, 2020 }, { 1692, 2052 }, { 1717, 2085 }, { 1741, 2118 },
+ { 1765, 2150 }, { 1789, 2183 }, { 1814, 2216 }, { 1838, 2248 },
+ { 1862, 2281 }, { 1885, 2313 }, { 1909, 2346 }, { 1933, 2378 },
+ { 1957, 2411 }, { 1992, 2459 }, { 2027, 2508 }, { 2061, 2556 },
+ { 2096, 2605 }, { 2130, 2653 }, { 2165, 2701 }, { 2199, 2750 },
+ { 2233, 2798 }, { 2267, 2847 }, { 2300, 2895 }, { 2334, 2943 },
+ { 2367, 2992 }, { 2400, 3040 }, { 2434, 3088 }, { 2467, 3137 },
+ { 2499, 3185 }, { 2532, 3234 }, { 2575, 3298 }, { 2618, 3362 },
+ { 2661, 3426 }, { 2704, 3491 }, { 2746, 3555 }, { 2788, 3619 },
+ { 2830, 3684 }, { 2872, 3748 }, { 2913, 3812 }, { 2954, 3876 },
+ { 2995, 3941 }, { 3036, 4005 }, { 3076, 4069 }, { 3127, 4149 },
+ { 3177, 4230 }, { 3226, 4310 }, { 3275, 4390 }, { 3324, 4470 },
+ { 3373, 4550 }, { 3421, 4631 }, { 3469, 4711 }, { 3517, 4791 },
+ { 3565, 4871 }, { 3621, 4967 }, { 3677, 5064 }, { 3733, 5160 },
+ { 3788, 5256 }, { 3843, 5352 }, { 3897, 5448 }, { 3951, 5544 },
+ { 4005, 5641 }, { 4058, 5737 }, { 4119, 5849 }, { 4181, 5961 },
+ { 4241, 6073 }, { 4301, 6185 }, { 4361, 6297 }, { 4420, 6410 },
+ { 4479, 6522 }, { 4546, 6650 }, { 4612, 6778 }, { 4677, 6906 },
+ { 4742, 7034 }, { 4807, 7162 }, { 4871, 7290 }, { 4942, 7435 },
+ { 5013, 7579 }, { 5083, 7723 }, { 5153, 7867 }, { 5222, 8011 },
+ { 5291, 8155 }, { 5367, 8315 }, { 5442, 8475 }, { 5517, 8635 },
+ { 5591, 8795 }, { 5665, 8956 }, { 5745, 9132 }, { 5825, 9308 },
+ { 5905, 9484 }, { 5984, 9660 }, { 6063, 9836 }, { 6149, 10028 },
+ { 6234, 10220 }, { 6319, 10412 }, { 6404, 10604 }, { 6495, 10812 },
+ { 6587, 11020 }, { 6678, 11228 }, { 6769, 11437 }, { 6867, 11661 },
+ { 6966, 11885 }, { 7064, 12109 }, { 7163, 12333 }, { 7269, 12573 },
+ { 7376, 12813 }, { 7483, 13053 }, { 7599, 13309 }, { 7715, 13565 },
+ { 7832, 13821 }, { 7958, 14093 }, { 8085, 14365 }, { 8214, 14637 },
+ { 8352, 14925 }, { 8492, 15213 }, { 8635, 15502 }, { 8788, 15806 },
+ { 8945, 16110 }, { 9104, 16414 }, { 9275, 16734 }, { 9450, 17054 },
+ { 9639, 17390 }, { 9832, 17726 }, { 10031, 18062 }, { 10245, 18414 },
+ { 10465, 18766 }, { 10702, 19134 }, { 10946, 19502 }, { 11210, 19886 },
+ { 11482, 20270 }, { 11776, 20670 }, { 12081, 21070 }, { 12409, 21486 },
+ { 12750, 21902 }, { 13118, 22334 }, { 13501, 22766 }, { 13913, 23214 },
+ { 14343, 23662 }, { 14807, 24126 }, { 15290, 24590 }, { 15812, 25070 },
+ { 16356, 25551 }, { 16943, 26047 }, { 17575, 26559 }, { 18237, 27071 },
+ { 18949, 27599 }, { 19718, 28143 }, { 20521, 28687 }, { 21387, 29247 },
+ }
+};
diff --git a/third_party/dav1d/src/dequant_tables.h b/third_party/dav1d/src/dequant_tables.h
new file mode 100644
index 0000000000..17763377bc
--- /dev/null
+++ b/third_party/dav1d/src/dequant_tables.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_DEQUANT_TABLES_H
+#define DAV1D_SRC_DEQUANT_TABLES_H
+
+#include <stdint.h>
+
+#include "src/levels.h"
+
+EXTERN const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2];
+
+#endif /* DAV1D_SRC_DEQUANT_TABLES_H */
diff --git a/third_party/dav1d/src/env.h b/third_party/dav1d/src/env.h
new file mode 100644
index 0000000000..7b91c4cab6
--- /dev/null
+++ b/third_party/dav1d/src/env.h
@@ -0,0 +1,521 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ENV_H
+#define DAV1D_SRC_ENV_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "src/levels.h"
+#include "src/refmvs.h"
+#include "src/tables.h"
+
+typedef struct BlockContext {
+ uint8_t ALIGN(mode[32], 8);
+ uint8_t ALIGN(lcoef[32], 8);
+ uint8_t ALIGN(ccoef[2][32], 8);
+ uint8_t ALIGN(seg_pred[32], 8);
+ uint8_t ALIGN(skip[32], 8);
+ uint8_t ALIGN(skip_mode[32], 8);
+ uint8_t ALIGN(intra[32], 8);
+ uint8_t ALIGN(comp_type[32], 8);
+ int8_t ALIGN(ref[2][32], 8); // -1 means intra
+ uint8_t ALIGN(filter[2][32], 8); // 3 means unset
+ int8_t ALIGN(tx_intra[32], 8);
+ int8_t ALIGN(tx[32], 8);
+ uint8_t ALIGN(tx_lpf_y[32], 8);
+ uint8_t ALIGN(tx_lpf_uv[32], 8);
+ uint8_t ALIGN(partition[16], 8);
+ uint8_t ALIGN(uvmode[32], 8);
+ uint8_t ALIGN(pal_sz[32], 8);
+} BlockContext;
+
+static inline int get_intra_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ if (have_left) {
+ if (have_top) {
+ const int ctx = l->intra[yb4] + a->intra[xb4];
+ return ctx + (ctx == 2);
+ } else
+ return l->intra[yb4] * 2;
+ } else {
+ return have_top ? a->intra[xb4] * 2 : 0;
+ }
+}
+
+static inline int get_tx_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const TxfmInfo *const max_tx,
+ const int yb4, const int xb4)
+{
+ return (l->tx_intra[yb4] >= max_tx->lh) + (a->tx_intra[xb4] >= max_tx->lw);
+}
+
+static inline int get_partition_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const enum BlockLevel bl,
+ const int yb8, const int xb8)
+{
+ return ((a->partition[xb8] >> (4 - bl)) & 1) +
+ (((l->partition[yb8] >> (4 - bl)) & 1) << 1);
+}
+
+static inline unsigned gather_left_partition_prob(const uint16_t *const in,
+ const enum BlockLevel bl)
+{
+ unsigned out = in[PARTITION_H - 1] - in[PARTITION_H];
+ // Exploit the fact that cdfs for PARTITION_SPLIT, PARTITION_T_TOP_SPLIT,
+ // PARTITION_T_BOTTOM_SPLIT and PARTITION_T_LEFT_SPLIT are neighbors.
+ out += in[PARTITION_SPLIT - 1] - in[PARTITION_T_LEFT_SPLIT];
+ if (bl != BL_128X128)
+ out += in[PARTITION_H4 - 1] - in[PARTITION_H4];
+ return out;
+}
+
+static inline unsigned gather_top_partition_prob(const uint16_t *const in,
+ const enum BlockLevel bl)
+{
+ // Exploit the fact that cdfs for PARTITION_V, PARTITION_SPLIT and
+ // PARTITION_T_TOP_SPLIT are neighbors.
+ unsigned out = in[PARTITION_V - 1] - in[PARTITION_T_TOP_SPLIT];
+ // Exploit the facts that cdfs for PARTITION_T_LEFT_SPLIT and
+ // PARTITION_T_RIGHT_SPLIT are neighbors, the probability for
+ // PARTITION_V4 is always zero, and the probability for
+ // PARTITION_T_RIGHT_SPLIT is zero in 128x128 blocks.
+ out += in[PARTITION_T_LEFT_SPLIT - 1];
+ if (bl != BL_128X128)
+ out += in[PARTITION_V4 - 1] - in[PARTITION_T_RIGHT_SPLIT];
+ return out;
+}
+
+static inline enum TxfmType get_uv_inter_txtp(const TxfmInfo *const uvt_dim,
+ const enum TxfmType ytxtp)
+{
+ if (uvt_dim->max == TX_32X32)
+ return ytxtp == IDTX ? IDTX : DCT_DCT;
+ if (uvt_dim->min == TX_16X16 &&
+ ((1 << ytxtp) & ((1 << H_FLIPADST) | (1 << V_FLIPADST) |
+ (1 << H_ADST) | (1 << V_ADST))))
+ {
+ return DCT_DCT;
+ }
+
+ return ytxtp;
+}
+
+static inline int get_filter_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int comp, const int dir, const int ref,
+ const int yb4, const int xb4)
+{
+ const int a_filter = (a->ref[0][xb4] == ref || a->ref[1][xb4] == ref) ?
+ a->filter[dir][xb4] : DAV1D_N_SWITCHABLE_FILTERS;
+ const int l_filter = (l->ref[0][yb4] == ref || l->ref[1][yb4] == ref) ?
+ l->filter[dir][yb4] : DAV1D_N_SWITCHABLE_FILTERS;
+
+ if (a_filter == l_filter) {
+ return comp * 4 + a_filter;
+ } else if (a_filter == DAV1D_N_SWITCHABLE_FILTERS) {
+ return comp * 4 + l_filter;
+ } else if (l_filter == DAV1D_N_SWITCHABLE_FILTERS) {
+ return comp * 4 + a_filter;
+ } else {
+ return comp * 4 + DAV1D_N_SWITCHABLE_FILTERS;
+ }
+}
+
+static inline int get_comp_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ if (have_top) {
+ if (have_left) {
+ if (a->comp_type[xb4]) {
+ if (l->comp_type[yb4]) {
+ return 4;
+ } else {
+ // 4U means intra (-1) or bwd (>= 4)
+ return 2 + ((unsigned)l->ref[0][yb4] >= 4U);
+ }
+ } else if (l->comp_type[yb4]) {
+ // 4U means intra (-1) or bwd (>= 4)
+ return 2 + ((unsigned)a->ref[0][xb4] >= 4U);
+ } else {
+ return (l->ref[0][yb4] >= 4) ^ (a->ref[0][xb4] >= 4);
+ }
+ } else {
+ return a->comp_type[xb4] ? 3 : a->ref[0][xb4] >= 4;
+ }
+ } else if (have_left) {
+ return l->comp_type[yb4] ? 3 : l->ref[0][yb4] >= 4;
+ } else {
+ return 1;
+ }
+}
+
+static inline int get_comp_dir_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+#define has_uni_comp(edge, off) \
+ ((edge->ref[0][off] < 4) == (edge->ref[1][off] < 4))
+
+ if (have_top && have_left) {
+ const int a_intra = a->intra[xb4], l_intra = l->intra[yb4];
+
+ if (a_intra && l_intra) return 2;
+ if (a_intra || l_intra) {
+ const BlockContext *const edge = a_intra ? l : a;
+ const int off = a_intra ? yb4 : xb4;
+
+ if (edge->comp_type[off] == COMP_INTER_NONE) return 2;
+ return 1 + 2 * has_uni_comp(edge, off);
+ }
+
+ const int a_comp = a->comp_type[xb4] != COMP_INTER_NONE;
+ const int l_comp = l->comp_type[yb4] != COMP_INTER_NONE;
+ const int a_ref0 = a->ref[0][xb4], l_ref0 = l->ref[0][yb4];
+
+ if (!a_comp && !l_comp) {
+ return 1 + 2 * ((a_ref0 >= 4) == (l_ref0 >= 4));
+ } else if (!a_comp || !l_comp) {
+ const BlockContext *const edge = a_comp ? a : l;
+ const int off = a_comp ? xb4 : yb4;
+
+ if (!has_uni_comp(edge, off)) return 1;
+ return 3 + ((a_ref0 >= 4) == (l_ref0 >= 4));
+ } else {
+ const int a_uni = has_uni_comp(a, xb4), l_uni = has_uni_comp(l, yb4);
+
+ if (!a_uni && !l_uni) return 0;
+ if (!a_uni || !l_uni) return 2;
+ return 3 + ((a_ref0 == 4) == (l_ref0 == 4));
+ }
+ } else if (have_top || have_left) {
+ const BlockContext *const edge = have_left ? l : a;
+ const int off = have_left ? yb4 : xb4;
+
+ if (edge->intra[off]) return 2;
+ if (edge->comp_type[off] == COMP_INTER_NONE) return 2;
+ return 4 * has_uni_comp(edge, off);
+ } else {
+ return 2;
+ }
+}
+
+static inline int get_poc_diff(const int order_hint_n_bits,
+ const int poc0, const int poc1)
+{
+ if (!order_hint_n_bits) return 0;
+ const int mask = 1 << (order_hint_n_bits - 1);
+ const int diff = poc0 - poc1;
+ return (diff & (mask - 1)) - (diff & mask);
+}
+
+static inline int get_jnt_comp_ctx(const int order_hint_n_bits,
+ const unsigned poc, const unsigned ref0poc,
+ const unsigned ref1poc,
+ const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4)
+{
+ const unsigned d0 = abs(get_poc_diff(order_hint_n_bits, ref0poc, poc));
+ const unsigned d1 = abs(get_poc_diff(order_hint_n_bits, poc, ref1poc));
+ const int offset = d0 == d1;
+ const int a_ctx = a->comp_type[xb4] >= COMP_INTER_AVG ||
+ a->ref[0][xb4] == 6;
+ const int l_ctx = l->comp_type[yb4] >= COMP_INTER_AVG ||
+ l->ref[0][yb4] == 6;
+
+ return 3 * offset + a_ctx + l_ctx;
+}
+
+static inline int get_mask_comp_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4)
+{
+ const int a_ctx = a->comp_type[xb4] >= COMP_INTER_SEG ? 1 :
+ a->ref[0][xb4] == 6 ? 3 : 0;
+ const int l_ctx = l->comp_type[yb4] >= COMP_INTER_SEG ? 1 :
+ l->ref[0][yb4] == 6 ? 3 : 0;
+
+ return imin(a_ctx + l_ctx, 5);
+}
+
+#define av1_get_ref_2_ctx av1_get_bwd_ref_ctx
+#define av1_get_ref_3_ctx av1_get_fwd_ref_ctx
+#define av1_get_ref_4_ctx av1_get_fwd_ref_1_ctx
+#define av1_get_ref_5_ctx av1_get_fwd_ref_2_ctx
+#define av1_get_ref_6_ctx av1_get_bwd_ref_1_ctx
+#define av1_get_uni_p_ctx av1_get_ref_ctx
+#define av1_get_uni_p2_ctx av1_get_fwd_ref_2_ctx
+
+static inline int av1_get_ref_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ int have_top, int have_left)
+{
+ int cnt[2] = { 0 };
+
+ if (have_top && !a->intra[xb4]) {
+ cnt[a->ref[0][xb4] >= 4]++;
+ if (a->comp_type[xb4]) cnt[a->ref[1][xb4] >= 4]++;
+ }
+
+ if (have_left && !l->intra[yb4]) {
+ cnt[l->ref[0][yb4] >= 4]++;
+ if (l->comp_type[yb4]) cnt[l->ref[1][yb4] >= 4]++;
+ }
+
+ return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int av1_get_fwd_ref_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ int cnt[4] = { 0 };
+
+ if (have_top && !a->intra[xb4]) {
+ if (a->ref[0][xb4] < 4) cnt[a->ref[0][xb4]]++;
+ if (a->comp_type[xb4] && a->ref[1][xb4] < 4) cnt[a->ref[1][xb4]]++;
+ }
+
+ if (have_left && !l->intra[yb4]) {
+ if (l->ref[0][yb4] < 4) cnt[l->ref[0][yb4]]++;
+ if (l->comp_type[yb4] && l->ref[1][yb4] < 4) cnt[l->ref[1][yb4]]++;
+ }
+
+ cnt[0] += cnt[1];
+ cnt[2] += cnt[3];
+
+ return cnt[0] == cnt[2] ? 1 : cnt[0] < cnt[2] ? 0 : 2;
+}
+
+static inline int av1_get_fwd_ref_1_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ int cnt[2] = { 0 };
+
+ if (have_top && !a->intra[xb4]) {
+ if (a->ref[0][xb4] < 2) cnt[a->ref[0][xb4]]++;
+ if (a->comp_type[xb4] && a->ref[1][xb4] < 2) cnt[a->ref[1][xb4]]++;
+ }
+
+ if (have_left && !l->intra[yb4]) {
+ if (l->ref[0][yb4] < 2) cnt[l->ref[0][yb4]]++;
+ if (l->comp_type[yb4] && l->ref[1][yb4] < 2) cnt[l->ref[1][yb4]]++;
+ }
+
+ return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int av1_get_fwd_ref_2_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ int cnt[2] = { 0 };
+
+ if (have_top && !a->intra[xb4]) {
+ if ((a->ref[0][xb4] ^ 2U) < 2) cnt[a->ref[0][xb4] - 2]++;
+ if (a->comp_type[xb4] && (a->ref[1][xb4] ^ 2U) < 2) cnt[a->ref[1][xb4] - 2]++;
+ }
+
+ if (have_left && !l->intra[yb4]) {
+ if ((l->ref[0][yb4] ^ 2U) < 2) cnt[l->ref[0][yb4] - 2]++;
+ if (l->comp_type[yb4] && (l->ref[1][yb4] ^ 2U) < 2) cnt[l->ref[1][yb4] - 2]++;
+ }
+
+ return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int av1_get_bwd_ref_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ int cnt[3] = { 0 };
+
+ if (have_top && !a->intra[xb4]) {
+ if (a->ref[0][xb4] >= 4) cnt[a->ref[0][xb4] - 4]++;
+ if (a->comp_type[xb4] && a->ref[1][xb4] >= 4) cnt[a->ref[1][xb4] - 4]++;
+ }
+
+ if (have_left && !l->intra[yb4]) {
+ if (l->ref[0][yb4] >= 4) cnt[l->ref[0][yb4] - 4]++;
+ if (l->comp_type[yb4] && l->ref[1][yb4] >= 4) cnt[l->ref[1][yb4] - 4]++;
+ }
+
+ cnt[1] += cnt[0];
+
+ return cnt[2] == cnt[1] ? 1 : cnt[1] < cnt[2] ? 0 : 2;
+}
+
+static inline int av1_get_bwd_ref_1_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ int cnt[3] = { 0 };
+
+ if (have_top && !a->intra[xb4]) {
+ if (a->ref[0][xb4] >= 4) cnt[a->ref[0][xb4] - 4]++;
+ if (a->comp_type[xb4] && a->ref[1][xb4] >= 4) cnt[a->ref[1][xb4] - 4]++;
+ }
+
+ if (have_left && !l->intra[yb4]) {
+ if (l->ref[0][yb4] >= 4) cnt[l->ref[0][yb4] - 4]++;
+ if (l->comp_type[yb4] && l->ref[1][yb4] >= 4) cnt[l->ref[1][yb4] - 4]++;
+ }
+
+ return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int av1_get_uni_p1_ctx(const BlockContext *const a,
+ const BlockContext *const l,
+ const int yb4, const int xb4,
+ const int have_top, const int have_left)
+{
+ int cnt[3] = { 0 };
+
+ if (have_top && !a->intra[xb4]) {
+ if (a->ref[0][xb4] - 1U < 3) cnt[a->ref[0][xb4] - 1]++;
+ if (a->comp_type[xb4] && a->ref[1][xb4] - 1U < 3) cnt[a->ref[1][xb4] - 1]++;
+ }
+
+ if (have_left && !l->intra[yb4]) {
+ if (l->ref[0][yb4] - 1U < 3) cnt[l->ref[0][yb4] - 1]++;
+ if (l->comp_type[yb4] && l->ref[1][yb4] - 1U < 3) cnt[l->ref[1][yb4] - 1]++;
+ }
+
+ cnt[1] += cnt[2];
+
+ return cnt[0] == cnt[1] ? 1 : cnt[0] < cnt[1] ? 0 : 2;
+}
+
+static inline int get_drl_context(const refmvs_candidate *const ref_mv_stack,
+ const int ref_idx)
+{
+ if (ref_mv_stack[ref_idx].weight >= 640)
+ return ref_mv_stack[ref_idx + 1].weight < 640;
+
+ return ref_mv_stack[ref_idx + 1].weight < 640 ? 2 : 0;
+}
+
+static inline unsigned get_cur_frame_segid(const int by, const int bx,
+ const int have_top,
+ const int have_left,
+ int *const seg_ctx,
+ const uint8_t *cur_seg_map,
+ const ptrdiff_t stride)
+{
+ cur_seg_map += bx + by * stride;
+ if (have_left && have_top) {
+ const int l = cur_seg_map[-1];
+ const int a = cur_seg_map[-stride];
+ const int al = cur_seg_map[-(stride + 1)];
+
+ if (l == a && al == l) *seg_ctx = 2;
+ else if (l == a || al == l || a == al) *seg_ctx = 1;
+ else *seg_ctx = 0;
+ return a == al ? a : l;
+ } else {
+ *seg_ctx = 0;
+ return have_left ? cur_seg_map[-1] : have_top ? cur_seg_map[-stride] : 0;
+ }
+}
+
+static inline void fix_int_mv_precision(mv *const mv) {
+ mv->x = (mv->x - (mv->x >> 15) + 3) & ~7U;
+ mv->y = (mv->y - (mv->y >> 15) + 3) & ~7U;
+}
+
+static inline void fix_mv_precision(const Dav1dFrameHeader *const hdr,
+ mv *const mv)
+{
+ if (hdr->force_integer_mv) {
+ fix_int_mv_precision(mv);
+ } else if (!hdr->hp) {
+ mv->x = (mv->x - (mv->x >> 15)) & ~1U;
+ mv->y = (mv->y - (mv->y >> 15)) & ~1U;
+ }
+}
+
+static inline mv get_gmv_2d(const Dav1dWarpedMotionParams *const gmv,
+ const int bx4, const int by4,
+ const int bw4, const int bh4,
+ const Dav1dFrameHeader *const hdr)
+{
+ switch (gmv->type) {
+ case DAV1D_WM_TYPE_ROT_ZOOM:
+ assert(gmv->matrix[5] == gmv->matrix[2]);
+ assert(gmv->matrix[4] == -gmv->matrix[3]);
+ // fall-through
+ default:
+ case DAV1D_WM_TYPE_AFFINE: {
+ const int x = bx4 * 4 + bw4 * 2 - 1;
+ const int y = by4 * 4 + bh4 * 2 - 1;
+ const int xc = (gmv->matrix[2] - (1 << 16)) * x +
+ gmv->matrix[3] * y + gmv->matrix[0];
+ const int yc = (gmv->matrix[5] - (1 << 16)) * y +
+ gmv->matrix[4] * x + gmv->matrix[1];
+ const int shift = 16 - (3 - !hdr->hp);
+ const int round = (1 << shift) >> 1;
+ mv res = (mv) {
+ .y = apply_sign(((abs(yc) + round) >> shift) << !hdr->hp, yc),
+ .x = apply_sign(((abs(xc) + round) >> shift) << !hdr->hp, xc),
+ };
+ if (hdr->force_integer_mv)
+ fix_int_mv_precision(&res);
+ return res;
+ }
+ case DAV1D_WM_TYPE_TRANSLATION: {
+ mv res = (mv) {
+ .y = gmv->matrix[0] >> 13,
+ .x = gmv->matrix[1] >> 13,
+ };
+ if (hdr->force_integer_mv)
+ fix_int_mv_precision(&res);
+ return res;
+ }
+ case DAV1D_WM_TYPE_IDENTITY:
+ return (mv) { .x = 0, .y = 0 };
+ }
+}
+
+#endif /* DAV1D_SRC_ENV_H */
diff --git a/third_party/dav1d/src/ext/x86/x86inc.asm b/third_party/dav1d/src/ext/x86/x86inc.asm
new file mode 100644
index 0000000000..68b1f74f4b
--- /dev/null
+++ b/third_party/dav1d/src/ext/x86/x86inc.asm
@@ -0,0 +1,1902 @@
+;*****************************************************************************
+;* x86inc.asm: x86 abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2022 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Henrik Gramner <henrik@gramner.com>
+;* Anton Mitrofanov <BugMaster@narod.ru>
+;* Fiona Glaser <fiona@x264.com>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x86inc.asm assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used.
+
+%ifndef private_prefix
+ %error private_prefix not defined
+%endif
+
+%ifndef public_prefix
+ %define public_prefix private_prefix
+%endif
+
+%ifndef STACK_ALIGNMENT
+ %if ARCH_X86_64
+ %define STACK_ALIGNMENT 16
+ %else
+ %define STACK_ALIGNMENT 4
+ %endif
+%endif
+
+%define WIN64 0
+%define UNIX64 0
+%if ARCH_X86_64
+ %ifidn __OUTPUT_FORMAT__,win32
+ %define WIN64 1
+ %elifidn __OUTPUT_FORMAT__,win64
+ %define WIN64 1
+ %elifidn __OUTPUT_FORMAT__,x64
+ %define WIN64 1
+ %else
+ %define UNIX64 1
+ %endif
+%endif
+
+%define FORMAT_ELF 0
+%define FORMAT_MACHO 0
+%ifidn __OUTPUT_FORMAT__,elf
+ %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf32
+ %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf64
+ %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,macho
+ %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho32
+ %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+ %define FORMAT_MACHO 1
+%endif
+
+%ifdef PREFIX
+ %define mangle(x) _ %+ x
+%else
+ %define mangle(x) x
+%endif
+
+; Use VEX-encoding even in non-AVX functions
+%ifndef FORCE_VEX_ENCODING
+ %define FORCE_VEX_ENCODING 0
+%endif
+
+%macro SECTION_RODATA 0-1 16
+ %ifidn __OUTPUT_FORMAT__,win32
+ SECTION .rdata align=%1
+ %elif WIN64
+ SECTION .rdata align=%1
+ %else
+ SECTION .rodata align=%1
+ %endif
+%endmacro
+
+%if ARCH_X86_64
+ %define PIC 1 ; always use PIC on x86-64
+ default rel
+%elifidn __OUTPUT_FORMAT__,win32
+ %define PIC 0 ; PIC isn't used on 32-bit Windows
+%elifndef PIC
+ %define PIC 0
+%endif
+
+%define HAVE_PRIVATE_EXTERN 1
+%ifdef __NASM_VER__
+ %use smartalign
+ %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
+ %define HAVE_PRIVATE_EXTERN 0
+ %endif
+%endif
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most use cases.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = (optional) stack size to be allocated. The stack will be aligned before
+; allocating the specified stack size. If the required stack alignment is
+; larger than the known stack alignment the stack will be manually aligned
+; and an extra register will be allocated to hold the original stack
+; pointer (to not invalidate r0m etc.). To prevent the use of an extra
+; register as stack pointer, request a negative stack size.
+; %4+/%5+ = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,7,0x40, dst, src, tmp
+; declares a function (foo) that automatically loads two arguments (dst and
+; src) into registers, uses one additional register (tmp) plus 7 vector
+; registers (m0-m6) and allocates 0x40 bytes of stack space.
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Use this instead of RET if it's a branch target.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+ %define r%1q %2
+ %define r%1d %2d
+ %define r%1w %2w
+ %define r%1b %2b
+ %define r%1h %2h
+ %define %2q %2
+ %if %0 == 2
+ %define r%1m %2d
+ %define r%1mp %2
+ %elif ARCH_X86_64 ; memory
+ %define r%1m [rstk + stack_offset + %3]
+ %define r%1mp qword r %+ %1 %+ m
+ %else
+ %define r%1m [rstk + stack_offset + %3]
+ %define r%1mp dword r %+ %1 %+ m
+ %endif
+ %define r%1 %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+ %define r%1q r%1
+ %define e%1q r%1
+ %define r%1d e%1
+ %define e%1d e%1
+ %define r%1w %1
+ %define e%1w %1
+ %define r%1h %3
+ %define e%1h %3
+ %define r%1b %2
+ %define e%1b %2
+ %if ARCH_X86_64 == 0
+ %define r%1 e%1
+ %endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+ %assign %%i 0
+ %rep %0
+ CAT_XDEFINE t, %%i, r%1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+ %rep %0
+ %define t%1q t%1 %+ q
+ %define t%1d t%1 %+ d
+ %define t%1w t%1 %+ w
+ %define t%1h t%1 %+ h
+ %define t%1b t%1 %+ b
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+ %define gprsize 8
+%else
+ %define gprsize 4
+%endif
+
+%macro LEA 2
+%if ARCH_X86_64
+ lea %1, [%2]
+%elif PIC
+ call $+5 ; special-cased to not affect the RSB on most CPU:s
+ pop %1
+ add %1, (%2)-$+1
+%else
+ mov %1, %2
+%endif
+%endmacro
+
+; Repeats an instruction/operation for multiple arguments.
+; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3"
+%macro REPX 2-* ; operation, args
+ %xdefine %%f(x) %1
+ %rep %0 - 1
+ %rotate 1
+ %%f(%1)
+ %endrep
+%endmacro
+
+%macro PUSH 1
+ push %1
+ %ifidn rstk, rsp
+ %assign stack_offset stack_offset+gprsize
+ %endif
+%endmacro
+
+%macro POP 1
+ pop %1
+ %ifidn rstk, rsp
+ %assign stack_offset stack_offset-gprsize
+ %endif
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ PUSH r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ pop r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+ %rep %0
+ %if %1 < num_args
+ mov r%1, r %+ %1 %+ mp
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SUB 2
+ sub %1, %2
+ %ifidn %1, rstk
+ %assign stack_offset stack_offset+(%2)
+ %endif
+%endmacro
+
+%macro ADD 2
+ add %1, %2
+ %ifidn %1, rstk
+ %assign stack_offset stack_offset-(%2)
+ %endif
+%endmacro
+
+%macro movifnidn 2
+ %ifnidn %1, %2
+ mov %1, %2
+ %endif
+%endmacro
+
+%if ARCH_X86_64 == 0
+ %define movsxd movifnidn
+%endif
+
+%macro movsxdifnidn 2
+ %ifnidn %1, %2
+ movsxd %1, %2
+ %endif
+%endmacro
+
+%macro ASSERT 1
+ %if (%1) == 0
+ %error assertion ``%1'' failed
+ %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+ %ifdef n_arg_names
+ %assign %%i 0
+ %rep n_arg_names
+ CAT_UNDEF arg_name %+ %%i, q
+ CAT_UNDEF arg_name %+ %%i, d
+ CAT_UNDEF arg_name %+ %%i, w
+ CAT_UNDEF arg_name %+ %%i, h
+ CAT_UNDEF arg_name %+ %%i, b
+ CAT_UNDEF arg_name %+ %%i, m
+ CAT_UNDEF arg_name %+ %%i, mp
+ CAT_UNDEF arg_name, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+
+ %xdefine %%stack_offset stack_offset
+ %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+ %assign %%i 0
+ %rep %0
+ %xdefine %1q r %+ %%i %+ q
+ %xdefine %1d r %+ %%i %+ d
+ %xdefine %1w r %+ %%i %+ w
+ %xdefine %1h r %+ %%i %+ h
+ %xdefine %1b r %+ %%i %+ b
+ %xdefine %1m r %+ %%i %+ m
+ %xdefine %1mp r %+ %%i %+ mp
+ CAT_XDEFINE arg_name, %%i, %1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+ %xdefine stack_offset %%stack_offset
+ %assign n_arg_names %0
+%endmacro
+
+%define required_stack_alignment ((mmsize + 15) & ~15)
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
+%define high_mm_regs (16*cpuflag(avx512))
+
+; Large stack allocations on Windows need to use stack probing in order
+; to guarantee that all stack memory is committed before accessing it.
+; This is done by ensuring that the guard page(s) at the end of the
+; currently committed pages are touched prior to any pages beyond that.
+%if WIN64
+ %assign STACK_PROBE_SIZE 8192
+%elifidn __OUTPUT_FORMAT__, win32
+ %assign STACK_PROBE_SIZE 4096
+%else
+ %assign STACK_PROBE_SIZE 0
+%endif
+
+%macro PROBE_STACK 1 ; stack_size
+ %if STACK_PROBE_SIZE
+ %assign %%i STACK_PROBE_SIZE
+ %rep %1 / STACK_PROBE_SIZE
+ mov eax, [rsp-%%i]
+ %assign %%i %%i+STACK_PROBE_SIZE
+ %endrep
+ %endif
+%endmacro
+
+%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only)
+ %ifnum %1
+ %if %1 != 0
+ %assign %%pad 0
+ %assign stack_size %1
+ %if stack_size < 0
+ %assign stack_size -stack_size
+ %endif
+ %if WIN64
+ %assign %%pad %%pad + 32 ; shadow space
+ %if mmsize != 8
+ %assign xmm_regs_used %2
+ %if xmm_regs_used > 8
+ %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
+ %endif
+ %endif
+ %endif
+ %if required_stack_alignment <= STACK_ALIGNMENT
+ ; maintain the current stack alignment
+ %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+ PROBE_STACK stack_size_padded
+ SUB rsp, stack_size_padded
+ %else
+ %assign %%reg_num (regs_used - 1)
+ %xdefine rstk r %+ %%reg_num
+ ; align stack, and save original stack location directly above
+ ; it, i.e. in [rsp+stack_size_padded], so we can restore the
+ ; stack in a single instruction (i.e. mov rsp, rstk or mov
+ ; rsp, [rsp+stack_size_padded])
+ %if %1 < 0 ; need to store rsp on stack
+ %xdefine rstkm [rsp + stack_size + %%pad]
+ %assign %%pad %%pad + gprsize
+ %else ; can keep rsp in rstk during whole function
+ %xdefine rstkm rstk
+ %endif
+ %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
+ PROBE_STACK stack_size_padded
+ mov rstk, rsp
+ and rsp, ~(required_stack_alignment-1)
+ sub rsp, stack_size_padded
+ movifnidn rstkm, rstk
+ %endif
+ WIN64_PUSH_XMM
+ %endif
+ %endif
+%endmacro
+
+%macro SETUP_STACK_POINTER 0-1 0
+ %ifnum %1
+ %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
+ %if %1 > 0
+ ; Reserve an additional register for storing the original stack pointer, but avoid using
+ ; eax/rax for this purpose since it can potentially get overwritten as a return value.
+ %assign regs_used (regs_used + 1)
+ %if ARCH_X86_64 && regs_used == 7
+ %assign regs_used 8
+ %elif ARCH_X86_64 == 0 && regs_used == 1
+ %assign regs_used 2
+ %endif
+ %endif
+ %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
+ ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
+ ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
+ %assign regs_used 5 + UNIX64 * 3
+ %endif
+ %endif
+ %endif
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0, rcx
+DECLARE_REG 1, rdx
+DECLARE_REG 2, R8
+DECLARE_REG 3, R9
+DECLARE_REG 4, R10, 40
+DECLARE_REG 5, R11, 48
+DECLARE_REG 6, rax, 56
+DECLARE_REG 7, rdi, 64
+DECLARE_REG 8, rsi, 72
+DECLARE_REG 9, rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R14, 96
+DECLARE_REG 12, R15, 104
+DECLARE_REG 13, R12, 112
+DECLARE_REG 14, R13, 120
+
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ SETUP_STACK_POINTER %4
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+ ALLOC_STACK %4, %3
+ %if mmsize != 8 && stack_size == 0
+ WIN64_SPILL_XMM %3
+ %endif
+ LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+ %if %0 > 4
+ %ifnum %4
+ DEFINE_ARGS %5
+ %else
+ DEFINE_ARGS %4, %5
+ %endif
+ %elifnnum %4
+ DEFINE_ARGS %4
+ %endif
+%endmacro
+
+%macro WIN64_PUSH_XMM 0
+ ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+ %if xmm_regs_used > 6 + high_mm_regs
+ movaps [rstk + stack_offset + 8], xmm6
+ %endif
+ %if xmm_regs_used > 7 + high_mm_regs
+ movaps [rstk + stack_offset + 24], xmm7
+ %endif
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
+ %assign %%i 8
+ %rep %%xmm_regs_on_stack
+ movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+ %assign xmm_regs_used %1
+ ASSERT xmm_regs_used <= 16 + high_mm_regs
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
+ ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
+ %assign %%pad %%xmm_regs_on_stack*16 + 32
+ %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+ SUB rsp, stack_size_padded
+ %endif
+ WIN64_PUSH_XMM
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 0
+ %assign %%pad_size 0
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
+ %assign %%i xmm_regs_used - high_mm_regs
+ %rep %%xmm_regs_on_stack
+ %assign %%i %%i-1
+ movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
+ %endrep
+ %endif
+ %if stack_size_padded > 0
+ %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
+ mov rsp, rstkm
+ %else
+ add rsp, stack_size_padded
+ %assign %%pad_size stack_size_padded
+ %endif
+ %endif
+ %if xmm_regs_used > 7 + high_mm_regs
+ movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
+ %endif
+ %if xmm_regs_used > 6 + high_mm_regs
+ movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
+ %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 0
+ WIN64_RESTORE_XMM_INTERNAL
+ %assign stack_offset (stack_offset-stack_size_padded)
+ %assign stack_size_padded 0
+ %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
+
+%macro RET 0
+ WIN64_RESTORE_XMM_INTERNAL
+ POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+ %if vzeroupper_required
+ vzeroupper
+ %endif
+ AUTO_REP_RET
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0, rdi
+DECLARE_REG 1, rsi
+DECLARE_REG 2, rdx
+DECLARE_REG 3, rcx
+DECLARE_REG 4, R8
+DECLARE_REG 5, R9
+DECLARE_REG 6, rax, 8
+DECLARE_REG 7, R10, 16
+DECLARE_REG 8, R11, 24
+DECLARE_REG 9, rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R14, 48
+DECLARE_REG 12, R15, 56
+DECLARE_REG 13, R12, 64
+DECLARE_REG 14, R13, 72
+
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ %assign xmm_regs_used %3
+ ASSERT regs_used >= num_args
+ SETUP_STACK_POINTER %4
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 9, 10, 11, 12, 13, 14
+ ALLOC_STACK %4
+ LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+ %if %0 > 4
+ %ifnum %4
+ DEFINE_ARGS %5
+ %else
+ DEFINE_ARGS %4, %5
+ %endif
+ %elifnnum %4
+ DEFINE_ARGS %4
+ %endif
+%endmacro
+
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
+
+%macro RET 0
+ %if stack_size_padded > 0
+ %if required_stack_alignment > STACK_ALIGNMENT
+ mov rsp, rstkm
+ %else
+ add rsp, stack_size_padded
+ %endif
+ %endif
+ POP_IF_USED 14, 13, 12, 11, 10, 9
+ %if vzeroupper_required
+ vzeroupper
+ %endif
+ AUTO_REP_RET
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+ %rep %0
+ %define r%1m [rstk + stack_offset + 4*%1 + 4]
+ %define r%1mp dword r%1m
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ %if num_args > 7
+ %assign num_args 7
+ %endif
+ %if regs_used > 7
+ %assign regs_used 7
+ %endif
+ SETUP_STACK_POINTER %4
+ ASSERT regs_used <= 7
+ PUSH_IF_USED 3, 4, 5, 6
+ ALLOC_STACK %4
+ LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+ %if %0 > 4
+ %ifnum %4
+ DEFINE_ARGS %5
+ %else
+ DEFINE_ARGS %4, %5
+ %endif
+ %elifnnum %4
+ DEFINE_ARGS %4
+ %endif
+%endmacro
+
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
+
+%macro RET 0
+ %if stack_size_padded > 0
+ %if required_stack_alignment > STACK_ALIGNMENT
+ mov rsp, rstkm
+ %else
+ add rsp, stack_size_padded
+ %endif
+ %endif
+ POP_IF_USED 6, 5, 4, 3
+ %if vzeroupper_required
+ vzeroupper
+ %endif
+ AUTO_REP_RET
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+ %macro WIN64_SPILL_XMM 1
+ %assign xmm_regs_used %1
+ %endmacro
+ %macro WIN64_RESTORE_XMM 0
+ %assign xmm_regs_used 0
+ %endmacro
+ %macro WIN64_PUSH_XMM 0
+ %endmacro
+%endif
+
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
+%macro REP_RET 0
+ %if has_epilogue || cpuflag(ssse3)
+ RET
+ %else
+ rep ret
+ %endif
+ annotate_function_size
+%endmacro
+
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+ %if notcpuflag(ssse3)
+ times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
+ %endif
+ ret
+ annotate_function_size
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+ %rep %0
+ %macro %1 1-2 %1
+ %2 %1
+ %if notcpuflag(ssse3)
+ %%branch_instr equ $
+ %xdefine last_branch_adr %%branch_instr
+ %endif
+ %endmacro
+ %rotate 1
+ %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
+%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
+ %if has_epilogue
+ call %1
+ RET
+ %elif %2
+ jmp %1
+ %endif
+ annotate_function_size
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
+; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
+%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
+ cglobal_internal 1, %1 %+ SUFFIX, %2
+%endmacro
+%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
+ cglobal_internal 0, %1 %+ SUFFIX, %2
+%endmacro
+%macro cglobal_internal 2-3+
+ annotate_function_size
+ %ifndef cglobaled_%2
+ %if %1
+ %xdefine %2 mangle(private_prefix %+ _ %+ %2)
+ %else
+ %xdefine %2 mangle(public_prefix %+ _ %+ %2)
+ %endif
+ %xdefine %2.skip_prologue %2 %+ .skip_prologue
+ CAT_XDEFINE cglobaled_, %2, 1
+ %endif
+ %xdefine current_function %2
+ %xdefine current_function_section __SECT__
+ %if FORMAT_ELF
+ %if %1
+ global %2:function hidden
+ %else
+ global %2:function
+ %endif
+ %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
+ global %2:private_extern
+ %else
+ global %2
+ %endif
+ align function_align
+ %2:
+ RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
+ %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+ %assign stack_offset 0 ; stack pointer offset relative to the return address
+ %assign stack_size 0 ; amount of stack space that can be freely used inside a function
+ %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+ %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
+ %ifnidn %3, ""
+ PROLOGUE %3
+ %endif
+%endmacro
+
+; Create a global symbol from a local label with the correct name mangling and type
+%macro cglobal_label 1
+ %if FORMAT_ELF
+ global current_function %+ %1:function hidden
+ %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+ global current_function %+ %1:private_extern
+ %else
+ global current_function %+ %1
+ %endif
+ %1:
+%endmacro
+
+%macro cextern 1
+ %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+ CAT_XDEFINE cglobaled_, %1, 1
+ extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+ %ifdef PREFIX
+ %xdefine %1 mangle(%1)
+ %endif
+ CAT_XDEFINE cglobaled_, %1, 1
+ extern %1
+%endmacro
+
+%macro const 1-2+
+ %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+ %if FORMAT_ELF
+ global %1:data hidden
+ %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+ global %1:private_extern
+ %else
+ global %1
+ %endif
+ %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
+%if FORMAT_ELF
+ [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
+%endif
+
+; Tell debuggers how large the function was.
+; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
+; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
+; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
+; then its size might be unspecified.
+%macro annotate_function_size 0
+ %ifdef __YASM_VER__
+ %ifdef current_function
+ %if FORMAT_ELF
+ current_function_section
+ %%ecf equ $
+ size current_function %%ecf - current_function
+ __SECT__
+ %endif
+ %endif
+ %endif
+%endmacro
+
+; cpuflags
+
+%assign cpuflags_mmx (1<<0)
+%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
+%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
+%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
+%assign cpuflags_sse (1<<4) | cpuflags_mmx2
+%assign cpuflags_sse2 (1<<5) | cpuflags_sse
+%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
+%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
+%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
+%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
+%assign cpuflags_sse4 (1<<10) | cpuflags_ssse3
+%assign cpuflags_sse42 (1<<11) | cpuflags_sse4
+%assign cpuflags_aesni (1<<12) | cpuflags_sse42
+%assign cpuflags_gfni (1<<13) | cpuflags_sse42
+%assign cpuflags_avx (1<<14) | cpuflags_sse42
+%assign cpuflags_xop (1<<15) | cpuflags_avx
+%assign cpuflags_fma4 (1<<16) | cpuflags_avx
+%assign cpuflags_fma3 (1<<17) | cpuflags_avx
+%assign cpuflags_bmi1 (1<<18) | cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2 (1<<19) | cpuflags_bmi1
+%assign cpuflags_avx2 (1<<20) | cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512 (1<<21) | cpuflags_avx2 ; F, CD, BW, DQ, VL
+%assign cpuflags_avx512icl (1<<22) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
+
+%assign cpuflags_cache32 (1<<23)
+%assign cpuflags_cache64 (1<<24)
+%assign cpuflags_aligned (1<<25) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<26)
+
+; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
+%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
+%define notcpuflag(x) (cpuflag(x) ^ 1)
+
+; Takes an arbitrary number of cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-*
+ %xdefine SUFFIX
+ %undef cpuname
+ %assign cpuflags 0
+
+ %if %0 >= 1
+ %rep %0
+ %ifdef cpuname
+ %xdefine cpuname cpuname %+ _%1
+ %else
+ %xdefine cpuname %1
+ %endif
+ %assign cpuflags cpuflags | cpuflags_%1
+ %rotate 1
+ %endrep
+ %xdefine SUFFIX _ %+ cpuname
+
+ %if cpuflag(avx)
+ %assign avx_enabled 1
+ %endif
+ %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
+ %define mova movaps
+ %define movu movups
+ %define movnta movntps
+ %endif
+ %if cpuflag(aligned)
+ %define movu mova
+ %elif cpuflag(sse3) && notcpuflag(ssse3)
+ %define movu lddqu
+ %endif
+ %endif
+
+ %if ARCH_X86_64 || cpuflag(sse2)
+ %ifdef __NASM_VER__
+ ALIGNMODE p6
+ %else
+ CPU amdnop
+ %endif
+ %else
+ %ifdef __NASM_VER__
+ ALIGNMODE nop
+ %else
+ CPU basicnop
+ %endif
+ %endif
+%endmacro
+
+; Merge mmx, sse*, and avx*
+; m# is a simd register of the currently selected size
+; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
+; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
+; (All 4 remain in sync through SWAP.)
+
+%macro CAT_XDEFINE 3
+ %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+ %undef %1%2
+%endmacro
+
+%macro DEFINE_MMREGS 1 ; mmtype
+ %assign %%prev_mmregs 0
+ %ifdef num_mmregs
+ %assign %%prev_mmregs num_mmregs
+ %endif
+
+ %assign num_mmregs 8
+ %if ARCH_X86_64 && mmsize >= 16
+ %assign num_mmregs 16
+ %if cpuflag(avx512) || mmsize == 64
+ %assign num_mmregs 32
+ %endif
+ %endif
+
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, %1 %+ %%i
+ CAT_XDEFINE nn%1, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %if %%prev_mmregs > num_mmregs
+ %rep %%prev_mmregs - num_mmregs
+ CAT_UNDEF m, %%i
+ CAT_UNDEF nn %+ mmtype, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+ %xdefine mmtype %1
+%endmacro
+
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
+ %if ARCH_X86_64 && cpuflag(avx512)
+ %assign %%i %1
+ %rep 16-%1
+ %assign %%i_high %%i+16
+ SWAP %%i, %%i_high
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
+%macro INIT_MMX 0-1+
+ %assign avx_enabled 0
+ %define RESET_MM_PERMUTATION INIT_MMX %1
+ %define mmsize 8
+ %define mova movq
+ %define movu movq
+ %define movh movd
+ %define movnta movntq
+ INIT_CPUFLAGS %1
+ DEFINE_MMREGS mm
+%endmacro
+
+%macro INIT_XMM 0-1+
+ %assign avx_enabled FORCE_VEX_ENCODING
+ %define RESET_MM_PERMUTATION INIT_XMM %1
+ %define mmsize 16
+ %define mova movdqa
+ %define movu movdqu
+ %define movh movq
+ %define movnta movntdq
+ INIT_CPUFLAGS %1
+ DEFINE_MMREGS xmm
+ %if WIN64
+ AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
+ %endif
+ %xdefine bcstd 1to4
+ %xdefine bcstq 1to2
+%endmacro
+
+%macro INIT_YMM 0-1+
+ %assign avx_enabled 1
+ %define RESET_MM_PERMUTATION INIT_YMM %1
+ %define mmsize 32
+ %define mova movdqa
+ %define movu movdqu
+ %undef movh
+ %define movnta movntdq
+ INIT_CPUFLAGS %1
+ DEFINE_MMREGS ymm
+ AVX512_MM_PERMUTATION
+ %xdefine bcstd 1to8
+ %xdefine bcstq 1to4
+%endmacro
+
+%macro INIT_ZMM 0-1+
+ %assign avx_enabled 1
+ %define RESET_MM_PERMUTATION INIT_ZMM %1
+ %define mmsize 64
+ %define mova movdqa
+ %define movu movdqu
+ %undef movh
+ %define movnta movntdq
+ INIT_CPUFLAGS %1
+ DEFINE_MMREGS zmm
+ AVX512_MM_PERMUTATION
+ %xdefine bcstd 1to16
+ %xdefine bcstq 1to8
+%endmacro
+
+INIT_XMM
+
+%macro DECLARE_MMCAST 1
+ %define mmmm%1 mm%1
+ %define mmxmm%1 mm%1
+ %define mmymm%1 mm%1
+ %define mmzmm%1 mm%1
+ %define xmmmm%1 mm%1
+ %define xmmxmm%1 xmm%1
+ %define xmmymm%1 xmm%1
+ %define xmmzmm%1 xmm%1
+ %define ymmmm%1 mm%1
+ %define ymmxmm%1 xmm%1
+ %define ymmymm%1 ymm%1
+ %define ymmzmm%1 ymm%1
+ %define zmmmm%1 mm%1
+ %define zmmxmm%1 xmm%1
+ %define zmmymm%1 ymm%1
+ %define zmmzmm%1 zmm%1
+ %define xm%1 xmm %+ m%1
+ %define ym%1 ymm %+ m%1
+ %define zm%1 zmm %+ m%1
+%endmacro
+
+%assign i 0
+%rep 32
+ DECLARE_MMCAST i
+ %assign i i+1
+%endrep
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+ %rep %0/2
+ %xdefine %%tmp%2 m%2
+ %rotate 2
+ %endrep
+ %rep %0/2
+ %xdefine m%1 %%tmp%2
+ CAT_XDEFINE nn, m%1, %1
+ %rotate 2
+ %endrep
+%endmacro
+
+%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
+ %ifnum %1 ; SWAP 0, 1, ...
+ SWAP_INTERNAL_NUM %1, %2
+ %else ; SWAP m0, m1, ...
+ SWAP_INTERNAL_NAME %1, %2
+ %endif
+%endmacro
+
+%macro SWAP_INTERNAL_NUM 2-*
+ %rep %0-1
+ %xdefine %%tmp m%1
+ %xdefine m%1 m%2
+ %xdefine m%2 %%tmp
+ CAT_XDEFINE nn, m%1, %1
+ CAT_XDEFINE nn, m%2, %2
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SWAP_INTERNAL_NAME 2-*
+ %xdefine %%args nn %+ %1
+ %rep %0-1
+ %xdefine %%args %%args, nn %+ %2
+ %rotate 1
+ %endrep
+ SWAP_INTERNAL_NUM %%args
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+ %if %0
+ %xdefine %%f %1_m
+ %else
+ %xdefine %%f current_function %+ _m
+ %endif
+ %assign %%i 0
+ %rep num_mmregs
+ %xdefine %%tmp m %+ %%i
+ CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
+ %assign %%i %%i+1
+ %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 0-1 ; name to load from
+ %if %0
+ %xdefine %%f %1_m
+ %else
+ %xdefine %%f current_function %+ _m
+ %endif
+ %xdefine %%tmp %%f %+ 0
+ %ifnum %%tmp
+ DEFINE_MMREGS mmtype
+ %assign %%i 0
+ %rep num_mmregs
+ %xdefine %%tmp %%f %+ %%i
+ CAT_XDEFINE %%m, %%i, m %+ %%tmp
+ %assign %%i %%i+1
+ %endrep
+ %rep num_mmregs
+ %assign %%i %%i-1
+ CAT_XDEFINE m, %%i, %%m %+ %%i
+ CAT_XDEFINE nn, m %+ %%i, %%i
+ %endrep
+ %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+ %ifid %1
+ call_internal %1 %+ SUFFIX, %1
+ %else
+ call %1
+ %endif
+%endmacro
+%macro call_internal 2
+ %xdefine %%i %2
+ %ifndef cglobaled_%2
+ %ifdef cglobaled_%1
+ %xdefine %%i %1
+ %endif
+ %endif
+ call %%i
+ LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+ %ifnum %2
+ %if %2==128
+ sub %1, -128
+ %else
+ add %1, %2
+ %endif
+ %else
+ add %1, %2
+ %endif
+%endmacro
+
+%macro sub 2
+ %ifnum %2
+ %if %2==128
+ add %1, -128
+ %else
+ sub %1, %2
+ %endif
+ %else
+ sub %1, %2
+ %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 32
+ %if i < 8
+ CAT_XDEFINE sizeofmm, i, 8
+ CAT_XDEFINE regnumofmm, i, i
+ %endif
+ CAT_XDEFINE sizeofxmm, i, 16
+ CAT_XDEFINE sizeofymm, i, 32
+ CAT_XDEFINE sizeofzmm, i, 64
+ CAT_XDEFINE regnumofxmm, i, i
+ CAT_XDEFINE regnumofymm, i, i
+ CAT_XDEFINE regnumofzmm, i, i
+ %assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+ %xdefine %%opcode %1
+ %xdefine %%dst %2
+ %rep %0-2
+ %ifidn %%dst, %3
+ %error non-avx emulation of ``%%opcode'' is not supported
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+;%6+: operands
+%macro RUN_AVX_INSTR 6-9+
+ %ifnum sizeof%7
+ %assign __sizeofreg sizeof%7
+ %elifnum sizeof%6
+ %assign __sizeofreg sizeof%6
+ %else
+ %assign __sizeofreg mmsize
+ %endif
+ %assign __emulate_avx 0
+ %if avx_enabled && __sizeofreg >= 16
+ %xdefine __instr v%1
+ %else
+ %xdefine __instr %1
+ %if %0 >= 8+%4
+ %assign __emulate_avx 1
+ %endif
+ %endif
+ %ifnidn %2, fnord
+ %ifdef cpuname
+ %if notcpuflag(%2)
+ %error use of ``%1'' %2 instruction in cpuname function: current_function
+ %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
+ %error use of ``%1'' sse2 instruction in cpuname function: current_function
+ %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
+ %error use of ``%1'' avx2 instruction in cpuname function: current_function
+ %elif __sizeofreg == 16 && notcpuflag(sse)
+ %error use of ``%1'' sse instruction in cpuname function: current_function
+ %elif __sizeofreg == 32 && notcpuflag(avx)
+ %error use of ``%1'' avx instruction in cpuname function: current_function
+ %elif __sizeofreg == 64 && notcpuflag(avx512)
+ %error use of ``%1'' avx512 instruction in cpuname function: current_function
+ %elifidn %1, pextrw ; special case because the base instruction is mmx2,
+ %ifnid %6 ; but sse4 is required for memory operands
+ %if notcpuflag(sse4)
+ %error use of ``%1'' sse4 instruction in cpuname function: current_function
+ %endif
+ %endif
+ %endif
+ %endif
+ %endif
+
+ %if __emulate_avx
+ %xdefine __src1 %7
+ %xdefine __src2 %8
+ %if %5 && %4 == 0
+ %ifnidn %6, %7
+ %ifidn %6, %8
+ %xdefine __src1 %8
+ %xdefine __src2 %7
+ %elifnnum sizeof%8
+ ; 3-operand AVX instructions with a memory arg can only have it in src2,
+ ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
+ ; So, if the instruction is commutative with a memory arg, swap them.
+ %xdefine __src1 %8
+ %xdefine __src2 %7
+ %endif
+ %endif
+ %endif
+ %ifnidn %6, __src1
+ %if %0 >= 9
+ CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
+ %else
+ CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
+ %endif
+ %if __sizeofreg == 8
+ MOVQ %6, __src1
+ %elif %3
+ MOVAPS %6, __src1
+ %else
+ MOVDQA %6, __src1
+ %endif
+ %endif
+ %if %0 >= 9
+ %1 %6, __src2, %9
+ %else
+ %1 %6, __src2
+ %endif
+ %elif %0 >= 9
+ %if avx_enabled && __sizeofreg >= 16 && %4 == 1
+ %ifnnum regnumof%7
+ %if %3
+ vmovaps %6, %7
+ %else
+ vmovdqa %6, %7
+ %endif
+ __instr %6, %6, %8, %9
+ %else
+ __instr %6, %7, %8, %9
+ %endif
+ %else
+ __instr %6, %7, %8, %9
+ %endif
+ %elif %0 == 8
+ %if avx_enabled && __sizeofreg >= 16 && %4 == 0
+ %xdefine __src1 %7
+ %xdefine __src2 %8
+ %if %5
+ %ifnum regnumof%7
+ %ifnum regnumof%8
+ %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
+ ; Most VEX-encoded instructions require an additional byte to encode when
+ ; src2 is a high register (e.g. m8..15). If the instruction is commutative
+ ; we can swap src1 and src2 when doing so reduces the instruction length.
+ %xdefine __src1 %8
+ %xdefine __src2 %7
+ %endif
+ %endif
+ %elifnum regnumof%8 ; put memory operands in src2 when possible
+ %xdefine __src1 %8
+ %xdefine __src2 %7
+ %else
+ %assign __emulate_avx 1
+ %endif
+ %elifnnum regnumof%7
+ ; EVEX allows imm8 shift instructions to be used with memory operands,
+ ; but VEX does not. This handles those special cases.
+ %ifnnum %8
+ %assign __emulate_avx 1
+ %elif notcpuflag(avx512)
+ %assign __emulate_avx 1
+ %endif
+ %endif
+ %if __emulate_avx ; a separate load is required
+ %if %3
+ vmovaps %6, %7
+ %else
+ vmovdqa %6, %7
+ %endif
+ __instr %6, %6, %8
+ %else
+ __instr %6, __src1, __src2
+ %endif
+ %else
+ __instr %6, %7, %8
+ %endif
+ %elif %0 == 7
+ %if avx_enabled && __sizeofreg >= 16 && %5
+ %xdefine __src1 %6
+ %xdefine __src2 %7
+ %ifnum regnumof%6
+ %ifnum regnumof%7
+ %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32
+ %xdefine __src1 %7
+ %xdefine __src2 %6
+ %endif
+ %endif
+ %endif
+ __instr %6, __src1, __src2
+ %else
+ __instr %6, %7
+ %endif
+ %else
+ __instr %6
+ %endif
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-5 fnord, 0, 255, 0
+ %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
+ %ifidn %2, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
+ %elifidn %3, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
+ %elifidn %4, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
+ %elifidn %5, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
+ %else
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
+ %endif
+ %endmacro
+%endmacro
+
+; Instructions with both VEX/EVEX and legacy encodings
+; Non-destructive instructions are written without parameters
+AVX_INSTR addpd, sse2, 1, 0, 1
+AVX_INSTR addps, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 0
+AVX_INSTR addss, sse, 1, 0, 0
+AVX_INSTR addsubpd, sse3, 1, 0, 0
+AVX_INSTR addsubps, sse3, 1, 0, 0
+AVX_INSTR aesdec, aesni, 0, 0, 0
+AVX_INSTR aesdeclast, aesni, 0, 0, 0
+AVX_INSTR aesenc, aesni, 0, 0, 0
+AVX_INSTR aesenclast, aesni, 0, 0, 0
+AVX_INSTR aesimc, aesni
+AVX_INSTR aeskeygenassist, aesni
+AVX_INSTR andnpd, sse2, 1, 0, 0
+AVX_INSTR andnps, sse, 1, 0, 0
+AVX_INSTR andpd, sse2, 1, 0, 1
+AVX_INSTR andps, sse, 1, 0, 1
+AVX_INSTR blendpd, sse4, 1, 1, 0
+AVX_INSTR blendps, sse4, 1, 1, 0
+AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding
+AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding
+AVX_INSTR cmpeqpd, sse2, 1, 0, 1
+AVX_INSTR cmpeqps, sse, 1, 0, 1
+AVX_INSTR cmpeqsd, sse2, 1, 0, 0
+AVX_INSTR cmpeqss, sse, 1, 0, 0
+AVX_INSTR cmplepd, sse2, 1, 0, 0
+AVX_INSTR cmpleps, sse, 1, 0, 0
+AVX_INSTR cmplesd, sse2, 1, 0, 0
+AVX_INSTR cmpless, sse, 1, 0, 0
+AVX_INSTR cmpltpd, sse2, 1, 0, 0
+AVX_INSTR cmpltps, sse, 1, 0, 0
+AVX_INSTR cmpltsd, sse2, 1, 0, 0
+AVX_INSTR cmpltss, sse, 1, 0, 0
+AVX_INSTR cmpneqpd, sse2, 1, 0, 1
+AVX_INSTR cmpneqps, sse, 1, 0, 1
+AVX_INSTR cmpneqsd, sse2, 1, 0, 0
+AVX_INSTR cmpneqss, sse, 1, 0, 0
+AVX_INSTR cmpnlepd, sse2, 1, 0, 0
+AVX_INSTR cmpnleps, sse, 1, 0, 0
+AVX_INSTR cmpnlesd, sse2, 1, 0, 0
+AVX_INSTR cmpnless, sse, 1, 0, 0
+AVX_INSTR cmpnltpd, sse2, 1, 0, 0
+AVX_INSTR cmpnltps, sse, 1, 0, 0
+AVX_INSTR cmpnltsd, sse2, 1, 0, 0
+AVX_INSTR cmpnltss, sse, 1, 0, 0
+AVX_INSTR cmpordpd, sse2 1, 0, 1
+AVX_INSTR cmpordps, sse 1, 0, 1
+AVX_INSTR cmpordsd, sse2 1, 0, 0
+AVX_INSTR cmpordss, sse 1, 0, 0
+AVX_INSTR cmppd, sse2, 1, 1, 0
+AVX_INSTR cmpps, sse, 1, 1, 0
+AVX_INSTR cmpsd, sse2, 1, 1, 0
+AVX_INSTR cmpss, sse, 1, 1, 0
+AVX_INSTR cmpunordpd, sse2, 1, 0, 1
+AVX_INSTR cmpunordps, sse, 1, 0, 1
+AVX_INSTR cmpunordsd, sse2, 1, 0, 0
+AVX_INSTR cmpunordss, sse, 1, 0, 0
+AVX_INSTR comisd, sse2, 1
+AVX_INSTR comiss, sse, 1
+AVX_INSTR cvtdq2pd, sse2, 1
+AVX_INSTR cvtdq2ps, sse2, 1
+AVX_INSTR cvtpd2dq, sse2, 1
+AVX_INSTR cvtpd2ps, sse2, 1
+AVX_INSTR cvtps2dq, sse2, 1
+AVX_INSTR cvtps2pd, sse2, 1
+AVX_INSTR cvtsd2si, sse2, 1
+AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
+AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
+AVX_INSTR cvtsi2ss, sse, 1, 0, 0
+AVX_INSTR cvtss2sd, sse2, 1, 0, 0
+AVX_INSTR cvtss2si, sse, 1
+AVX_INSTR cvttpd2dq, sse2, 1
+AVX_INSTR cvttps2dq, sse2, 1
+AVX_INSTR cvttsd2si, sse2, 1
+AVX_INSTR cvttss2si, sse, 1
+AVX_INSTR divpd, sse2, 1, 0, 0
+AVX_INSTR divps, sse, 1, 0, 0
+AVX_INSTR divsd, sse2, 1, 0, 0
+AVX_INSTR divss, sse, 1, 0, 0
+AVX_INSTR dppd, sse4, 1, 1, 0
+AVX_INSTR dpps, sse4, 1, 1, 0
+AVX_INSTR extractps, sse4, 1
+AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
+AVX_INSTR haddpd, sse3, 1, 0, 0
+AVX_INSTR haddps, sse3, 1, 0, 0
+AVX_INSTR hsubpd, sse3, 1, 0, 0
+AVX_INSTR hsubps, sse3, 1, 0, 0
+AVX_INSTR insertps, sse4, 1, 1, 0
+AVX_INSTR lddqu, sse3
+AVX_INSTR ldmxcsr, sse, 1
+AVX_INSTR maskmovdqu, sse2
+AVX_INSTR maxpd, sse2, 1, 0, 1
+AVX_INSTR maxps, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 0
+AVX_INSTR maxss, sse, 1, 0, 0
+AVX_INSTR minpd, sse2, 1, 0, 1
+AVX_INSTR minps, sse, 1, 0, 1
+AVX_INSTR minsd, sse2, 1, 0, 0
+AVX_INSTR minss, sse, 1, 0, 0
+AVX_INSTR movapd, sse2, 1
+AVX_INSTR movaps, sse, 1
+AVX_INSTR movd, mmx
+AVX_INSTR movddup, sse3, 1
+AVX_INSTR movdqa, sse2
+AVX_INSTR movdqu, sse2
+AVX_INSTR movhlps, sse, 1, 0, 0
+AVX_INSTR movhpd, sse2, 1, 0, 0
+AVX_INSTR movhps, sse, 1, 0, 0
+AVX_INSTR movlhps, sse, 1, 0, 0
+AVX_INSTR movlpd, sse2, 1, 0, 0
+AVX_INSTR movlps, sse, 1, 0, 0
+AVX_INSTR movmskpd, sse2, 1
+AVX_INSTR movmskps, sse, 1
+AVX_INSTR movntdq, sse2
+AVX_INSTR movntdqa, sse4
+AVX_INSTR movntpd, sse2, 1
+AVX_INSTR movntps, sse, 1
+AVX_INSTR movq, mmx
+AVX_INSTR movsd, sse2, 1, 0, 0
+AVX_INSTR movshdup, sse3, 1
+AVX_INSTR movsldup, sse3, 1
+AVX_INSTR movss, sse, 1, 0, 0
+AVX_INSTR movupd, sse2, 1
+AVX_INSTR movups, sse, 1
+AVX_INSTR mpsadbw, sse4, 0, 1, 0
+AVX_INSTR mulpd, sse2, 1, 0, 1
+AVX_INSTR mulps, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 0
+AVX_INSTR mulss, sse, 1, 0, 0
+AVX_INSTR orpd, sse2, 1, 0, 1
+AVX_INSTR orps, sse, 1, 0, 1
+AVX_INSTR pabsb, ssse3
+AVX_INSTR pabsd, ssse3
+AVX_INSTR pabsw, ssse3
+AVX_INSTR packssdw, mmx, 0, 0, 0
+AVX_INSTR packsswb, mmx, 0, 0, 0
+AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR paddb, mmx, 0, 0, 1
+AVX_INSTR paddd, mmx, 0, 0, 1
+AVX_INSTR paddq, sse2, 0, 0, 1
+AVX_INSTR paddsb, mmx, 0, 0, 1
+AVX_INSTR paddsw, mmx, 0, 0, 1
+AVX_INSTR paddusb, mmx, 0, 0, 1
+AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
+AVX_INSTR palignr, ssse3, 0, 1, 0
+AVX_INSTR pand, mmx, 0, 0, 1
+AVX_INSTR pandn, mmx, 0, 0, 0
+AVX_INSTR pavgb, mmx2, 0, 0, 1
+AVX_INSTR pavgw, mmx2, 0, 0, 1
+AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding
+AVX_INSTR pblendw, sse4, 0, 1, 0
+AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
+AVX_INSTR pclmulqdq, fnord, 0, 1, 0
+AVX_INSTR pcmpeqb, mmx, 0, 0, 1
+AVX_INSTR pcmpeqd, mmx, 0, 0, 1
+AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
+AVX_INSTR pcmpgtb, mmx, 0, 0, 0
+AVX_INSTR pcmpgtd, mmx, 0, 0, 0
+AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pextrb, sse4
+AVX_INSTR pextrd, sse4
+AVX_INSTR pextrq, sse4
+AVX_INSTR pextrw, mmx2
+AVX_INSTR phaddd, ssse3, 0, 0, 0
+AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phaddw, ssse3, 0, 0, 0
+AVX_INSTR phminposuw, sse4
+AVX_INSTR phsubd, ssse3, 0, 0, 0
+AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR phsubw, ssse3, 0, 0, 0
+AVX_INSTR pinsrb, sse4, 0, 1, 0
+AVX_INSTR pinsrd, sse4, 0, 1, 0
+AVX_INSTR pinsrq, sse4, 0, 1, 0
+AVX_INSTR pinsrw, mmx2, 0, 1, 0
+AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
+AVX_INSTR pmaxsb, sse4, 0, 0, 1
+AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
+AVX_INSTR pmaxub, mmx2, 0, 0, 1
+AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
+AVX_INSTR pminsb, sse4, 0, 0, 1
+AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
+AVX_INSTR pminub, mmx2, 0, 0, 1
+AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
+AVX_INSTR pmovmskb, mmx2
+AVX_INSTR pmovsxbd, sse4
+AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxdq, sse4
+AVX_INSTR pmovsxwd, sse4
+AVX_INSTR pmovsxwq, sse4
+AVX_INSTR pmovzxbd, sse4
+AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxdq, sse4
+AVX_INSTR pmovzxwd, sse4
+AVX_INSTR pmovzxwq, sse4
+AVX_INSTR pmuldq, sse4, 0, 0, 1
+AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
+AVX_INSTR pmulhuw, mmx2, 0, 0, 1
+AVX_INSTR pmulhw, mmx, 0, 0, 1
+AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
+AVX_INSTR pmuludq, sse2, 0, 0, 1
+AVX_INSTR por, mmx, 0, 0, 1
+AVX_INSTR psadbw, mmx2, 0, 0, 1
+AVX_INSTR pshufb, ssse3, 0, 0, 0
+AVX_INSTR pshufd, sse2
+AVX_INSTR pshufhw, sse2
+AVX_INSTR pshuflw, sse2
+AVX_INSTR psignb, ssse3, 0, 0, 0
+AVX_INSTR psignd, ssse3, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
+AVX_INSTR pslld, mmx, 0, 0, 0
+AVX_INSTR pslldq, sse2, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR psrad, mmx, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psrld, mmx, 0, 0, 0
+AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psubb, mmx, 0, 0, 0
+AVX_INSTR psubd, mmx, 0, 0, 0
+AVX_INSTR psubq, sse2, 0, 0, 0
+AVX_INSTR psubsb, mmx, 0, 0, 0
+AVX_INSTR psubsw, mmx, 0, 0, 0
+AVX_INSTR psubusb, mmx, 0, 0, 0
+AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
+AVX_INSTR ptest, sse4
+AVX_INSTR punpckhbw, mmx, 0, 0, 0
+AVX_INSTR punpckhdq, mmx, 0, 0, 0
+AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
+AVX_INSTR punpcklbw, mmx, 0, 0, 0
+AVX_INSTR punpckldq, mmx, 0, 0, 0
+AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
+AVX_INSTR pxor, mmx, 0, 0, 1
+AVX_INSTR rcpps, sse, 1
+AVX_INSTR rcpss, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4, 1
+AVX_INSTR roundps, sse4, 1
+AVX_INSTR roundsd, sse4, 1, 1, 0
+AVX_INSTR roundss, sse4, 1, 1, 0
+AVX_INSTR rsqrtps, sse, 1
+AVX_INSTR rsqrtss, sse, 1, 0, 0
+AVX_INSTR shufpd, sse2, 1, 1, 0
+AVX_INSTR shufps, sse, 1, 1, 0
+AVX_INSTR sqrtpd, sse2, 1
+AVX_INSTR sqrtps, sse, 1
+AVX_INSTR sqrtsd, sse2, 1, 0, 0
+AVX_INSTR sqrtss, sse, 1, 0, 0
+AVX_INSTR stmxcsr, sse, 1
+AVX_INSTR subpd, sse2, 1, 0, 0
+AVX_INSTR subps, sse, 1, 0, 0
+AVX_INSTR subsd, sse2, 1, 0, 0
+AVX_INSTR subss, sse, 1, 0, 0
+AVX_INSTR ucomisd, sse2, 1
+AVX_INSTR ucomiss, sse, 1
+AVX_INSTR unpckhpd, sse2, 1, 0, 0
+AVX_INSTR unpckhps, sse, 1, 0, 0
+AVX_INSTR unpcklpd, sse2, 1, 0, 0
+AVX_INSTR unpcklps, sse, 1, 0, 0
+AVX_INSTR xorpd, sse2, 1, 0, 1
+AVX_INSTR xorps, sse, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 3dnow, 1, 0, 1
+AVX_INSTR pfmul, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
+
+;%1 == instruction
+;%2 == minimal instruction set
+%macro GPR_INSTR 2
+ %macro %1 2-5 fnord, %1, %2
+ %ifdef cpuname
+ %if notcpuflag(%5)
+ %error use of ``%4'' %5 instruction in cpuname function: current_function
+ %endif
+ %endif
+ %ifidn %3, fnord
+ %4 %1, %2
+ %else
+ %4 %1, %2, %3
+ %endif
+ %endmacro
+%endmacro
+
+GPR_INSTR andn, bmi1
+GPR_INSTR bextr, bmi1
+GPR_INSTR blsi, bmi1
+GPR_INSTR blsmsk, bmi1
+GPR_INSTR blsr, bmi1
+GPR_INSTR bzhi, bmi2
+GPR_INSTR mulx, bmi2
+GPR_INSTR pdep, bmi2
+GPR_INSTR pext, bmi2
+GPR_INSTR popcnt, sse42
+GPR_INSTR rorx, bmi2
+GPR_INSTR sarx, bmi2
+GPR_INSTR shlx, bmi2
+GPR_INSTR shrx, bmi2
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+ %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+ %if j < 10
+ CAT_XDEFINE q000, j, i
+ %elif j < 100
+ CAT_XDEFINE q00, j, i
+ %elif j < 1000
+ CAT_XDEFINE q0, j, i
+ %else
+ CAT_XDEFINE q, j, i
+ %endif
+ %assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+ %macro %1 4-7 %1, %2, %3
+ %if cpuflag(xop)
+ v%5 %1, %2, %3, %4
+ %elifnidn %1, %4
+ %6 %1, %2, %3
+ %7 %1, %4
+ %else
+ %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
+ %endif
+ %endmacro
+%endmacro
+
+FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
+FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmacsww, pmullw, paddw
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
+; FMA3 is only possible if dst is the same as one of the src registers.
+; Either src2 or src3 can be a memory operand.
+%macro FMA4_INSTR 2-*
+ %push fma4_instr
+ %xdefine %$prefix %1
+ %rep %0 - 1
+ %macro %$prefix%2 4-6 %$prefix, %2
+ %if notcpuflag(fma3) && notcpuflag(fma4)
+ %error use of ``%5%6'' fma instruction in cpuname function: current_function
+ %elif cpuflag(fma4)
+ v%5%6 %1, %2, %3, %4
+ %elifidn %1, %2
+ ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
+ %ifnum sizeof%3
+ v%{5}213%6 %2, %3, %4
+ %else
+ v%{5}132%6 %2, %4, %3
+ %endif
+ %elifidn %1, %3
+ v%{5}213%6 %3, %2, %4
+ %elifidn %1, %4
+ v%{5}231%6 %4, %2, %3
+ %else
+ %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
+ %endif
+ %endmacro
+ %rotate 1
+ %endrep
+ %pop
+%endmacro
+
+FMA4_INSTR fmadd, pd, ps, sd, ss
+FMA4_INSTR fmaddsub, pd, ps
+FMA4_INSTR fmsub, pd, ps, sd, ss
+FMA4_INSTR fmsubadd, pd, ps
+FMA4_INSTR fnmadd, pd, ps, sd, ss
+FMA4_INSTR fnmsub, pd, ps, sd, ss
+
+; Macros for converting VEX instructions to equivalent EVEX ones.
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
+ %macro %1 2-7 fnord, fnord, %1, %2, %3
+ %ifidn %3, fnord
+ %define %%args %1, %2
+ %elifidn %4, fnord
+ %define %%args %1, %2, %3
+ %else
+ %define %%args %1, %2, %3, %4
+ %endif
+ %assign %%evex_required cpuflag(avx512) & %7
+ %ifnum regnumof%1
+ %if regnumof%1 >= 16 || sizeof%1 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %ifnum regnumof%2
+ %if regnumof%2 >= 16 || sizeof%2 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %ifnum regnumof%3
+ %if regnumof%3 >= 16 || sizeof%3 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %if %%evex_required
+ %6 %%args
+ %else
+ %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
+ %endif
+ %endmacro
+%endmacro
+
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
+EVEX_INSTR vextractf128, vextractf32x4
+EVEX_INSTR vextracti128, vextracti32x4
+EVEX_INSTR vinsertf128, vinsertf32x4
+EVEX_INSTR vinserti128, vinserti32x4
+EVEX_INSTR vmovdqa, vmovdqa32
+EVEX_INSTR vmovdqu, vmovdqu32
+EVEX_INSTR vpand, vpandd
+EVEX_INSTR vpandn, vpandnd
+EVEX_INSTR vpor, vpord
+EVEX_INSTR vpxor, vpxord
+EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision
+EVEX_INSTR vrcpss, vrcp14ss, 1
+EVEX_INSTR vrsqrtps, vrsqrt14ps, 1
+EVEX_INSTR vrsqrtss, vrsqrt14ss, 1
diff --git a/third_party/dav1d/src/fg_apply.h b/third_party/dav1d/src/fg_apply.h
new file mode 100644
index 0000000000..be6685d801
--- /dev/null
+++ b/third_party/dav1d/src/fg_apply.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_FG_APPLY_H
+#define DAV1D_SRC_FG_APPLY_H
+
+#include "dav1d/picture.h"
+
+#include "common/bitdepth.h"
+
+#include "src/filmgrain.h"
+
+#ifdef BITDEPTH
+# define array_decl(type, name, sz) type name sz
+#else
+# define array_decl(type, name, sz) void *name
+#endif
+
+bitfn_decls(void dav1d_apply_grain,
+ const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out, const Dav1dPicture *const in);
+bitfn_decls(void dav1d_prep_grain,
+ const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out, const Dav1dPicture *const in,
+ array_decl(uint8_t, scaling, [3][SCALING_SIZE]),
+ array_decl(entry, grain_lut, [3][GRAIN_HEIGHT+1][GRAIN_WIDTH]));
+bitfn_decls(void dav1d_apply_grain_row,
+ const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out, const Dav1dPicture *const in,
+ array_decl(const uint8_t, scaling, [3][SCALING_SIZE]),
+ array_decl(const entry, grain_lut, [3][GRAIN_HEIGHT+1][GRAIN_WIDTH]),
+ const int row);
+
+#endif /* DAV1D_SRC_FG_APPLY_H */
diff --git a/third_party/dav1d/src/fg_apply_tmpl.c b/third_party/dav1d/src/fg_apply_tmpl.c
new file mode 100644
index 0000000000..e1b1655f44
--- /dev/null
+++ b/third_party/dav1d/src/fg_apply_tmpl.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+
+#include "dav1d/common.h"
+#include "dav1d/picture.h"
+
+#include "common/intops.h"
+#include "common/bitdepth.h"
+
+#include "src/fg_apply.h"
+#include "src/ref.h"
+
+static void generate_scaling(const int bitdepth,
+ const uint8_t points[][2], const int num,
+ uint8_t scaling[SCALING_SIZE])
+{
+#if BITDEPTH == 8
+ const int shift_x = 0;
+ const int scaling_size = SCALING_SIZE;
+#else
+ assert(bitdepth > 8);
+ const int shift_x = bitdepth - 8;
+ const int scaling_size = 1 << bitdepth;
+#endif
+
+ if (num == 0) {
+ memset(scaling, 0, scaling_size);
+ return;
+ }
+
+ // Fill up the preceding entries with the initial value
+ memset(scaling, points[0][1], points[0][0] << shift_x);
+
+ // Linearly interpolate the values in the middle
+ for (int i = 0; i < num - 1; i++) {
+ const int bx = points[i][0];
+ const int by = points[i][1];
+ const int ex = points[i+1][0];
+ const int ey = points[i+1][1];
+ const int dx = ex - bx;
+ const int dy = ey - by;
+ assert(dx > 0);
+ const int delta = dy * ((0x10000 + (dx >> 1)) / dx);
+ for (int x = 0, d = 0x8000; x < dx; x++) {
+ scaling[(bx + x) << shift_x] = by + (d >> 16);
+ d += delta;
+ }
+ }
+
+ // Fill up the remaining entries with the final value
+ const int n = points[num - 1][0] << shift_x;
+ memset(&scaling[n], points[num - 1][1], scaling_size - n);
+
+#if BITDEPTH != 8
+ const int pad = 1 << shift_x, rnd = pad >> 1;
+ for (int i = 0; i < num - 1; i++) {
+ const int bx = points[i][0] << shift_x;
+ const int ex = points[i+1][0] << shift_x;
+ const int dx = ex - bx;
+ for (int x = 0; x < dx; x += pad) {
+ const int range = scaling[bx + x + pad] - scaling[bx + x];
+ for (int n = 1, r = rnd; n < pad; n++) {
+ r += range;
+ scaling[bx + x + n] = scaling[bx + x] + (r >> shift_x);
+ }
+ }
+ }
+#endif
+}
+
+#ifndef UNIT_TEST
+void bitfn(dav1d_prep_grain)(const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out,
+ const Dav1dPicture *const in,
+ uint8_t scaling[3][SCALING_SIZE],
+ entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH])
+{
+ const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
+#if BITDEPTH != 8
+ const int bitdepth_max = (1 << out->p.bpc) - 1;
+#endif
+
+ // Generate grain LUTs as needed
+ dsp->generate_grain_y(grain_lut[0], data HIGHBD_TAIL_SUFFIX); // always needed
+ if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
+ dsp->generate_grain_uv[in->p.layout - 1](grain_lut[1], grain_lut[0],
+ data, 0 HIGHBD_TAIL_SUFFIX);
+ if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
+ dsp->generate_grain_uv[in->p.layout - 1](grain_lut[2], grain_lut[0],
+ data, 1 HIGHBD_TAIL_SUFFIX);
+
+ // Generate scaling LUTs as needed
+ if (data->num_y_points || data->chroma_scaling_from_luma)
+ generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);
+ if (data->num_uv_points[0])
+ generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);
+ if (data->num_uv_points[1])
+ generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]);
+
+ // Create new references for the non-modified planes
+ assert(out->stride[0] == in->stride[0]);
+ if (!data->num_y_points) {
+ struct Dav1dRef **out_plane_ref = out->ref->user_data;
+ struct Dav1dRef **in_plane_ref = in->ref->user_data;
+ dav1d_ref_dec(&out_plane_ref[0]);
+ out_plane_ref[0] = in_plane_ref[0];
+ dav1d_ref_inc(out_plane_ref[0]);
+ out->data[0] = in->data[0];
+ }
+
+ if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400 && !data->chroma_scaling_from_luma) {
+ assert(out->stride[1] == in->stride[1]);
+ struct Dav1dRef **out_plane_ref = out->ref->user_data;
+ struct Dav1dRef **in_plane_ref = in->ref->user_data;
+ if (!data->num_uv_points[0]) {
+ dav1d_ref_dec(&out_plane_ref[1]);
+ out_plane_ref[1] = in_plane_ref[1];
+ dav1d_ref_inc(out_plane_ref[1]);
+ out->data[1] = in->data[1];
+ }
+ if (!data->num_uv_points[1]) {
+ dav1d_ref_dec(&out_plane_ref[2]);
+ out_plane_ref[2] = in_plane_ref[2];
+ dav1d_ref_inc(out_plane_ref[2]);
+ out->data[2] = in->data[2];
+ }
+ }
+}
+
+void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out,
+ const Dav1dPicture *const in,
+ const uint8_t scaling[3][SCALING_SIZE],
+ const entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH],
+ const int row)
+{
+ // Synthesize grain for the affected planes
+ const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
+ const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_x = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cpw = (out->p.w + ss_x) >> ss_x;
+ const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
+ pixel *const luma_src =
+ ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
+#if BITDEPTH != 8
+ const int bitdepth_max = (1 << out->p.bpc) - 1;
+#endif
+
+ if (data->num_y_points) {
+ const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE);
+ dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]),
+ luma_src, out->stride[0], data,
+ out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
+ }
+
+ if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
+ !data->chroma_scaling_from_luma)
+ {
+ return;
+ }
+
+ const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
+
+ // extend padding pixels
+ if (out->p.w & ss_x) {
+ pixel *ptr = luma_src;
+ for (int y = 0; y < bh; y++) {
+ ptr[out->p.w] = ptr[out->p.w - 1];
+ ptr += PXSTRIDE(in->stride[0]) << ss_y;
+ }
+ }
+
+ const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
+ if (data->chroma_scaling_from_luma) {
+ for (int pl = 0; pl < 2; pl++)
+ dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
+ ((const pixel *) in->data[1 + pl]) + uv_off,
+ in->stride[1], data, cpw,
+ scaling[0], grain_lut[1 + pl],
+ bh, row, luma_src, in->stride[0],
+ pl, is_id HIGHBD_TAIL_SUFFIX);
+ } else {
+ for (int pl = 0; pl < 2; pl++)
+ if (data->num_uv_points[pl])
+ dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
+ ((const pixel *) in->data[1 + pl]) + uv_off,
+ in->stride[1], data, cpw,
+ scaling[1 + pl], grain_lut[1 + pl],
+ bh, row, luma_src, in->stride[0],
+ pl, is_id HIGHBD_TAIL_SUFFIX);
+ }
+}
+
+void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
+ Dav1dPicture *const out,
+ const Dav1dPicture *const in)
+{
+ ALIGN_STK_16(entry, grain_lut, 3,[GRAIN_HEIGHT + 1][GRAIN_WIDTH]);
+#if ARCH_X86_64 && BITDEPTH == 8
+ ALIGN_STK_64(uint8_t, scaling, 3,[SCALING_SIZE]);
+#else
+ uint8_t scaling[3][SCALING_SIZE];
+#endif
+ const int rows = (out->p.h + 31) >> 5;
+
+ bitfn(dav1d_prep_grain)(dsp, out, in, scaling, grain_lut);
+ for (int row = 0; row < rows; row++)
+ bitfn(dav1d_apply_grain_row)(dsp, out, in, scaling, grain_lut, row);
+}
+#endif
diff --git a/third_party/dav1d/src/filmgrain.h b/third_party/dav1d/src/filmgrain.h
new file mode 100644
index 0000000000..a5d6be6d44
--- /dev/null
+++ b/third_party/dav1d/src/filmgrain.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_FILM_GRAIN_H
+#define DAV1D_SRC_FILM_GRAIN_H
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+
+#define GRAIN_WIDTH 82
+#define GRAIN_HEIGHT 73
+#define BLOCK_SIZE 32
+#if !defined(BITDEPTH) || BITDEPTH == 8
+#define SCALING_SIZE 256
+typedef int8_t entry;
+#else
+#define SCALING_SIZE 4096
+typedef int16_t entry;
+#endif
+
+#define decl_generate_grain_y_fn(name) \
+void (name)(entry buf[][GRAIN_WIDTH], \
+ const Dav1dFilmGrainData *const data HIGHBD_DECL_SUFFIX)
+typedef decl_generate_grain_y_fn(*generate_grain_y_fn);
+
+#define decl_generate_grain_uv_fn(name) \
+void (name)(entry buf[][GRAIN_WIDTH], \
+ const entry buf_y[][GRAIN_WIDTH], \
+ const Dav1dFilmGrainData *const data, const intptr_t uv HIGHBD_DECL_SUFFIX)
+typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn);
+
+#define decl_fgy_32x32xn_fn(name) \
+void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
+ const Dav1dFilmGrainData *data, \
+ size_t pw, const uint8_t scaling[SCALING_SIZE], \
+ const entry grain_lut[][GRAIN_WIDTH], \
+ int bh, int row_num HIGHBD_DECL_SUFFIX)
+typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn);
+
+#define decl_fguv_32x32xn_fn(name) \
+void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
+ const Dav1dFilmGrainData *data, size_t pw, \
+ const uint8_t scaling[SCALING_SIZE], \
+ const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \
+ const pixel *luma_row, ptrdiff_t luma_stride, \
+ int uv_pl, int is_id HIGHBD_DECL_SUFFIX)
+typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn);
+
+typedef struct Dav1dFilmGrainDSPContext {
+ generate_grain_y_fn generate_grain_y;
+ generate_grain_uv_fn generate_grain_uv[3];
+
+ fgy_32x32xn_fn fgy_32x32xn;
+ fguv_32x32xn_fn fguv_32x32xn[3];
+} Dav1dFilmGrainDSPContext;
+
+bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c);
+
+#endif /* DAV1D_SRC_FILM_GRAIN_H */
diff --git a/third_party/dav1d/src/filmgrain_tmpl.c b/third_party/dav1d/src/filmgrain_tmpl.c
new file mode 100644
index 0000000000..0986ac2a58
--- /dev/null
+++ b/third_party/dav1d/src/filmgrain_tmpl.c
@@ -0,0 +1,441 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/filmgrain.h"
+#include "src/tables.h"
+
+#define SUB_GRAIN_WIDTH 44
+#define SUB_GRAIN_HEIGHT 38
+
+static inline int get_random_number(const int bits, unsigned *const state) {
+ const int r = *state;
+ unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
+ *state = (r >> 1) | (bit << 15);
+
+ return (*state >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static inline int round2(const int x, const uint64_t shift) {
+ return (x + ((1 << shift) >> 1)) >> shift;
+}
+
+static void generate_grain_y_c(entry buf[][GRAIN_WIDTH],
+ const Dav1dFilmGrainData *const data
+ HIGHBD_DECL_SUFFIX)
+{
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ unsigned seed = data->seed;
+ const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+ const int grain_ctr = 128 << bitdepth_min_8;
+ const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+ for (int y = 0; y < GRAIN_HEIGHT; y++) {
+ for (int x = 0; x < GRAIN_WIDTH; x++) {
+ const int value = get_random_number(11, &seed);
+ buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
+ }
+ }
+
+ const int ar_pad = 3;
+ const int ar_lag = data->ar_coeff_lag;
+
+ for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
+ for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
+ const int8_t *coeff = data->ar_coeffs_y;
+ int sum = 0;
+ for (int dy = -ar_lag; dy <= 0; dy++) {
+ for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+ if (!dx && !dy)
+ break;
+ sum += *(coeff++) * buf[y + dy][x + dx];
+ }
+ }
+
+ const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+ buf[y][x] = iclip(grain, grain_min, grain_max);
+ }
+ }
+}
+
+static NOINLINE void
+generate_grain_uv_c(entry buf[][GRAIN_WIDTH],
+ const entry buf_y[][GRAIN_WIDTH],
+ const Dav1dFilmGrainData *const data, const intptr_t uv,
+ const int subx, const int suby HIGHBD_DECL_SUFFIX)
+{
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524);
+ const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
+ const int grain_ctr = 128 << bitdepth_min_8;
+ const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+ const int chromaW = subx ? SUB_GRAIN_WIDTH : GRAIN_WIDTH;
+ const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
+
+ for (int y = 0; y < chromaH; y++) {
+ for (int x = 0; x < chromaW; x++) {
+ const int value = get_random_number(11, &seed);
+ buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
+ }
+ }
+
+ const int ar_pad = 3;
+ const int ar_lag = data->ar_coeff_lag;
+
+ for (int y = ar_pad; y < chromaH; y++) {
+ for (int x = ar_pad; x < chromaW - ar_pad; x++) {
+ const int8_t *coeff = data->ar_coeffs_uv[uv];
+ int sum = 0;
+ for (int dy = -ar_lag; dy <= 0; dy++) {
+ for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+ // For the final (current) pixel, we need to add in the
+ // contribution from the luma grain texture
+ if (!dx && !dy) {
+ if (!data->num_y_points)
+ break;
+ int luma = 0;
+ const int lumaX = ((x - ar_pad) << subx) + ar_pad;
+ const int lumaY = ((y - ar_pad) << suby) + ar_pad;
+ for (int i = 0; i <= suby; i++) {
+ for (int j = 0; j <= subx; j++) {
+ luma += buf_y[lumaY + i][lumaX + j];
+ }
+ }
+ luma = round2(luma, subx + suby);
+ sum += luma * (*coeff);
+ break;
+ }
+
+ sum += *(coeff++) * buf[y + dy][x + dx];
+ }
+ }
+
+ const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+ buf[y][x] = iclip(grain, grain_min, grain_max);
+ }
+ }
+}
+
+#define gnuv_ss_fn(nm, ss_x, ss_y) \
+static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \
+ generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \
+}
+
+gnuv_ss_fn(420, 1, 1);
+gnuv_ss_fn(422, 1, 0);
+gnuv_ss_fn(444, 0, 0);
+
+// samples from the correct block of a grain LUT, while taking into account the
+// offsets provided by the offsets cache
+static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
+ const int offsets[2][2], const int subx, const int suby,
+ const int bx, const int by, const int x, const int y)
+{
+ const int randval = offsets[bx][by];
+ const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
+ const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
+ return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by]
+ [offx + x + (BLOCK_SIZE >> subx) * bx];
+}
+
+static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
+ const ptrdiff_t stride,
+ const Dav1dFilmGrainData *const data, const size_t pw,
+ const uint8_t scaling[SCALING_SIZE],
+ const entry grain_lut[][GRAIN_WIDTH],
+ const int bh, const int row_num HIGHBD_DECL_SUFFIX)
+{
+ const int rows = 1 + (data->overlap_flag && row_num > 0);
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int grain_ctr = 128 << bitdepth_min_8;
+ const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+ int min_value, max_value;
+ if (data->clip_to_restricted_range) {
+ min_value = 16 << bitdepth_min_8;
+ max_value = 235 << bitdepth_min_8;
+ } else {
+ min_value = 0;
+ max_value = BITDEPTH_MAX;
+ }
+
+ // seed[0] contains the current row, seed[1] contains the previous
+ unsigned seed[2];
+ for (int i = 0; i < rows; i++) {
+ seed[i] = data->seed;
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8;
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+ }
+
+ assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
+
+ int offsets[2 /* col offset */][2 /* row offset */];
+
+ // process this row in BLOCK_SIZE^2 blocks
+ for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
+ const int bw = imin(BLOCK_SIZE, (int) pw - bx);
+
+ if (data->overlap_flag && bx) {
+ // shift previous offsets left
+ for (int i = 0; i < rows; i++)
+ offsets[1][i] = offsets[0][i];
+ }
+
+ // update current offsets
+ for (int i = 0; i < rows; i++)
+ offsets[0][i] = get_random_number(8, &seed[i]);
+
+ // x/y block offsets to compensate for overlapped regions
+ const int ystart = data->overlap_flag && row_num ? imin(2, bh) : 0;
+ const int xstart = data->overlap_flag && bx ? imin(2, bw) : 0;
+
+ static const int w[2][2] = { { 27, 17 }, { 17, 27 } };
+
+#define add_noise_y(x, y, grain) \
+ const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx; \
+ pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx; \
+ const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
+ *dst = iclip(*src + noise, min_value, max_value);
+
+ for (int y = ystart; y < bh; y++) {
+ // Non-overlapped image region (straightforward)
+ for (int x = xstart; x < bw; x++) {
+ int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+ add_noise_y(x, y, grain);
+ }
+
+ // Special case for overlapped column
+ for (int x = 0; x < xstart; x++) {
+ int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+ int old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
+ grain = round2(old * w[x][0] + grain * w[x][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_y(x, y, grain);
+ }
+ }
+
+ for (int y = 0; y < ystart; y++) {
+ // Special case for overlapped row (sans corner)
+ for (int x = xstart; x < bw; x++) {
+ int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+ int old = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
+ grain = round2(old * w[y][0] + grain * w[y][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_y(x, y, grain);
+ }
+
+ // Special case for doubly-overlapped corner
+ for (int x = 0; x < xstart; x++) {
+ // Blend the top pixel with the top left block
+ int top = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
+ int old = sample_lut(grain_lut, offsets, 0, 0, 1, 1, x, y);
+ top = round2(old * w[x][0] + top * w[x][1], 5);
+ top = iclip(top, grain_min, grain_max);
+
+ // Blend the current pixel with the left block
+ int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+ old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
+ grain = round2(old * w[x][0] + grain * w[x][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+
+ // Mix the row rows together and apply grain
+ grain = round2(top * w[y][0] + grain * w[y][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_y(x, y, grain);
+ }
+ }
+ }
+}
+
+static NOINLINE void
+fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
+ const ptrdiff_t stride, const Dav1dFilmGrainData *const data,
+ const size_t pw, const uint8_t scaling[SCALING_SIZE],
+ const entry grain_lut[][GRAIN_WIDTH], const int bh,
+ const int row_num, const pixel *const luma_row,
+ const ptrdiff_t luma_stride, const int uv, const int is_id,
+ const int sx, const int sy HIGHBD_DECL_SUFFIX)
+{
+ const int rows = 1 + (data->overlap_flag && row_num > 0);
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int grain_ctr = 128 << bitdepth_min_8;
+ const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
+
+ int min_value, max_value;
+ if (data->clip_to_restricted_range) {
+ min_value = 16 << bitdepth_min_8;
+ max_value = (is_id ? 235 : 240) << bitdepth_min_8;
+ } else {
+ min_value = 0;
+ max_value = BITDEPTH_MAX;
+ }
+
+ // seed[0] contains the current row, seed[1] contains the previous
+ unsigned seed[2];
+ for (int i = 0; i < rows; i++) {
+ seed[i] = data->seed;
+ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8;
+ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+ }
+
+ assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
+
+ int offsets[2 /* col offset */][2 /* row offset */];
+
+ // process this row in BLOCK_SIZE^2 blocks (subsampled)
+ for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
+ const int bw = imin(BLOCK_SIZE >> sx, (int)(pw - bx));
+ if (data->overlap_flag && bx) {
+ // shift previous offsets left
+ for (int i = 0; i < rows; i++)
+ offsets[1][i] = offsets[0][i];
+ }
+
+ // update current offsets
+ for (int i = 0; i < rows; i++)
+ offsets[0][i] = get_random_number(8, &seed[i]);
+
+ // x/y block offsets to compensate for overlapped regions
+ const int ystart = data->overlap_flag && row_num ? imin(2 >> sy, bh) : 0;
+ const int xstart = data->overlap_flag && bx ? imin(2 >> sx, bw) : 0;
+
+ static const int w[2 /* sub */][2 /* off */][2] = {
+ { { 27, 17 }, { 17, 27 } },
+ { { 23, 22 } },
+ };
+
+#define add_noise_uv(x, y, grain) \
+ const int lx = (bx + x) << sx; \
+ const int ly = y << sy; \
+ const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx; \
+ pixel avg = luma[0]; \
+ if (sx) \
+ avg = (avg + luma[1] + 1) >> 1; \
+ const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
+ pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
+ int val = avg; \
+ if (!data->chroma_scaling_from_luma) { \
+ const int combined = avg * data->uv_luma_mult[uv] + \
+ *src * data->uv_mult[uv]; \
+ val = iclip_pixel( (combined >> 6) + \
+ (data->uv_offset[uv] * (1 << bitdepth_min_8)) ); \
+ } \
+ const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \
+ *dst = iclip(*src + noise, min_value, max_value);
+
+ for (int y = ystart; y < bh; y++) {
+ // Non-overlapped image region (straightforward)
+ for (int x = xstart; x < bw; x++) {
+ int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+ add_noise_uv(x, y, grain);
+ }
+
+ // Special case for overlapped column
+ for (int x = 0; x < xstart; x++) {
+ int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+ int old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
+ grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_uv(x, y, grain);
+ }
+ }
+
+ for (int y = 0; y < ystart; y++) {
+ // Special case for overlapped row (sans corner)
+ for (int x = xstart; x < bw; x++) {
+ int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+ int old = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
+ grain = round2(old * w[sy][y][0] + grain * w[sy][y][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_uv(x, y, grain);
+ }
+
+ // Special case for doubly-overlapped corner
+ for (int x = 0; x < xstart; x++) {
+ // Blend the top pixel with the top left block
+ int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
+ int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y);
+ top = round2(old * w[sx][x][0] + top * w[sx][x][1], 5);
+ top = iclip(top, grain_min, grain_max);
+
+ // Blend the current pixel with the left block
+ int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+ old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
+ grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+
+ // Mix the row rows together and apply to image
+ grain = round2(top * w[sy][y][0] + grain * w[sy][y][1], 5);
+ grain = iclip(grain, grain_min, grain_max);
+ add_noise_uv(x, y, grain);
+ }
+ }
+ }
+}
+
+#define fguv_ss_fn(nm, ss_x, ss_y) \
+static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \
+ fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \
+ row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \
+ HIGHBD_TAIL_SUFFIX); \
+}
+
+fguv_ss_fn(420, 1, 1);
+fguv_ss_fn(422, 1, 0);
+fguv_ss_fn(444, 0, 0);
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/filmgrain.h"
+#elif ARCH_X86
+#include "src/x86/filmgrain.h"
+#endif
+#endif
+
+COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
+ c->generate_grain_y = generate_grain_y_c;
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c;
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c;
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c;
+
+ c->fgy_32x32xn = fgy_32x32xn_c;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ film_grain_dsp_init_arm(c);
+#elif ARCH_X86
+ film_grain_dsp_init_x86(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/getbits.c b/third_party/dav1d/src/getbits.c
new file mode 100644
index 0000000000..673070be3d
--- /dev/null
+++ b/third_party/dav1d/src/getbits.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <limits.h>
+
+#include "common/intops.h"
+
+#include "src/getbits.h"
+
+void dav1d_init_get_bits(GetBits *const c, const uint8_t *const data,
+ const size_t sz)
+{
+ assert(sz);
+ c->ptr = c->ptr_start = data;
+ c->ptr_end = &c->ptr_start[sz];
+ c->state = 0;
+ c->bits_left = 0;
+ c->error = 0;
+}
+
+unsigned dav1d_get_bit(GetBits *const c) {
+ if (!c->bits_left) {
+ if (c->ptr >= c->ptr_end) {
+ c->error = 1;
+ } else {
+ const unsigned state = *c->ptr++;
+ c->bits_left = 7;
+ c->state = (uint64_t) state << 57;
+ return state >> 7;
+ }
+ }
+
+ const uint64_t state = c->state;
+ c->bits_left--;
+ c->state = state << 1;
+ return (unsigned) (state >> 63);
+}
+
+static inline void refill(GetBits *const c, const int n) {
+ assert(c->bits_left >= 0 && c->bits_left < 32);
+ unsigned state = 0;
+ do {
+ if (c->ptr >= c->ptr_end) {
+ c->error = 1;
+ if (state) break;
+ return;
+ }
+ state = (state << 8) | *c->ptr++;
+ c->bits_left += 8;
+ } while (n > c->bits_left);
+ c->state |= (uint64_t) state << (64 - c->bits_left);
+}
+
+#define GET_BITS(name, type, type64) \
+type name(GetBits *const c, const int n) { \
+ assert(n > 0 && n <= 32); \
+ /* Unsigned cast avoids refill after eob */ \
+ if ((unsigned) n > (unsigned) c->bits_left) \
+ refill(c, n); \
+ const uint64_t state = c->state; \
+ c->bits_left -= n; \
+ c->state = state << n; \
+ return (type) ((type64) state >> (64 - n)); \
+}
+
+GET_BITS(dav1d_get_bits, unsigned, uint64_t)
+GET_BITS(dav1d_get_sbits, int, int64_t)
+
+unsigned dav1d_get_uleb128(GetBits *const c) {
+ uint64_t val = 0;
+ unsigned i = 0, more;
+
+ do {
+ const int v = dav1d_get_bits(c, 8);
+ more = v & 0x80;
+ val |= ((uint64_t) (v & 0x7F)) << i;
+ i += 7;
+ } while (more && i < 56);
+
+ if (val > UINT_MAX || more) {
+ c->error = 1;
+ return 0;
+ }
+
+ return (unsigned) val;
+}
+
+unsigned dav1d_get_uniform(GetBits *const c, const unsigned max) {
+ // Output in range [0..max-1]
+ // max must be > 1, or else nothing is read from the bitstream
+ assert(max > 1);
+ const int l = ulog2(max) + 1;
+ assert(l > 1);
+ const unsigned m = (1U << l) - max;
+ const unsigned v = dav1d_get_bits(c, l - 1);
+ return v < m ? v : (v << 1) - m + dav1d_get_bit(c);
+}
+
+unsigned dav1d_get_vlc(GetBits *const c) {
+ if (dav1d_get_bit(c))
+ return 0;
+
+ int n_bits = 0;
+ do {
+ if (++n_bits == 32)
+ return 0xFFFFFFFFU;
+ } while (!dav1d_get_bit(c));
+
+ return ((1U << n_bits) - 1) + dav1d_get_bits(c, n_bits);
+}
+
+static unsigned get_bits_subexp_u(GetBits *const c, const unsigned ref,
+ const unsigned n)
+{
+ unsigned v = 0;
+
+ for (int i = 0;; i++) {
+ const int b = i ? 3 + i - 1 : 3;
+
+ if (n < v + 3 * (1 << b)) {
+ v += dav1d_get_uniform(c, n - v + 1);
+ break;
+ }
+
+ if (!dav1d_get_bit(c)) {
+ v += dav1d_get_bits(c, b);
+ break;
+ }
+
+ v += 1 << b;
+ }
+
+ return ref * 2 <= n ? inv_recenter(ref, v) : n - inv_recenter(n - ref, v);
+}
+
+int dav1d_get_bits_subexp(GetBits *const c, const int ref, const unsigned n) {
+ return (int) get_bits_subexp_u(c, ref + (1 << n), 2 << n) - (1 << n);
+}
+
+void dav1d_bytealign_get_bits(GetBits *c) {
+ // bits_left is never more than 7, because it is only incremented
+ // by refill(), called by dav1d_get_bits and that never reads more
+ // than 7 bits more than it needs.
+ //
+ // If this wasn't true, we would need to work out how many bits to
+ // discard (bits_left % 8), subtract that from bits_left and then
+ // shift state right by that amount.
+ assert(c->bits_left <= 7);
+
+ c->bits_left = 0;
+ c->state = 0;
+}
diff --git a/third_party/dav1d/src/getbits.h b/third_party/dav1d/src/getbits.h
new file mode 100644
index 0000000000..57b80dc714
--- /dev/null
+++ b/third_party/dav1d/src/getbits.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_GETBITS_H
+#define DAV1D_SRC_GETBITS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct GetBits {
+ uint64_t state;
+ int bits_left, error;
+ const uint8_t *ptr, *ptr_start, *ptr_end;
+} GetBits;
+
+void dav1d_init_get_bits(GetBits *c, const uint8_t *data, size_t sz);
+unsigned dav1d_get_bit(GetBits *c);
+unsigned dav1d_get_bits(GetBits *c, int n);
+int dav1d_get_sbits(GetBits *c, int n);
+unsigned dav1d_get_uleb128(GetBits *c);
+
+// Output in range 0..max-1
+unsigned dav1d_get_uniform(GetBits *c, unsigned max);
+unsigned dav1d_get_vlc(GetBits *c);
+int dav1d_get_bits_subexp(GetBits *c, int ref, unsigned n);
+
+// Discard bits from the buffer until we're next byte-aligned.
+void dav1d_bytealign_get_bits(GetBits *c);
+
+// Return the current bit position relative to the start of the buffer.
+static inline unsigned dav1d_get_bits_pos(const GetBits *c) {
+ return (unsigned) (c->ptr - c->ptr_start) * 8 - c->bits_left;
+}
+
+#endif /* DAV1D_SRC_GETBITS_H */
diff --git a/third_party/dav1d/src/internal.h b/third_party/dav1d/src/internal.h
new file mode 100644
index 0000000000..b5fd1e18ef
--- /dev/null
+++ b/third_party/dav1d/src/internal.h
@@ -0,0 +1,467 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_INTERNAL_H
+#define DAV1D_SRC_INTERNAL_H
+
+#include <stdatomic.h>
+
+#include "dav1d/data.h"
+
+typedef struct Dav1dFrameContext Dav1dFrameContext;
+typedef struct Dav1dTileState Dav1dTileState;
+typedef struct Dav1dTaskContext Dav1dTaskContext;
+typedef struct Dav1dTask Dav1dTask;
+
+#include "common/attributes.h"
+
+#include "src/cdef.h"
+#include "src/cdf.h"
+#include "src/data.h"
+#include "src/env.h"
+#include "src/filmgrain.h"
+#include "src/intra_edge.h"
+#include "src/ipred.h"
+#include "src/itx.h"
+#include "src/levels.h"
+#include "src/lf_mask.h"
+#include "src/loopfilter.h"
+#include "src/looprestoration.h"
+#include "src/mc.h"
+#include "src/msac.h"
+#include "src/picture.h"
+#include "src/recon.h"
+#include "src/refmvs.h"
+#include "src/thread.h"
+
+typedef struct Dav1dDSPContext {
+ Dav1dFilmGrainDSPContext fg;
+ Dav1dIntraPredDSPContext ipred;
+ Dav1dMCDSPContext mc;
+ Dav1dInvTxfmDSPContext itx;
+ Dav1dLoopFilterDSPContext lf;
+ Dav1dCdefDSPContext cdef;
+ Dav1dLoopRestorationDSPContext lr;
+} Dav1dDSPContext;
+
+struct Dav1dTileGroup {
+ Dav1dData data;
+ int start, end;
+};
+
+enum TaskType {
+ DAV1D_TASK_TYPE_INIT,
+ DAV1D_TASK_TYPE_INIT_CDF,
+ DAV1D_TASK_TYPE_TILE_ENTROPY,
+ DAV1D_TASK_TYPE_ENTROPY_PROGRESS,
+ DAV1D_TASK_TYPE_TILE_RECONSTRUCTION,
+ DAV1D_TASK_TYPE_DEBLOCK_COLS,
+ DAV1D_TASK_TYPE_DEBLOCK_ROWS,
+ DAV1D_TASK_TYPE_CDEF,
+ DAV1D_TASK_TYPE_SUPER_RESOLUTION,
+ DAV1D_TASK_TYPE_LOOP_RESTORATION,
+ DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS,
+ DAV1D_TASK_TYPE_FG_PREP,
+ DAV1D_TASK_TYPE_FG_APPLY,
+};
+
+struct Dav1dContext {
+ Dav1dFrameContext *fc;
+ unsigned n_fc;
+
+ Dav1dTaskContext *tc;
+ unsigned n_tc;
+
+ // cache of OBUs that make up a single frame before we submit them
+ // to a frame worker to be decoded
+ struct Dav1dTileGroup *tile;
+ int n_tile_data_alloc;
+ int n_tile_data;
+ int n_tiles;
+ Dav1dMemPool *seq_hdr_pool;
+ Dav1dRef *seq_hdr_ref;
+ Dav1dSequenceHeader *seq_hdr;
+ Dav1dMemPool *frame_hdr_pool;
+ Dav1dRef *frame_hdr_ref;
+ Dav1dFrameHeader *frame_hdr;
+
+ Dav1dRef *content_light_ref;
+ Dav1dContentLightLevel *content_light;
+ Dav1dRef *mastering_display_ref;
+ Dav1dMasteringDisplay *mastering_display;
+ Dav1dRef *itut_t35_ref;
+ Dav1dITUTT35 *itut_t35;
+
+ // decoded output picture queue
+ Dav1dData in;
+ Dav1dThreadPicture out, cache;
+ // dummy is a pointer to prevent compiler errors about atomic_load()
+ // not taking const arguments
+ atomic_int flush_mem, *flush;
+ struct {
+ Dav1dThreadPicture *out_delayed;
+ unsigned next;
+ } frame_thread;
+
+ // task threading (refer to tc[] for per_thread thingies)
+ struct TaskThreadData {
+ pthread_mutex_t lock;
+ pthread_cond_t cond;
+ atomic_uint first;
+ unsigned cur;
+ // This is used for delayed reset of the task cur pointer when
+ // such operation is needed but the thread doesn't enter a critical
+ // section (typically when executing the next sbrow task locklessly).
+ // See src/thread_task.c:reset_task_cur().
+ atomic_uint reset_task_cur;
+ atomic_int cond_signaled;
+ struct {
+ int exec;
+ pthread_cond_t cond;
+ const Dav1dPicture *in;
+ Dav1dPicture *out;
+ enum TaskType type;
+ atomic_int progress[2]; /* [0]=started, [1]=completed */
+ union {
+ struct {
+ ALIGN(int8_t grain_lut_8bpc[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH], 16);
+ ALIGN(uint8_t scaling_8bpc[3][256], 64);
+ };
+ struct {
+ ALIGN(int16_t grain_lut_16bpc[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH], 16);
+ ALIGN(uint8_t scaling_16bpc[3][4096], 64);
+ };
+ };
+ } delayed_fg;
+ int inited;
+ } task_thread;
+
+ // reference/entropy state
+ Dav1dMemPool *segmap_pool;
+ Dav1dMemPool *refmvs_pool;
+ struct {
+ Dav1dThreadPicture p;
+ Dav1dRef *segmap;
+ Dav1dRef *refmvs;
+ unsigned refpoc[7];
+ } refs[8];
+ Dav1dMemPool *cdf_pool;
+ CdfThreadContext cdf[8];
+
+ Dav1dDSPContext dsp[3 /* 8, 10, 12 bits/component */];
+ Dav1dRefmvsDSPContext refmvs_dsp;
+
+ // tree to keep track of which edges are available
+ struct {
+ EdgeNode *root[2 /* BL_128X128 vs. BL_64X64 */];
+ EdgeBranch branch_sb128[1 + 4 + 16 + 64];
+ EdgeBranch branch_sb64[1 + 4 + 16];
+ EdgeTip tip_sb128[256];
+ EdgeTip tip_sb64[64];
+ } intra_edge;
+
+ Dav1dPicAllocator allocator;
+ int apply_grain;
+ int operating_point;
+ unsigned operating_point_idc;
+ int all_layers;
+ int max_spatial_id;
+ unsigned frame_size_limit;
+ int strict_std_compliance;
+ int output_invisible_frames;
+ enum Dav1dInloopFilterType inloop_filters;
+ enum Dav1dDecodeFrameType decode_frame_type;
+ int drain;
+ enum PictureFlags frame_flags;
+ enum Dav1dEventFlags event_flags;
+ Dav1dDataProps cached_error_props;
+ int cached_error;
+
+ Dav1dLogger logger;
+
+ Dav1dMemPool *picture_pool;
+};
+
+struct Dav1dTask {
+ unsigned frame_idx; // frame thread id
+ enum TaskType type; // task work
+ int sby; // sbrow
+
+ // task dependencies
+ int recon_progress, deblock_progress;
+ int deps_skip;
+ struct Dav1dTask *next; // only used in task queue
+};
+
+struct Dav1dFrameContext {
+ Dav1dRef *seq_hdr_ref;
+ Dav1dSequenceHeader *seq_hdr;
+ Dav1dRef *frame_hdr_ref;
+ Dav1dFrameHeader *frame_hdr;
+ Dav1dThreadPicture refp[7];
+ Dav1dPicture cur; // during block coding / reconstruction
+ Dav1dThreadPicture sr_cur; // after super-resolution upscaling
+ Dav1dRef *mvs_ref;
+ refmvs_temporal_block *mvs, *ref_mvs[7];
+ Dav1dRef *ref_mvs_ref[7];
+ Dav1dRef *cur_segmap_ref, *prev_segmap_ref;
+ uint8_t *cur_segmap;
+ const uint8_t *prev_segmap;
+ unsigned refpoc[7], refrefpoc[7][7];
+ uint8_t gmv_warp_allowed[7];
+ CdfThreadContext in_cdf, out_cdf;
+ struct Dav1dTileGroup *tile;
+ int n_tile_data_alloc;
+ int n_tile_data;
+
+ // for scalable references
+ struct ScalableMotionParams {
+ int scale; // if no scaling, this is 0
+ int step;
+ } svc[7][2 /* x, y */];
+ int resize_step[2 /* y, uv */], resize_start[2 /* y, uv */];
+
+ const Dav1dContext *c;
+ Dav1dTileState *ts;
+ int n_ts;
+ const Dav1dDSPContext *dsp;
+ struct {
+ recon_b_intra_fn recon_b_intra;
+ recon_b_inter_fn recon_b_inter;
+ filter_sbrow_fn filter_sbrow;
+ filter_sbrow_fn filter_sbrow_deblock_cols;
+ filter_sbrow_fn filter_sbrow_deblock_rows;
+ void (*filter_sbrow_cdef)(Dav1dTaskContext *tc, int sby);
+ filter_sbrow_fn filter_sbrow_resize;
+ filter_sbrow_fn filter_sbrow_lr;
+ backup_ipred_edge_fn backup_ipred_edge;
+ read_coef_blocks_fn read_coef_blocks;
+ } bd_fn;
+
+ int ipred_edge_sz;
+ pixel *ipred_edge[3];
+ ptrdiff_t b4_stride;
+ int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step, sr_sb128w;
+ uint16_t dq[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
+ const uint8_t *qm[N_RECT_TX_SIZES][3 /* plane */];
+ BlockContext *a;
+ int a_sz /* w*tile_rows */;
+ refmvs_frame rf;
+ uint8_t jnt_weights[7][7];
+ int bitdepth_max;
+
+ struct {
+ int next_tile_row[2 /* 0: reconstruction, 1: entropy */];
+ atomic_int entropy_progress;
+ atomic_int deblock_progress; // in sby units
+ atomic_uint *frame_progress, *copy_lpf_progress;
+ // indexed using t->by * f->b4_stride + t->bx
+ Av1Block *b;
+ struct CodedBlockInfo {
+ int16_t eob[3 /* plane */];
+ uint8_t txtp[3 /* plane */];
+ } *cbi;
+ // indexed using (t->by >> 1) * (f->b4_stride >> 1) + (t->bx >> 1)
+ uint16_t (*pal)[3 /* plane */][8 /* idx */];
+ // iterated over inside tile state
+ uint8_t *pal_idx;
+ coef *cf;
+ int prog_sz;
+ int pal_sz, pal_idx_sz, cf_sz;
+ // start offsets per tile
+ int *tile_start_off;
+ } frame_thread;
+
+ // loopfilter
+ struct {
+ uint8_t (*level)[4];
+ Av1Filter *mask;
+ Av1Restoration *lr_mask;
+ int mask_sz /* w*h */, lr_mask_sz;
+ int cdef_buf_plane_sz[2]; /* stride*sbh*4 */
+ int cdef_buf_sbh;
+ int lr_buf_plane_sz[2]; /* (stride*sbh*4) << sb128 if n_tc > 1, else stride*4 */
+ int re_sz /* h */;
+ ALIGN(Av1FilterLUT lim_lut, 16);
+ int last_sharpness;
+ uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
+ uint8_t *tx_lpf_right_edge[2];
+ uint8_t *cdef_line_buf, *lr_line_buf;
+ pixel *cdef_line[2 /* pre, post */][3 /* plane */];
+ pixel *cdef_lpf_line[3 /* plane */];
+ pixel *lr_lpf_line[3 /* plane */];
+
+ // in-loop filter per-frame state keeping
+ uint8_t *start_of_tile_row;
+ int start_of_tile_row_sz;
+ int need_cdef_lpf_copy;
+ pixel *p[3], *sr_p[3];
+ Av1Filter *mask_ptr, *prev_mask_ptr;
+ int restore_planes; // enum LrRestorePlanes
+ } lf;
+
+ struct {
+ pthread_mutex_t lock;
+ pthread_cond_t cond;
+ struct TaskThreadData *ttd;
+ struct Dav1dTask *tasks, *tile_tasks[2], init_task;
+ int num_tasks, num_tile_tasks;
+ atomic_int init_done;
+ atomic_int done[2];
+ int retval;
+ int update_set; // whether we need to update CDF reference
+ atomic_int error;
+ atomic_int task_counter;
+ struct Dav1dTask *task_head, *task_tail;
+ // Points to the task directly before the cur pointer in the queue.
+ // This cur pointer is theoretical here, we actually keep track of the
+ // "prev_t" variable. This is needed to not loose the tasks in
+ // [head;cur-1] when picking one for execution.
+ struct Dav1dTask *task_cur_prev;
+ struct { // async task insertion
+ atomic_int merge;
+ pthread_mutex_t lock;
+ Dav1dTask *head, *tail;
+ } pending_tasks;
+ } task_thread;
+
+ // threading (refer to tc[] for per-thread things)
+ struct FrameTileThreadData {
+ int (*lowest_pixel_mem)[7][2];
+ int lowest_pixel_mem_sz;
+ } tile_thread;
+};
+
+struct Dav1dTileState {
+ CdfContext cdf;
+ MsacContext msac;
+
+ struct {
+ int col_start, col_end, row_start, row_end; // in 4px units
+ int col, row; // in tile units
+ } tiling;
+
+ // in sby units, TILE_ERROR after a decoding error
+ atomic_int progress[2 /* 0: reconstruction, 1: entropy */];
+ struct {
+ uint8_t *pal_idx;
+ coef *cf;
+ } frame_thread[2 /* 0: reconstruction, 1: entropy */];
+
+ // in fullpel units, [0] = Y, [1] = UV, used for progress requirements
+ // each entry is one tile-sbrow; middle index is refidx
+ int (*lowest_pixel)[7][2];
+
+ uint16_t dqmem[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
+ const uint16_t (*dq)[3][2];
+ int last_qidx;
+
+ int8_t last_delta_lf[4];
+ uint8_t lflvlmem[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
+ const uint8_t (*lflvl)[4][8][2];
+
+ Av1RestorationUnit *lr_ref[3];
+};
+
+struct Dav1dTaskContext {
+ const Dav1dContext *c;
+ const Dav1dFrameContext *f;
+ Dav1dTileState *ts;
+ int bx, by;
+ BlockContext l, *a;
+ refmvs_tile rt;
+ ALIGN(union, 64) {
+ int16_t cf_8bpc [32 * 32];
+ int32_t cf_16bpc[32 * 32];
+ };
+ // FIXME types can be changed to pixel (and dynamically allocated)
+ // which would make copy/assign operations slightly faster?
+ uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
+ uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
+ uint8_t txtp_map[32 * 32]; // inter-only
+ ALIGN(union, 64) {
+ struct {
+ union {
+ uint8_t lap_8bpc [128 * 32];
+ uint16_t lap_16bpc[128 * 32];
+ struct {
+ int16_t compinter[2][128 * 128];
+ uint8_t seg_mask[128 * 128];
+ };
+ };
+ union {
+ // stride=192 for non-SVC, or 320 for SVC
+ uint8_t emu_edge_8bpc [320 * (256 + 7)];
+ uint16_t emu_edge_16bpc[320 * (256 + 7)];
+ };
+ };
+ struct {
+ union {
+ uint8_t levels[32 * 34];
+ struct {
+ uint8_t pal_order[64][8];
+ uint8_t pal_ctx[64];
+ };
+ };
+ int16_t ac[32 * 32];
+ uint8_t pal_idx[2 * 64 * 64];
+ uint16_t pal[3 /* plane */][8 /* palette_idx */];
+ ALIGN(union, 64) {
+ struct {
+ uint8_t interintra_8bpc[64 * 64];
+ uint8_t edge_8bpc[257];
+ };
+ struct {
+ uint16_t interintra_16bpc[64 * 64];
+ uint16_t edge_16bpc[257];
+ };
+ };
+ };
+ } scratch;
+
+ Dav1dWarpedMotionParams warpmv;
+ Av1Filter *lf_mask;
+ int top_pre_cdef_toggle;
+ int8_t *cur_sb_cdef_idx_ptr;
+ // for chroma sub8x8, we need to know the filter for all 4 subblocks in
+ // a 4x4 area, but the top/left one can go out of cache already, so this
+ // keeps it accessible
+ enum Filter2d tl_4x4_filter;
+
+ struct {
+ int pass;
+ } frame_thread;
+ struct {
+ struct thread_data td;
+ struct TaskThreadData *ttd;
+ struct FrameTileThreadData *fttd;
+ int flushed;
+ int die;
+ } task_thread;
+};
+
+#endif /* DAV1D_SRC_INTERNAL_H */
diff --git a/third_party/dav1d/src/intra_edge.c b/third_party/dav1d/src/intra_edge.c
new file mode 100644
index 0000000000..684d113fa9
--- /dev/null
+++ b/third_party/dav1d/src/intra_edge.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/attributes.h"
+
+#include "src/intra_edge.h"
+#include "src/levels.h"
+
+struct ModeSelMem {
+ EdgeBranch *nwc[3 /* 64x64, 32x32, 16x16 */];
+ EdgeTip *nt;
+};
+
+static void init_edges(EdgeNode *const node,
+ const enum BlockLevel bl,
+ const enum EdgeFlags edge_flags)
+{
+ node->o = edge_flags;
+
+#define ALL_FL(t) (EDGE_I444_##t | EDGE_I422_##t | EDGE_I420_##t)
+ if (bl == BL_8X8) {
+ EdgeTip *const nt = (EdgeTip *) node;
+
+ node->h[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM);
+ node->h[1] = edge_flags & (ALL_FL(LEFT_HAS_BOTTOM) |
+ EDGE_I420_TOP_HAS_RIGHT);
+
+ node->v[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT);
+ node->v[1] = edge_flags & (ALL_FL(TOP_HAS_RIGHT) |
+ EDGE_I420_LEFT_HAS_BOTTOM |
+ EDGE_I422_LEFT_HAS_BOTTOM);
+
+ nt->split[0] = ALL_FL(TOP_HAS_RIGHT) | ALL_FL(LEFT_HAS_BOTTOM);
+ nt->split[1] = (edge_flags & ALL_FL(TOP_HAS_RIGHT)) |
+ EDGE_I422_LEFT_HAS_BOTTOM;
+ nt->split[2] = edge_flags | EDGE_I444_TOP_HAS_RIGHT;
+ nt->split[3] = edge_flags & (EDGE_I420_TOP_HAS_RIGHT |
+ EDGE_I420_LEFT_HAS_BOTTOM |
+ EDGE_I422_LEFT_HAS_BOTTOM);
+ } else {
+ EdgeBranch *const nwc = (EdgeBranch *) node;
+
+ node->h[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM);
+ node->h[1] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM);
+
+ node->v[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT);
+ node->v[1] = edge_flags & ALL_FL(TOP_HAS_RIGHT);
+
+ nwc->h4[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM);
+ nwc->h4[1] =
+ nwc->h4[2] = ALL_FL(LEFT_HAS_BOTTOM);
+ nwc->h4[3] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM);
+ if (bl == BL_16X16)
+ nwc->h4[1] |= edge_flags & EDGE_I420_TOP_HAS_RIGHT;
+
+ nwc->v4[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT);
+ nwc->v4[1] =
+ nwc->v4[2] = ALL_FL(TOP_HAS_RIGHT);
+ nwc->v4[3] = edge_flags & ALL_FL(TOP_HAS_RIGHT);
+ if (bl == BL_16X16)
+ nwc->v4[1] |= edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM |
+ EDGE_I422_LEFT_HAS_BOTTOM);
+
+ nwc->tls[0] = ALL_FL(TOP_HAS_RIGHT) | ALL_FL(LEFT_HAS_BOTTOM);
+ nwc->tls[1] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM);
+ nwc->tls[2] = edge_flags & ALL_FL(TOP_HAS_RIGHT);
+
+ nwc->trs[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT);
+ nwc->trs[1] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM);
+ nwc->trs[2] = 0;
+
+ nwc->tts[0] = ALL_FL(TOP_HAS_RIGHT) | ALL_FL(LEFT_HAS_BOTTOM);
+ nwc->tts[1] = edge_flags & ALL_FL(TOP_HAS_RIGHT);
+ nwc->tts[2] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM);
+
+ nwc->tbs[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM);
+ nwc->tbs[1] = edge_flags | ALL_FL(TOP_HAS_RIGHT);
+ nwc->tbs[2] = 0;
+ }
+}
+
+static void init_mode_node(EdgeBranch *const nwc,
+ const enum BlockLevel bl,
+ struct ModeSelMem *const mem,
+ const int top_has_right,
+ const int left_has_bottom)
+{
+ init_edges(&nwc->node, bl,
+ (top_has_right ? ALL_FL(TOP_HAS_RIGHT) : 0) |
+ (left_has_bottom ? ALL_FL(LEFT_HAS_BOTTOM) : 0));
+ if (bl == BL_16X16) {
+ for (int n = 0; n < 4; n++) {
+ EdgeTip *const nt = mem->nt++;
+ nwc->split[n] = &nt->node;
+ init_edges(&nt->node, bl + 1,
+ ((n == 3 || (n == 1 && !top_has_right)) ? 0 :
+ ALL_FL(TOP_HAS_RIGHT)) |
+ (!(n == 0 || (n == 2 && left_has_bottom)) ? 0 :
+ ALL_FL(LEFT_HAS_BOTTOM)));
+ }
+ } else {
+ for (int n = 0; n < 4; n++) {
+ EdgeBranch *const nwc_child = mem->nwc[bl]++;
+ nwc->split[n] = &nwc_child->node;
+ init_mode_node(nwc_child, bl + 1, mem,
+ !(n == 3 || (n == 1 && !top_has_right)),
+ n == 0 || (n == 2 && left_has_bottom));
+ }
+ }
+}
+
+void dav1d_init_mode_tree(EdgeNode *const root_node, EdgeTip *const nt,
+ const int allow_sb128)
+{
+ EdgeBranch *const root = (EdgeBranch *) root_node;
+ struct ModeSelMem mem;
+ mem.nt = nt;
+
+ if (allow_sb128) {
+ mem.nwc[BL_128X128] = &root[1];
+ mem.nwc[BL_64X64] = &root[1 + 4];
+ mem.nwc[BL_32X32] = &root[1 + 4 + 16];
+ init_mode_node(root, BL_128X128, &mem, 1, 0);
+ assert(mem.nwc[BL_128X128] == &root[1 + 4]);
+ assert(mem.nwc[BL_64X64] == &root[1 + 4 + 16]);
+ assert(mem.nwc[BL_32X32] == &root[1 + 4 + 16 + 64]);
+ assert(mem.nt == &nt[256]);
+ } else {
+ mem.nwc[BL_128X128] = NULL;
+ mem.nwc[BL_64X64] = &root[1];
+ mem.nwc[BL_32X32] = &root[1 + 4];
+ init_mode_node(root, BL_64X64, &mem, 1, 0);
+ assert(mem.nwc[BL_64X64] == &root[1 + 4]);
+ assert(mem.nwc[BL_32X32] == &root[1 + 4 + 16]);
+ assert(mem.nt == &nt[64]);
+ }
+}
diff --git a/third_party/dav1d/src/intra_edge.h b/third_party/dav1d/src/intra_edge.h
new file mode 100644
index 0000000000..8b4e150181
--- /dev/null
+++ b/third_party/dav1d/src/intra_edge.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_INTRA_EDGE_H
+#define DAV1D_SRC_INTRA_EDGE_H
+
+enum EdgeFlags {
+ EDGE_I444_TOP_HAS_RIGHT = 1 << 0,
+ EDGE_I422_TOP_HAS_RIGHT = 1 << 1,
+ EDGE_I420_TOP_HAS_RIGHT = 1 << 2,
+ EDGE_I444_LEFT_HAS_BOTTOM = 1 << 3,
+ EDGE_I422_LEFT_HAS_BOTTOM = 1 << 4,
+ EDGE_I420_LEFT_HAS_BOTTOM = 1 << 5,
+};
+
+typedef struct EdgeNode EdgeNode;
+struct EdgeNode {
+ enum EdgeFlags o, h[2], v[2];
+};
+typedef struct EdgeTip {
+ EdgeNode node;
+ enum EdgeFlags split[4];
+} EdgeTip;
+typedef struct EdgeBranch {
+ EdgeNode node;
+ enum EdgeFlags tts[3], tbs[3], tls[3], trs[3], h4[4], v4[4];
+ EdgeNode *split[4];
+} EdgeBranch;
+
+void dav1d_init_mode_tree(EdgeNode *const root, EdgeTip *const nt,
+ const int allow_sb128);
+
+#endif /* DAV1D_SRC_INTRA_EDGE_H */
diff --git a/third_party/dav1d/src/ipred.h b/third_party/dav1d/src/ipred.h
new file mode 100644
index 0000000000..739ef1a266
--- /dev/null
+++ b/third_party/dav1d/src/ipred.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_IPRED_H
+#define DAV1D_SRC_IPRED_H
+
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+
+/*
+ * Intra prediction.
+ * - a is the angle (in degrees) for directional intra predictors. For other
+ * modes, it is ignored;
+ * - topleft is the same as the argument given to dav1d_prepare_intra_edges(),
+ * see ipred_prepare.h for more detailed documentation.
+ */
+#define decl_angular_ipred_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, \
+ int width, int height, int angle, int max_width, int max_height \
+ HIGHBD_DECL_SUFFIX)
+typedef decl_angular_ipred_fn(*angular_ipred_fn);
+
+/*
+ * Create a subsampled Y plane with the DC subtracted.
+ * - w/h_pad is the edge of the width/height that extends outside the visible
+ * portion of the frame in 4px units;
+ * - ac has a stride of 16.
+ */
+#define decl_cfl_ac_fn(name) \
+void (name)(int16_t *ac, const pixel *y, ptrdiff_t stride, \
+ int w_pad, int h_pad, int cw, int ch)
+typedef decl_cfl_ac_fn(*cfl_ac_fn);
+
+/*
+ * dst[x,y] += alpha * ac[x,y]
+ * - alpha contains a q3 scalar in [-16,16] range;
+ */
+#define decl_cfl_pred_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, \
+ int width, int height, const int16_t *ac, int alpha \
+ HIGHBD_DECL_SUFFIX)
+typedef decl_cfl_pred_fn(*cfl_pred_fn);
+
+/*
+ * dst[x,y] = pal[idx[x,y]]
+ * - palette indices are [0-7]
+ * - only 16-byte alignment is guaranteed for idx.
+ */
+#define decl_pal_pred_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const uint16_t *pal, \
+ const uint8_t *idx, int w, int h)
+typedef decl_pal_pred_fn(*pal_pred_fn);
+
+typedef struct Dav1dIntraPredDSPContext {
+ angular_ipred_fn intra_pred[N_IMPL_INTRA_PRED_MODES];
+
+ // chroma-from-luma
+ cfl_ac_fn cfl_ac[3 /* 420, 422, 444 */];
+ cfl_pred_fn cfl_pred[DC_128_PRED + 1];
+
+ // palette
+ pal_pred_fn pal_pred;
+} Dav1dIntraPredDSPContext;
+
+bitfn_decls(void dav1d_intra_pred_dsp_init, Dav1dIntraPredDSPContext *c);
+
+#endif /* DAV1D_SRC_IPRED_H */
diff --git a/third_party/dav1d/src/ipred_prepare.h b/third_party/dav1d/src/ipred_prepare.h
new file mode 100644
index 0000000000..6a7efeb3d7
--- /dev/null
+++ b/third_party/dav1d/src/ipred_prepare.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_IPRED_PREPARE_H
+#define DAV1D_SRC_IPRED_PREPARE_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "common/bitdepth.h"
+
+#include "src/env.h"
+#include "src/intra_edge.h"
+#include "src/levels.h"
+
+/*
+ * Luma intra edge preparation.
+ *
+ * x/y/start/w/h are in luma block (4px) units:
+ * - x and y are the absolute block positions in the image;
+ * - start/w/h are the *dependent tile* boundary positions. In practice, start
+ * is the horizontal tile start, w is the horizontal tile end, the vertical
+ * tile start is assumed to be 0 and h is the vertical image end.
+ *
+ * edge_flags signals which edges are available for this transform-block inside
+ * the given partition, as well as for the partition inside the superblock
+ * structure.
+ *
+ * dst and stride are pointers to the top/left position of the current block,
+ * and can be used to locate the top, left, top/left, top/right and bottom/left
+ * edge pointers also.
+ *
+ * angle is the angle_delta [-3..3] on input, and the absolute angle on output.
+ *
+ * mode is the intra prediction mode as coded in the bitstream. The return value
+ * is this same mode, converted to an index in the DSP functions.
+ *
+ * tw/th are the size of the transform block in block (4px) units.
+ *
+ * topleft_out is a pointer to scratch memory that will be filled with the edge
+ * pixels. The memory array should have space to be indexed in the [-2*w,2*w]
+ * range, in the following order:
+ * - [0] will be the top/left edge pixel;
+ * - [1..w] will be the top edge pixels (1 being left-most, w being right-most);
+ * - [w+1..2*w] will be the top/right edge pixels;
+ * - [-1..-w] will be the left edge pixels (-1 being top-most, -w being bottom-
+ * most);
+ * - [-w-1..-2*w] will be the bottom/left edge pixels.
+ * Each edge may remain uninitialized if it is not used by the returned mode
+ * index. If edges are not available (because the edge position is outside the
+ * tile dimensions or because edge_flags indicates lack of edge availability),
+ * they will be extended from nearby edges as defined by the av1 spec.
+ */
+enum IntraPredMode
+ bytefn(dav1d_prepare_intra_edges)(int x, int have_left, int y, int have_top,
+ int w, int h, enum EdgeFlags edge_flags,
+ const pixel *dst, ptrdiff_t stride,
+ const pixel *prefilter_toplevel_sb_edge,
+ enum IntraPredMode mode, int *angle,
+ int tw, int th, int filter_edge,
+ pixel *topleft_out HIGHBD_DECL_SUFFIX);
+
+// These flags are OR'd with the angle argument into intra predictors.
+// ANGLE_USE_EDGE_FILTER_FLAG signals that edges should be convolved
+// with a filter before using them to predict values in a block.
+// ANGLE_SMOOTH_EDGE_FLAG means that edges are smooth and should use
+// reduced filter strength.
+#define ANGLE_USE_EDGE_FILTER_FLAG 1024
+#define ANGLE_SMOOTH_EDGE_FLAG 512
+
+static inline int sm_flag(const BlockContext *const b, const int idx) {
+ if (!b->intra[idx]) return 0;
+ const enum IntraPredMode m = b->mode[idx];
+ return (m == SMOOTH_PRED || m == SMOOTH_H_PRED ||
+ m == SMOOTH_V_PRED) ? ANGLE_SMOOTH_EDGE_FLAG : 0;
+}
+
+static inline int sm_uv_flag(const BlockContext *const b, const int idx) {
+ const enum IntraPredMode m = b->uvmode[idx];
+ return (m == SMOOTH_PRED || m == SMOOTH_H_PRED ||
+ m == SMOOTH_V_PRED) ? ANGLE_SMOOTH_EDGE_FLAG : 0;
+}
+
+#endif /* DAV1D_SRC_IPRED_PREPARE_H */
diff --git a/third_party/dav1d/src/ipred_prepare_tmpl.c b/third_party/dav1d/src/ipred_prepare_tmpl.c
new file mode 100644
index 0000000000..0bf9de9418
--- /dev/null
+++ b/third_party/dav1d/src/ipred_prepare_tmpl.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/ipred_prepare.h"
+
+static const uint8_t av1_mode_conv[N_INTRA_PRED_MODES]
+ [2 /* have_left */][2 /* have_top */] =
+{
+ [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
+ { LEFT_DC_PRED, DC_PRED } },
+ [PAETH_PRED] = { { DC_128_PRED, VERT_PRED },
+ { HOR_PRED, PAETH_PRED } },
+};
+
+static const uint8_t av1_mode_to_angle_map[8] = {
+ 90, 180, 45, 135, 113, 157, 203, 67
+};
+
+static const struct {
+ uint8_t needs_left:1;
+ uint8_t needs_top:1;
+ uint8_t needs_topleft:1;
+ uint8_t needs_topright:1;
+ uint8_t needs_bottomleft:1;
+} av1_intra_prediction_edges[N_IMPL_INTRA_PRED_MODES] = {
+ [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
+ [VERT_PRED] = { .needs_top = 1 },
+ [HOR_PRED] = { .needs_left = 1 },
+ [LEFT_DC_PRED] = { .needs_left = 1 },
+ [TOP_DC_PRED] = { .needs_top = 1 },
+ [DC_128_PRED] = { 0 },
+ [Z1_PRED] = { .needs_top = 1, .needs_topright = 1,
+ .needs_topleft = 1 },
+ [Z2_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+ [Z3_PRED] = { .needs_left = 1, .needs_bottomleft = 1,
+ .needs_topleft = 1 },
+ [SMOOTH_PRED] = { .needs_left = 1, .needs_top = 1 },
+ [SMOOTH_V_PRED] = { .needs_left = 1, .needs_top = 1 },
+ [SMOOTH_H_PRED] = { .needs_left = 1, .needs_top = 1 },
+ [PAETH_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+ [FILTER_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+};
+
+enum IntraPredMode
+bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
+ const int y, const int have_top,
+ const int w, const int h,
+ const enum EdgeFlags edge_flags,
+ const pixel *const dst,
+ const ptrdiff_t stride,
+ const pixel *prefilter_toplevel_sb_edge,
+ enum IntraPredMode mode, int *const angle,
+ const int tw, const int th, const int filter_edge,
+ pixel *const topleft_out HIGHBD_DECL_SUFFIX)
+{
+ const int bitdepth = bitdepth_from_max(bitdepth_max);
+ assert(y < h && x < w);
+
+ switch (mode) {
+ case VERT_PRED:
+ case HOR_PRED:
+ case DIAG_DOWN_LEFT_PRED:
+ case DIAG_DOWN_RIGHT_PRED:
+ case VERT_RIGHT_PRED:
+ case HOR_DOWN_PRED:
+ case HOR_UP_PRED:
+ case VERT_LEFT_PRED: {
+ *angle = av1_mode_to_angle_map[mode - VERT_PRED] + 3 * *angle;
+
+ if (*angle <= 90)
+ mode = *angle < 90 && have_top ? Z1_PRED : VERT_PRED;
+ else if (*angle < 180)
+ mode = Z2_PRED;
+ else
+ mode = *angle > 180 && have_left ? Z3_PRED : HOR_PRED;
+ break;
+ }
+ case DC_PRED:
+ case PAETH_PRED:
+ mode = av1_mode_conv[mode][have_left][have_top];
+ break;
+ default:
+ break;
+ }
+
+ const pixel *dst_top;
+ if (have_top &&
+ (av1_intra_prediction_edges[mode].needs_top ||
+ av1_intra_prediction_edges[mode].needs_topleft ||
+ (av1_intra_prediction_edges[mode].needs_left && !have_left)))
+ {
+ if (prefilter_toplevel_sb_edge) {
+ dst_top = &prefilter_toplevel_sb_edge[x * 4];
+ } else {
+ dst_top = &dst[-PXSTRIDE(stride)];
+ }
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_left) {
+ const int sz = th << 2;
+ pixel *const left = &topleft_out[-sz];
+
+ if (have_left) {
+ const int px_have = imin(sz, (h - y) << 2);
+
+ for (int i = 0; i < px_have; i++)
+ left[sz - 1 - i] = dst[PXSTRIDE(stride) * i - 1];
+ if (px_have < sz)
+ pixel_set(left, left[sz - px_have], sz - px_have);
+ } else {
+ pixel_set(left, have_top ? *dst_top : ((1 << bitdepth) >> 1) + 1, sz);
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_bottomleft) {
+ const int have_bottomleft = (!have_left || y + th >= h) ? 0 :
+ (edge_flags & EDGE_I444_LEFT_HAS_BOTTOM);
+
+ if (have_bottomleft) {
+ const int px_have = imin(sz, (h - y - th) << 2);
+
+ for (int i = 0; i < px_have; i++)
+ left[-(i + 1)] = dst[(sz + i) * PXSTRIDE(stride) - 1];
+ if (px_have < sz)
+ pixel_set(left - sz, left[-px_have], sz - px_have);
+ } else {
+ pixel_set(left - sz, left[0], sz);
+ }
+ }
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_top) {
+ const int sz = tw << 2;
+ pixel *const top = &topleft_out[1];
+
+ if (have_top) {
+ const int px_have = imin(sz, (w - x) << 2);
+ pixel_copy(top, dst_top, px_have);
+ if (px_have < sz)
+ pixel_set(top + px_have, top[px_have - 1], sz - px_have);
+ } else {
+ pixel_set(top, have_left ? dst[-1] : ((1 << bitdepth) >> 1) - 1, sz);
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_topright) {
+ const int have_topright = (!have_top || x + tw >= w) ? 0 :
+ (edge_flags & EDGE_I444_TOP_HAS_RIGHT);
+
+ if (have_topright) {
+ const int px_have = imin(sz, (w - x - tw) << 2);
+
+ pixel_copy(top + sz, &dst_top[sz], px_have);
+ if (px_have < sz)
+ pixel_set(top + sz + px_have, top[sz + px_have - 1],
+ sz - px_have);
+ } else {
+ pixel_set(top + sz, top[sz - 1], sz);
+ }
+ }
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_topleft) {
+ if (have_left)
+ *topleft_out = have_top ? dst_top[-1] : dst[-1];
+ else
+ *topleft_out = have_top ? *dst_top : (1 << bitdepth) >> 1;
+
+ if (mode == Z2_PRED && tw + th >= 6 && filter_edge)
+ *topleft_out = ((topleft_out[-1] + topleft_out[1]) * 5 +
+ topleft_out[0] * 6 + 8) >> 4;
+ }
+
+ return mode;
+}
diff --git a/third_party/dav1d/src/ipred_tmpl.c b/third_party/dav1d/src/ipred_tmpl.c
new file mode 100644
index 0000000000..151d4842a0
--- /dev/null
+++ b/third_party/dav1d/src/ipred_tmpl.c
@@ -0,0 +1,771 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/ipred.h"
+#include "src/tables.h"
+
+static NOINLINE void
+splat_dc(pixel *dst, const ptrdiff_t stride,
+ const int width, const int height, const int dc HIGHBD_DECL_SUFFIX)
+{
+#if BITDEPTH == 8
+ assert(dc <= 0xff);
+ if (width > 4) {
+ const uint64_t dcN = dc * 0x0101010101010101ULL;
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x += sizeof(dcN))
+ *((uint64_t *) &dst[x]) = dcN;
+ dst += PXSTRIDE(stride);
+ }
+ } else {
+ const unsigned dcN = dc * 0x01010101U;
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x += sizeof(dcN))
+ *((unsigned *) &dst[x]) = dcN;
+ dst += PXSTRIDE(stride);
+ }
+ }
+#else
+ assert(dc <= bitdepth_max);
+ const uint64_t dcN = dc * 0x0001000100010001ULL;
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x += sizeof(dcN) >> 1)
+ *((uint64_t *) &dst[x]) = dcN;
+ dst += PXSTRIDE(stride);
+ }
+#endif
+}
+
+static NOINLINE void
+cfl_pred(pixel *dst, const ptrdiff_t stride,
+ const int width, const int height, const int dc,
+ const int16_t *ac, const int alpha HIGHBD_DECL_SUFFIX)
+{
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ const int diff = alpha * ac[x];
+ dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff));
+ }
+ ac += width;
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static unsigned dc_gen_top(const pixel *const topleft, const int width) {
+ unsigned dc = width >> 1;
+ for (int i = 0; i < width; i++)
+ dc += topleft[1 + i];
+ return dc >> ctz(width);
+}
+
+static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ splat_dc(dst, stride, width, height, dc_gen_top(topleft, width)
+ HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height,
+ const int16_t *ac, const int alpha
+ HIGHBD_DECL_SUFFIX)
+{
+ cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha
+ HIGHBD_TAIL_SUFFIX);
+}
+
+static unsigned dc_gen_left(const pixel *const topleft, const int height) {
+ unsigned dc = height >> 1;
+ for (int i = 0; i < height; i++)
+ dc += topleft[-(1 + i)];
+ return dc >> ctz(height);
+}
+
+static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ splat_dc(dst, stride, width, height, dc_gen_left(topleft, height)
+ HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height,
+ const int16_t *ac, const int alpha
+ HIGHBD_DECL_SUFFIX)
+{
+ const unsigned dc = dc_gen_left(topleft, height);
+ cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
+}
+
+#if BITDEPTH == 8
+#define MULTIPLIER_1x2 0x5556
+#define MULTIPLIER_1x4 0x3334
+#define BASE_SHIFT 16
+#else
+#define MULTIPLIER_1x2 0xAAAB
+#define MULTIPLIER_1x4 0x6667
+#define BASE_SHIFT 17
+#endif
+
+static unsigned dc_gen(const pixel *const topleft,
+ const int width, const int height)
+{
+ unsigned dc = (width + height) >> 1;
+ for (int i = 0; i < width; i++)
+ dc += topleft[i + 1];
+ for (int i = 0; i < height; i++)
+ dc += topleft[-(i + 1)];
+ dc >>= ctz(width + height);
+
+ if (width != height) {
+ dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 :
+ MULTIPLIER_1x2;
+ dc >>= BASE_SHIFT;
+ }
+ return dc;
+}
+
+static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ splat_dc(dst, stride, width, height, dc_gen(topleft, width, height)
+ HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height,
+ const int16_t *ac, const int alpha
+ HIGHBD_DECL_SUFFIX)
+{
+ unsigned dc = dc_gen(topleft, width, height);
+ cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
+}
+
+#undef MULTIPLIER_1x2
+#undef MULTIPLIER_1x4
+#undef BASE_SHIFT
+
+static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+#if BITDEPTH == 16
+ const int dc = (bitdepth_max + 1) >> 1;
+#else
+ const int dc = 128;
+#endif
+ splat_dc(dst, stride, width, height, dc HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height,
+ const int16_t *ac, const int alpha
+ HIGHBD_DECL_SUFFIX)
+{
+#if BITDEPTH == 16
+ const int dc = (bitdepth_max + 1) >> 1;
+#else
+ const int dc = 128;
+#endif
+ cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
+}
+
+static void ipred_v_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ for (int y = 0; y < height; y++) {
+ pixel_copy(dst, topleft + 1, width);
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_h_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ for (int y = 0; y < height; y++) {
+ pixel_set(dst, topleft[-(1 + y)], width);
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const tl_ptr,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int topleft = tl_ptr[0];
+ for (int y = 0; y < height; y++) {
+ const int left = tl_ptr[-(y + 1)];
+ for (int x = 0; x < width; x++) {
+ const int top = tl_ptr[1 + x];
+ const int base = left + top - topleft;
+ const int ldiff = abs(left - base);
+ const int tdiff = abs(top - base);
+ const int tldiff = abs(topleft - base);
+
+ dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left :
+ tdiff <= tldiff ? top : topleft;
+ }
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const uint8_t *const weights_hor = &dav1d_sm_weights[width];
+ const uint8_t *const weights_ver = &dav1d_sm_weights[height];
+ const int right = topleft[width], bottom = topleft[-height];
+
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ const int pred = weights_ver[y] * topleft[1 + x] +
+ (256 - weights_ver[y]) * bottom +
+ weights_hor[x] * topleft[-(1 + y)] +
+ (256 - weights_hor[x]) * right;
+ dst[x] = (pred + 256) >> 9;
+ }
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const uint8_t *const weights_ver = &dav1d_sm_weights[height];
+ const int bottom = topleft[-height];
+
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ const int pred = weights_ver[y] * topleft[1 + x] +
+ (256 - weights_ver[y]) * bottom;
+ dst[x] = (pred + 128) >> 8;
+ }
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const uint8_t *const weights_hor = &dav1d_sm_weights[width];
+ const int right = topleft[width];
+
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ const int pred = weights_hor[x] * topleft[-(y + 1)] +
+ (256 - weights_hor[x]) * right;
+ dst[x] = (pred + 128) >> 8;
+ }
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static NOINLINE int get_filter_strength(const int wh, const int angle,
+ const int is_sm)
+{
+ if (is_sm) {
+ if (wh <= 8) {
+ if (angle >= 64) return 2;
+ if (angle >= 40) return 1;
+ } else if (wh <= 16) {
+ if (angle >= 48) return 2;
+ if (angle >= 20) return 1;
+ } else if (wh <= 24) {
+ if (angle >= 4) return 3;
+ } else {
+ return 3;
+ }
+ } else {
+ if (wh <= 8) {
+ if (angle >= 56) return 1;
+ } else if (wh <= 16) {
+ if (angle >= 40) return 1;
+ } else if (wh <= 24) {
+ if (angle >= 32) return 3;
+ if (angle >= 16) return 2;
+ if (angle >= 8) return 1;
+ } else if (wh <= 32) {
+ if (angle >= 32) return 3;
+ if (angle >= 4) return 2;
+ return 1;
+ } else {
+ return 3;
+ }
+ }
+ return 0;
+}
+
+static NOINLINE void filter_edge(pixel *const out, const int sz,
+ const int lim_from, const int lim_to,
+ const pixel *const in, const int from,
+ const int to, const int strength)
+{
+ static const uint8_t kernel[3][5] = {
+ { 0, 4, 8, 4, 0 },
+ { 0, 5, 6, 5, 0 },
+ { 2, 4, 4, 4, 2 }
+ };
+
+ assert(strength > 0);
+ int i = 0;
+ for (; i < imin(sz, lim_from); i++)
+ out[i] = in[iclip(i, from, to - 1)];
+ for (; i < imin(lim_to, sz); i++) {
+ int s = 0;
+ for (int j = 0; j < 5; j++)
+ s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j];
+ out[i] = (s + 8) >> 4;
+ }
+ for (; i < sz; i++)
+ out[i] = in[iclip(i, from, to - 1)];
+}
+
+static inline int get_upsample(const int wh, const int angle, const int is_sm) {
+ return angle < 40 && wh <= 16 >> is_sm;
+}
+
+static NOINLINE void upsample_edge(pixel *const out, const int hsz,
+ const pixel *const in, const int from,
+ const int to HIGHBD_DECL_SUFFIX)
+{
+ static const int8_t kernel[4] = { -1, 9, 9, -1 };
+ int i;
+ for (i = 0; i < hsz - 1; i++) {
+ out[i * 2] = in[iclip(i, from, to - 1)];
+
+ int s = 0;
+ for (int j = 0; j < 4; j++)
+ s += in[iclip(i + j - 1, from, to - 1)] * kernel[j];
+ out[i * 2 + 1] = iclip_pixel((s + 8) >> 4);
+ }
+ out[i * 2] = in[iclip(i, from, to - 1)];
+}
+
+static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int is_sm = (angle >> 9) & 0x1;
+ const int enable_intra_edge_filter = angle >> 10;
+ angle &= 511;
+ assert(angle < 90);
+ int dx = dav1d_dr_intra_derivative[angle >> 1];
+ pixel top_out[64 + 64];
+ const pixel *top;
+ int max_base_x;
+ const int upsample_above = enable_intra_edge_filter ?
+ get_upsample(width + height, 90 - angle, is_sm) : 0;
+ if (upsample_above) {
+ upsample_edge(top_out, width + height, &topleft_in[1], -1,
+ width + imin(width, height) HIGHBD_TAIL_SUFFIX);
+ top = top_out;
+ max_base_x = 2 * (width + height) - 2;
+ dx <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, 90 - angle, is_sm) : 0;
+ if (filter_strength) {
+ filter_edge(top_out, width + height, 0, width + height,
+ &topleft_in[1], -1, width + imin(width, height),
+ filter_strength);
+ top = top_out;
+ max_base_x = width + height - 1;
+ } else {
+ top = &topleft_in[1];
+ max_base_x = width + imin(width, height) - 1;
+ }
+ }
+ const int base_inc = 1 + upsample_above;
+ for (int y = 0, xpos = dx; y < height;
+ y++, dst += PXSTRIDE(stride), xpos += dx)
+ {
+ const int frac = xpos & 0x3E;
+
+ for (int x = 0, base = xpos >> 6; x < width; x++, base += base_inc) {
+ if (base < max_base_x) {
+ const int v = top[base] * (64 - frac) + top[base + 1] * frac;
+ dst[x] = (v + 32) >> 6;
+ } else {
+ pixel_set(&dst[x], top[max_base_x], width - x);
+ break;
+ }
+ }
+ }
+}
+
+static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int is_sm = (angle >> 9) & 0x1;
+ const int enable_intra_edge_filter = angle >> 10;
+ angle &= 511;
+ assert(angle > 90 && angle < 180);
+ int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1];
+ int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1];
+ const int upsample_left = enable_intra_edge_filter ?
+ get_upsample(width + height, 180 - angle, is_sm) : 0;
+ const int upsample_above = enable_intra_edge_filter ?
+ get_upsample(width + height, angle - 90, is_sm) : 0;
+ pixel edge[64 + 64 + 1];
+ pixel *const topleft = &edge[64];
+
+ if (upsample_above) {
+ upsample_edge(topleft, width + 1, topleft_in, 0, width + 1
+ HIGHBD_TAIL_SUFFIX);
+ dx <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, angle - 90, is_sm) : 0;
+
+ if (filter_strength) {
+ filter_edge(&topleft[1], width, 0, max_width,
+ &topleft_in[1], -1, width,
+ filter_strength);
+ } else {
+ pixel_copy(&topleft[1], &topleft_in[1], width);
+ }
+ }
+ if (upsample_left) {
+ upsample_edge(&topleft[-height * 2], height + 1, &topleft_in[-height],
+ 0, height + 1 HIGHBD_TAIL_SUFFIX);
+ dy <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, 180 - angle, is_sm) : 0;
+
+ if (filter_strength) {
+ filter_edge(&topleft[-height], height, height - max_height, height,
+ &topleft_in[-height],
+ 0, height + 1, filter_strength);
+ } else {
+ pixel_copy(&topleft[-height], &topleft_in[-height], height);
+ }
+ }
+ *topleft = *topleft_in;
+
+ const int base_inc_x = 1 + upsample_above;
+ const pixel *const left = &topleft[-(1 + upsample_left)];
+ for (int y = 0, xpos = ((1 + upsample_above) << 6) - dx; y < height;
+ y++, xpos -= dx, dst += PXSTRIDE(stride))
+ {
+ int base_x = xpos >> 6;
+ const int frac_x = xpos & 0x3E;
+
+ for (int x = 0, ypos = (y << (6 + upsample_left)) - dy; x < width;
+ x++, base_x += base_inc_x, ypos -= dy)
+ {
+ int v;
+ if (base_x >= 0) {
+ v = topleft[base_x] * (64 - frac_x) +
+ topleft[base_x + 1] * frac_x;
+ } else {
+ const int base_y = ypos >> 6;
+ assert(base_y >= -(1 + upsample_left));
+ const int frac_y = ypos & 0x3E;
+ v = left[-base_y] * (64 - frac_y) +
+ left[-(base_y + 1)] * frac_y;
+ }
+ dst[x] = (v + 32) >> 6;
+ }
+ }
+}
+
+static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ const int is_sm = (angle >> 9) & 0x1;
+ const int enable_intra_edge_filter = angle >> 10;
+ angle &= 511;
+ assert(angle > 180);
+ int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
+ pixel left_out[64 + 64];
+ const pixel *left;
+ int max_base_y;
+ const int upsample_left = enable_intra_edge_filter ?
+ get_upsample(width + height, angle - 180, is_sm) : 0;
+ if (upsample_left) {
+ upsample_edge(left_out, width + height,
+ &topleft_in[-(width + height)],
+ imax(width - height, 0), width + height + 1
+ HIGHBD_TAIL_SUFFIX);
+ left = &left_out[2 * (width + height) - 2];
+ max_base_y = 2 * (width + height) - 2;
+ dy <<= 1;
+ } else {
+ const int filter_strength = enable_intra_edge_filter ?
+ get_filter_strength(width + height, angle - 180, is_sm) : 0;
+
+ if (filter_strength) {
+ filter_edge(left_out, width + height, 0, width + height,
+ &topleft_in[-(width + height)],
+ imax(width - height, 0), width + height + 1,
+ filter_strength);
+ left = &left_out[width + height - 1];
+ max_base_y = width + height - 1;
+ } else {
+ left = &topleft_in[-1];
+ max_base_y = height + imin(width, height) - 1;
+ }
+ }
+ const int base_inc = 1 + upsample_left;
+ for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {
+ const int frac = ypos & 0x3E;
+
+ for (int y = 0, base = ypos >> 6; y < height; y++, base += base_inc) {
+ if (base < max_base_y) {
+ const int v = left[-base] * (64 - frac) +
+ left[-(base + 1)] * frac;
+ dst[y * PXSTRIDE(stride) + x] = (v + 32) >> 6;
+ } else {
+ do {
+ dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];
+ } while (++y < height);
+ break;
+ }
+ }
+ }
+}
+
+#if ARCH_X86
+#define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \
+ flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 + \
+ flt_ptr[16] * p2 + flt_ptr[17] * p3 + \
+ flt_ptr[32] * p4 + flt_ptr[33] * p5 + \
+ flt_ptr[48] * p6
+#define FLT_INCR 2
+#else
+#define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \
+ flt_ptr[ 0] * p0 + flt_ptr[ 8] * p1 + \
+ flt_ptr[16] * p2 + flt_ptr[24] * p3 + \
+ flt_ptr[32] * p4 + flt_ptr[40] * p5 + \
+ flt_ptr[48] * p6
+#define FLT_INCR 1
+#endif
+
+/* Up to 32x32 only */
+static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int filt_idx,
+ const int max_width, const int max_height
+ HIGHBD_DECL_SUFFIX)
+{
+ filt_idx &= 511;
+ assert(filt_idx < 5);
+
+ const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];
+ const pixel *top = &topleft_in[1];
+ for (int y = 0; y < height; y += 2) {
+ const pixel *topleft = &topleft_in[-y];
+ const pixel *left = &topleft[-1];
+ ptrdiff_t left_stride = -1;
+ for (int x = 0; x < width; x += 4) {
+ const int p0 = *topleft;
+ const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];
+ const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];
+ pixel *ptr = &dst[x];
+ const int8_t *flt_ptr = filter;
+
+ for (int yy = 0; yy < 2; yy++) {
+ for (int xx = 0; xx < 4; xx++, flt_ptr += FLT_INCR) {
+ const int acc = FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6);
+ ptr[xx] = iclip_pixel((acc + 8) >> 4);
+ }
+ ptr += PXSTRIDE(stride);
+ }
+ left = &dst[x + 4 - 1];
+ left_stride = PXSTRIDE(stride);
+ top += 4;
+ topleft = &top[-1];
+ }
+ top = &dst[PXSTRIDE(stride)];
+ dst = &dst[PXSTRIDE(stride) * 2];
+ }
+}
+
+static NOINLINE void
+cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
+ const int w_pad, const int h_pad, const int width, const int height,
+ const int ss_hor, const int ss_ver)
+{
+ int y, x;
+ int16_t *const ac_orig = ac;
+
+ assert(w_pad >= 0 && w_pad * 4 < width);
+ assert(h_pad >= 0 && h_pad * 4 < height);
+
+ for (y = 0; y < height - 4 * h_pad; y++) {
+ for (x = 0; x < width - 4 * w_pad; x++) {
+ int ac_sum = ypx[x << ss_hor];
+ if (ss_hor) ac_sum += ypx[x * 2 + 1];
+ if (ss_ver) {
+ ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)];
+ if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)];
+ }
+ ac[x] = ac_sum << (1 + !ss_ver + !ss_hor);
+ }
+ for (; x < width; x++)
+ ac[x] = ac[x - 1];
+ ac += width;
+ ypx += PXSTRIDE(stride) << ss_ver;
+ }
+ for (; y < height; y++) {
+ memcpy(ac, &ac[-width], width * sizeof(*ac));
+ ac += width;
+ }
+
+ const int log2sz = ctz(width) + ctz(height);
+ int sum = (1 << log2sz) >> 1;
+ for (ac = ac_orig, y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ sum += ac[x];
+ ac += width;
+ }
+ sum >>= log2sz;
+
+ // subtract DC
+ for (ac = ac_orig, y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ ac[x] -= sum;
+ ac += width;
+ }
+}
+
+#define cfl_ac_fn(fmt, ss_hor, ss_ver) \
+static void cfl_ac_##fmt##_c(int16_t *const ac, const pixel *const ypx, \
+ const ptrdiff_t stride, const int w_pad, \
+ const int h_pad, const int cw, const int ch) \
+{ \
+ cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver); \
+}
+
+cfl_ac_fn(420, 1, 1)
+cfl_ac_fn(422, 1, 0)
+cfl_ac_fn(444, 0, 0)
+
+static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
+ const uint16_t *const pal, const uint8_t *idx,
+ const int w, const int h)
+{
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++)
+ dst[x] = (pixel) pal[idx[x]];
+ idx += w;
+ dst += PXSTRIDE(stride);
+ }
+}
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/ipred.h"
+#elif ARCH_X86
+#include "src/x86/ipred.h"
+#endif
+#endif
+
+COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
+ c->intra_pred[DC_PRED ] = ipred_dc_c;
+ c->intra_pred[DC_128_PRED ] = ipred_dc_128_c;
+ c->intra_pred[TOP_DC_PRED ] = ipred_dc_top_c;
+ c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;
+ c->intra_pred[HOR_PRED ] = ipred_h_c;
+ c->intra_pred[VERT_PRED ] = ipred_v_c;
+ c->intra_pred[PAETH_PRED ] = ipred_paeth_c;
+ c->intra_pred[SMOOTH_PRED ] = ipred_smooth_c;
+ c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
+ c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
+ c->intra_pred[Z1_PRED ] = ipred_z1_c;
+ c->intra_pred[Z2_PRED ] = ipred_z2_c;
+ c->intra_pred[Z3_PRED ] = ipred_z3_c;
+ c->intra_pred[FILTER_PRED ] = ipred_filter_c;
+
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = cfl_ac_420_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = cfl_ac_422_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = cfl_ac_444_c;
+
+ c->cfl_pred[DC_PRED ] = ipred_cfl_c;
+ c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;
+ c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;
+ c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;
+
+ c->pal_pred = pal_pred_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ intra_pred_dsp_init_arm(c);
+#elif ARCH_X86
+ intra_pred_dsp_init_x86(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/itx.h b/third_party/dav1d/src/itx.h
new file mode 100644
index 0000000000..d522079907
--- /dev/null
+++ b/third_party/dav1d/src/itx.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_ITX_H
+#define DAV1D_SRC_ITX_H
+
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+
+#define decl_itx_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob \
+ HIGHBD_DECL_SUFFIX)
+typedef decl_itx_fn(*itxfm_fn);
+
+typedef struct Dav1dInvTxfmDSPContext {
+ itxfm_fn itxfm_add[N_RECT_TX_SIZES][N_TX_TYPES_PLUS_LL];
+} Dav1dInvTxfmDSPContext;
+
+bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc);
+
+#endif /* DAV1D_SRC_ITX_H */
diff --git a/third_party/dav1d/src/itx_1d.c b/third_party/dav1d/src/itx_1d.c
new file mode 100644
index 0000000000..ca14fc8c41
--- /dev/null
+++ b/third_party/dav1d/src/itx_1d.c
@@ -0,0 +1,1034 @@
+/*
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "common/intops.h"
+
+#include "src/itx_1d.h"
+
+#define CLIP(a) iclip(a, min, max)
+
+/*
+ * In some places, we use the pattern like this:
+ * t2 = ((in1 * 1567 - in3 * (3784 - 4096) + 2048) >> 12) - in3;
+ * even though the reference code might use something like:
+ * t2 = (in1 * 1567 - in3 * 3784 + 2048) >> 12;
+ *
+ * The reason for this is that for 12 bits/component bitstreams (corrupt/
+ * invalid ones, but they are codable nonetheless), each coefficient or
+ * input can be 19(+sign) bits, and therefore if the combination of the
+ * two multipliers (each 12 bits) is >= 4096, the result of the add/sub
+ * after the pair of multiplies will exceed the 31+sign bit range. Signed
+ * integer overflows are UB in C, and we'd like to prevent that.
+ *
+ * To workaround this, we invert one of the two coefficients (or, if both are
+ * multiples of 2, we reduce their magnitude by one bit). It should be noted
+ * that SIMD implementations do not have to follow this exact behaviour. The
+ * AV1 spec clearly states that the result of the multiply/add pairs should
+ * fit in 31+sign bit intermediates, and that streams violating this convention
+ * are not AV1-compliant. So, as long as we don't trigger UB (which some people
+ * would consider a security vulnerability), we're fine. So, SIMD can simply
+ * use the faster implementation, even if that might in some cases result in
+ * integer overflows, since these are not considered valid AV1 anyway, and in
+ * e.g. x86 assembly, integer overflows are not considered UB, but they merely
+ * wrap around.
+ */
+
+static NOINLINE void
+inv_dct4_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max, const int tx64)
+{
+ assert(stride > 0);
+ const int in0 = c[0 * stride], in1 = c[1 * stride];
+
+ int t0, t1, t2, t3;
+ if (tx64) {
+ t0 = t1 = (in0 * 181 + 128) >> 8;
+ t2 = (in1 * 1567 + 2048) >> 12;
+ t3 = (in1 * 3784 + 2048) >> 12;
+ } else {
+ const int in2 = c[2 * stride], in3 = c[3 * stride];
+
+ t0 = ((in0 + in2) * 181 + 128) >> 8;
+ t1 = ((in0 - in2) * 181 + 128) >> 8;
+ t2 = ((in1 * 1567 - in3 * (3784 - 4096) + 2048) >> 12) - in3;
+ t3 = ((in1 * (3784 - 4096) + in3 * 1567 + 2048) >> 12) + in1;
+ }
+
+ c[0 * stride] = CLIP(t0 + t3);
+ c[1 * stride] = CLIP(t1 + t2);
+ c[2 * stride] = CLIP(t1 - t2);
+ c[3 * stride] = CLIP(t0 - t3);
+}
+
+void dav1d_inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ inv_dct4_1d_internal_c(c, stride, min, max, 0);
+}
+
+static NOINLINE void
+inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max, const int tx64)
+{
+ assert(stride > 0);
+ inv_dct4_1d_internal_c(c, stride << 1, min, max, tx64);
+
+ const int in1 = c[1 * stride], in3 = c[3 * stride];
+
+ int t4a, t5a, t6a, t7a;
+ if (tx64) {
+ t4a = (in1 * 799 + 2048) >> 12;
+ t5a = (in3 * -2276 + 2048) >> 12;
+ t6a = (in3 * 3406 + 2048) >> 12;
+ t7a = (in1 * 4017 + 2048) >> 12;
+ } else {
+ const int in5 = c[5 * stride], in7 = c[7 * stride];
+
+ t4a = ((in1 * 799 - in7 * (4017 - 4096) + 2048) >> 12) - in7;
+ t5a = (in5 * 1703 - in3 * 1138 + 1024) >> 11;
+ t6a = (in5 * 1138 + in3 * 1703 + 1024) >> 11;
+ t7a = ((in1 * (4017 - 4096) + in7 * 799 + 2048) >> 12) + in1;
+ }
+
+ const int t4 = CLIP(t4a + t5a);
+ t5a = CLIP(t4a - t5a);
+ const int t7 = CLIP(t7a + t6a);
+ t6a = CLIP(t7a - t6a);
+
+ const int t5 = ((t6a - t5a) * 181 + 128) >> 8;
+ const int t6 = ((t6a + t5a) * 181 + 128) >> 8;
+
+ const int t0 = c[0 * stride];
+ const int t1 = c[2 * stride];
+ const int t2 = c[4 * stride];
+ const int t3 = c[6 * stride];
+
+ c[0 * stride] = CLIP(t0 + t7);
+ c[1 * stride] = CLIP(t1 + t6);
+ c[2 * stride] = CLIP(t2 + t5);
+ c[3 * stride] = CLIP(t3 + t4);
+ c[4 * stride] = CLIP(t3 - t4);
+ c[5 * stride] = CLIP(t2 - t5);
+ c[6 * stride] = CLIP(t1 - t6);
+ c[7 * stride] = CLIP(t0 - t7);
+}
+
+void dav1d_inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ inv_dct8_1d_internal_c(c, stride, min, max, 0);
+}
+
+static NOINLINE void
+inv_dct16_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max, int tx64)
+{
+ assert(stride > 0);
+ inv_dct8_1d_internal_c(c, stride << 1, min, max, tx64);
+
+ const int in1 = c[1 * stride], in3 = c[3 * stride];
+ const int in5 = c[5 * stride], in7 = c[7 * stride];
+
+ int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
+ if (tx64) {
+ t8a = (in1 * 401 + 2048) >> 12;
+ t9a = (in7 * -2598 + 2048) >> 12;
+ t10a = (in5 * 1931 + 2048) >> 12;
+ t11a = (in3 * -1189 + 2048) >> 12;
+ t12a = (in3 * 3920 + 2048) >> 12;
+ t13a = (in5 * 3612 + 2048) >> 12;
+ t14a = (in7 * 3166 + 2048) >> 12;
+ t15a = (in1 * 4076 + 2048) >> 12;
+ } else {
+ const int in9 = c[ 9 * stride], in11 = c[11 * stride];
+ const int in13 = c[13 * stride], in15 = c[15 * stride];
+
+ t8a = ((in1 * 401 - in15 * (4076 - 4096) + 2048) >> 12) - in15;
+ t9a = (in9 * 1583 - in7 * 1299 + 1024) >> 11;
+ t10a = ((in5 * 1931 - in11 * (3612 - 4096) + 2048) >> 12) - in11;
+ t11a = ((in13 * (3920 - 4096) - in3 * 1189 + 2048) >> 12) + in13;
+ t12a = ((in13 * 1189 + in3 * (3920 - 4096) + 2048) >> 12) + in3;
+ t13a = ((in5 * (3612 - 4096) + in11 * 1931 + 2048) >> 12) + in5;
+ t14a = (in9 * 1299 + in7 * 1583 + 1024) >> 11;
+ t15a = ((in1 * (4076 - 4096) + in15 * 401 + 2048) >> 12) + in1;
+ }
+
+ int t8 = CLIP(t8a + t9a);
+ int t9 = CLIP(t8a - t9a);
+ int t10 = CLIP(t11a - t10a);
+ int t11 = CLIP(t11a + t10a);
+ int t12 = CLIP(t12a + t13a);
+ int t13 = CLIP(t12a - t13a);
+ int t14 = CLIP(t15a - t14a);
+ int t15 = CLIP(t15a + t14a);
+
+ t9a = (( t14 * 1567 - t9 * (3784 - 4096) + 2048) >> 12) - t9;
+ t14a = (( t14 * (3784 - 4096) + t9 * 1567 + 2048) >> 12) + t14;
+ t10a = ((-(t13 * (3784 - 4096) + t10 * 1567) + 2048) >> 12) - t13;
+ t13a = (( t13 * 1567 - t10 * (3784 - 4096) + 2048) >> 12) - t10;
+
+ t8a = CLIP(t8 + t11);
+ t9 = CLIP(t9a + t10a);
+ t10 = CLIP(t9a - t10a);
+ t11a = CLIP(t8 - t11);
+ t12a = CLIP(t15 - t12);
+ t13 = CLIP(t14a - t13a);
+ t14 = CLIP(t14a + t13a);
+ t15a = CLIP(t15 + t12);
+
+ t10a = ((t13 - t10) * 181 + 128) >> 8;
+ t13a = ((t13 + t10) * 181 + 128) >> 8;
+ t11 = ((t12a - t11a) * 181 + 128) >> 8;
+ t12 = ((t12a + t11a) * 181 + 128) >> 8;
+
+ const int t0 = c[ 0 * stride];
+ const int t1 = c[ 2 * stride];
+ const int t2 = c[ 4 * stride];
+ const int t3 = c[ 6 * stride];
+ const int t4 = c[ 8 * stride];
+ const int t5 = c[10 * stride];
+ const int t6 = c[12 * stride];
+ const int t7 = c[14 * stride];
+
+ c[ 0 * stride] = CLIP(t0 + t15a);
+ c[ 1 * stride] = CLIP(t1 + t14);
+ c[ 2 * stride] = CLIP(t2 + t13a);
+ c[ 3 * stride] = CLIP(t3 + t12);
+ c[ 4 * stride] = CLIP(t4 + t11);
+ c[ 5 * stride] = CLIP(t5 + t10a);
+ c[ 6 * stride] = CLIP(t6 + t9);
+ c[ 7 * stride] = CLIP(t7 + t8a);
+ c[ 8 * stride] = CLIP(t7 - t8a);
+ c[ 9 * stride] = CLIP(t6 - t9);
+ c[10 * stride] = CLIP(t5 - t10a);
+ c[11 * stride] = CLIP(t4 - t11);
+ c[12 * stride] = CLIP(t3 - t12);
+ c[13 * stride] = CLIP(t2 - t13a);
+ c[14 * stride] = CLIP(t1 - t14);
+ c[15 * stride] = CLIP(t0 - t15a);
+}
+
+void dav1d_inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ inv_dct16_1d_internal_c(c, stride, min, max, 0);
+}
+
+static NOINLINE void
+inv_dct32_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max, const int tx64)
+{
+ assert(stride > 0);
+ inv_dct16_1d_internal_c(c, stride << 1, min, max, tx64);
+
+ const int in1 = c[ 1 * stride], in3 = c[ 3 * stride];
+ const int in5 = c[ 5 * stride], in7 = c[ 7 * stride];
+ const int in9 = c[ 9 * stride], in11 = c[11 * stride];
+ const int in13 = c[13 * stride], in15 = c[15 * stride];
+
+ int t16a, t17a, t18a, t19a, t20a, t21a, t22a, t23a;
+ int t24a, t25a, t26a, t27a, t28a, t29a, t30a, t31a;
+ if (tx64) {
+ t16a = (in1 * 201 + 2048) >> 12;
+ t17a = (in15 * -2751 + 2048) >> 12;
+ t18a = (in9 * 1751 + 2048) >> 12;
+ t19a = (in7 * -1380 + 2048) >> 12;
+ t20a = (in5 * 995 + 2048) >> 12;
+ t21a = (in11 * -2106 + 2048) >> 12;
+ t22a = (in13 * 2440 + 2048) >> 12;
+ t23a = (in3 * -601 + 2048) >> 12;
+ t24a = (in3 * 4052 + 2048) >> 12;
+ t25a = (in13 * 3290 + 2048) >> 12;
+ t26a = (in11 * 3513 + 2048) >> 12;
+ t27a = (in5 * 3973 + 2048) >> 12;
+ t28a = (in7 * 3857 + 2048) >> 12;
+ t29a = (in9 * 3703 + 2048) >> 12;
+ t30a = (in15 * 3035 + 2048) >> 12;
+ t31a = (in1 * 4091 + 2048) >> 12;
+ } else {
+ const int in17 = c[17 * stride], in19 = c[19 * stride];
+ const int in21 = c[21 * stride], in23 = c[23 * stride];
+ const int in25 = c[25 * stride], in27 = c[27 * stride];
+ const int in29 = c[29 * stride], in31 = c[31 * stride];
+
+ t16a = ((in1 * 201 - in31 * (4091 - 4096) + 2048) >> 12) - in31;
+ t17a = ((in17 * (3035 - 4096) - in15 * 2751 + 2048) >> 12) + in17;
+ t18a = ((in9 * 1751 - in23 * (3703 - 4096) + 2048) >> 12) - in23;
+ t19a = ((in25 * (3857 - 4096) - in7 * 1380 + 2048) >> 12) + in25;
+ t20a = ((in5 * 995 - in27 * (3973 - 4096) + 2048) >> 12) - in27;
+ t21a = ((in21 * (3513 - 4096) - in11 * 2106 + 2048) >> 12) + in21;
+ t22a = (in13 * 1220 - in19 * 1645 + 1024) >> 11;
+ t23a = ((in29 * (4052 - 4096) - in3 * 601 + 2048) >> 12) + in29;
+ t24a = ((in29 * 601 + in3 * (4052 - 4096) + 2048) >> 12) + in3;
+ t25a = (in13 * 1645 + in19 * 1220 + 1024) >> 11;
+ t26a = ((in21 * 2106 + in11 * (3513 - 4096) + 2048) >> 12) + in11;
+ t27a = ((in5 * (3973 - 4096) + in27 * 995 + 2048) >> 12) + in5;
+ t28a = ((in25 * 1380 + in7 * (3857 - 4096) + 2048) >> 12) + in7;
+ t29a = ((in9 * (3703 - 4096) + in23 * 1751 + 2048) >> 12) + in9;
+ t30a = ((in17 * 2751 + in15 * (3035 - 4096) + 2048) >> 12) + in15;
+ t31a = ((in1 * (4091 - 4096) + in31 * 201 + 2048) >> 12) + in1;
+ }
+
+ int t16 = CLIP(t16a + t17a);
+ int t17 = CLIP(t16a - t17a);
+ int t18 = CLIP(t19a - t18a);
+ int t19 = CLIP(t19a + t18a);
+ int t20 = CLIP(t20a + t21a);
+ int t21 = CLIP(t20a - t21a);
+ int t22 = CLIP(t23a - t22a);
+ int t23 = CLIP(t23a + t22a);
+ int t24 = CLIP(t24a + t25a);
+ int t25 = CLIP(t24a - t25a);
+ int t26 = CLIP(t27a - t26a);
+ int t27 = CLIP(t27a + t26a);
+ int t28 = CLIP(t28a + t29a);
+ int t29 = CLIP(t28a - t29a);
+ int t30 = CLIP(t31a - t30a);
+ int t31 = CLIP(t31a + t30a);
+
+ t17a = (( t30 * 799 - t17 * (4017 - 4096) + 2048) >> 12) - t17;
+ t30a = (( t30 * (4017 - 4096) + t17 * 799 + 2048) >> 12) + t30;
+ t18a = ((-(t29 * (4017 - 4096) + t18 * 799) + 2048) >> 12) - t29;
+ t29a = (( t29 * 799 - t18 * (4017 - 4096) + 2048) >> 12) - t18;
+ t21a = ( t26 * 1703 - t21 * 1138 + 1024) >> 11;
+ t26a = ( t26 * 1138 + t21 * 1703 + 1024) >> 11;
+ t22a = (-(t25 * 1138 + t22 * 1703 ) + 1024) >> 11;
+ t25a = ( t25 * 1703 - t22 * 1138 + 1024) >> 11;
+
+ t16a = CLIP(t16 + t19);
+ t17 = CLIP(t17a + t18a);
+ t18 = CLIP(t17a - t18a);
+ t19a = CLIP(t16 - t19);
+ t20a = CLIP(t23 - t20);
+ t21 = CLIP(t22a - t21a);
+ t22 = CLIP(t22a + t21a);
+ t23a = CLIP(t23 + t20);
+ t24a = CLIP(t24 + t27);
+ t25 = CLIP(t25a + t26a);
+ t26 = CLIP(t25a - t26a);
+ t27a = CLIP(t24 - t27);
+ t28a = CLIP(t31 - t28);
+ t29 = CLIP(t30a - t29a);
+ t30 = CLIP(t30a + t29a);
+ t31a = CLIP(t31 + t28);
+
+ t18a = (( t29 * 1567 - t18 * (3784 - 4096) + 2048) >> 12) - t18;
+ t29a = (( t29 * (3784 - 4096) + t18 * 1567 + 2048) >> 12) + t29;
+ t19 = (( t28a * 1567 - t19a * (3784 - 4096) + 2048) >> 12) - t19a;
+ t28 = (( t28a * (3784 - 4096) + t19a * 1567 + 2048) >> 12) + t28a;
+ t20 = ((-(t27a * (3784 - 4096) + t20a * 1567) + 2048) >> 12) - t27a;
+ t27 = (( t27a * 1567 - t20a * (3784 - 4096) + 2048) >> 12) - t20a;
+ t21a = ((-(t26 * (3784 - 4096) + t21 * 1567) + 2048) >> 12) - t26;
+ t26a = (( t26 * 1567 - t21 * (3784 - 4096) + 2048) >> 12) - t21;
+
+ t16 = CLIP(t16a + t23a);
+ t17a = CLIP(t17 + t22);
+ t18 = CLIP(t18a + t21a);
+ t19a = CLIP(t19 + t20);
+ t20a = CLIP(t19 - t20);
+ t21 = CLIP(t18a - t21a);
+ t22a = CLIP(t17 - t22);
+ t23 = CLIP(t16a - t23a);
+ t24 = CLIP(t31a - t24a);
+ t25a = CLIP(t30 - t25);
+ t26 = CLIP(t29a - t26a);
+ t27a = CLIP(t28 - t27);
+ t28a = CLIP(t28 + t27);
+ t29 = CLIP(t29a + t26a);
+ t30a = CLIP(t30 + t25);
+ t31 = CLIP(t31a + t24a);
+
+ t20 = ((t27a - t20a) * 181 + 128) >> 8;
+ t27 = ((t27a + t20a) * 181 + 128) >> 8;
+ t21a = ((t26 - t21 ) * 181 + 128) >> 8;
+ t26a = ((t26 + t21 ) * 181 + 128) >> 8;
+ t22 = ((t25a - t22a) * 181 + 128) >> 8;
+ t25 = ((t25a + t22a) * 181 + 128) >> 8;
+ t23a = ((t24 - t23 ) * 181 + 128) >> 8;
+ t24a = ((t24 + t23 ) * 181 + 128) >> 8;
+
+ const int t0 = c[ 0 * stride];
+ const int t1 = c[ 2 * stride];
+ const int t2 = c[ 4 * stride];
+ const int t3 = c[ 6 * stride];
+ const int t4 = c[ 8 * stride];
+ const int t5 = c[10 * stride];
+ const int t6 = c[12 * stride];
+ const int t7 = c[14 * stride];
+ const int t8 = c[16 * stride];
+ const int t9 = c[18 * stride];
+ const int t10 = c[20 * stride];
+ const int t11 = c[22 * stride];
+ const int t12 = c[24 * stride];
+ const int t13 = c[26 * stride];
+ const int t14 = c[28 * stride];
+ const int t15 = c[30 * stride];
+
+ c[ 0 * stride] = CLIP(t0 + t31);
+ c[ 1 * stride] = CLIP(t1 + t30a);
+ c[ 2 * stride] = CLIP(t2 + t29);
+ c[ 3 * stride] = CLIP(t3 + t28a);
+ c[ 4 * stride] = CLIP(t4 + t27);
+ c[ 5 * stride] = CLIP(t5 + t26a);
+ c[ 6 * stride] = CLIP(t6 + t25);
+ c[ 7 * stride] = CLIP(t7 + t24a);
+ c[ 8 * stride] = CLIP(t8 + t23a);
+ c[ 9 * stride] = CLIP(t9 + t22);
+ c[10 * stride] = CLIP(t10 + t21a);
+ c[11 * stride] = CLIP(t11 + t20);
+ c[12 * stride] = CLIP(t12 + t19a);
+ c[13 * stride] = CLIP(t13 + t18);
+ c[14 * stride] = CLIP(t14 + t17a);
+ c[15 * stride] = CLIP(t15 + t16);
+ c[16 * stride] = CLIP(t15 - t16);
+ c[17 * stride] = CLIP(t14 - t17a);
+ c[18 * stride] = CLIP(t13 - t18);
+ c[19 * stride] = CLIP(t12 - t19a);
+ c[20 * stride] = CLIP(t11 - t20);
+ c[21 * stride] = CLIP(t10 - t21a);
+ c[22 * stride] = CLIP(t9 - t22);
+ c[23 * stride] = CLIP(t8 - t23a);
+ c[24 * stride] = CLIP(t7 - t24a);
+ c[25 * stride] = CLIP(t6 - t25);
+ c[26 * stride] = CLIP(t5 - t26a);
+ c[27 * stride] = CLIP(t4 - t27);
+ c[28 * stride] = CLIP(t3 - t28a);
+ c[29 * stride] = CLIP(t2 - t29);
+ c[30 * stride] = CLIP(t1 - t30a);
+ c[31 * stride] = CLIP(t0 - t31);
+}
+
+void dav1d_inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ inv_dct32_1d_internal_c(c, stride, min, max, 0);
+}
+
+void dav1d_inv_dct64_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ assert(stride > 0);
+ inv_dct32_1d_internal_c(c, stride << 1, min, max, 1);
+
+ const int in1 = c[ 1 * stride], in3 = c[ 3 * stride];
+ const int in5 = c[ 5 * stride], in7 = c[ 7 * stride];
+ const int in9 = c[ 9 * stride], in11 = c[11 * stride];
+ const int in13 = c[13 * stride], in15 = c[15 * stride];
+ const int in17 = c[17 * stride], in19 = c[19 * stride];
+ const int in21 = c[21 * stride], in23 = c[23 * stride];
+ const int in25 = c[25 * stride], in27 = c[27 * stride];
+ const int in29 = c[29 * stride], in31 = c[31 * stride];
+
+ int t32a = (in1 * 101 + 2048) >> 12;
+ int t33a = (in31 * -2824 + 2048) >> 12;
+ int t34a = (in17 * 1660 + 2048) >> 12;
+ int t35a = (in15 * -1474 + 2048) >> 12;
+ int t36a = (in9 * 897 + 2048) >> 12;
+ int t37a = (in23 * -2191 + 2048) >> 12;
+ int t38a = (in25 * 2359 + 2048) >> 12;
+ int t39a = (in7 * -700 + 2048) >> 12;
+ int t40a = (in5 * 501 + 2048) >> 12;
+ int t41a = (in27 * -2520 + 2048) >> 12;
+ int t42a = (in21 * 2019 + 2048) >> 12;
+ int t43a = (in11 * -1092 + 2048) >> 12;
+ int t44a = (in13 * 1285 + 2048) >> 12;
+ int t45a = (in19 * -1842 + 2048) >> 12;
+ int t46a = (in29 * 2675 + 2048) >> 12;
+ int t47a = (in3 * -301 + 2048) >> 12;
+ int t48a = (in3 * 4085 + 2048) >> 12;
+ int t49a = (in29 * 3102 + 2048) >> 12;
+ int t50a = (in19 * 3659 + 2048) >> 12;
+ int t51a = (in13 * 3889 + 2048) >> 12;
+ int t52a = (in11 * 3948 + 2048) >> 12;
+ int t53a = (in21 * 3564 + 2048) >> 12;
+ int t54a = (in27 * 3229 + 2048) >> 12;
+ int t55a = (in5 * 4065 + 2048) >> 12;
+ int t56a = (in7 * 4036 + 2048) >> 12;
+ int t57a = (in25 * 3349 + 2048) >> 12;
+ int t58a = (in23 * 3461 + 2048) >> 12;
+ int t59a = (in9 * 3996 + 2048) >> 12;
+ int t60a = (in15 * 3822 + 2048) >> 12;
+ int t61a = (in17 * 3745 + 2048) >> 12;
+ int t62a = (in31 * 2967 + 2048) >> 12;
+ int t63a = (in1 * 4095 + 2048) >> 12;
+
+ int t32 = CLIP(t32a + t33a);
+ int t33 = CLIP(t32a - t33a);
+ int t34 = CLIP(t35a - t34a);
+ int t35 = CLIP(t35a + t34a);
+ int t36 = CLIP(t36a + t37a);
+ int t37 = CLIP(t36a - t37a);
+ int t38 = CLIP(t39a - t38a);
+ int t39 = CLIP(t39a + t38a);
+ int t40 = CLIP(t40a + t41a);
+ int t41 = CLIP(t40a - t41a);
+ int t42 = CLIP(t43a - t42a);
+ int t43 = CLIP(t43a + t42a);
+ int t44 = CLIP(t44a + t45a);
+ int t45 = CLIP(t44a - t45a);
+ int t46 = CLIP(t47a - t46a);
+ int t47 = CLIP(t47a + t46a);
+ int t48 = CLIP(t48a + t49a);
+ int t49 = CLIP(t48a - t49a);
+ int t50 = CLIP(t51a - t50a);
+ int t51 = CLIP(t51a + t50a);
+ int t52 = CLIP(t52a + t53a);
+ int t53 = CLIP(t52a - t53a);
+ int t54 = CLIP(t55a - t54a);
+ int t55 = CLIP(t55a + t54a);
+ int t56 = CLIP(t56a + t57a);
+ int t57 = CLIP(t56a - t57a);
+ int t58 = CLIP(t59a - t58a);
+ int t59 = CLIP(t59a + t58a);
+ int t60 = CLIP(t60a + t61a);
+ int t61 = CLIP(t60a - t61a);
+ int t62 = CLIP(t63a - t62a);
+ int t63 = CLIP(t63a + t62a);
+
+ t33a = ((t33 * (4096 - 4076) + t62 * 401 + 2048) >> 12) - t33;
+ t34a = ((t34 * -401 + t61 * (4096 - 4076) + 2048) >> 12) - t61;
+ t37a = (t37 * -1299 + t58 * 1583 + 1024) >> 11;
+ t38a = (t38 * -1583 + t57 * -1299 + 1024) >> 11;
+ t41a = ((t41 * (4096 - 3612) + t54 * 1931 + 2048) >> 12) - t41;
+ t42a = ((t42 * -1931 + t53 * (4096 - 3612) + 2048) >> 12) - t53;
+ t45a = ((t45 * -1189 + t50 * (3920 - 4096) + 2048) >> 12) + t50;
+ t46a = ((t46 * (4096 - 3920) + t49 * -1189 + 2048) >> 12) - t46;
+ t49a = ((t46 * -1189 + t49 * (3920 - 4096) + 2048) >> 12) + t49;
+ t50a = ((t45 * (3920 - 4096) + t50 * 1189 + 2048) >> 12) + t45;
+ t53a = ((t42 * (4096 - 3612) + t53 * 1931 + 2048) >> 12) - t42;
+ t54a = ((t41 * 1931 + t54 * (3612 - 4096) + 2048) >> 12) + t54;
+ t57a = (t38 * -1299 + t57 * 1583 + 1024) >> 11;
+ t58a = (t37 * 1583 + t58 * 1299 + 1024) >> 11;
+ t61a = ((t34 * (4096 - 4076) + t61 * 401 + 2048) >> 12) - t34;
+ t62a = ((t33 * 401 + t62 * (4076 - 4096) + 2048) >> 12) + t62;
+
+ t32a = CLIP(t32 + t35);
+ t33 = CLIP(t33a + t34a);
+ t34 = CLIP(t33a - t34a);
+ t35a = CLIP(t32 - t35);
+ t36a = CLIP(t39 - t36);
+ t37 = CLIP(t38a - t37a);
+ t38 = CLIP(t38a + t37a);
+ t39a = CLIP(t39 + t36);
+ t40a = CLIP(t40 + t43);
+ t41 = CLIP(t41a + t42a);
+ t42 = CLIP(t41a - t42a);
+ t43a = CLIP(t40 - t43);
+ t44a = CLIP(t47 - t44);
+ t45 = CLIP(t46a - t45a);
+ t46 = CLIP(t46a + t45a);
+ t47a = CLIP(t47 + t44);
+ t48a = CLIP(t48 + t51);
+ t49 = CLIP(t49a + t50a);
+ t50 = CLIP(t49a - t50a);
+ t51a = CLIP(t48 - t51);
+ t52a = CLIP(t55 - t52);
+ t53 = CLIP(t54a - t53a);
+ t54 = CLIP(t54a + t53a);
+ t55a = CLIP(t55 + t52);
+ t56a = CLIP(t56 + t59);
+ t57 = CLIP(t57a + t58a);
+ t58 = CLIP(t57a - t58a);
+ t59a = CLIP(t56 - t59);
+ t60a = CLIP(t63 - t60);
+ t61 = CLIP(t62a - t61a);
+ t62 = CLIP(t62a + t61a);
+ t63a = CLIP(t63 + t60);
+
+ t34a = ((t34 * (4096 - 4017) + t61 * 799 + 2048) >> 12) - t34;
+ t35 = ((t35a * (4096 - 4017) + t60a * 799 + 2048) >> 12) - t35a;
+ t36 = ((t36a * -799 + t59a * (4096 - 4017) + 2048) >> 12) - t59a;
+ t37a = ((t37 * -799 + t58 * (4096 - 4017) + 2048) >> 12) - t58;
+ t42a = (t42 * -1138 + t53 * 1703 + 1024) >> 11;
+ t43 = (t43a * -1138 + t52a * 1703 + 1024) >> 11;
+ t44 = (t44a * -1703 + t51a * -1138 + 1024) >> 11;
+ t45a = (t45 * -1703 + t50 * -1138 + 1024) >> 11;
+ t50a = (t45 * -1138 + t50 * 1703 + 1024) >> 11;
+ t51 = (t44a * -1138 + t51a * 1703 + 1024) >> 11;
+ t52 = (t43a * 1703 + t52a * 1138 + 1024) >> 11;
+ t53a = (t42 * 1703 + t53 * 1138 + 1024) >> 11;
+ t58a = ((t37 * (4096 - 4017) + t58 * 799 + 2048) >> 12) - t37;
+ t59 = ((t36a * (4096 - 4017) + t59a * 799 + 2048) >> 12) - t36a;
+ t60 = ((t35a * 799 + t60a * (4017 - 4096) + 2048) >> 12) + t60a;
+ t61a = ((t34 * 799 + t61 * (4017 - 4096) + 2048) >> 12) + t61;
+
+ t32 = CLIP(t32a + t39a);
+ t33a = CLIP(t33 + t38);
+ t34 = CLIP(t34a + t37a);
+ t35a = CLIP(t35 + t36);
+ t36a = CLIP(t35 - t36);
+ t37 = CLIP(t34a - t37a);
+ t38a = CLIP(t33 - t38);
+ t39 = CLIP(t32a - t39a);
+ t40 = CLIP(t47a - t40a);
+ t41a = CLIP(t46 - t41);
+ t42 = CLIP(t45a - t42a);
+ t43a = CLIP(t44 - t43);
+ t44a = CLIP(t44 + t43);
+ t45 = CLIP(t45a + t42a);
+ t46a = CLIP(t46 + t41);
+ t47 = CLIP(t47a + t40a);
+ t48 = CLIP(t48a + t55a);
+ t49a = CLIP(t49 + t54);
+ t50 = CLIP(t50a + t53a);
+ t51a = CLIP(t51 + t52);
+ t52a = CLIP(t51 - t52);
+ t53 = CLIP(t50a - t53a);
+ t54a = CLIP(t49 - t54);
+ t55 = CLIP(t48a - t55a);
+ t56 = CLIP(t63a - t56a);
+ t57a = CLIP(t62 - t57);
+ t58 = CLIP(t61a - t58a);
+ t59a = CLIP(t60 - t59);
+ t60a = CLIP(t60 + t59);
+ t61 = CLIP(t61a + t58a);
+ t62a = CLIP(t62 + t57);
+ t63 = CLIP(t63a + t56a);
+
+ t36 = ((t36a * (4096 - 3784) + t59a * 1567 + 2048) >> 12) - t36a;
+ t37a = ((t37 * (4096 - 3784) + t58 * 1567 + 2048) >> 12) - t37;
+ t38 = ((t38a * (4096 - 3784) + t57a * 1567 + 2048) >> 12) - t38a;
+ t39a = ((t39 * (4096 - 3784) + t56 * 1567 + 2048) >> 12) - t39;
+ t40a = ((t40 * -1567 + t55 * (4096 - 3784) + 2048) >> 12) - t55;
+ t41 = ((t41a * -1567 + t54a * (4096 - 3784) + 2048) >> 12) - t54a;
+ t42a = ((t42 * -1567 + t53 * (4096 - 3784) + 2048) >> 12) - t53;
+ t43 = ((t43a * -1567 + t52a * (4096 - 3784) + 2048) >> 12) - t52a;
+ t52 = ((t43a * (4096 - 3784) + t52a * 1567 + 2048) >> 12) - t43a;
+ t53a = ((t42 * (4096 - 3784) + t53 * 1567 + 2048) >> 12) - t42;
+ t54 = ((t41a * (4096 - 3784) + t54a * 1567 + 2048) >> 12) - t41a;
+ t55a = ((t40 * (4096 - 3784) + t55 * 1567 + 2048) >> 12) - t40;
+ t56a = ((t39 * 1567 + t56 * (3784 - 4096) + 2048) >> 12) + t56;
+ t57 = ((t38a * 1567 + t57a * (3784 - 4096) + 2048) >> 12) + t57a;
+ t58a = ((t37 * 1567 + t58 * (3784 - 4096) + 2048) >> 12) + t58;
+ t59 = ((t36a * 1567 + t59a * (3784 - 4096) + 2048) >> 12) + t59a;
+
+ t32a = CLIP(t32 + t47);
+ t33 = CLIP(t33a + t46a);
+ t34a = CLIP(t34 + t45);
+ t35 = CLIP(t35a + t44a);
+ t36a = CLIP(t36 + t43);
+ t37 = CLIP(t37a + t42a);
+ t38a = CLIP(t38 + t41);
+ t39 = CLIP(t39a + t40a);
+ t40 = CLIP(t39a - t40a);
+ t41a = CLIP(t38 - t41);
+ t42 = CLIP(t37a - t42a);
+ t43a = CLIP(t36 - t43);
+ t44 = CLIP(t35a - t44a);
+ t45a = CLIP(t34 - t45);
+ t46 = CLIP(t33a - t46a);
+ t47a = CLIP(t32 - t47);
+ t48a = CLIP(t63 - t48);
+ t49 = CLIP(t62a - t49a);
+ t50a = CLIP(t61 - t50);
+ t51 = CLIP(t60a - t51a);
+ t52a = CLIP(t59 - t52);
+ t53 = CLIP(t58a - t53a);
+ t54a = CLIP(t57 - t54);
+ t55 = CLIP(t56a - t55a);
+ t56 = CLIP(t56a + t55a);
+ t57a = CLIP(t57 + t54);
+ t58 = CLIP(t58a + t53a);
+ t59a = CLIP(t59 + t52);
+ t60 = CLIP(t60a + t51a);
+ t61a = CLIP(t61 + t50);
+ t62 = CLIP(t62a + t49a);
+ t63a = CLIP(t63 + t48);
+
+ t40a = ((t55 - t40 ) * 181 + 128) >> 8;
+ t41 = ((t54a - t41a) * 181 + 128) >> 8;
+ t42a = ((t53 - t42 ) * 181 + 128) >> 8;
+ t43 = ((t52a - t43a) * 181 + 128) >> 8;
+ t44a = ((t51 - t44 ) * 181 + 128) >> 8;
+ t45 = ((t50a - t45a) * 181 + 128) >> 8;
+ t46a = ((t49 - t46 ) * 181 + 128) >> 8;
+ t47 = ((t48a - t47a) * 181 + 128) >> 8;
+ t48 = ((t47a + t48a) * 181 + 128) >> 8;
+ t49a = ((t46 + t49 ) * 181 + 128) >> 8;
+ t50 = ((t45a + t50a) * 181 + 128) >> 8;
+ t51a = ((t44 + t51 ) * 181 + 128) >> 8;
+ t52 = ((t43a + t52a) * 181 + 128) >> 8;
+ t53a = ((t42 + t53 ) * 181 + 128) >> 8;
+ t54 = ((t41a + t54a) * 181 + 128) >> 8;
+ t55a = ((t40 + t55 ) * 181 + 128) >> 8;
+
+ const int t0 = c[ 0 * stride];
+ const int t1 = c[ 2 * stride];
+ const int t2 = c[ 4 * stride];
+ const int t3 = c[ 6 * stride];
+ const int t4 = c[ 8 * stride];
+ const int t5 = c[10 * stride];
+ const int t6 = c[12 * stride];
+ const int t7 = c[14 * stride];
+ const int t8 = c[16 * stride];
+ const int t9 = c[18 * stride];
+ const int t10 = c[20 * stride];
+ const int t11 = c[22 * stride];
+ const int t12 = c[24 * stride];
+ const int t13 = c[26 * stride];
+ const int t14 = c[28 * stride];
+ const int t15 = c[30 * stride];
+ const int t16 = c[32 * stride];
+ const int t17 = c[34 * stride];
+ const int t18 = c[36 * stride];
+ const int t19 = c[38 * stride];
+ const int t20 = c[40 * stride];
+ const int t21 = c[42 * stride];
+ const int t22 = c[44 * stride];
+ const int t23 = c[46 * stride];
+ const int t24 = c[48 * stride];
+ const int t25 = c[50 * stride];
+ const int t26 = c[52 * stride];
+ const int t27 = c[54 * stride];
+ const int t28 = c[56 * stride];
+ const int t29 = c[58 * stride];
+ const int t30 = c[60 * stride];
+ const int t31 = c[62 * stride];
+
+ c[ 0 * stride] = CLIP(t0 + t63a);
+ c[ 1 * stride] = CLIP(t1 + t62);
+ c[ 2 * stride] = CLIP(t2 + t61a);
+ c[ 3 * stride] = CLIP(t3 + t60);
+ c[ 4 * stride] = CLIP(t4 + t59a);
+ c[ 5 * stride] = CLIP(t5 + t58);
+ c[ 6 * stride] = CLIP(t6 + t57a);
+ c[ 7 * stride] = CLIP(t7 + t56);
+ c[ 8 * stride] = CLIP(t8 + t55a);
+ c[ 9 * stride] = CLIP(t9 + t54);
+ c[10 * stride] = CLIP(t10 + t53a);
+ c[11 * stride] = CLIP(t11 + t52);
+ c[12 * stride] = CLIP(t12 + t51a);
+ c[13 * stride] = CLIP(t13 + t50);
+ c[14 * stride] = CLIP(t14 + t49a);
+ c[15 * stride] = CLIP(t15 + t48);
+ c[16 * stride] = CLIP(t16 + t47);
+ c[17 * stride] = CLIP(t17 + t46a);
+ c[18 * stride] = CLIP(t18 + t45);
+ c[19 * stride] = CLIP(t19 + t44a);
+ c[20 * stride] = CLIP(t20 + t43);
+ c[21 * stride] = CLIP(t21 + t42a);
+ c[22 * stride] = CLIP(t22 + t41);
+ c[23 * stride] = CLIP(t23 + t40a);
+ c[24 * stride] = CLIP(t24 + t39);
+ c[25 * stride] = CLIP(t25 + t38a);
+ c[26 * stride] = CLIP(t26 + t37);
+ c[27 * stride] = CLIP(t27 + t36a);
+ c[28 * stride] = CLIP(t28 + t35);
+ c[29 * stride] = CLIP(t29 + t34a);
+ c[30 * stride] = CLIP(t30 + t33);
+ c[31 * stride] = CLIP(t31 + t32a);
+ c[32 * stride] = CLIP(t31 - t32a);
+ c[33 * stride] = CLIP(t30 - t33);
+ c[34 * stride] = CLIP(t29 - t34a);
+ c[35 * stride] = CLIP(t28 - t35);
+ c[36 * stride] = CLIP(t27 - t36a);
+ c[37 * stride] = CLIP(t26 - t37);
+ c[38 * stride] = CLIP(t25 - t38a);
+ c[39 * stride] = CLIP(t24 - t39);
+ c[40 * stride] = CLIP(t23 - t40a);
+ c[41 * stride] = CLIP(t22 - t41);
+ c[42 * stride] = CLIP(t21 - t42a);
+ c[43 * stride] = CLIP(t20 - t43);
+ c[44 * stride] = CLIP(t19 - t44a);
+ c[45 * stride] = CLIP(t18 - t45);
+ c[46 * stride] = CLIP(t17 - t46a);
+ c[47 * stride] = CLIP(t16 - t47);
+ c[48 * stride] = CLIP(t15 - t48);
+ c[49 * stride] = CLIP(t14 - t49a);
+ c[50 * stride] = CLIP(t13 - t50);
+ c[51 * stride] = CLIP(t12 - t51a);
+ c[52 * stride] = CLIP(t11 - t52);
+ c[53 * stride] = CLIP(t10 - t53a);
+ c[54 * stride] = CLIP(t9 - t54);
+ c[55 * stride] = CLIP(t8 - t55a);
+ c[56 * stride] = CLIP(t7 - t56);
+ c[57 * stride] = CLIP(t6 - t57a);
+ c[58 * stride] = CLIP(t5 - t58);
+ c[59 * stride] = CLIP(t4 - t59a);
+ c[60 * stride] = CLIP(t3 - t60);
+ c[61 * stride] = CLIP(t2 - t61a);
+ c[62 * stride] = CLIP(t1 - t62);
+ c[63 * stride] = CLIP(t0 - t63a);
+}
+
+static NOINLINE void
+inv_adst4_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
+ const int min, const int max,
+ int32_t *const out, const ptrdiff_t out_s)
+{
+ assert(in_s > 0 && out_s != 0);
+ const int in0 = in[0 * in_s], in1 = in[1 * in_s];
+ const int in2 = in[2 * in_s], in3 = in[3 * in_s];
+
+ out[0 * out_s] = (( 1321 * in0 + (3803 - 4096) * in2 +
+ (2482 - 4096) * in3 + (3344 - 4096) * in1 + 2048) >> 12) +
+ in2 + in3 + in1;
+ out[1 * out_s] = (((2482 - 4096) * in0 - 1321 * in2 -
+ (3803 - 4096) * in3 + (3344 - 4096) * in1 + 2048) >> 12) +
+ in0 - in3 + in1;
+ out[2 * out_s] = (209 * (in0 - in2 + in3) + 128) >> 8;
+ out[3 * out_s] = (((3803 - 4096) * in0 + (2482 - 4096) * in2 -
+ 1321 * in3 - (3344 - 4096) * in1 + 2048) >> 12) +
+ in0 + in2 - in1;
+}
+
+static NOINLINE void
+inv_adst8_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
+ const int min, const int max,
+ int32_t *const out, const ptrdiff_t out_s)
+{
+ assert(in_s > 0 && out_s != 0);
+ const int in0 = in[0 * in_s], in1 = in[1 * in_s];
+ const int in2 = in[2 * in_s], in3 = in[3 * in_s];
+ const int in4 = in[4 * in_s], in5 = in[5 * in_s];
+ const int in6 = in[6 * in_s], in7 = in[7 * in_s];
+
+ const int t0a = (((4076 - 4096) * in7 + 401 * in0 + 2048) >> 12) + in7;
+ const int t1a = (( 401 * in7 - (4076 - 4096) * in0 + 2048) >> 12) - in0;
+ const int t2a = (((3612 - 4096) * in5 + 1931 * in2 + 2048) >> 12) + in5;
+ const int t3a = (( 1931 * in5 - (3612 - 4096) * in2 + 2048) >> 12) - in2;
+ int t4a = ( 1299 * in3 + 1583 * in4 + 1024) >> 11;
+ int t5a = ( 1583 * in3 - 1299 * in4 + 1024) >> 11;
+ int t6a = (( 1189 * in1 + (3920 - 4096) * in6 + 2048) >> 12) + in6;
+ int t7a = (((3920 - 4096) * in1 - 1189 * in6 + 2048) >> 12) + in1;
+
+ const int t0 = CLIP(t0a + t4a);
+ const int t1 = CLIP(t1a + t5a);
+ int t2 = CLIP(t2a + t6a);
+ int t3 = CLIP(t3a + t7a);
+ const int t4 = CLIP(t0a - t4a);
+ const int t5 = CLIP(t1a - t5a);
+ int t6 = CLIP(t2a - t6a);
+ int t7 = CLIP(t3a - t7a);
+
+ t4a = (((3784 - 4096) * t4 + 1567 * t5 + 2048) >> 12) + t4;
+ t5a = (( 1567 * t4 - (3784 - 4096) * t5 + 2048) >> 12) - t5;
+ t6a = (((3784 - 4096) * t7 - 1567 * t6 + 2048) >> 12) + t7;
+ t7a = (( 1567 * t7 + (3784 - 4096) * t6 + 2048) >> 12) + t6;
+
+ out[0 * out_s] = CLIP(t0 + t2 );
+ out[7 * out_s] = -CLIP(t1 + t3 );
+ t2 = CLIP(t0 - t2 );
+ t3 = CLIP(t1 - t3 );
+ out[1 * out_s] = -CLIP(t4a + t6a);
+ out[6 * out_s] = CLIP(t5a + t7a);
+ t6 = CLIP(t4a - t6a);
+ t7 = CLIP(t5a - t7a);
+
+ out[3 * out_s] = -(((t2 + t3) * 181 + 128) >> 8);
+ out[4 * out_s] = ((t2 - t3) * 181 + 128) >> 8;
+ out[2 * out_s] = ((t6 + t7) * 181 + 128) >> 8;
+ out[5 * out_s] = -(((t6 - t7) * 181 + 128) >> 8);
+}
+
+static NOINLINE void
+inv_adst16_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
+ const int min, const int max,
+ int32_t *const out, const ptrdiff_t out_s)
+{
+ assert(in_s > 0 && out_s != 0);
+ const int in0 = in[ 0 * in_s], in1 = in[ 1 * in_s];
+ const int in2 = in[ 2 * in_s], in3 = in[ 3 * in_s];
+ const int in4 = in[ 4 * in_s], in5 = in[ 5 * in_s];
+ const int in6 = in[ 6 * in_s], in7 = in[ 7 * in_s];
+ const int in8 = in[ 8 * in_s], in9 = in[ 9 * in_s];
+ const int in10 = in[10 * in_s], in11 = in[11 * in_s];
+ const int in12 = in[12 * in_s], in13 = in[13 * in_s];
+ const int in14 = in[14 * in_s], in15 = in[15 * in_s];
+
+ int t0 = ((in15 * (4091 - 4096) + in0 * 201 + 2048) >> 12) + in15;
+ int t1 = ((in15 * 201 - in0 * (4091 - 4096) + 2048) >> 12) - in0;
+ int t2 = ((in13 * (3973 - 4096) + in2 * 995 + 2048) >> 12) + in13;
+ int t3 = ((in13 * 995 - in2 * (3973 - 4096) + 2048) >> 12) - in2;
+ int t4 = ((in11 * (3703 - 4096) + in4 * 1751 + 2048) >> 12) + in11;
+ int t5 = ((in11 * 1751 - in4 * (3703 - 4096) + 2048) >> 12) - in4;
+ int t6 = (in9 * 1645 + in6 * 1220 + 1024) >> 11;
+ int t7 = (in9 * 1220 - in6 * 1645 + 1024) >> 11;
+ int t8 = ((in7 * 2751 + in8 * (3035 - 4096) + 2048) >> 12) + in8;
+ int t9 = ((in7 * (3035 - 4096) - in8 * 2751 + 2048) >> 12) + in7;
+ int t10 = ((in5 * 2106 + in10 * (3513 - 4096) + 2048) >> 12) + in10;
+ int t11 = ((in5 * (3513 - 4096) - in10 * 2106 + 2048) >> 12) + in5;
+ int t12 = ((in3 * 1380 + in12 * (3857 - 4096) + 2048) >> 12) + in12;
+ int t13 = ((in3 * (3857 - 4096) - in12 * 1380 + 2048) >> 12) + in3;
+ int t14 = ((in1 * 601 + in14 * (4052 - 4096) + 2048) >> 12) + in14;
+ int t15 = ((in1 * (4052 - 4096) - in14 * 601 + 2048) >> 12) + in1;
+
+ int t0a = CLIP(t0 + t8 );
+ int t1a = CLIP(t1 + t9 );
+ int t2a = CLIP(t2 + t10);
+ int t3a = CLIP(t3 + t11);
+ int t4a = CLIP(t4 + t12);
+ int t5a = CLIP(t5 + t13);
+ int t6a = CLIP(t6 + t14);
+ int t7a = CLIP(t7 + t15);
+ int t8a = CLIP(t0 - t8 );
+ int t9a = CLIP(t1 - t9 );
+ int t10a = CLIP(t2 - t10);
+ int t11a = CLIP(t3 - t11);
+ int t12a = CLIP(t4 - t12);
+ int t13a = CLIP(t5 - t13);
+ int t14a = CLIP(t6 - t14);
+ int t15a = CLIP(t7 - t15);
+
+ t8 = ((t8a * (4017 - 4096) + t9a * 799 + 2048) >> 12) + t8a;
+ t9 = ((t8a * 799 - t9a * (4017 - 4096) + 2048) >> 12) - t9a;
+ t10 = ((t10a * 2276 + t11a * (3406 - 4096) + 2048) >> 12) + t11a;
+ t11 = ((t10a * (3406 - 4096) - t11a * 2276 + 2048) >> 12) + t10a;
+ t12 = ((t13a * (4017 - 4096) - t12a * 799 + 2048) >> 12) + t13a;
+ t13 = ((t13a * 799 + t12a * (4017 - 4096) + 2048) >> 12) + t12a;
+ t14 = ((t15a * 2276 - t14a * (3406 - 4096) + 2048) >> 12) - t14a;
+ t15 = ((t15a * (3406 - 4096) + t14a * 2276 + 2048) >> 12) + t15a;
+
+ t0 = CLIP(t0a + t4a);
+ t1 = CLIP(t1a + t5a);
+ t2 = CLIP(t2a + t6a);
+ t3 = CLIP(t3a + t7a);
+ t4 = CLIP(t0a - t4a);
+ t5 = CLIP(t1a - t5a);
+ t6 = CLIP(t2a - t6a);
+ t7 = CLIP(t3a - t7a);
+ t8a = CLIP(t8 + t12);
+ t9a = CLIP(t9 + t13);
+ t10a = CLIP(t10 + t14);
+ t11a = CLIP(t11 + t15);
+ t12a = CLIP(t8 - t12);
+ t13a = CLIP(t9 - t13);
+ t14a = CLIP(t10 - t14);
+ t15a = CLIP(t11 - t15);
+
+ t4a = ((t4 * (3784 - 4096) + t5 * 1567 + 2048) >> 12) + t4;
+ t5a = ((t4 * 1567 - t5 * (3784 - 4096) + 2048) >> 12) - t5;
+ t6a = ((t7 * (3784 - 4096) - t6 * 1567 + 2048) >> 12) + t7;
+ t7a = ((t7 * 1567 + t6 * (3784 - 4096) + 2048) >> 12) + t6;
+ t12 = ((t12a * (3784 - 4096) + t13a * 1567 + 2048) >> 12) + t12a;
+ t13 = ((t12a * 1567 - t13a * (3784 - 4096) + 2048) >> 12) - t13a;
+ t14 = ((t15a * (3784 - 4096) - t14a * 1567 + 2048) >> 12) + t15a;
+ t15 = ((t15a * 1567 + t14a * (3784 - 4096) + 2048) >> 12) + t14a;
+
+ out[ 0 * out_s] = CLIP(t0 + t2 );
+ out[15 * out_s] = -CLIP(t1 + t3 );
+ t2a = CLIP(t0 - t2 );
+ t3a = CLIP(t1 - t3 );
+ out[ 3 * out_s] = -CLIP(t4a + t6a );
+ out[12 * out_s] = CLIP(t5a + t7a );
+ t6 = CLIP(t4a - t6a );
+ t7 = CLIP(t5a - t7a );
+ out[ 1 * out_s] = -CLIP(t8a + t10a);
+ out[14 * out_s] = CLIP(t9a + t11a);
+ t10 = CLIP(t8a - t10a);
+ t11 = CLIP(t9a - t11a);
+ out[ 2 * out_s] = CLIP(t12 + t14 );
+ out[13 * out_s] = -CLIP(t13 + t15 );
+ t14a = CLIP(t12 - t14 );
+ t15a = CLIP(t13 - t15 );
+
+ out[ 7 * out_s] = -(((t2a + t3a) * 181 + 128) >> 8);
+ out[ 8 * out_s] = ((t2a - t3a) * 181 + 128) >> 8;
+ out[ 4 * out_s] = ((t6 + t7) * 181 + 128) >> 8;
+ out[11 * out_s] = -(((t6 - t7) * 181 + 128) >> 8);
+ out[ 6 * out_s] = ((t10 + t11) * 181 + 128) >> 8;
+ out[ 9 * out_s] = -(((t10 - t11) * 181 + 128) >> 8);
+ out[ 5 * out_s] = -(((t14a + t15a) * 181 + 128) >> 8);
+ out[10 * out_s] = ((t14a - t15a) * 181 + 128) >> 8;
+}
+
+#define inv_adst_1d(sz) \
+void dav1d_inv_adst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
+ const int min, const int max) \
+{ \
+ inv_adst##sz##_1d_internal_c(c, stride, min, max, c, stride); \
+} \
+void dav1d_inv_flipadst##sz##_1d_c(int32_t *const c, const ptrdiff_t stride, \
+ const int min, const int max) \
+{ \
+ inv_adst##sz##_1d_internal_c(c, stride, min, max, \
+ &c[(sz - 1) * stride], -stride); \
+}
+
+inv_adst_1d( 4)
+inv_adst_1d( 8)
+inv_adst_1d(16)
+
+#undef inv_adst_1d
+
+void dav1d_inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ assert(stride > 0);
+ for (int i = 0; i < 4; i++) {
+ const int in = c[stride * i];
+ c[stride * i] = in + ((in * 1697 + 2048) >> 12);
+ }
+}
+
+void dav1d_inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ assert(stride > 0);
+ for (int i = 0; i < 8; i++)
+ c[stride * i] *= 2;
+}
+
+void dav1d_inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ assert(stride > 0);
+ for (int i = 0; i < 16; i++) {
+ const int in = c[stride * i];
+ c[stride * i] = 2 * in + ((in * 1697 + 1024) >> 11);
+ }
+}
+
+void dav1d_inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride,
+ const int min, const int max)
+{
+ assert(stride > 0);
+ for (int i = 0; i < 32; i++)
+ c[stride * i] *= 4;
+}
+
+void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) {
+ assert(stride > 0);
+ const int in0 = c[0 * stride], in1 = c[1 * stride];
+ const int in2 = c[2 * stride], in3 = c[3 * stride];
+
+ const int t0 = in0 + in1;
+ const int t2 = in2 - in3;
+ const int t4 = (t0 - t2) >> 1;
+ const int t3 = t4 - in3;
+ const int t1 = t4 - in1;
+
+ c[0 * stride] = t0 - t3;
+ c[1 * stride] = t3;
+ c[2 * stride] = t1;
+ c[3 * stride] = t2 + t1;
+}
diff --git a/third_party/dav1d/src/itx_1d.h b/third_party/dav1d/src/itx_1d.h
new file mode 100644
index 0000000000..b63d71b020
--- /dev/null
+++ b/third_party/dav1d/src/itx_1d.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifndef DAV1D_SRC_ITX_1D_H
+#define DAV1D_SRC_ITX_1D_H
+
+#define decl_itx_1d_fn(name) \
+void (name)(int32_t *c, ptrdiff_t stride, int min, int max)
+typedef decl_itx_1d_fn(*itx_1d_fn);
+
+decl_itx_1d_fn(dav1d_inv_dct4_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct8_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct16_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct32_1d_c);
+decl_itx_1d_fn(dav1d_inv_dct64_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_adst4_1d_c);
+decl_itx_1d_fn(dav1d_inv_adst8_1d_c);
+decl_itx_1d_fn(dav1d_inv_adst16_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_flipadst4_1d_c);
+decl_itx_1d_fn(dav1d_inv_flipadst8_1d_c);
+decl_itx_1d_fn(dav1d_inv_flipadst16_1d_c);
+
+decl_itx_1d_fn(dav1d_inv_identity4_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity8_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity16_1d_c);
+decl_itx_1d_fn(dav1d_inv_identity32_1d_c);
+
+void dav1d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride);
+
+#endif /* DAV1D_SRC_ITX_1D_H */
diff --git a/third_party/dav1d/src/itx_tmpl.c b/third_party/dav1d/src/itx_tmpl.c
new file mode 100644
index 0000000000..d3859892d8
--- /dev/null
+++ b/third_party/dav1d/src/itx_tmpl.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright © 2018-2019, VideoLAN and dav1d authors
+ * Copyright © 2018-2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/itx.h"
+#include "src/itx_1d.h"
+
+static NOINLINE void
+inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff,
+ const int eob, const int w, const int h, const int shift,
+ const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
+ const int has_dconly HIGHBD_DECL_SUFFIX)
+{
+ assert(w >= 4 && w <= 64);
+ assert(h >= 4 && h <= 64);
+ assert(eob >= 0);
+
+ const int is_rect2 = w * 2 == h || h * 2 == w;
+ const int rnd = (1 << shift) >> 1;
+
+ if (eob < has_dconly) {
+ int dc = coeff[0];
+ coeff[0] = 0;
+ if (is_rect2)
+ dc = (dc * 181 + 128) >> 8;
+ dc = (dc * 181 + 128) >> 8;
+ dc = (dc + rnd) >> shift;
+ dc = (dc * 181 + 128 + 2048) >> 12;
+ for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))
+ for (int x = 0; x < w; x++)
+ dst[x] = iclip_pixel(dst[x] + dc);
+ return;
+ }
+
+ const int sh = imin(h, 32), sw = imin(w, 32);
+#if BITDEPTH == 8
+ const int row_clip_min = INT16_MIN;
+ const int col_clip_min = INT16_MIN;
+#else
+ const int row_clip_min = (int) ((unsigned) ~bitdepth_max << 7);
+ const int col_clip_min = (int) ((unsigned) ~bitdepth_max << 5);
+#endif
+ const int row_clip_max = ~row_clip_min;
+ const int col_clip_max = ~col_clip_min;
+
+ int32_t tmp[64 * 64], *c = tmp;
+ for (int y = 0; y < sh; y++, c += w) {
+ if (is_rect2)
+ for (int x = 0; x < sw; x++)
+ c[x] = (coeff[y + x * sh] * 181 + 128) >> 8;
+ else
+ for (int x = 0; x < sw; x++)
+ c[x] = coeff[y + x * sh];
+ first_1d_fn(c, 1, row_clip_min, row_clip_max);
+ }
+
+ memset(coeff, 0, sizeof(*coeff) * sw * sh);
+ for (int i = 0; i < w * sh; i++)
+ tmp[i] = iclip((tmp[i] + rnd) >> shift, col_clip_min, col_clip_max);
+
+ for (int x = 0; x < w; x++)
+ second_1d_fn(&tmp[x], w, col_clip_min, col_clip_max);
+
+ c = tmp;
+ for (int y = 0; y < h; y++, dst += PXSTRIDE(stride))
+ for (int x = 0; x < w; x++)
+ dst[x] = iclip_pixel(dst[x] + ((*c++ + 8) >> 4));
+}
+
+#define inv_txfm_fn(type1, type2, w, h, shift, has_dconly) \
+static void \
+inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
+ const ptrdiff_t stride, \
+ coef *const coeff, \
+ const int eob \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift, \
+ dav1d_inv_##type1##w##_1d_c, dav1d_inv_##type2##h##_1d_c, \
+ has_dconly HIGHBD_TAIL_SUFFIX); \
+}
+
+#define inv_txfm_fn64(w, h, shift) \
+inv_txfm_fn(dct, dct, w, h, shift, 1)
+
+#define inv_txfm_fn32(w, h, shift) \
+inv_txfm_fn64(w, h, shift) \
+inv_txfm_fn(identity, identity, w, h, shift, 0)
+
+#define inv_txfm_fn16(w, h, shift) \
+inv_txfm_fn32(w, h, shift) \
+inv_txfm_fn(adst, dct, w, h, shift, 0) \
+inv_txfm_fn(dct, adst, w, h, shift, 0) \
+inv_txfm_fn(adst, adst, w, h, shift, 0) \
+inv_txfm_fn(dct, flipadst, w, h, shift, 0) \
+inv_txfm_fn(flipadst, dct, w, h, shift, 0) \
+inv_txfm_fn(adst, flipadst, w, h, shift, 0) \
+inv_txfm_fn(flipadst, adst, w, h, shift, 0) \
+inv_txfm_fn(flipadst, flipadst, w, h, shift, 0) \
+inv_txfm_fn(identity, dct, w, h, shift, 0) \
+inv_txfm_fn(dct, identity, w, h, shift, 0) \
+
+#define inv_txfm_fn84(w, h, shift) \
+inv_txfm_fn16(w, h, shift) \
+inv_txfm_fn(identity, flipadst, w, h, shift, 0) \
+inv_txfm_fn(flipadst, identity, w, h, shift, 0) \
+inv_txfm_fn(identity, adst, w, h, shift, 0) \
+inv_txfm_fn(adst, identity, w, h, shift, 0) \
+
+inv_txfm_fn84( 4, 4, 0)
+inv_txfm_fn84( 4, 8, 0)
+inv_txfm_fn84( 4, 16, 1)
+inv_txfm_fn84( 8, 4, 0)
+inv_txfm_fn84( 8, 8, 1)
+inv_txfm_fn84( 8, 16, 1)
+inv_txfm_fn32( 8, 32, 2)
+inv_txfm_fn84(16, 4, 1)
+inv_txfm_fn84(16, 8, 1)
+inv_txfm_fn16(16, 16, 2)
+inv_txfm_fn32(16, 32, 1)
+inv_txfm_fn64(16, 64, 2)
+inv_txfm_fn32(32, 8, 2)
+inv_txfm_fn32(32, 16, 1)
+inv_txfm_fn32(32, 32, 2)
+inv_txfm_fn64(32, 64, 1)
+inv_txfm_fn64(64, 16, 2)
+inv_txfm_fn64(64, 32, 1)
+inv_txfm_fn64(64, 64, 2)
+
+static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
+ coef *const coeff, const int eob
+ HIGHBD_DECL_SUFFIX)
+{
+ int32_t tmp[4 * 4], *c = tmp;
+ for (int y = 0; y < 4; y++, c += 4) {
+ for (int x = 0; x < 4; x++)
+ c[x] = coeff[y + x * 4] >> 2;
+ dav1d_inv_wht4_1d_c(c, 1);
+ }
+ memset(coeff, 0, sizeof(*coeff) * 4 * 4);
+
+ for (int x = 0; x < 4; x++)
+ dav1d_inv_wht4_1d_c(&tmp[x], 4);
+
+ c = tmp;
+ for (int y = 0; y < 4; y++, dst += PXSTRIDE(stride))
+ for (int x = 0; x < 4; x++)
+ dst[x] = iclip_pixel(dst[x] + *c++);
+}
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/itx.h"
+#elif ARCH_X86
+#include "src/x86/itx.h"
+#endif
+#endif
+
+COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
+#define assign_itx_all_fn64(w, h, pfx) \
+ c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT ] = \
+ inv_txfm_add_dct_dct_##w##x##h##_c
+
+#define assign_itx_all_fn32(w, h, pfx) \
+ assign_itx_all_fn64(w, h, pfx); \
+ c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
+ inv_txfm_add_identity_identity_##w##x##h##_c
+
+#define assign_itx_all_fn16(w, h, pfx) \
+ assign_itx_all_fn32(w, h, pfx); \
+ c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
+ inv_txfm_add_adst_dct_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
+ inv_txfm_add_dct_adst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
+ inv_txfm_add_adst_adst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
+ inv_txfm_add_flipadst_adst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
+ inv_txfm_add_adst_flipadst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
+ inv_txfm_add_flipadst_dct_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
+ inv_txfm_add_dct_flipadst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
+ inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
+ inv_txfm_add_dct_identity_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
+ inv_txfm_add_identity_dct_##w##x##h##_c
+
+#define assign_itx_all_fn84(w, h, pfx) \
+ assign_itx_all_fn16(w, h, pfx); \
+ c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
+ inv_txfm_add_flipadst_identity_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
+ inv_txfm_add_identity_flipadst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
+ inv_txfm_add_adst_identity_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
+ inv_txfm_add_identity_adst_##w##x##h##_c; \
+
+ c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;
+ assign_itx_all_fn84( 4, 4, );
+ assign_itx_all_fn84( 4, 8, R);
+ assign_itx_all_fn84( 4, 16, R);
+ assign_itx_all_fn84( 8, 4, R);
+ assign_itx_all_fn84( 8, 8, );
+ assign_itx_all_fn84( 8, 16, R);
+ assign_itx_all_fn32( 8, 32, R);
+ assign_itx_all_fn84(16, 4, R);
+ assign_itx_all_fn84(16, 8, R);
+ assign_itx_all_fn16(16, 16, );
+ assign_itx_all_fn32(16, 32, R);
+ assign_itx_all_fn64(16, 64, R);
+ assign_itx_all_fn32(32, 8, R);
+ assign_itx_all_fn32(32, 16, R);
+ assign_itx_all_fn32(32, 32, );
+ assign_itx_all_fn64(32, 64, R);
+ assign_itx_all_fn64(64, 16, R);
+ assign_itx_all_fn64(64, 32, R);
+ assign_itx_all_fn64(64, 64, );
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ itx_dsp_init_arm(c, bpc);
+#endif
+#if ARCH_X86
+ itx_dsp_init_x86(c, bpc);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/levels.h b/third_party/dav1d/src/levels.h
new file mode 100644
index 0000000000..0f510e9f30
--- /dev/null
+++ b/third_party/dav1d/src/levels.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LEVELS_H
+#define DAV1D_SRC_LEVELS_H
+
+#include <stdint.h>
+
+#include "dav1d/headers.h"
+#include "common/attributes.h"
+
+enum ObuMetaType {
+ OBU_META_HDR_CLL = 1,
+ OBU_META_HDR_MDCV = 2,
+ OBU_META_SCALABILITY = 3,
+ OBU_META_ITUT_T35 = 4,
+ OBU_META_TIMECODE = 5,
+};
+
+enum TxfmSize {
+ TX_4X4,
+ TX_8X8,
+ TX_16X16,
+ TX_32X32,
+ TX_64X64,
+ N_TX_SIZES,
+};
+
+enum BlockLevel {
+ BL_128X128,
+ BL_64X64,
+ BL_32X32,
+ BL_16X16,
+ BL_8X8,
+ N_BL_LEVELS,
+};
+
+enum RectTxfmSize {
+ RTX_4X8 = N_TX_SIZES,
+ RTX_8X4,
+ RTX_8X16,
+ RTX_16X8,
+ RTX_16X32,
+ RTX_32X16,
+ RTX_32X64,
+ RTX_64X32,
+ RTX_4X16,
+ RTX_16X4,
+ RTX_8X32,
+ RTX_32X8,
+ RTX_16X64,
+ RTX_64X16,
+ N_RECT_TX_SIZES
+};
+
+enum TxfmType {
+ DCT_DCT, // DCT in both horizontal and vertical
+ ADST_DCT, // ADST in vertical, DCT in horizontal
+ DCT_ADST, // DCT in vertical, ADST in horizontal
+ ADST_ADST, // ADST in both directions
+ FLIPADST_DCT,
+ DCT_FLIPADST,
+ FLIPADST_FLIPADST,
+ ADST_FLIPADST,
+ FLIPADST_ADST,
+ IDTX,
+ V_DCT,
+ H_DCT,
+ V_ADST,
+ H_ADST,
+ V_FLIPADST,
+ H_FLIPADST,
+ N_TX_TYPES,
+ WHT_WHT = N_TX_TYPES,
+ N_TX_TYPES_PLUS_LL,
+};
+
+enum TxClass {
+ TX_CLASS_2D,
+ TX_CLASS_H,
+ TX_CLASS_V,
+};
+
+enum IntraPredMode {
+ DC_PRED,
+ VERT_PRED,
+ HOR_PRED,
+ DIAG_DOWN_LEFT_PRED,
+ DIAG_DOWN_RIGHT_PRED,
+ VERT_RIGHT_PRED,
+ HOR_DOWN_PRED,
+ HOR_UP_PRED,
+ VERT_LEFT_PRED,
+ SMOOTH_PRED,
+ SMOOTH_V_PRED,
+ SMOOTH_H_PRED,
+ PAETH_PRED,
+ N_INTRA_PRED_MODES,
+ CFL_PRED = N_INTRA_PRED_MODES,
+ N_UV_INTRA_PRED_MODES,
+ N_IMPL_INTRA_PRED_MODES = N_UV_INTRA_PRED_MODES,
+ LEFT_DC_PRED = DIAG_DOWN_LEFT_PRED,
+ TOP_DC_PRED,
+ DC_128_PRED,
+ Z1_PRED,
+ Z2_PRED,
+ Z3_PRED,
+ FILTER_PRED = N_INTRA_PRED_MODES,
+};
+
+enum InterIntraPredMode {
+ II_DC_PRED,
+ II_VERT_PRED,
+ II_HOR_PRED,
+ II_SMOOTH_PRED,
+ N_INTER_INTRA_PRED_MODES,
+};
+
+enum BlockPartition {
+ PARTITION_NONE, // [ ] <-.
+ PARTITION_H, // [-] |
+ PARTITION_V, // [|] |
+ PARTITION_SPLIT, // [+] --'
+ PARTITION_T_TOP_SPLIT, // [⊥] i.e. split top, H bottom
+ PARTITION_T_BOTTOM_SPLIT, // [т] i.e. H top, split bottom
+ PARTITION_T_LEFT_SPLIT, // [-|] i.e. split left, V right
+ PARTITION_T_RIGHT_SPLIT, // [|-] i.e. V left, split right
+ PARTITION_H4, // [Ⲷ]
+ PARTITION_V4, // [Ⲽ]
+ N_PARTITIONS,
+ N_SUB8X8_PARTITIONS = PARTITION_T_TOP_SPLIT,
+};
+
+enum BlockSize {
+ BS_128x128,
+ BS_128x64,
+ BS_64x128,
+ BS_64x64,
+ BS_64x32,
+ BS_64x16,
+ BS_32x64,
+ BS_32x32,
+ BS_32x16,
+ BS_32x8,
+ BS_16x64,
+ BS_16x32,
+ BS_16x16,
+ BS_16x8,
+ BS_16x4,
+ BS_8x32,
+ BS_8x16,
+ BS_8x8,
+ BS_8x4,
+ BS_4x16,
+ BS_4x8,
+ BS_4x4,
+ N_BS_SIZES,
+};
+
+enum Filter2d { // order is horizontal, vertical
+ FILTER_2D_8TAP_REGULAR,
+ FILTER_2D_8TAP_REGULAR_SMOOTH,
+ FILTER_2D_8TAP_REGULAR_SHARP,
+ FILTER_2D_8TAP_SHARP_REGULAR,
+ FILTER_2D_8TAP_SHARP_SMOOTH,
+ FILTER_2D_8TAP_SHARP,
+ FILTER_2D_8TAP_SMOOTH_REGULAR,
+ FILTER_2D_8TAP_SMOOTH,
+ FILTER_2D_8TAP_SMOOTH_SHARP,
+ FILTER_2D_BILINEAR,
+ N_2D_FILTERS,
+};
+
+enum MVJoint {
+ MV_JOINT_ZERO,
+ MV_JOINT_H,
+ MV_JOINT_V,
+ MV_JOINT_HV,
+ N_MV_JOINTS,
+};
+
+enum InterPredMode {
+ NEARESTMV,
+ NEARMV,
+ GLOBALMV,
+ NEWMV,
+ N_INTER_PRED_MODES,
+};
+
+enum DRL_PROXIMITY {
+ NEAREST_DRL,
+ NEARER_DRL,
+ NEAR_DRL,
+ NEARISH_DRL
+};
+
+enum CompInterPredMode {
+ NEARESTMV_NEARESTMV,
+ NEARMV_NEARMV,
+ NEARESTMV_NEWMV,
+ NEWMV_NEARESTMV,
+ NEARMV_NEWMV,
+ NEWMV_NEARMV,
+ GLOBALMV_GLOBALMV,
+ NEWMV_NEWMV,
+ N_COMP_INTER_PRED_MODES,
+};
+
+enum CompInterType {
+ COMP_INTER_NONE,
+ COMP_INTER_WEIGHTED_AVG,
+ COMP_INTER_AVG,
+ COMP_INTER_SEG,
+ COMP_INTER_WEDGE,
+};
+
+enum InterIntraType {
+ INTER_INTRA_NONE,
+ INTER_INTRA_BLEND,
+ INTER_INTRA_WEDGE,
+};
+
+typedef union mv {
+ struct {
+ int16_t y, x;
+ };
+ uint32_t n;
+} mv;
+
+enum MotionMode {
+ MM_TRANSLATION,
+ MM_OBMC,
+ MM_WARP,
+};
+
+#define QINDEX_RANGE 256
+
+typedef struct Av1Block {
+ uint8_t bl, bs, bp;
+ uint8_t intra, seg_id, skip_mode, skip, uvtx;
+ union {
+ struct {
+ uint8_t y_mode, uv_mode, tx, pal_sz[2];
+ int8_t y_angle, uv_angle, cfl_alpha[2];
+ }; // intra
+ struct {
+ union {
+ struct {
+ union mv mv[2];
+ uint8_t wedge_idx, mask_sign, interintra_mode;
+ };
+ struct {
+ union mv mv2d;
+ int16_t matrix[4];
+ };
+ };
+ uint8_t comp_type, inter_mode, motion_mode, drl_idx;
+ int8_t ref[2];
+ uint8_t max_ytx, filter2d, interintra_type, tx_split0;
+ uint16_t tx_split1;
+ }; // inter
+ };
+} Av1Block;
+
+#endif /* DAV1D_SRC_LEVELS_H */
diff --git a/third_party/dav1d/src/lf_apply.h b/third_party/dav1d/src/lf_apply.h
new file mode 100644
index 0000000000..cf4c898550
--- /dev/null
+++ b/third_party/dav1d/src/lf_apply.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LF_APPLY_H
+#define DAV1D_SRC_LF_APPLY_H
+
+#include <stdint.h>
+
+#include "common/bitdepth.h"
+
+#include "src/internal.h"
+#include "src/levels.h"
+
+void bytefn(dav1d_loopfilter_sbrow_cols)(const Dav1dFrameContext *f,
+ pixel *const p[3], Av1Filter *lflvl,
+ int sby, int start_of_tile_row);
+void bytefn(dav1d_loopfilter_sbrow_rows)(const Dav1dFrameContext *f,
+ pixel *const p[3], Av1Filter *lflvl,
+ int sby);
+
+void bytefn(dav1d_copy_lpf)(Dav1dFrameContext *const f,
+ /*const*/ pixel *const src[3], int sby);
+
+#endif /* DAV1D_SRC_LF_APPLY_H */
diff --git a/third_party/dav1d/src/lf_apply_tmpl.c b/third_party/dav1d/src/lf_apply_tmpl.c
new file mode 100644
index 0000000000..4ef3becd82
--- /dev/null
+++ b/third_party/dav1d/src/lf_apply_tmpl.c
@@ -0,0 +1,466 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/lf_apply.h"
+#include "src/lr_apply.h"
+
+// The loop filter buffer stores 12 rows of pixels. A superblock block will
+// contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above
+// and 2 below) the final 4 rows are used to swap the bottom of the last
+// stripe with the top of the next super block row.
+static void backup_lpf(const Dav1dFrameContext *const f,
+ pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int ss_ver, const int sb128,
+ int row, const int row_h, const int src_w,
+ const int h, const int ss_hor, const int lr_backup)
+{
+ const int cdef_backup = !lr_backup;
+ const int dst_w = f->frame_hdr->super_res.enabled ?
+ (f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w;
+
+ // The first stripe of the frame is shorter by 8 luma pixel rows.
+ int stripe_h = ((64 << (cdef_backup & sb128)) - 8 * !row) >> ss_ver;
+ src += (stripe_h - 2) * PXSTRIDE(src_stride);
+
+ if (f->c->n_tc == 1) {
+ if (row) {
+ const int top = 4 << sb128;
+ // Copy the top part of the stored loop filtered pixels from the
+ // previous sb row needed above the first stripe of this sb row.
+ pixel_copy(&dst[PXSTRIDE(dst_stride) * 0],
+ &dst[PXSTRIDE(dst_stride) * top], dst_w);
+ pixel_copy(&dst[PXSTRIDE(dst_stride) * 1],
+ &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w);
+ pixel_copy(&dst[PXSTRIDE(dst_stride) * 2],
+ &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w);
+ pixel_copy(&dst[PXSTRIDE(dst_stride) * 3],
+ &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
+ }
+ dst += 4 * PXSTRIDE(dst_stride);
+ }
+
+ if (lr_backup && (f->frame_hdr->width[0] != f->frame_hdr->width[1])) {
+ while (row + stripe_h <= row_h) {
+ const int n_lines = 4 - (row + stripe_h + 1 == h);
+ f->dsp->mc.resize(dst, dst_stride, src, src_stride,
+ dst_w, n_lines, src_w, f->resize_step[ss_hor],
+ f->resize_start[ss_hor] HIGHBD_CALL_SUFFIX);
+ row += stripe_h; // unmodified stripe_h for the 1st stripe
+ stripe_h = 64 >> ss_ver;
+ src += stripe_h * PXSTRIDE(src_stride);
+ dst += n_lines * PXSTRIDE(dst_stride);
+ if (n_lines == 3) {
+ pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], dst_w);
+ dst += PXSTRIDE(dst_stride);
+ }
+ }
+ } else {
+ while (row + stripe_h <= row_h) {
+ const int n_lines = 4 - (row + stripe_h + 1 == h);
+ for (int i = 0; i < 4; i++) {
+ pixel_copy(dst, i == n_lines ? &dst[-PXSTRIDE(dst_stride)] :
+ src, src_w);
+ dst += PXSTRIDE(dst_stride);
+ src += PXSTRIDE(src_stride);
+ }
+ row += stripe_h; // unmodified stripe_h for the 1st stripe
+ stripe_h = 64 >> ss_ver;
+ src += (stripe_h - 4) * PXSTRIDE(src_stride);
+ }
+ }
+}
+
+void bytefn(dav1d_copy_lpf)(Dav1dFrameContext *const f,
+ /*const*/ pixel *const src[3], const int sby)
+{
+ const int have_tt = f->c->n_tc > 1;
+ const int resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
+ const int offset = 8 * !!sby;
+ const ptrdiff_t *const src_stride = f->cur.stride;
+ const ptrdiff_t *const lr_stride = f->sr_cur.p.stride;
+ const int tt_off = have_tt * sby * (4 << f->seq_hdr->sb128);
+ pixel *const dst[3] = {
+ f->lf.lr_lpf_line[0] + tt_off * PXSTRIDE(lr_stride[0]),
+ f->lf.lr_lpf_line[1] + tt_off * PXSTRIDE(lr_stride[1]),
+ f->lf.lr_lpf_line[2] + tt_off * PXSTRIDE(lr_stride[1])
+ };
+
+ // TODO Also check block level restore type to reduce copying.
+ const int restore_planes = f->lf.restore_planes;
+
+ if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_Y) {
+ const int h = f->cur.p.h;
+ const int w = f->bw << 2;
+ const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1);
+ const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
+ if (restore_planes & LR_RESTORE_Y || !resize)
+ backup_lpf(f, dst[0], lr_stride[0],
+ src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
+ 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 1);
+ if (have_tt && resize) {
+ const ptrdiff_t cdef_off_y = sby * 4 * PXSTRIDE(src_stride[0]);
+ backup_lpf(f, f->lf.cdef_lpf_line[0] + cdef_off_y, src_stride[0],
+ src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
+ 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 0);
+ }
+ }
+ if ((f->seq_hdr->cdef || restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) &&
+ f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400)
+ {
+ const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h = (f->cur.p.h + ss_ver) >> ss_ver;
+ const int w = f->bw << (2 - ss_hor);
+ const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1);
+ const int offset_uv = offset >> ss_ver;
+ const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
+ const ptrdiff_t cdef_off_uv = sby * 4 * PXSTRIDE(src_stride[1]);
+ if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_U) {
+ if (restore_planes & LR_RESTORE_U || !resize)
+ backup_lpf(f, dst[1], lr_stride[1],
+ src[1] - offset_uv * PXSTRIDE(src_stride[1]),
+ src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
+ row_h, w, h, ss_hor, 1);
+ if (have_tt && resize)
+ backup_lpf(f, f->lf.cdef_lpf_line[1] + cdef_off_uv, src_stride[1],
+ src[1] - offset_uv * PXSTRIDE(src_stride[1]),
+ src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
+ row_h, w, h, ss_hor, 0);
+ }
+ if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_V) {
+ if (restore_planes & LR_RESTORE_V || !resize)
+ backup_lpf(f, dst[2], lr_stride[1],
+ src[2] - offset_uv * PXSTRIDE(src_stride[1]),
+ src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
+ row_h, w, h, ss_hor, 1);
+ if (have_tt && resize)
+ backup_lpf(f, f->lf.cdef_lpf_line[2] + cdef_off_uv, src_stride[1],
+ src[2] - offset_uv * PXSTRIDE(src_stride[1]),
+ src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe,
+ row_h, w, h, ss_hor, 0);
+ }
+ }
+}
+
+static inline void filter_plane_cols_y(const Dav1dFrameContext *const f,
+ const int have_left,
+ const uint8_t (*lvl)[4],
+ const ptrdiff_t b4_stride,
+ const uint16_t (*const mask)[3][2],
+ pixel *dst, const ptrdiff_t ls,
+ const int w,
+ const int starty4, const int endy4)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+
+ // filter edges between columns (e.g. block1 | block2)
+ for (int x = 0; x < w; x++) {
+ if (!have_left && !x) continue;
+ uint32_t hmask[4];
+ if (!starty4) {
+ hmask[0] = mask[x][0][0];
+ hmask[1] = mask[x][1][0];
+ hmask[2] = mask[x][2][0];
+ if (endy4 > 16) {
+ hmask[0] |= (unsigned) mask[x][0][1] << 16;
+ hmask[1] |= (unsigned) mask[x][1][1] << 16;
+ hmask[2] |= (unsigned) mask[x][2][1] << 16;
+ }
+ } else {
+ hmask[0] = mask[x][0][1];
+ hmask[1] = mask[x][1][1];
+ hmask[2] = mask[x][2][1];
+ }
+ hmask[3] = 0;
+ dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls, hmask,
+ (const uint8_t(*)[4]) &lvl[x][0], b4_stride,
+ &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX);
+ }
+}
+
+static inline void filter_plane_rows_y(const Dav1dFrameContext *const f,
+ const int have_top,
+ const uint8_t (*lvl)[4],
+ const ptrdiff_t b4_stride,
+ const uint16_t (*const mask)[3][2],
+ pixel *dst, const ptrdiff_t ls,
+ const int w,
+ const int starty4, const int endy4)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+
+ // block1
+ // filter edges between rows (e.g. ------)
+ // block2
+ for (int y = starty4; y < endy4;
+ y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
+ {
+ if (!have_top && !y) continue;
+ const uint32_t vmask[4] = {
+ mask[y][0][0] | ((unsigned) mask[y][0][1] << 16),
+ mask[y][1][0] | ((unsigned) mask[y][1][1] << 16),
+ mask[y][2][0] | ((unsigned) mask[y][2][1] << 16),
+ 0,
+ };
+ dsp->lf.loop_filter_sb[0][1](dst, ls, vmask,
+ (const uint8_t(*)[4]) &lvl[0][1], b4_stride,
+ &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX);
+ }
+}
+
+static inline void filter_plane_cols_uv(const Dav1dFrameContext *const f,
+ const int have_left,
+ const uint8_t (*lvl)[4],
+ const ptrdiff_t b4_stride,
+ const uint16_t (*const mask)[2][2],
+ pixel *const u, pixel *const v,
+ const ptrdiff_t ls, const int w,
+ const int starty4, const int endy4,
+ const int ss_ver)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+
+ // filter edges between columns (e.g. block1 | block2)
+ for (int x = 0; x < w; x++) {
+ if (!have_left && !x) continue;
+ uint32_t hmask[3];
+ if (!starty4) {
+ hmask[0] = mask[x][0][0];
+ hmask[1] = mask[x][1][0];
+ if (endy4 > (16 >> ss_ver)) {
+ hmask[0] |= (unsigned) mask[x][0][1] << (16 >> ss_ver);
+ hmask[1] |= (unsigned) mask[x][1][1] << (16 >> ss_ver);
+ }
+ } else {
+ hmask[0] = mask[x][0][1];
+ hmask[1] = mask[x][1][1];
+ }
+ hmask[2] = 0;
+ dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls, hmask,
+ (const uint8_t(*)[4]) &lvl[x][2], b4_stride,
+ &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX);
+ dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls, hmask,
+ (const uint8_t(*)[4]) &lvl[x][3], b4_stride,
+ &f->lf.lim_lut, endy4 - starty4 HIGHBD_CALL_SUFFIX);
+ }
+}
+
+static inline void filter_plane_rows_uv(const Dav1dFrameContext *const f,
+ const int have_top,
+ const uint8_t (*lvl)[4],
+ const ptrdiff_t b4_stride,
+ const uint16_t (*const mask)[2][2],
+ pixel *const u, pixel *const v,
+ const ptrdiff_t ls, const int w,
+ const int starty4, const int endy4,
+ const int ss_hor)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+ ptrdiff_t off_l = 0;
+
+ // block1
+ // filter edges between rows (e.g. ------)
+ // block2
+ for (int y = starty4; y < endy4;
+ y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
+ {
+ if (!have_top && !y) continue;
+ const uint32_t vmask[3] = {
+ mask[y][0][0] | ((unsigned) mask[y][0][1] << (16 >> ss_hor)),
+ mask[y][1][0] | ((unsigned) mask[y][1][1] << (16 >> ss_hor)),
+ 0,
+ };
+ dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, vmask,
+ (const uint8_t(*)[4]) &lvl[0][2], b4_stride,
+ &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX);
+ dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, vmask,
+ (const uint8_t(*)[4]) &lvl[0][3], b4_stride,
+ &f->lf.lim_lut, w HIGHBD_CALL_SUFFIX);
+ }
+}
+
+void bytefn(dav1d_loopfilter_sbrow_cols)(const Dav1dFrameContext *const f,
+ pixel *const p[3], Av1Filter *const lflvl,
+ int sby, const int start_of_tile_row)
+{
+ int x, have_left;
+ // Don't filter outside the frame
+ const int is_sb64 = !f->seq_hdr->sb128;
+ const int starty4 = (sby & is_sb64) << 4;
+ const int sbsz = 32 >> is_sb64;
+ const int sbl2 = 5 - is_sb64;
+ const int halign = (f->bh + 31) & ~31;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
+ const unsigned vmax = 1U << vmask, hmax = 1U << hmask;
+ const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz);
+ const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
+
+ // fix lpf strength at tile col boundaries
+ const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2];
+ const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)];
+ for (int tile_col = 1;; tile_col++) {
+ x = f->frame_hdr->tiling.col_start_sb[tile_col];
+ if ((x << sbl2) >= f->bw) break;
+ const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;
+ x >>= is_sb64;
+
+ uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4];
+ for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {
+ const int sidx = mask >= 0x10000U;
+ const unsigned smask = mask >> (sidx << 4);
+ const int idx = 2 * !!(y_hmask[2][sidx] & smask) +
+ !!(y_hmask[1][sidx] & smask);
+ y_hmask[2][sidx] &= ~smask;
+ y_hmask[1][sidx] &= ~smask;
+ y_hmask[0][sidx] &= ~smask;
+ y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask;
+ }
+
+ if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+ uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4];
+ for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;
+ y++, uv_mask <<= 1)
+ {
+ const int sidx = uv_mask >= vmax;
+ const unsigned smask = uv_mask >> (sidx << (4 - ss_ver));
+ const int idx = !!(uv_hmask[1][sidx] & smask);
+ uv_hmask[1][sidx] &= ~smask;
+ uv_hmask[0][sidx] &= ~smask;
+ uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask;
+ }
+ }
+ lpf_y += halign;
+ lpf_uv += halign >> ss_ver;
+ }
+
+ // fix lpf strength at tile row boundaries
+ if (start_of_tile_row) {
+ const BlockContext *a;
+ for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)];
+ x < f->sb128w; x++, a++)
+ {
+ uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4];
+ const unsigned w = imin(32, f->w4 - (x << 5));
+ for (unsigned mask = 1, i = 0; i < w; mask <<= 1, i++) {
+ const int sidx = mask >= 0x10000U;
+ const unsigned smask = mask >> (sidx << 4);
+ const int idx = 2 * !!(y_vmask[2][sidx] & smask) +
+ !!(y_vmask[1][sidx] & smask);
+ y_vmask[2][sidx] &= ~smask;
+ y_vmask[1][sidx] &= ~smask;
+ y_vmask[0][sidx] &= ~smask;
+ y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask;
+ }
+
+ if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+ const unsigned cw = (w + ss_hor) >> ss_hor;
+ uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver];
+ for (unsigned uv_mask = 1, i = 0; i < cw; uv_mask <<= 1, i++) {
+ const int sidx = uv_mask >= hmax;
+ const unsigned smask = uv_mask >> (sidx << (4 - ss_hor));
+ const int idx = !!(uv_vmask[1][sidx] & smask);
+ uv_vmask[1][sidx] &= ~smask;
+ uv_vmask[0][sidx] &= ~smask;
+ uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask;
+ }
+ }
+ }
+ }
+
+ pixel *ptr;
+ uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
+ for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w;
+ x++, have_left = 1, ptr += 128, level_ptr += 32)
+ {
+ filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,
+ lflvl[x].filter_y[0], ptr, f->cur.stride[0],
+ imin(32, f->w4 - x * 32), starty4, endy4);
+ }
+
+ if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v)
+ return;
+
+ ptrdiff_t uv_off;
+ level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
+ for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w;
+ x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
+ {
+ filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,
+ lflvl[x].filter_uv[0],
+ &p[1][uv_off], &p[2][uv_off], f->cur.stride[1],
+ (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
+ starty4 >> ss_ver, uv_endy4, ss_ver);
+ }
+}
+
+void bytefn(dav1d_loopfilter_sbrow_rows)(const Dav1dFrameContext *const f,
+ pixel *const p[3], Av1Filter *const lflvl,
+ int sby)
+{
+ int x;
+ // Don't filter outside the frame
+ const int have_top = sby > 0;
+ const int is_sb64 = !f->seq_hdr->sb128;
+ const int starty4 = (sby & is_sb64) << 4;
+ const int sbsz = 32 >> is_sb64;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz);
+ const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
+
+ pixel *ptr;
+ uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
+ for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
+ filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
+ lflvl[x].filter_y[1], ptr, f->cur.stride[0],
+ imin(32, f->w4 - x * 32), starty4, endy4);
+ }
+
+ if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v)
+ return;
+
+ ptrdiff_t uv_off;
+ level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
+ for (uv_off = 0, x = 0; x < f->sb128w;
+ x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
+ {
+ filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,
+ lflvl[x].filter_uv[1],
+ &p[1][uv_off], &p[2][uv_off], f->cur.stride[1],
+ (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
+ starty4 >> ss_ver, uv_endy4, ss_hor);
+ }
+}
diff --git a/third_party/dav1d/src/lf_mask.c b/third_party/dav1d/src/lf_mask.c
new file mode 100644
index 0000000000..91fe4a02c8
--- /dev/null
+++ b/third_party/dav1d/src/lf_mask.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/ctx.h"
+#include "src/levels.h"
+#include "src/lf_mask.h"
+#include "src/tables.h"
+
+static void decomp_tx(uint8_t (*const txa)[2 /* txsz, step */][32 /* y */][32 /* x */],
+ const enum RectTxfmSize from,
+ const int depth,
+ const int y_off, const int x_off,
+ const uint16_t *const tx_masks)
+{
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[from];
+ const int is_split = (from == (int) TX_4X4 || depth > 1) ? 0 :
+ (tx_masks[depth] >> (y_off * 4 + x_off)) & 1;
+
+ if (is_split) {
+ const enum RectTxfmSize sub = t_dim->sub;
+ const int htw4 = t_dim->w >> 1, hth4 = t_dim->h >> 1;
+
+ decomp_tx(txa, sub, depth + 1, y_off * 2 + 0, x_off * 2 + 0, tx_masks);
+ if (t_dim->w >= t_dim->h)
+ decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][0][htw4],
+ sub, depth + 1, y_off * 2 + 0, x_off * 2 + 1, tx_masks);
+ if (t_dim->h >= t_dim->w) {
+ decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][hth4][0],
+ sub, depth + 1, y_off * 2 + 1, x_off * 2 + 0, tx_masks);
+ if (t_dim->w >= t_dim->h)
+ decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][hth4][htw4],
+ sub, depth + 1, y_off * 2 + 1, x_off * 2 + 1, tx_masks);
+ }
+ } else {
+ const int lw = imin(2, t_dim->lw), lh = imin(2, t_dim->lh);
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ for (int y = 0; y < t_dim->h; y++) { \
+ rep_macro(type, txa[0][0][y], off, mul * lw); \
+ rep_macro(type, txa[1][0][y], off, mul * lh); \
+ txa[0][1][y][0] = t_dim->w; \
+ }
+ case_set_upto16(t_dim->w,,, 0);
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, txa[1][1][0], off, mul * t_dim->h)
+ case_set_upto16(t_dim->w,,, 0);
+#undef set_ctx
+ }
+}
+
+static inline void mask_edges_inter(uint16_t (*const masks)[32][3][2],
+ const int by4, const int bx4,
+ const int w4, const int h4, const int skip,
+ const enum RectTxfmSize max_tx,
+ const uint16_t *const tx_masks,
+ uint8_t *const a, uint8_t *const l)
+{
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[max_tx];
+ int y, x;
+
+ ALIGN_STK_16(uint8_t, txa, 2 /* edge */, [2 /* txsz, step */][32 /* y */][32 /* x */]);
+ for (int y_off = 0, y = 0; y < h4; y += t_dim->h, y_off++)
+ for (int x_off = 0, x = 0; x < w4; x += t_dim->w, x_off++)
+ decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][y][x],
+ max_tx, 0, y_off, x_off, tx_masks);
+
+ // left block edge
+ unsigned mask = 1U << by4;
+ for (y = 0; y < h4; y++, mask <<= 1) {
+ const int sidx = mask >= 0x10000;
+ const unsigned smask = mask >> (sidx << 4);
+ masks[0][bx4][imin(txa[0][0][y][0], l[y])][sidx] |= smask;
+ }
+
+ // top block edge
+ for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
+ const int sidx = mask >= 0x10000;
+ const unsigned smask = mask >> (sidx << 4);
+ masks[1][by4][imin(txa[1][0][0][x], a[x])][sidx] |= smask;
+ }
+
+ if (!skip) {
+ // inner (tx) left|right edges
+ for (y = 0, mask = 1U << by4; y < h4; y++, mask <<= 1) {
+ const int sidx = mask >= 0x10000U;
+ const unsigned smask = mask >> (sidx << 4);
+ int ltx = txa[0][0][y][0];
+ int step = txa[0][1][y][0];
+ for (x = step; x < w4; x += step) {
+ const int rtx = txa[0][0][y][x];
+ masks[0][bx4 + x][imin(rtx, ltx)][sidx] |= smask;
+ ltx = rtx;
+ step = txa[0][1][y][x];
+ }
+ }
+
+ // top
+ // inner (tx) --- edges
+ // bottom
+ for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
+ const int sidx = mask >= 0x10000U;
+ const unsigned smask = mask >> (sidx << 4);
+ int ttx = txa[1][0][0][x];
+ int step = txa[1][1][0][x];
+ for (y = step; y < h4; y += step) {
+ const int btx = txa[1][0][y][x];
+ masks[1][by4 + y][imin(ttx, btx)][sidx] |= smask;
+ ttx = btx;
+ step = txa[1][1][y][x];
+ }
+ }
+ }
+
+ for (y = 0; y < h4; y++)
+ l[y] = txa[0][0][y][w4 - 1];
+ memcpy(a, txa[1][0][h4 - 1], w4);
+}
+
+static inline void mask_edges_intra(uint16_t (*const masks)[32][3][2],
+ const int by4, const int bx4,
+ const int w4, const int h4,
+ const enum RectTxfmSize tx,
+ uint8_t *const a, uint8_t *const l)
+{
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
+ const int twl4 = t_dim->lw, thl4 = t_dim->lh;
+ const int twl4c = imin(2, twl4), thl4c = imin(2, thl4);
+ int y, x;
+
+ // left block edge
+ unsigned mask = 1U << by4;
+ for (y = 0; y < h4; y++, mask <<= 1) {
+ const int sidx = mask >= 0x10000;
+ const unsigned smask = mask >> (sidx << 4);
+ masks[0][bx4][imin(twl4c, l[y])][sidx] |= smask;
+ }
+
+ // top block edge
+ for (x = 0, mask = 1U << bx4; x < w4; x++, mask <<= 1) {
+ const int sidx = mask >= 0x10000;
+ const unsigned smask = mask >> (sidx << 4);
+ masks[1][by4][imin(thl4c, a[x])][sidx] |= smask;
+ }
+
+ // inner (tx) left|right edges
+ const int hstep = t_dim->w;
+ unsigned t = 1U << by4;
+ unsigned inner = (unsigned) ((((uint64_t) t) << h4) - t);
+ unsigned inner1 = inner & 0xffff, inner2 = inner >> 16;
+ for (x = hstep; x < w4; x += hstep) {
+ if (inner1) masks[0][bx4 + x][twl4c][0] |= inner1;
+ if (inner2) masks[0][bx4 + x][twl4c][1] |= inner2;
+ }
+
+ // top
+ // inner (tx) --- edges
+ // bottom
+ const int vstep = t_dim->h;
+ t = 1U << bx4;
+ inner = (unsigned) ((((uint64_t) t) << w4) - t);
+ inner1 = inner & 0xffff;
+ inner2 = inner >> 16;
+ for (y = vstep; y < h4; y += vstep) {
+ if (inner1) masks[1][by4 + y][thl4c][0] |= inner1;
+ if (inner2) masks[1][by4 + y][thl4c][1] |= inner2;
+ }
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, a, off, mul * thl4c)
+#define default_memset(dir, diridx, off, var) \
+ memset(a, thl4c, var)
+ case_set_upto32_with_default(w4,,, 0);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, l, off, mul * twl4c)
+#define default_memset(dir, diridx, off, var) \
+ memset(l, twl4c, var)
+ case_set_upto32_with_default(h4,,, 0);
+#undef default_memset
+#undef set_ctx
+}
+
+static void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
+ const int cby4, const int cbx4,
+ const int cw4, const int ch4,
+ const int skip_inter,
+ const enum RectTxfmSize tx,
+ uint8_t *const a, uint8_t *const l,
+ const int ss_hor, const int ss_ver)
+{
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
+ const int twl4 = t_dim->lw, thl4 = t_dim->lh;
+ const int twl4c = !!twl4, thl4c = !!thl4;
+ int y, x;
+ const int vbits = 4 - ss_ver, hbits = 4 - ss_hor;
+ const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
+ const unsigned vmax = 1 << vmask, hmax = 1 << hmask;
+
+ // left block edge
+ unsigned mask = 1U << cby4;
+ for (y = 0; y < ch4; y++, mask <<= 1) {
+ const int sidx = mask >= vmax;
+ const unsigned smask = mask >> (sidx << vbits);
+ masks[0][cbx4][imin(twl4c, l[y])][sidx] |= smask;
+ }
+
+ // top block edge
+ for (x = 0, mask = 1U << cbx4; x < cw4; x++, mask <<= 1) {
+ const int sidx = mask >= hmax;
+ const unsigned smask = mask >> (sidx << hbits);
+ masks[1][cby4][imin(thl4c, a[x])][sidx] |= smask;
+ }
+
+ if (!skip_inter) {
+ // inner (tx) left|right edges
+ const int hstep = t_dim->w;
+ unsigned t = 1U << cby4;
+ unsigned inner = (unsigned) ((((uint64_t) t) << ch4) - t);
+ unsigned inner1 = inner & ((1 << vmask) - 1), inner2 = inner >> vmask;
+ for (x = hstep; x < cw4; x += hstep) {
+ if (inner1) masks[0][cbx4 + x][twl4c][0] |= inner1;
+ if (inner2) masks[0][cbx4 + x][twl4c][1] |= inner2;
+ }
+
+ // top
+ // inner (tx) --- edges
+ // bottom
+ const int vstep = t_dim->h;
+ t = 1U << cbx4;
+ inner = (unsigned) ((((uint64_t) t) << cw4) - t);
+ inner1 = inner & ((1 << hmask) - 1), inner2 = inner >> hmask;
+ for (y = vstep; y < ch4; y += vstep) {
+ if (inner1) masks[1][cby4 + y][thl4c][0] |= inner1;
+ if (inner2) masks[1][cby4 + y][thl4c][1] |= inner2;
+ }
+ }
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, a, off, mul * thl4c)
+#define default_memset(dir, diridx, off, var) \
+ memset(a, thl4c, var)
+ case_set_upto32_with_default(cw4,,, 0);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, l, off, mul * twl4c)
+#define default_memset(dir, diridx, off, var) \
+ memset(l, twl4c, var)
+ case_set_upto32_with_default(ch4,,, 0);
+#undef default_memset
+#undef set_ctx
+}
+
+void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
+ uint8_t (*const level_cache)[4],
+ const ptrdiff_t b4_stride,
+ const uint8_t (*filter_level)[8][2],
+ const int bx, const int by,
+ const int iw, const int ih,
+ const enum BlockSize bs,
+ const enum RectTxfmSize ytx,
+ const enum RectTxfmSize uvtx,
+ const enum Dav1dPixelLayout layout,
+ uint8_t *const ay, uint8_t *const ly,
+ uint8_t *const auv, uint8_t *const luv)
+{
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = imin(iw - bx, b_dim[0]);
+ const int bh4 = imin(ih - by, b_dim[1]);
+ const int bx4 = bx & 31;
+ const int by4 = by & 31;
+
+ if (bw4 && bh4) {
+ uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx;
+ for (int y = 0; y < bh4; y++) {
+ for (int x = 0; x < bw4; x++) {
+ level_cache_ptr[x][0] = filter_level[0][0][0];
+ level_cache_ptr[x][1] = filter_level[1][0][0];
+ }
+ level_cache_ptr += b4_stride;
+ }
+
+ mask_edges_intra(lflvl->filter_y, by4, bx4, bw4, bh4, ytx, ay, ly);
+ }
+
+ if (!auv) return;
+
+ const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cbw4 = imin(((iw + ss_hor) >> ss_hor) - (bx >> ss_hor),
+ (b_dim[0] + ss_hor) >> ss_hor);
+ const int cbh4 = imin(((ih + ss_ver) >> ss_ver) - (by >> ss_ver),
+ (b_dim[1] + ss_ver) >> ss_ver);
+
+ if (!cbw4 || !cbh4) return;
+
+ const int cbx4 = bx4 >> ss_hor;
+ const int cby4 = by4 >> ss_ver;
+
+ uint8_t (*level_cache_ptr)[4] =
+ level_cache + (by >> ss_ver) * b4_stride + (bx >> ss_hor);
+ for (int y = 0; y < cbh4; y++) {
+ for (int x = 0; x < cbw4; x++) {
+ level_cache_ptr[x][2] = filter_level[2][0][0];
+ level_cache_ptr[x][3] = filter_level[3][0][0];
+ }
+ level_cache_ptr += b4_stride;
+ }
+
+ mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, 0, uvtx,
+ auv, luv, ss_hor, ss_ver);
+}
+
+void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
+ uint8_t (*const level_cache)[4],
+ const ptrdiff_t b4_stride,
+ const uint8_t (*filter_level)[8][2],
+ const int bx, const int by,
+ const int iw, const int ih,
+ const int skip, const enum BlockSize bs,
+ const enum RectTxfmSize max_ytx,
+ const uint16_t *const tx_masks,
+ const enum RectTxfmSize uvtx,
+ const enum Dav1dPixelLayout layout,
+ uint8_t *const ay, uint8_t *const ly,
+ uint8_t *const auv, uint8_t *const luv)
+{
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = imin(iw - bx, b_dim[0]);
+ const int bh4 = imin(ih - by, b_dim[1]);
+ const int bx4 = bx & 31;
+ const int by4 = by & 31;
+
+ if (bw4 && bh4) {
+ uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx;
+ for (int y = 0; y < bh4; y++) {
+ for (int x = 0; x < bw4; x++) {
+ level_cache_ptr[x][0] = filter_level[0][0][0];
+ level_cache_ptr[x][1] = filter_level[1][0][0];
+ }
+ level_cache_ptr += b4_stride;
+ }
+
+ mask_edges_inter(lflvl->filter_y, by4, bx4, bw4, bh4, skip,
+ max_ytx, tx_masks, ay, ly);
+ }
+
+ if (!auv) return;
+
+ const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cbw4 = imin(((iw + ss_hor) >> ss_hor) - (bx >> ss_hor),
+ (b_dim[0] + ss_hor) >> ss_hor);
+ const int cbh4 = imin(((ih + ss_ver) >> ss_ver) - (by >> ss_ver),
+ (b_dim[1] + ss_ver) >> ss_ver);
+
+ if (!cbw4 || !cbh4) return;
+
+ const int cbx4 = bx4 >> ss_hor;
+ const int cby4 = by4 >> ss_ver;
+
+ uint8_t (*level_cache_ptr)[4] =
+ level_cache + (by >> ss_ver) * b4_stride + (bx >> ss_hor);
+ for (int y = 0; y < cbh4; y++) {
+ for (int x = 0; x < cbw4; x++) {
+ level_cache_ptr[x][2] = filter_level[2][0][0];
+ level_cache_ptr[x][3] = filter_level[3][0][0];
+ }
+ level_cache_ptr += b4_stride;
+ }
+
+ mask_edges_chroma(lflvl->filter_uv, cby4, cbx4, cbw4, cbh4, skip, uvtx,
+ auv, luv, ss_hor, ss_ver);
+}
+
+void dav1d_calc_eih(Av1FilterLUT *const lim_lut, const int filter_sharpness) {
+ // set E/I/H values from loopfilter level
+ const int sharp = filter_sharpness;
+ for (int level = 0; level < 64; level++) {
+ int limit = level;
+
+ if (sharp > 0) {
+ limit >>= (sharp + 3) >> 2;
+ limit = imin(limit, 9 - sharp);
+ }
+ limit = imax(limit, 1);
+
+ lim_lut->i[level] = limit;
+ lim_lut->e[level] = 2 * (level + 2) + limit;
+ }
+ lim_lut->sharp[0] = (sharp + 3) >> 2;
+ lim_lut->sharp[1] = sharp ? 9 - sharp : 0xff;
+}
+
+static void calc_lf_value(uint8_t (*const lflvl_values)[2],
+ const int base_lvl, const int lf_delta,
+ const int seg_delta,
+ const Dav1dLoopfilterModeRefDeltas *const mr_delta)
+{
+ const int base = iclip(iclip(base_lvl + lf_delta, 0, 63) + seg_delta, 0, 63);
+
+ if (!mr_delta) {
+ memset(lflvl_values, base, 8 * 2);
+ } else {
+ const int sh = base >= 32;
+ lflvl_values[0][0] = lflvl_values[0][1] =
+ iclip(base + (mr_delta->ref_delta[0] * (1 << sh)), 0, 63);
+ for (int r = 1; r < 8; r++) {
+ for (int m = 0; m < 2; m++) {
+ const int delta =
+ mr_delta->mode_delta[m] + mr_delta->ref_delta[r];
+ lflvl_values[r][m] = iclip(base + (delta * (1 << sh)), 0, 63);
+ }
+ }
+ }
+}
+
+static inline void calc_lf_value_chroma(uint8_t (*const lflvl_values)[2],
+ const int base_lvl, const int lf_delta,
+ const int seg_delta,
+ const Dav1dLoopfilterModeRefDeltas *const mr_delta)
+{
+ if (!base_lvl)
+ memset(lflvl_values, 0, 8 * 2);
+ else
+ calc_lf_value(lflvl_values, base_lvl, lf_delta, seg_delta, mr_delta);
+}
+
+void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2],
+ const Dav1dFrameHeader *const hdr,
+ const int8_t lf_delta[4])
+{
+ const int n_seg = hdr->segmentation.enabled ? 8 : 1;
+
+ if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1]) {
+ memset(lflvl_values, 0, 8 * 4 * 2 * n_seg);
+ return;
+ }
+
+ const Dav1dLoopfilterModeRefDeltas *const mr_deltas =
+ hdr->loopfilter.mode_ref_delta_enabled ?
+ &hdr->loopfilter.mode_ref_deltas : NULL;
+ for (int s = 0; s < n_seg; s++) {
+ const Dav1dSegmentationData *const segd =
+ hdr->segmentation.enabled ? &hdr->segmentation.seg_data.d[s] : NULL;
+
+ calc_lf_value(lflvl_values[s][0], hdr->loopfilter.level_y[0],
+ lf_delta[0], segd ? segd->delta_lf_y_v : 0, mr_deltas);
+ calc_lf_value(lflvl_values[s][1], hdr->loopfilter.level_y[1],
+ lf_delta[hdr->delta.lf.multi ? 1 : 0],
+ segd ? segd->delta_lf_y_h : 0, mr_deltas);
+ calc_lf_value_chroma(lflvl_values[s][2], hdr->loopfilter.level_u,
+ lf_delta[hdr->delta.lf.multi ? 2 : 0],
+ segd ? segd->delta_lf_u : 0, mr_deltas);
+ calc_lf_value_chroma(lflvl_values[s][3], hdr->loopfilter.level_v,
+ lf_delta[hdr->delta.lf.multi ? 3 : 0],
+ segd ? segd->delta_lf_v : 0, mr_deltas);
+ }
+}
diff --git a/third_party/dav1d/src/lf_mask.h b/third_party/dav1d/src/lf_mask.h
new file mode 100644
index 0000000000..5edf4a0932
--- /dev/null
+++ b/third_party/dav1d/src/lf_mask.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LF_MASK_H
+#define DAV1D_SRC_LF_MASK_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "src/levels.h"
+
+typedef struct Av1FilterLUT {
+ uint8_t e[64];
+ uint8_t i[64];
+ uint64_t sharp[2];
+} Av1FilterLUT;
+
+typedef struct Av1RestorationUnit {
+ uint8_t /* enum Dav1dRestorationType */ type;
+ int8_t filter_h[3];
+ int8_t filter_v[3];
+ uint8_t sgr_idx;
+ int8_t sgr_weights[2];
+} Av1RestorationUnit;
+
+// each struct describes one 128x128 area (1 or 4 SBs), pre-superres-scaling
+typedef struct Av1Filter {
+ // each bit is 1 col
+ uint16_t filter_y[2 /* 0=col, 1=row */][32][3][2];
+ uint16_t filter_uv[2 /* 0=col, 1=row */][32][2][2];
+ int8_t cdef_idx[4]; // -1 means "unset"
+ uint16_t noskip_mask[16][2]; // for 8x8 blocks, but stored on a 4x8 basis
+} Av1Filter;
+
+// each struct describes one 128x128 area (1 or 4 SBs), post-superres-scaling
+typedef struct Av1Restoration {
+ Av1RestorationUnit lr[3][4];
+} Av1Restoration;
+
+void dav1d_create_lf_mask_intra(Av1Filter *lflvl, uint8_t (*level_cache)[4],
+ const ptrdiff_t b4_stride,
+ const uint8_t (*level)[8][2], int bx, int by,
+ int iw, int ih, enum BlockSize bs,
+ enum RectTxfmSize ytx, enum RectTxfmSize uvtx,
+ enum Dav1dPixelLayout layout, uint8_t *ay,
+ uint8_t *ly, uint8_t *auv, uint8_t *luv);
+void dav1d_create_lf_mask_inter(Av1Filter *lflvl, uint8_t (*level_cache)[4],
+ const ptrdiff_t b4_stride,
+ const uint8_t (*level)[8][2], int bx, int by,
+ int iw, int ih, int skip_inter,
+ enum BlockSize bs, enum RectTxfmSize max_ytx,
+ const uint16_t *tx_mask, enum RectTxfmSize uvtx,
+ enum Dav1dPixelLayout layout, uint8_t *ay,
+ uint8_t *ly, uint8_t *auv, uint8_t *luv);
+void dav1d_calc_eih(Av1FilterLUT *lim_lut, int filter_sharpness);
+void dav1d_calc_lf_values(uint8_t (*values)[4][8][2], const Dav1dFrameHeader *hdr,
+ const int8_t lf_delta[4]);
+
+#endif /* DAV1D_SRC_LF_MASK_H */
diff --git a/third_party/dav1d/src/lib.c b/third_party/dav1d/src/lib.c
new file mode 100644
index 0000000000..eca22ebe03
--- /dev/null
+++ b/third_party/dav1d/src/lib.c
@@ -0,0 +1,802 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "vcs_version.h"
+
+#include <errno.h>
+#include <string.h>
+
+#if defined(__linux__) && defined(HAVE_DLSYM)
+#include <dlfcn.h>
+#endif
+
+#include "dav1d/dav1d.h"
+#include "dav1d/data.h"
+
+#include "common/validate.h"
+
+#include "src/cpu.h"
+#include "src/fg_apply.h"
+#include "src/internal.h"
+#include "src/log.h"
+#include "src/obu.h"
+#include "src/qm.h"
+#include "src/ref.h"
+#include "src/thread_task.h"
+#include "src/wedge.h"
+
+static COLD void init_internal(void) {
+ dav1d_init_cpu();
+ dav1d_init_interintra_masks();
+ dav1d_init_qm_tables();
+ dav1d_init_thread();
+ dav1d_init_wedge_masks();
+}
+
+COLD const char *dav1d_version(void) {
+ return DAV1D_VERSION;
+}
+
+COLD void dav1d_default_settings(Dav1dSettings *const s) {
+ s->n_threads = 0;
+ s->max_frame_delay = 0;
+ s->apply_grain = 1;
+ s->allocator.cookie = NULL;
+ s->allocator.alloc_picture_callback = dav1d_default_picture_alloc;
+ s->allocator.release_picture_callback = dav1d_default_picture_release;
+ s->logger.cookie = NULL;
+ s->logger.callback = dav1d_log_default_callback;
+ s->operating_point = 0;
+ s->all_layers = 1; // just until the tests are adjusted
+ s->frame_size_limit = 0;
+ s->strict_std_compliance = 0;
+ s->output_invisible_frames = 0;
+ s->inloop_filters = DAV1D_INLOOPFILTER_ALL;
+ s->decode_frame_type = DAV1D_DECODEFRAMETYPE_ALL;
+}
+
+static void close_internal(Dav1dContext **const c_out, int flush);
+
+NO_SANITIZE("cfi-icall") // CFI is broken with dlsym()
+static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_attr) {
+#if defined(__linux__) && defined(HAVE_DLSYM) && defined(__GLIBC__)
+ /* glibc has an issue where the size of the TLS is subtracted from the stack
+ * size instead of allocated separately. As a result the specified stack
+ * size may be insufficient when used in an application with large amounts
+ * of TLS data. The following is a workaround to compensate for that.
+ * See https://sourceware.org/bugzilla/show_bug.cgi?id=11787 */
+ size_t (*const get_minstack)(const pthread_attr_t*) =
+ dlsym(RTLD_DEFAULT, "__pthread_get_minstack");
+ if (get_minstack)
+ return get_minstack(thread_attr) - PTHREAD_STACK_MIN;
+#endif
+ return 0;
+}
+
+static COLD void get_num_threads(Dav1dContext *const c, const Dav1dSettings *const s,
+ unsigned *n_tc, unsigned *n_fc)
+{
+ /* ceil(sqrt(n)) */
+ static const uint8_t fc_lut[49] = {
+ 1, /* 1 */
+ 2, 2, 2, /* 2- 4 */
+ 3, 3, 3, 3, 3, /* 5- 9 */
+ 4, 4, 4, 4, 4, 4, 4, /* 10-16 */
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 17-25 */
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 26-36 */
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 37-49 */
+ };
+ *n_tc = s->n_threads ? s->n_threads :
+ iclip(dav1d_num_logical_processors(c), 1, DAV1D_MAX_THREADS);
+ *n_fc = s->max_frame_delay ? umin(s->max_frame_delay, *n_tc) :
+ *n_tc < 50 ? fc_lut[*n_tc - 1] : 8; // min(8, ceil(sqrt(n)))
+}
+
+COLD int dav1d_get_frame_delay(const Dav1dSettings *const s) {
+ unsigned n_tc, n_fc;
+ validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->n_threads >= 0 &&
+ s->n_threads <= DAV1D_MAX_THREADS, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->max_frame_delay >= 0 &&
+ s->max_frame_delay <= DAV1D_MAX_FRAME_DELAY, DAV1D_ERR(EINVAL));
+
+ get_num_threads(NULL, s, &n_tc, &n_fc);
+ return n_fc;
+}
+
+COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
+ static pthread_once_t initted = PTHREAD_ONCE_INIT;
+ pthread_once(&initted, init_internal);
+
+ validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->n_threads >= 0 &&
+ s->n_threads <= DAV1D_MAX_THREADS, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->max_frame_delay >= 0 &&
+ s->max_frame_delay <= DAV1D_MAX_FRAME_DELAY, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->allocator.alloc_picture_callback != NULL,
+ DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->allocator.release_picture_callback != NULL,
+ DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->operating_point >= 0 &&
+ s->operating_point <= 31, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->decode_frame_type >= DAV1D_DECODEFRAMETYPE_ALL &&
+ s->decode_frame_type <= DAV1D_DECODEFRAMETYPE_KEY, DAV1D_ERR(EINVAL));
+
+ pthread_attr_t thread_attr;
+ if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM);
+ size_t stack_size = 1024 * 1024 + get_stack_size_internal(&thread_attr);
+
+ pthread_attr_setstacksize(&thread_attr, stack_size);
+
+ Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 64);
+ if (!c) goto error;
+ memset(c, 0, sizeof(*c));
+
+ c->allocator = s->allocator;
+ c->logger = s->logger;
+ c->apply_grain = s->apply_grain;
+ c->operating_point = s->operating_point;
+ c->all_layers = s->all_layers;
+ c->frame_size_limit = s->frame_size_limit;
+ c->strict_std_compliance = s->strict_std_compliance;
+ c->output_invisible_frames = s->output_invisible_frames;
+ c->inloop_filters = s->inloop_filters;
+ c->decode_frame_type = s->decode_frame_type;
+
+ dav1d_data_props_set_defaults(&c->cached_error_props);
+
+ if (dav1d_mem_pool_init(&c->seq_hdr_pool) ||
+ dav1d_mem_pool_init(&c->frame_hdr_pool) ||
+ dav1d_mem_pool_init(&c->segmap_pool) ||
+ dav1d_mem_pool_init(&c->refmvs_pool) ||
+ dav1d_mem_pool_init(&c->cdf_pool))
+ {
+ goto error;
+ }
+
+ if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc &&
+ c->allocator.release_picture_callback == dav1d_default_picture_release)
+ {
+ if (c->allocator.cookie) goto error;
+ if (dav1d_mem_pool_init(&c->picture_pool)) goto error;
+ c->allocator.cookie = c->picture_pool;
+ } else if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc ||
+ c->allocator.release_picture_callback == dav1d_default_picture_release)
+ {
+ goto error;
+ }
+
+ /* On 32-bit systems extremely large frame sizes can cause overflows in
+ * dav1d_decode_frame() malloc size calculations. Prevent that from occuring
+ * by enforcing a maximum frame size limit, chosen to roughly correspond to
+ * the largest size possible to decode without exhausting virtual memory. */
+ if (sizeof(size_t) < 8 && s->frame_size_limit - 1 >= 8192 * 8192) {
+ c->frame_size_limit = 8192 * 8192;
+ if (s->frame_size_limit)
+ dav1d_log(c, "Frame size limit reduced from %u to %u.\n",
+ s->frame_size_limit, c->frame_size_limit);
+ }
+
+ c->flush = &c->flush_mem;
+ atomic_init(c->flush, 0);
+
+ get_num_threads(c, s, &c->n_tc, &c->n_fc);
+
+ c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * c->n_fc, 32);
+ if (!c->fc) goto error;
+ memset(c->fc, 0, sizeof(*c->fc) * c->n_fc);
+
+ c->tc = dav1d_alloc_aligned(sizeof(*c->tc) * c->n_tc, 64);
+ if (!c->tc) goto error;
+ memset(c->tc, 0, sizeof(*c->tc) * c->n_tc);
+ if (c->n_tc > 1) {
+ if (pthread_mutex_init(&c->task_thread.lock, NULL)) goto error;
+ if (pthread_cond_init(&c->task_thread.cond, NULL)) {
+ pthread_mutex_destroy(&c->task_thread.lock);
+ goto error;
+ }
+ if (pthread_cond_init(&c->task_thread.delayed_fg.cond, NULL)) {
+ pthread_cond_destroy(&c->task_thread.cond);
+ pthread_mutex_destroy(&c->task_thread.lock);
+ goto error;
+ }
+ c->task_thread.cur = c->n_fc;
+ atomic_init(&c->task_thread.reset_task_cur, UINT_MAX);
+ atomic_init(&c->task_thread.cond_signaled, 0);
+ c->task_thread.inited = 1;
+ }
+
+ if (c->n_fc > 1) {
+ c->frame_thread.out_delayed =
+ calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed));
+ if (!c->frame_thread.out_delayed) goto error;
+ }
+ for (unsigned n = 0; n < c->n_fc; n++) {
+ Dav1dFrameContext *const f = &c->fc[n];
+ if (c->n_tc > 1) {
+ if (pthread_mutex_init(&f->task_thread.lock, NULL)) goto error;
+ if (pthread_cond_init(&f->task_thread.cond, NULL)) {
+ pthread_mutex_destroy(&f->task_thread.lock);
+ goto error;
+ }
+ if (pthread_mutex_init(&f->task_thread.pending_tasks.lock, NULL)) {
+ pthread_cond_destroy(&f->task_thread.cond);
+ pthread_mutex_destroy(&f->task_thread.lock);
+ goto error;
+ }
+ }
+ f->c = c;
+ f->task_thread.ttd = &c->task_thread;
+ f->lf.last_sharpness = -1;
+ dav1d_refmvs_init(&f->rf);
+ }
+
+ for (unsigned m = 0; m < c->n_tc; m++) {
+ Dav1dTaskContext *const t = &c->tc[m];
+ t->f = &c->fc[0];
+ t->task_thread.ttd = &c->task_thread;
+ t->c = c;
+ memset(t->cf_16bpc, 0, sizeof(t->cf_16bpc));
+ if (c->n_tc > 1) {
+ if (pthread_mutex_init(&t->task_thread.td.lock, NULL)) goto error;
+ if (pthread_cond_init(&t->task_thread.td.cond, NULL)) {
+ pthread_mutex_destroy(&t->task_thread.td.lock);
+ goto error;
+ }
+ if (pthread_create(&t->task_thread.td.thread, &thread_attr, dav1d_worker_task, t)) {
+ pthread_cond_destroy(&t->task_thread.td.cond);
+ pthread_mutex_destroy(&t->task_thread.td.lock);
+ goto error;
+ }
+ t->task_thread.td.inited = 1;
+ }
+ }
+ dav1d_refmvs_dsp_init(&c->refmvs_dsp);
+
+ // intra edge tree
+ c->intra_edge.root[BL_128X128] = &c->intra_edge.branch_sb128[0].node;
+ dav1d_init_mode_tree(c->intra_edge.root[BL_128X128], c->intra_edge.tip_sb128, 1);
+ c->intra_edge.root[BL_64X64] = &c->intra_edge.branch_sb64[0].node;
+ dav1d_init_mode_tree(c->intra_edge.root[BL_64X64], c->intra_edge.tip_sb64, 0);
+
+ pthread_attr_destroy(&thread_attr);
+
+ return 0;
+
+error:
+ if (c) close_internal(c_out, 0);
+ pthread_attr_destroy(&thread_attr);
+ return DAV1D_ERR(ENOMEM);
+}
+
+static void dummy_free(const uint8_t *const data, void *const user_data) {
+ assert(data && !user_data);
+}
+
+int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out,
+ const uint8_t *const ptr, const size_t sz)
+{
+ Dav1dData buf = { 0 };
+ int res;
+
+ validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
+
+ Dav1dSettings s;
+ dav1d_default_settings(&s);
+ s.n_threads = 1;
+ s.logger.callback = NULL;
+
+ Dav1dContext *c;
+ res = dav1d_open(&c, &s);
+ if (res < 0) return res;
+
+ if (ptr) {
+ res = dav1d_data_wrap_internal(&buf, ptr, sz, dummy_free, NULL);
+ if (res < 0) goto error;
+ }
+
+ while (buf.sz > 0) {
+ res = dav1d_parse_obus(c, &buf, 1);
+ if (res < 0) goto error;
+
+ assert((size_t)res <= buf.sz);
+ buf.sz -= res;
+ buf.data += res;
+ }
+
+ if (!c->seq_hdr) {
+ res = DAV1D_ERR(ENOENT);
+ goto error;
+ }
+
+ memcpy(out, c->seq_hdr, sizeof(*out));
+
+ res = 0;
+error:
+ dav1d_data_unref_internal(&buf);
+ dav1d_close(&c);
+
+ return res;
+}
+
+static int has_grain(const Dav1dPicture *const pic)
+{
+ const Dav1dFilmGrainData *fgdata = &pic->frame_hdr->film_grain.data;
+ return fgdata->num_y_points || fgdata->num_uv_points[0] ||
+ fgdata->num_uv_points[1] || (fgdata->clip_to_restricted_range &&
+ fgdata->chroma_scaling_from_luma);
+}
+
+static int output_image(Dav1dContext *const c, Dav1dPicture *const out)
+{
+ int res = 0;
+
+ Dav1dThreadPicture *const in = (c->all_layers || !c->max_spatial_id)
+ ? &c->out : &c->cache;
+ if (!c->apply_grain || !has_grain(&in->p)) {
+ dav1d_picture_move_ref(out, &in->p);
+ dav1d_thread_picture_unref(in);
+ goto end;
+ }
+
+ res = dav1d_apply_grain(c, out, &in->p);
+ dav1d_thread_picture_unref(in);
+end:
+ if (!c->all_layers && c->max_spatial_id && c->out.p.data[0]) {
+ dav1d_thread_picture_move_ref(in, &c->out);
+ }
+ return res;
+}
+
+static int output_picture_ready(Dav1dContext *const c, const int drain) {
+ if (c->cached_error) return 1;
+ if (!c->all_layers && c->max_spatial_id) {
+ if (c->out.p.data[0] && c->cache.p.data[0]) {
+ if (c->max_spatial_id == c->cache.p.frame_hdr->spatial_id ||
+ c->out.flags & PICTURE_FLAG_NEW_TEMPORAL_UNIT)
+ return 1;
+ dav1d_thread_picture_unref(&c->cache);
+ dav1d_thread_picture_move_ref(&c->cache, &c->out);
+ return 0;
+ } else if (c->cache.p.data[0] && drain) {
+ return 1;
+ } else if (c->out.p.data[0]) {
+ dav1d_thread_picture_move_ref(&c->cache, &c->out);
+ return 0;
+ }
+ }
+
+ return !!c->out.p.data[0];
+}
+
+static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
+ unsigned drain_count = 0;
+ int drained = 0;
+ do {
+ const unsigned next = c->frame_thread.next;
+ Dav1dFrameContext *const f = &c->fc[next];
+ pthread_mutex_lock(&c->task_thread.lock);
+ while (f->n_tile_data > 0)
+ pthread_cond_wait(&f->task_thread.cond,
+ &f->task_thread.ttd->lock);
+ Dav1dThreadPicture *const out_delayed =
+ &c->frame_thread.out_delayed[next];
+ if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
+ unsigned first = atomic_load(&c->task_thread.first);
+ if (first + 1U < c->n_fc)
+ atomic_fetch_add(&c->task_thread.first, 1U);
+ else
+ atomic_store(&c->task_thread.first, 0);
+ atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
+ &first, UINT_MAX);
+ if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
+ c->task_thread.cur--;
+ drained = 1;
+ } else if (drained) {
+ pthread_mutex_unlock(&c->task_thread.lock);
+ break;
+ }
+ if (++c->frame_thread.next == c->n_fc)
+ c->frame_thread.next = 0;
+ pthread_mutex_unlock(&c->task_thread.lock);
+ const int error = f->task_thread.retval;
+ if (error) {
+ f->task_thread.retval = 0;
+ dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
+ dav1d_thread_picture_unref(out_delayed);
+ return error;
+ }
+ if (out_delayed->p.data[0]) {
+ const unsigned progress =
+ atomic_load_explicit(&out_delayed->progress[1],
+ memory_order_relaxed);
+ if ((out_delayed->visible || c->output_invisible_frames) &&
+ progress != FRAME_ERROR)
+ {
+ dav1d_thread_picture_ref(&c->out, out_delayed);
+ c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
+ }
+ dav1d_thread_picture_unref(out_delayed);
+ if (output_picture_ready(c, 0))
+ return output_image(c, out);
+ }
+ } while (++drain_count < c->n_fc);
+
+ if (output_picture_ready(c, 1))
+ return output_image(c, out);
+
+ return DAV1D_ERR(EAGAIN);
+}
+
+static int gen_picture(Dav1dContext *const c)
+{
+ int res;
+ Dav1dData *const in = &c->in;
+
+ if (output_picture_ready(c, 0))
+ return 0;
+
+ while (in->sz > 0) {
+ res = dav1d_parse_obus(c, in, 0);
+ if (res < 0) {
+ dav1d_data_unref_internal(in);
+ } else {
+ assert((size_t)res <= in->sz);
+ in->sz -= res;
+ in->data += res;
+ if (!in->sz) dav1d_data_unref_internal(in);
+ }
+ if (output_picture_ready(c, 0))
+ break;
+ if (res < 0)
+ return res;
+ }
+
+ return 0;
+}
+
+int dav1d_send_data(Dav1dContext *const c, Dav1dData *const in)
+{
+ validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(in != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(in->data == NULL || in->sz, DAV1D_ERR(EINVAL));
+
+ if (in->data)
+ c->drain = 0;
+ if (c->in.data)
+ return DAV1D_ERR(EAGAIN);
+ dav1d_data_ref(&c->in, in);
+
+ int res = gen_picture(c);
+ if (!res)
+ dav1d_data_unref_internal(in);
+
+ return res;
+}
+
+int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
+{
+ validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
+
+ const int drain = c->drain;
+ c->drain = 1;
+
+ int res = gen_picture(c);
+ if (res < 0)
+ return res;
+
+ if (c->cached_error) {
+ const int res = c->cached_error;
+ c->cached_error = 0;
+ return res;
+ }
+
+ if (output_picture_ready(c, c->n_fc == 1))
+ return output_image(c, out);
+
+ if (c->n_fc > 1 && drain)
+ return drain_picture(c, out);
+
+ return DAV1D_ERR(EAGAIN);
+}
+
+int dav1d_apply_grain(Dav1dContext *const c, Dav1dPicture *const out,
+ const Dav1dPicture *const in)
+{
+ validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(in != NULL, DAV1D_ERR(EINVAL));
+
+ if (!has_grain(in)) {
+ dav1d_picture_ref(out, in);
+ return 0;
+ }
+
+ int res = dav1d_picture_alloc_copy(c, out, in->p.w, in);
+ if (res < 0) goto error;
+
+ if (c->n_tc > 1) {
+ dav1d_task_delayed_fg(c, out, in);
+ } else {
+ switch (out->p.bpc) {
+#if CONFIG_8BPC
+ case 8:
+ dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in);
+ break;
+#endif
+#if CONFIG_16BPC
+ case 10:
+ case 12:
+ dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in);
+ break;
+#endif
+ default: abort();
+ }
+ }
+
+ return 0;
+
+error:
+ dav1d_picture_unref_internal(out);
+ return res;
+}
+
+void dav1d_flush(Dav1dContext *const c) {
+ dav1d_data_unref_internal(&c->in);
+ if (c->out.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->out);
+ if (c->cache.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->cache);
+
+ c->drain = 0;
+ c->cached_error = 0;
+
+ for (int i = 0; i < 8; i++) {
+ if (c->refs[i].p.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->refs[i].p);
+ dav1d_ref_dec(&c->refs[i].segmap);
+ dav1d_ref_dec(&c->refs[i].refmvs);
+ dav1d_cdf_thread_unref(&c->cdf[i]);
+ }
+ c->frame_hdr = NULL;
+ c->seq_hdr = NULL;
+ dav1d_ref_dec(&c->seq_hdr_ref);
+
+ c->mastering_display = NULL;
+ c->content_light = NULL;
+ c->itut_t35 = NULL;
+ dav1d_ref_dec(&c->mastering_display_ref);
+ dav1d_ref_dec(&c->content_light_ref);
+ dav1d_ref_dec(&c->itut_t35_ref);
+
+ dav1d_data_props_unref_internal(&c->cached_error_props);
+
+ if (c->n_fc == 1 && c->n_tc == 1) return;
+ atomic_store(c->flush, 1);
+
+ // stop running tasks in worker threads
+ if (c->n_tc > 1) {
+ pthread_mutex_lock(&c->task_thread.lock);
+ for (unsigned i = 0; i < c->n_tc; i++) {
+ Dav1dTaskContext *const tc = &c->tc[i];
+ while (!tc->task_thread.flushed) {
+ pthread_cond_wait(&tc->task_thread.td.cond, &c->task_thread.lock);
+ }
+ }
+ for (unsigned i = 0; i < c->n_fc; i++) {
+ c->fc[i].task_thread.task_head = NULL;
+ c->fc[i].task_thread.task_tail = NULL;
+ c->fc[i].task_thread.task_cur_prev = NULL;
+ c->fc[i].task_thread.pending_tasks.head = NULL;
+ c->fc[i].task_thread.pending_tasks.tail = NULL;
+ atomic_init(&c->fc[i].task_thread.pending_tasks.merge, 0);
+ }
+ atomic_init(&c->task_thread.first, 0);
+ c->task_thread.cur = c->n_fc;
+ atomic_store(&c->task_thread.reset_task_cur, UINT_MAX);
+ atomic_store(&c->task_thread.cond_signaled, 0);
+ pthread_mutex_unlock(&c->task_thread.lock);
+ }
+
+ // wait for threads to complete flushing
+ if (c->n_fc > 1) {
+ for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
+ if (next == c->n_fc) next = 0;
+ Dav1dFrameContext *const f = &c->fc[next];
+ dav1d_decode_frame_exit(f, -1);
+ f->n_tile_data = 0;
+ f->task_thread.retval = 0;
+ Dav1dThreadPicture *out_delayed = &c->frame_thread.out_delayed[next];
+ if (out_delayed->p.frame_hdr) {
+ dav1d_thread_picture_unref(out_delayed);
+ }
+ }
+ c->frame_thread.next = 0;
+ }
+ atomic_store(c->flush, 0);
+}
+
+COLD void dav1d_close(Dav1dContext **const c_out) {
+ validate_input(c_out != NULL);
+ close_internal(c_out, 1);
+}
+
+static COLD void close_internal(Dav1dContext **const c_out, int flush) {
+ Dav1dContext *const c = *c_out;
+ if (!c) return;
+
+ if (flush) dav1d_flush(c);
+
+ if (c->tc) {
+ struct TaskThreadData *ttd = &c->task_thread;
+ if (ttd->inited) {
+ pthread_mutex_lock(&ttd->lock);
+ for (unsigned n = 0; n < c->n_tc && c->tc[n].task_thread.td.inited; n++)
+ c->tc[n].task_thread.die = 1;
+ pthread_cond_broadcast(&ttd->cond);
+ pthread_mutex_unlock(&ttd->lock);
+ for (unsigned n = 0; n < c->n_tc; n++) {
+ Dav1dTaskContext *const pf = &c->tc[n];
+ if (!pf->task_thread.td.inited) break;
+ pthread_join(pf->task_thread.td.thread, NULL);
+ pthread_cond_destroy(&pf->task_thread.td.cond);
+ pthread_mutex_destroy(&pf->task_thread.td.lock);
+ }
+ pthread_cond_destroy(&ttd->delayed_fg.cond);
+ pthread_cond_destroy(&ttd->cond);
+ pthread_mutex_destroy(&ttd->lock);
+ }
+ dav1d_free_aligned(c->tc);
+ }
+
+ for (unsigned n = 0; c->fc && n < c->n_fc; n++) {
+ Dav1dFrameContext *const f = &c->fc[n];
+
+ // clean-up threading stuff
+ if (c->n_fc > 1) {
+ freep(&f->tile_thread.lowest_pixel_mem);
+ freep(&f->frame_thread.b);
+ dav1d_freep_aligned(&f->frame_thread.pal_idx);
+ dav1d_freep_aligned(&f->frame_thread.cf);
+ freep(&f->frame_thread.tile_start_off);
+ dav1d_freep_aligned(&f->frame_thread.pal);
+ freep(&f->frame_thread.cbi);
+ }
+ if (c->n_tc > 1) {
+ pthread_mutex_destroy(&f->task_thread.pending_tasks.lock);
+ pthread_cond_destroy(&f->task_thread.cond);
+ pthread_mutex_destroy(&f->task_thread.lock);
+ }
+ freep(&f->frame_thread.frame_progress);
+ freep(&f->task_thread.tasks);
+ freep(&f->task_thread.tile_tasks[0]);
+ dav1d_free_aligned(f->ts);
+ dav1d_free_aligned(f->ipred_edge[0]);
+ free(f->a);
+ free(f->tile);
+ free(f->lf.mask);
+ free(f->lf.lr_mask);
+ free(f->lf.level);
+ free(f->lf.tx_lpf_right_edge[0]);
+ free(f->lf.start_of_tile_row);
+ dav1d_refmvs_clear(&f->rf);
+ dav1d_free_aligned(f->lf.cdef_line_buf);
+ dav1d_free_aligned(f->lf.lr_line_buf);
+ }
+ dav1d_free_aligned(c->fc);
+ if (c->n_fc > 1 && c->frame_thread.out_delayed) {
+ for (unsigned n = 0; n < c->n_fc; n++)
+ if (c->frame_thread.out_delayed[n].p.frame_hdr)
+ dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]);
+ free(c->frame_thread.out_delayed);
+ }
+ for (int n = 0; n < c->n_tile_data; n++)
+ dav1d_data_unref_internal(&c->tile[n].data);
+ free(c->tile);
+ for (int n = 0; n < 8; n++) {
+ dav1d_cdf_thread_unref(&c->cdf[n]);
+ if (c->refs[n].p.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->refs[n].p);
+ dav1d_ref_dec(&c->refs[n].refmvs);
+ dav1d_ref_dec(&c->refs[n].segmap);
+ }
+ dav1d_ref_dec(&c->seq_hdr_ref);
+ dav1d_ref_dec(&c->frame_hdr_ref);
+
+ dav1d_ref_dec(&c->mastering_display_ref);
+ dav1d_ref_dec(&c->content_light_ref);
+ dav1d_ref_dec(&c->itut_t35_ref);
+
+ dav1d_mem_pool_end(c->seq_hdr_pool);
+ dav1d_mem_pool_end(c->frame_hdr_pool);
+ dav1d_mem_pool_end(c->segmap_pool);
+ dav1d_mem_pool_end(c->refmvs_pool);
+ dav1d_mem_pool_end(c->cdf_pool);
+ dav1d_mem_pool_end(c->picture_pool);
+
+ dav1d_freep_aligned(c_out);
+}
+
+int dav1d_get_event_flags(Dav1dContext *const c, enum Dav1dEventFlags *const flags) {
+ validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(flags != NULL, DAV1D_ERR(EINVAL));
+
+ *flags = c->event_flags;
+ c->event_flags = 0;
+ return 0;
+}
+
+int dav1d_get_decode_error_data_props(Dav1dContext *const c, Dav1dDataProps *const out) {
+ validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
+
+ dav1d_data_props_unref_internal(out);
+ *out = c->cached_error_props;
+ dav1d_data_props_set_defaults(&c->cached_error_props);
+
+ return 0;
+}
+
+void dav1d_picture_unref(Dav1dPicture *const p) {
+ dav1d_picture_unref_internal(p);
+}
+
+uint8_t *dav1d_data_create(Dav1dData *const buf, const size_t sz) {
+ return dav1d_data_create_internal(buf, sz);
+}
+
+int dav1d_data_wrap(Dav1dData *const buf, const uint8_t *const ptr,
+ const size_t sz,
+ void (*const free_callback)(const uint8_t *data,
+ void *user_data),
+ void *const user_data)
+{
+ return dav1d_data_wrap_internal(buf, ptr, sz, free_callback, user_data);
+}
+
+int dav1d_data_wrap_user_data(Dav1dData *const buf,
+ const uint8_t *const user_data,
+ void (*const free_callback)(const uint8_t *user_data,
+ void *cookie),
+ void *const cookie)
+{
+ return dav1d_data_wrap_user_data_internal(buf,
+ user_data,
+ free_callback,
+ cookie);
+}
+
+void dav1d_data_unref(Dav1dData *const buf) {
+ dav1d_data_unref_internal(buf);
+}
+
+void dav1d_data_props_unref(Dav1dDataProps *const props) {
+ dav1d_data_props_unref_internal(props);
+}
diff --git a/third_party/dav1d/src/log.c b/third_party/dav1d/src/log.c
new file mode 100644
index 0000000000..de6776a617
--- /dev/null
+++ b/third_party/dav1d/src/log.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "dav1d/dav1d.h"
+
+#include "common/validate.h"
+
+#include "src/internal.h"
+#include "src/log.h"
+
+#if CONFIG_LOG
+COLD void dav1d_log_default_callback(void *const cookie,
+ const char *const format, va_list ap)
+{
+ vfprintf(stderr, format, ap);
+}
+
+COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) {
+ validate_input(c != NULL);
+
+ if (!c->logger.callback)
+ return;
+
+ va_list ap;
+ va_start(ap, format);
+ c->logger.callback(c->logger.cookie, format, ap);
+ va_end(ap);
+}
+#endif
diff --git a/third_party/dav1d/src/log.h b/third_party/dav1d/src/log.h
new file mode 100644
index 0000000000..df32de7f25
--- /dev/null
+++ b/third_party/dav1d/src/log.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOG_H
+#define DAV1D_SRC_LOG_H
+
+#include "config.h"
+
+#include <stdarg.h>
+
+#include "dav1d/dav1d.h"
+
+#include "common/attributes.h"
+
+#if CONFIG_LOG
+#define dav1d_log dav1d_log
+void dav1d_log_default_callback(void *cookie, const char *format, va_list ap);
+void dav1d_log(Dav1dContext *c, const char *format, ...) ATTR_FORMAT_PRINTF(2, 3);
+#else
+#define dav1d_log_default_callback NULL
+#define dav1d_log(...) do { } while(0)
+#endif
+
+#endif /* DAV1D_SRC_LOG_H */
diff --git a/third_party/dav1d/src/loopfilter.h b/third_party/dav1d/src/loopfilter.h
new file mode 100644
index 0000000000..a0f78c9657
--- /dev/null
+++ b/third_party/dav1d/src/loopfilter.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOOPFILTER_H
+#define DAV1D_SRC_LOOPFILTER_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+#include "src/lf_mask.h"
+
+#define decl_loopfilter_sb_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const uint32_t *mask, \
+ const uint8_t (*lvl)[4], ptrdiff_t lvl_stride, \
+ const Av1FilterLUT *lut, int w HIGHBD_DECL_SUFFIX)
+typedef decl_loopfilter_sb_fn(*loopfilter_sb_fn);
+
+typedef struct Dav1dLoopFilterDSPContext {
+ /*
+ * dimension 1: plane (0=luma, 1=chroma)
+ * dimension 2: 0=col-edge filter (h), 1=row-edge filter (v)
+ *
+ * dst/stride are aligned by 32
+ */
+ loopfilter_sb_fn loop_filter_sb[2][2];
+} Dav1dLoopFilterDSPContext;
+
+bitfn_decls(void dav1d_loop_filter_dsp_init, Dav1dLoopFilterDSPContext *c);
+
+#endif /* DAV1D_SRC_LOOPFILTER_H */
diff --git a/third_party/dav1d/src/loopfilter_tmpl.c b/third_party/dav1d/src/loopfilter_tmpl.c
new file mode 100644
index 0000000000..cacf258756
--- /dev/null
+++ b/third_party/dav1d/src/loopfilter_tmpl.c
@@ -0,0 +1,268 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/loopfilter.h"
+
+static NOINLINE void
+loop_filter(pixel *dst, int E, int I, int H,
+ const ptrdiff_t stridea, const ptrdiff_t strideb, const int wd
+ HIGHBD_DECL_SUFFIX)
+{
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const int F = 1 << bitdepth_min_8;
+ E <<= bitdepth_min_8;
+ I <<= bitdepth_min_8;
+ H <<= bitdepth_min_8;
+
+ for (int i = 0; i < 4; i++, dst += stridea) {
+ int p6, p5, p4, p3, p2;
+ int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
+ int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
+ int q2, q3, q4, q5, q6;
+ int fm, flat8out, flat8in;
+
+ fm = abs(p1 - p0) <= I && abs(q1 - q0) <= I &&
+ abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E;
+
+ if (wd > 4) {
+ p2 = dst[strideb * -3];
+ q2 = dst[strideb * +2];
+
+ fm &= abs(p2 - p1) <= I && abs(q2 - q1) <= I;
+
+ if (wd > 6) {
+ p3 = dst[strideb * -4];
+ q3 = dst[strideb * +3];
+
+ fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I;
+ }
+ }
+ if (!fm) continue;
+
+ if (wd >= 16) {
+ p6 = dst[strideb * -7];
+ p5 = dst[strideb * -6];
+ p4 = dst[strideb * -5];
+ q4 = dst[strideb * +4];
+ q5 = dst[strideb * +5];
+ q6 = dst[strideb * +6];
+
+ flat8out = abs(p6 - p0) <= F && abs(p5 - p0) <= F &&
+ abs(p4 - p0) <= F && abs(q4 - q0) <= F &&
+ abs(q5 - q0) <= F && abs(q6 - q0) <= F;
+ }
+
+ if (wd >= 6)
+ flat8in = abs(p2 - p0) <= F && abs(p1 - p0) <= F &&
+ abs(q1 - q0) <= F && abs(q2 - q0) <= F;
+
+ if (wd >= 8)
+ flat8in &= abs(p3 - p0) <= F && abs(q3 - q0) <= F;
+
+ if (wd >= 16 && (flat8out & flat8in)) {
+ dst[strideb * -6] = (p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 +
+ p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
+ dst[strideb * -5] = (p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 +
+ p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
+ dst[strideb * -4] = (p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 +
+ p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
+ dst[strideb * -3] = (p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 +
+ p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
+ dst[strideb * -2] = (p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
+ p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
+ dst[strideb * -1] = (p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+ q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
+ dst[strideb * +0] = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+ q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+ dst[strideb * +1] = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+ q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8) >> 4;
+ dst[strideb * +2] = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
+ q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8) >> 4;
+ dst[strideb * +3] = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
+ q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8) >> 4;
+ dst[strideb * +4] = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
+ q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
+ dst[strideb * +5] = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 +
+ q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
+ } else if (wd >= 8 && flat8in) {
+ dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
+ dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
+ dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
+ dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
+ dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
+ dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
+ } else if (wd == 6 && flat8in) {
+ dst[strideb * -2] = (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3;
+ dst[strideb * -1] = (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3;
+ dst[strideb * +0] = (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3;
+ dst[strideb * +1] = (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3;
+ } else {
+ const int hev = abs(p1 - p0) > H || abs(q1 - q0) > H;
+
+#define iclip_diff(v) iclip(v, -128 * (1 << bitdepth_min_8), \
+ 128 * (1 << bitdepth_min_8) - 1)
+
+ if (hev) {
+ int f = iclip_diff(p1 - q1), f1, f2;
+ f = iclip_diff(3 * (q0 - p0) + f);
+
+ f1 = imin(f + 4, (128 << bitdepth_min_8) - 1) >> 3;
+ f2 = imin(f + 3, (128 << bitdepth_min_8) - 1) >> 3;
+
+ dst[strideb * -1] = iclip_pixel(p0 + f2);
+ dst[strideb * +0] = iclip_pixel(q0 - f1);
+ } else {
+ int f = iclip_diff(3 * (q0 - p0)), f1, f2;
+
+ f1 = imin(f + 4, (128 << bitdepth_min_8) - 1) >> 3;
+ f2 = imin(f + 3, (128 << bitdepth_min_8) - 1) >> 3;
+
+ dst[strideb * -1] = iclip_pixel(p0 + f2);
+ dst[strideb * +0] = iclip_pixel(q0 - f1);
+
+ f = (f1 + 1) >> 1;
+ dst[strideb * -2] = iclip_pixel(p1 + f);
+ dst[strideb * +1] = iclip_pixel(q1 - f);
+ }
+#undef iclip_diff
+ }
+ }
+}
+
+static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,
+ const uint32_t *const vmask,
+ const uint8_t (*l)[4], ptrdiff_t b4_stride,
+ const Av1FilterLUT *lut, const int h
+ HIGHBD_DECL_SUFFIX)
+{
+ const unsigned vm = vmask[0] | vmask[1] | vmask[2];
+ for (unsigned y = 1; vm & ~(y - 1);
+ y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
+ {
+ if (vm & y) {
+ const int L = l[0][0] ? l[0][0] : l[-1][0];
+ if (!L) continue;
+ const int H = L >> 4;
+ const int E = lut->e[L], I = lut->i[L];
+ const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);
+ loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx
+ HIGHBD_TAIL_SUFFIX);
+ }
+ }
+}
+
+static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,
+ const uint32_t *const vmask,
+ const uint8_t (*l)[4], ptrdiff_t b4_stride,
+ const Av1FilterLUT *lut, const int w
+ HIGHBD_DECL_SUFFIX)
+{
+ const unsigned vm = vmask[0] | vmask[1] | vmask[2];
+ for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
+ if (vm & x) {
+ const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
+ if (!L) continue;
+ const int H = L >> 4;
+ const int E = lut->e[L], I = lut->i[L];
+ const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);
+ loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx
+ HIGHBD_TAIL_SUFFIX);
+ }
+ }
+}
+
+static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,
+ const uint32_t *const vmask,
+ const uint8_t (*l)[4], ptrdiff_t b4_stride,
+ const Av1FilterLUT *lut, const int h
+ HIGHBD_DECL_SUFFIX)
+{
+ const unsigned vm = vmask[0] | vmask[1];
+ for (unsigned y = 1; vm & ~(y - 1);
+ y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
+ {
+ if (vm & y) {
+ const int L = l[0][0] ? l[0][0] : l[-1][0];
+ if (!L) continue;
+ const int H = L >> 4;
+ const int E = lut->e[L], I = lut->i[L];
+ const int idx = !!(vmask[1] & y);
+ loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx
+ HIGHBD_TAIL_SUFFIX);
+ }
+ }
+}
+
+static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
+ const uint32_t *const vmask,
+ const uint8_t (*l)[4], ptrdiff_t b4_stride,
+ const Av1FilterLUT *lut, const int w
+ HIGHBD_DECL_SUFFIX)
+{
+ const unsigned vm = vmask[0] | vmask[1];
+ for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
+ if (vm & x) {
+ const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
+ if (!L) continue;
+ const int H = L >> 4;
+ const int E = lut->e[L], I = lut->i[L];
+ const int idx = !!(vmask[1] & x);
+ loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx
+ HIGHBD_TAIL_SUFFIX);
+ }
+ }
+}
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/loopfilter.h"
+#elif ARCH_X86
+#include "src/x86/loopfilter.h"
+#endif
+#endif
+
+COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
+ c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
+ c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
+ c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
+ c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ loop_filter_dsp_init_arm(c);
+#elif ARCH_X86
+ loop_filter_dsp_init_x86(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/looprestoration.h b/third_party/dav1d/src/looprestoration.h
new file mode 100644
index 0000000000..f55dd31947
--- /dev/null
+++ b/third_party/dav1d/src/looprestoration.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LOOPRESTORATION_H
+#define DAV1D_SRC_LOOPRESTORATION_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+enum LrEdgeFlags {
+ LR_HAVE_LEFT = 1 << 0,
+ LR_HAVE_RIGHT = 1 << 1,
+ LR_HAVE_TOP = 1 << 2,
+ LR_HAVE_BOTTOM = 1 << 3,
+};
+
+#ifdef BITDEPTH
+typedef const pixel (*const_left_pixel_row)[4];
+#else
+typedef const void *const_left_pixel_row;
+#endif
+
+typedef union LooprestorationParams {
+ ALIGN(int16_t filter[2][8], 16);
+ struct {
+ uint32_t s0, s1;
+ int16_t w0, w1;
+ } sgr;
+} LooprestorationParams;
+
+// Although the spec applies restoration filters over 4x4 blocks,
+// they can be applied to a bigger surface.
+// * w is constrained by the restoration unit size (w <= 256)
+// * h is constrained by the stripe height (h <= 64)
+// The filter functions are allowed to do aligned writes past the right
+// edge of the buffer, aligned up to the minimum loop restoration unit size
+// (which is 32 pixels for subsampled chroma and 64 pixels for luma).
+#define decl_lr_filter_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const_left_pixel_row left, \
+ const pixel *lpf, int w, int h, \
+ const LooprestorationParams *params, \
+ enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+typedef decl_lr_filter_fn(*looprestorationfilter_fn);
+
+typedef struct Dav1dLoopRestorationDSPContext {
+ looprestorationfilter_fn wiener[2]; /* 7-tap, 5-tap */
+ looprestorationfilter_fn sgr[3]; /* 5x5, 3x3, mix */
+} Dav1dLoopRestorationDSPContext;
+
+bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc);
+
+#endif /* DAV1D_SRC_LOOPRESTORATION_H */
diff --git a/third_party/dav1d/src/looprestoration_tmpl.c b/third_party/dav1d/src/looprestoration_tmpl.c
new file mode 100644
index 0000000000..d4d7867dba
--- /dev/null
+++ b/third_party/dav1d/src/looprestoration_tmpl.c
@@ -0,0 +1,554 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+#include "src/looprestoration.h"
+#include "src/tables.h"
+
+// 256 * 1.5 + 3 + 3 = 390
+#define REST_UNIT_STRIDE (390)
+
+// TODO Reuse p when no padding is needed (add and remove lpf pixels in p)
+// TODO Chroma only requires 2 rows of padding.
+static NOINLINE void
+padding(pixel *dst, const pixel *p, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf, int unit_w,
+ const int stripe_h, const enum LrEdgeFlags edges)
+{
+ const int have_left = !!(edges & LR_HAVE_LEFT);
+ const int have_right = !!(edges & LR_HAVE_RIGHT);
+
+ // Copy more pixels if we don't have to pad them
+ unit_w += 3 * have_left + 3 * have_right;
+ pixel *dst_l = dst + 3 * !have_left;
+ p -= 3 * have_left;
+ lpf -= 3 * have_left;
+
+ if (edges & LR_HAVE_TOP) {
+ // Copy previous loop filtered rows
+ const pixel *const above_1 = lpf;
+ const pixel *const above_2 = above_1 + PXSTRIDE(stride);
+ pixel_copy(dst_l, above_1, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
+ } else {
+ // Pad with first row
+ pixel_copy(dst_l, p, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+ if (have_left) {
+ pixel_copy(dst_l, &left[0][1], 3);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+ }
+ }
+
+ pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
+ if (edges & LR_HAVE_BOTTOM) {
+ // Copy next loop filtered rows
+ const pixel *const below_1 = lpf + 6 * PXSTRIDE(stride);
+ const pixel *const below_2 = below_1 + PXSTRIDE(stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
+ } else {
+ // Pad with last row
+ const pixel *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+ if (have_left) {
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ }
+ }
+
+ // Inner UNIT_WxSTRIPE_H
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
+ dst_tl += REST_UNIT_STRIDE;
+ p += PXSTRIDE(stride);
+ }
+
+ if (!have_right) {
+ pixel *pad = dst_l + unit_w;
+ pixel *row_last = &dst_l[unit_w - 1];
+ // Pad 3x(STRIPE_H+6) with last column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(pad, *row_last, 3);
+ pad += REST_UNIT_STRIDE;
+ row_last += REST_UNIT_STRIDE;
+ }
+ }
+
+ if (!have_left) {
+ // Pad 3x(STRIPE_H+6) with first column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(dst, *dst_l, 3);
+ dst += REST_UNIT_STRIDE;
+ dst_l += REST_UNIT_STRIDE;
+ }
+ } else {
+ dst += 3 * REST_UNIT_STRIDE;
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst, &left[j][1], 3);
+ dst += REST_UNIT_STRIDE;
+ }
+ }
+}
+
+// FIXME Could split into luma and chroma specific functions,
+// (since first and last tops are always 0 for chroma)
+// FIXME Could implement a version that requires less temporary memory
+// (should be possible to implement with only 6 rows of temp storage)
+static void wiener_c(pixel *p, const ptrdiff_t stride,
+ const pixel (*const left)[4],
+ const pixel *lpf, const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
+ // of padding above and below
+ pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+ pixel *tmp_ptr = tmp;
+
+ padding(tmp, p, stride, left, lpf, w, h, edges);
+
+ // Values stored between horizontal and vertical filtering don't
+ // fit in a uint8_t.
+ uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+ uint16_t *hor_ptr = hor;
+
+ const int16_t (*const filter)[8] = params->filter;
+ const int bitdepth = bitdepth_from_max(bitdepth_max);
+ const int round_bits_h = 3 + (bitdepth == 12) * 2;
+ const int rounding_off_h = 1 << (round_bits_h - 1);
+ const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
+ for (int j = 0; j < h + 6; j++) {
+ for (int i = 0; i < w; i++) {
+ int sum = (1 << (bitdepth + 6));
+#if BITDEPTH == 8
+ sum += tmp_ptr[i + 3] * 128;
+#endif
+
+ for (int k = 0; k < 7; k++) {
+ sum += tmp_ptr[i + k] * filter[0][k];
+ }
+
+ hor_ptr[i] =
+ iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
+ }
+ tmp_ptr += REST_UNIT_STRIDE;
+ hor_ptr += REST_UNIT_STRIDE;
+ }
+
+ const int round_bits_v = 11 - (bitdepth == 12) * 2;
+ const int rounding_off_v = 1 << (round_bits_v - 1);
+ const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ int sum = -round_offset;
+
+ for (int k = 0; k < 7; k++) {
+ sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filter[1][k];
+ }
+
+ p[j * PXSTRIDE(stride) + i] =
+ iclip_pixel((sum + rounding_off_v) >> round_bits_v);
+ }
+ }
+}
+
+// Sum over a 3x3 area
+// The dst and src pointers are positioned 3 pixels above and 3 pixels to the
+// left of the top left corner. However, the self guided filter only needs 1
+// pixel above and one pixel to the left. As for the pixels below and to the
+// right they must be computed in the sums, but don't need to be stored.
+//
+// Example for a 4x4 block:
+// x x x x x x x x x x
+// x c c c c c c c c x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x c c c c c c c c x
+// x x x x x x x x x x
+//
+// s: Pixel summed and stored
+// i: Pixel summed and stored (between loops)
+// c: Pixel summed not stored
+// x: Pixel not summed not stored
+static void boxsum3(int32_t *sumsq, coef *sum, const pixel *src,
+ const int w, const int h)
+{
+ // We skip the first row, as it is never used
+ src += REST_UNIT_STRIDE;
+
+ // We skip the first and last columns, as they are never used
+ for (int x = 1; x < w - 1; x++) {
+ coef *sum_v = sum + x;
+ int32_t *sumsq_v = sumsq + x;
+ const pixel *s = src + x;
+ int a = s[0], a2 = a * a;
+ int b = s[REST_UNIT_STRIDE], b2 = b * b;
+
+ // We skip the first 2 rows, as they are skipped in the next loop and
+ // we don't need the last 2 row as it is skipped in the next loop
+ for (int y = 2; y < h - 2; y++) {
+ s += REST_UNIT_STRIDE;
+ const int c = s[REST_UNIT_STRIDE];
+ const int c2 = c * c;
+ sum_v += REST_UNIT_STRIDE;
+ sumsq_v += REST_UNIT_STRIDE;
+ *sum_v = a + b + c;
+ *sumsq_v = a2 + b2 + c2;
+ a = b;
+ a2 = b2;
+ b = c;
+ b2 = c2;
+ }
+ }
+
+ // We skip the first row as it is never read
+ sum += REST_UNIT_STRIDE;
+ sumsq += REST_UNIT_STRIDE;
+ // We skip the last 2 rows as it is never read
+ for (int y = 2; y < h - 2; y++) {
+ int a = sum[1], a2 = sumsq[1];
+ int b = sum[2], b2 = sumsq[2];
+
+ // We don't store the first column as it is never read and
+ // we don't store the last 2 columns as they are never read
+ for (int x = 2; x < w - 2; x++) {
+ const int c = sum[x + 1], c2 = sumsq[x + 1];
+ sum[x] = a + b + c;
+ sumsq[x] = a2 + b2 + c2;
+ a = b;
+ a2 = b2;
+ b = c;
+ b2 = c2;
+ }
+ sum += REST_UNIT_STRIDE;
+ sumsq += REST_UNIT_STRIDE;
+ }
+}
+
+// Sum over a 5x5 area
+// The dst and src pointers are positioned 3 pixels above and 3 pixels to the
+// left of the top left corner. However, the self guided filter only needs 1
+// pixel above and one pixel to the left. As for the pixels below and to the
+// right they must be computed in the sums, but don't need to be stored.
+//
+// Example for a 4x4 block:
+// c c c c c c c c c c
+// c c c c c c c c c c
+// i i s s s s s s i i
+// i i s s s s s s i i
+// i i s s s s s s i i
+// i i s s s s s s i i
+// i i s s s s s s i i
+// i i s s s s s s i i
+// c c c c c c c c c c
+// c c c c c c c c c c
+//
+// s: Pixel summed and stored
+// i: Pixel summed and stored (between loops)
+// c: Pixel summed not stored
+// x: Pixel not summed not stored
+static void boxsum5(int32_t *sumsq, coef *sum, const pixel *const src,
+ const int w, const int h)
+{
+ for (int x = 0; x < w; x++) {
+ coef *sum_v = sum + x;
+ int32_t *sumsq_v = sumsq + x;
+ const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
+ int a = s[-3 * REST_UNIT_STRIDE], a2 = a * a;
+ int b = s[-2 * REST_UNIT_STRIDE], b2 = b * b;
+ int c = s[-1 * REST_UNIT_STRIDE], c2 = c * c;
+ int d = s[0], d2 = d * d;
+
+ // We skip the first 2 rows, as they are skipped in the next loop and
+ // we don't need the last 2 row as it is skipped in the next loop
+ for (int y = 2; y < h - 2; y++) {
+ s += REST_UNIT_STRIDE;
+ const int e = *s, e2 = e * e;
+ sum_v += REST_UNIT_STRIDE;
+ sumsq_v += REST_UNIT_STRIDE;
+ *sum_v = a + b + c + d + e;
+ *sumsq_v = a2 + b2 + c2 + d2 + e2;
+ a = b;
+ b = c;
+ c = d;
+ d = e;
+ a2 = b2;
+ b2 = c2;
+ c2 = d2;
+ d2 = e2;
+ }
+ }
+
+ // We skip the first row as it is never read
+ sum += REST_UNIT_STRIDE;
+ sumsq += REST_UNIT_STRIDE;
+ for (int y = 2; y < h - 2; y++) {
+ int a = sum[0], a2 = sumsq[0];
+ int b = sum[1], b2 = sumsq[1];
+ int c = sum[2], c2 = sumsq[2];
+ int d = sum[3], d2 = sumsq[3];
+
+ for (int x = 2; x < w - 2; x++) {
+ const int e = sum[x + 2], e2 = sumsq[x + 2];
+ sum[x] = a + b + c + d + e;
+ sumsq[x] = a2 + b2 + c2 + d2 + e2;
+ a = b;
+ b = c;
+ c = d;
+ d = e;
+ a2 = b2;
+ b2 = c2;
+ c2 = d2;
+ d2 = e2;
+ }
+ sum += REST_UNIT_STRIDE;
+ sumsq += REST_UNIT_STRIDE;
+ }
+}
+
+static NOINLINE void
+selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
+ const int w, const int h, const int n, const unsigned s
+ HIGHBD_DECL_SUFFIX)
+{
+ const unsigned sgr_one_by_x = n == 25 ? 164 : 455;
+
+ // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
+ // of padding above and below
+ int32_t sumsq[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE];
+ int32_t *A = sumsq + 2 * REST_UNIT_STRIDE + 3;
+ // By inverting A and B after the boxsums, B can be of size coef instead
+ // of int32_t
+ coef sum[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE];
+ coef *B = sum + 2 * REST_UNIT_STRIDE + 3;
+
+ const int step = (n == 25) + 1;
+ if (n == 25)
+ boxsum5(sumsq, sum, src, w + 6, h + 6);
+ else
+ boxsum3(sumsq, sum, src, w + 6, h + 6);
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+
+ int32_t *AA = A - REST_UNIT_STRIDE;
+ coef *BB = B - REST_UNIT_STRIDE;
+ for (int j = -1; j < h + 1; j+= step) {
+ for (int i = -1; i < w + 1; i++) {
+ const int a =
+ (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8);
+ const int b =
+ (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8;
+
+ const unsigned p = imax(a * n - b * b, 0);
+ const unsigned z = (p * s + (1 << 19)) >> 20;
+ const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)];
+
+ // This is where we invert A and B, so that B is of size coef.
+ AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
+ BB[i] = x;
+ }
+ AA += step * REST_UNIT_STRIDE;
+ BB += step * REST_UNIT_STRIDE;
+ }
+
+ src += 3 * REST_UNIT_STRIDE + 3;
+ if (n == 25) {
+ int j = 0;
+#define SIX_NEIGHBORS(P, i)\
+ ((P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 6 + \
+ (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] + \
+ P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 5)
+ for (; j < h - 1; j+=2) {
+ for (int i = 0; i < w; i++) {
+ const int a = SIX_NEIGHBORS(B, i);
+ const int b = SIX_NEIGHBORS(A, i);
+ dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
+ }
+ dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
+ src += REST_UNIT_STRIDE;
+ B += REST_UNIT_STRIDE;
+ A += REST_UNIT_STRIDE;
+ for (int i = 0; i < w; i++) {
+ const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
+ const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
+ dst[i] = (b - a * src[i] + (1 << 7)) >> 8;
+ }
+ dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
+ src += REST_UNIT_STRIDE;
+ B += REST_UNIT_STRIDE;
+ A += REST_UNIT_STRIDE;
+ }
+ if (j + 1 == h) { // Last row, when number of rows is odd
+ for (int i = 0; i < w; i++) {
+ const int a = SIX_NEIGHBORS(B, i);
+ const int b = SIX_NEIGHBORS(A, i);
+ dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
+ }
+ }
+#undef SIX_NEIGHBORS
+ } else {
+#define EIGHT_NEIGHBORS(P, i)\
+ ((P[i] + P[i - 1] + P[i + 1] + P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 4 + \
+ (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] + \
+ P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 3)
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ const int a = EIGHT_NEIGHBORS(B, i);
+ const int b = EIGHT_NEIGHBORS(A, i);
+ dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
+ }
+ dst += 384;
+ src += REST_UNIT_STRIDE;
+ B += REST_UNIT_STRIDE;
+ A += REST_UNIT_STRIDE;
+ }
+ }
+#undef EIGHT_NEIGHBORS
+}
+
+static void sgr_5x5_c(pixel *p, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
+ // of padding above and below
+ pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+
+ // Selfguided filter outputs to a maximum stripe height of 64 and a
+ // maximum restoration width of 384 (256 * 1.5)
+ coef dst[64 * 384];
+
+ padding(tmp, p, stride, left, lpf, w, h, edges);
+ selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25,
+ params->sgr.s0 HIGHBD_TAIL_SUFFIX);
+
+ const int w0 = params->sgr.w0;
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ const int v = w0 * dst[j * 384 + i];
+ p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
+ }
+ p += PXSTRIDE(stride);
+ }
+}
+
+static void sgr_3x3_c(pixel *p, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+ coef dst[64 * 384];
+
+ padding(tmp, p, stride, left, lpf, w, h, edges);
+ selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9,
+ params->sgr.s1 HIGHBD_TAIL_SUFFIX);
+
+ const int w1 = params->sgr.w1;
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ const int v = w1 * dst[j * 384 + i];
+ p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
+ }
+ p += PXSTRIDE(stride);
+ }
+}
+
+static void sgr_mix_c(pixel *p, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+ coef dst0[64 * 384];
+ coef dst1[64 * 384];
+
+ padding(tmp, p, stride, left, lpf, w, h, edges);
+ selfguided_filter(dst0, tmp, REST_UNIT_STRIDE, w, h, 25,
+ params->sgr.s0 HIGHBD_TAIL_SUFFIX);
+ selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9,
+ params->sgr.s1 HIGHBD_TAIL_SUFFIX);
+
+ const int w0 = params->sgr.w0;
+ const int w1 = params->sgr.w1;
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ const int v = w0 * dst0[j * 384 + i] + w1 * dst1[j * 384 + i];
+ p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
+ }
+ p += PXSTRIDE(stride);
+ }
+}
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/looprestoration.h"
+#elif ARCH_PPC64LE
+#include "src/ppc/looprestoration.h"
+#elif ARCH_X86
+#include "src/x86/looprestoration.h"
+#endif
+#endif
+
+COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c,
+ const int bpc)
+{
+ c->wiener[0] = c->wiener[1] = wiener_c;
+ c->sgr[0] = sgr_5x5_c;
+ c->sgr[1] = sgr_3x3_c;
+ c->sgr[2] = sgr_mix_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ loop_restoration_dsp_init_arm(c, bpc);
+#elif ARCH_PPC64LE
+ loop_restoration_dsp_init_ppc(c, bpc);
+#elif ARCH_X86
+ loop_restoration_dsp_init_x86(c, bpc);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/lr_apply.h b/third_party/dav1d/src/lr_apply.h
new file mode 100644
index 0000000000..2815367534
--- /dev/null
+++ b/third_party/dav1d/src/lr_apply.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_LR_APPLY_H
+#define DAV1D_SRC_LR_APPLY_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/internal.h"
+
+enum LrRestorePlanes {
+ LR_RESTORE_Y = 1 << 0,
+ LR_RESTORE_U = 1 << 1,
+ LR_RESTORE_V = 1 << 2,
+};
+
+void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
+ int sby);
+
+#endif /* DAV1D_SRC_LR_APPLY_H */
diff --git a/third_party/dav1d/src/lr_apply_tmpl.c b/third_party/dav1d/src/lr_apply_tmpl.c
new file mode 100644
index 0000000000..c517f89820
--- /dev/null
+++ b/third_party/dav1d/src/lr_apply_tmpl.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+
+#include "common/intops.h"
+
+#include "src/lr_apply.h"
+
+static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
+ const pixel (*left)[4], int x, int y,
+ const int plane, const int unit_w, const int row_h,
+ const Av1RestorationUnit *const lr, enum LrEdgeFlags edges)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const int chroma = !!plane;
+ const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
+ const ptrdiff_t stride = f->sr_cur.p.stride[chroma];
+ const int sby = (y + (y ? 8 << ss_ver : 0)) >> (6 - ss_ver + f->seq_hdr->sb128);
+ const int have_tt = f->c->n_tc > 1;
+ const pixel *lpf = f->lf.lr_lpf_line[plane] +
+ have_tt * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(stride) + x;
+
+ // The first stripe of the frame is shorter by 8 luma pixel rows.
+ int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
+
+ looprestorationfilter_fn lr_fn;
+ LooprestorationParams params;
+ if (lr->type == DAV1D_RESTORATION_WIENER) {
+ int16_t (*const filter)[8] = params.filter;
+ filter[0][0] = filter[0][6] = lr->filter_h[0];
+ filter[0][1] = filter[0][5] = lr->filter_h[1];
+ filter[0][2] = filter[0][4] = lr->filter_h[2];
+ filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2;
+#if BITDEPTH != 8
+ /* For 8-bit SIMD it's beneficial to handle the +128 separately
+ * in order to avoid overflows. */
+ filter[0][3] += 128;
+#endif
+
+ filter[1][0] = filter[1][6] = lr->filter_v[0];
+ filter[1][1] = filter[1][5] = lr->filter_v[1];
+ filter[1][2] = filter[1][4] = lr->filter_v[2];
+ filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
+
+ lr_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
+ } else {
+ assert(lr->type == DAV1D_RESTORATION_SGRPROJ);
+ const uint16_t *const sgr_params = dav1d_sgr_params[lr->sgr_idx];
+ params.sgr.s0 = sgr_params[0];
+ params.sgr.s1 = sgr_params[1];
+ params.sgr.w0 = lr->sgr_weights[0];
+ params.sgr.w1 = 128 - (lr->sgr_weights[0] + lr->sgr_weights[1]);
+
+ lr_fn = dsp->lr.sgr[!!sgr_params[0] + !!sgr_params[1] * 2 - 1];
+ }
+
+ while (y + stripe_h <= row_h) {
+ // Change the HAVE_BOTTOM bit in edges to (sby + 1 != f->sbh || y + stripe_h != row_h)
+ edges ^= (-(sby + 1 != f->sbh || y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
+ lr_fn(p, stride, left, lpf, unit_w, stripe_h, &params, edges HIGHBD_CALL_SUFFIX);
+
+ left += stripe_h;
+ y += stripe_h;
+ p += stripe_h * PXSTRIDE(stride);
+ edges |= LR_HAVE_TOP;
+ stripe_h = imin(64 >> ss_ver, row_h - y);
+ if (stripe_h == 0) break;
+ lpf += 4 * PXSTRIDE(stride);
+ }
+}
+
+static void backup4xU(pixel (*dst)[4], const pixel *src, const ptrdiff_t src_stride,
+ int u)
+{
+ for (; u > 0; u--, dst++, src += PXSTRIDE(src_stride))
+ pixel_copy(dst, src, 4);
+}
+
+static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
+ const int w, const int h, const int row_h, const int plane)
+{
+ const int chroma = !!plane;
+ const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
+ const int ss_hor = chroma & (f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444);
+ const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];
+
+ const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!plane];
+ const int unit_size = 1 << unit_size_log2;
+ const int half_unit_size = unit_size >> 1;
+ const int max_unit_size = unit_size + half_unit_size;
+
+ // Y coordinate of the sbrow (y is 8 luma pixel rows above row_y)
+ const int row_y = y + ((8 >> ss_ver) * !!y);
+
+ // FIXME This is an ugly hack to lookup the proper AV1Filter unit for
+ // chroma planes. Question: For Multithreaded decoding, is it better
+ // to store the chroma LR information with collocated Luma information?
+ // In other words. For a chroma restoration unit locate at 128,128 and
+ // with a 4:2:0 chroma subsampling, do we store the filter information at
+ // the AV1Filter unit located at (128,128) or (256,256)
+ // TODO Support chroma subsampling.
+ const int shift_hor = 7 - ss_hor;
+
+ /* maximum sbrow height is 128 + 8 rows offset */
+ ALIGN_STK_16(pixel, pre_lr_border, 2, [128 + 8][4]);
+ const Av1RestorationUnit *lr[2];
+
+ enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT;
+
+ int aligned_unit_pos = row_y & ~(unit_size - 1);
+ if (aligned_unit_pos && aligned_unit_pos + half_unit_size > h)
+ aligned_unit_pos -= unit_size;
+ aligned_unit_pos <<= ss_ver;
+ const int sb_idx = (aligned_unit_pos >> 7) * f->sr_sb128w;
+ const int unit_idx = ((aligned_unit_pos >> 6) & 1) << 1;
+ lr[0] = &f->lf.lr_mask[sb_idx].lr[plane][unit_idx];
+ int restore = lr[0]->type != DAV1D_RESTORATION_NONE;
+ int x = 0, bit = 0;
+ for (; x + max_unit_size <= w; p += unit_size, edges |= LR_HAVE_LEFT, bit ^= 1) {
+ const int next_x = x + unit_size;
+ const int next_u_idx = unit_idx + ((next_x >> (shift_hor - 1)) & 1);
+ lr[!bit] =
+ &f->lf.lr_mask[sb_idx + (next_x >> shift_hor)].lr[plane][next_u_idx];
+ const int restore_next = lr[!bit]->type != DAV1D_RESTORATION_NONE;
+ if (restore_next)
+ backup4xU(pre_lr_border[bit], p + unit_size - 4, p_stride, row_h - y);
+ if (restore)
+ lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_size, row_h,
+ lr[bit], edges);
+ x = next_x;
+ restore = restore_next;
+ }
+ if (restore) {
+ edges &= ~LR_HAVE_RIGHT;
+ const int unit_w = w - x;
+ lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr[bit], edges);
+ }
+}
+
+void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
+ const int sby)
+{
+ const int offset_y = 8 * !!sby;
+ const ptrdiff_t *const dst_stride = f->sr_cur.p.stride;
+ const int restore_planes = f->lf.restore_planes;
+ const int not_last = sby + 1 < f->sbh;
+
+ if (restore_planes & LR_RESTORE_Y) {
+ const int h = f->sr_cur.p.p.h;
+ const int w = f->sr_cur.p.p.w;
+ const int next_row_y = (sby + 1) << (6 + f->seq_hdr->sb128);
+ const int row_h = imin(next_row_y - 8 * not_last, h);
+ const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset_y;
+ lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,
+ h, row_h, 0);
+ }
+ if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
+ const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h = (f->sr_cur.p.p.h + ss_ver) >> ss_ver;
+ const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+ const int next_row_y = (sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128);
+ const int row_h = imin(next_row_y - (8 >> ss_ver) * not_last, h);
+ const int offset_uv = offset_y >> ss_ver;
+ const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
+ if (restore_planes & LR_RESTORE_U)
+ lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
+ w, h, row_h, 1);
+
+ if (restore_planes & LR_RESTORE_V)
+ lr_sbrow(f, dst[2] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
+ w, h, row_h, 2);
+ }
+}
diff --git a/third_party/dav1d/src/mc.h b/third_party/dav1d/src/mc.h
new file mode 100644
index 0000000000..59ba2d9a5a
--- /dev/null
+++ b/third_party/dav1d/src/mc.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_MC_H
+#define DAV1D_SRC_MC_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include "common/bitdepth.h"
+
+#include "src/levels.h"
+
+#define decl_mc_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const pixel *src, ptrdiff_t src_stride, \
+ int w, int h, int mx, int my HIGHBD_DECL_SUFFIX)
+typedef decl_mc_fn(*mc_fn);
+
+#define decl_mc_scaled_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const pixel *src, ptrdiff_t src_stride, \
+ int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX)
+typedef decl_mc_scaled_fn(*mc_scaled_fn);
+
+#define decl_warp8x8_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const pixel *src, ptrdiff_t src_stride, \
+ const int16_t *abcd, int mx, int my HIGHBD_DECL_SUFFIX)
+typedef decl_warp8x8_fn(*warp8x8_fn);
+
+#define decl_mct_fn(name) \
+void (name)(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, \
+ int w, int h, int mx, int my HIGHBD_DECL_SUFFIX)
+typedef decl_mct_fn(*mct_fn);
+
+#define decl_mct_scaled_fn(name) \
+void (name)(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, \
+ int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX)
+typedef decl_mct_scaled_fn(*mct_scaled_fn);
+
+#define decl_warp8x8t_fn(name) \
+void (name)(int16_t *tmp, const ptrdiff_t tmp_stride, \
+ const pixel *src, ptrdiff_t src_stride, \
+ const int16_t *abcd, int mx, int my HIGHBD_DECL_SUFFIX)
+typedef decl_warp8x8t_fn(*warp8x8t_fn);
+
+#define decl_avg_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const int16_t *tmp1, const int16_t *tmp2, int w, int h \
+ HIGHBD_DECL_SUFFIX)
+typedef decl_avg_fn(*avg_fn);
+
+#define decl_w_avg_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const int16_t *tmp1, const int16_t *tmp2, int w, int h, int weight \
+ HIGHBD_DECL_SUFFIX)
+typedef decl_w_avg_fn(*w_avg_fn);
+
+#define decl_mask_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const int16_t *tmp1, const int16_t *tmp2, int w, int h, \
+ const uint8_t *mask HIGHBD_DECL_SUFFIX)
+typedef decl_mask_fn(*mask_fn);
+
+#define decl_w_mask_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const int16_t *tmp1, const int16_t *tmp2, int w, int h, \
+ uint8_t *mask, int sign HIGHBD_DECL_SUFFIX)
+typedef decl_w_mask_fn(*w_mask_fn);
+
+#define decl_blend_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, \
+ int w, int h, const uint8_t *mask)
+typedef decl_blend_fn(*blend_fn);
+
+#define decl_blend_dir_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, int w, int h)
+typedef decl_blend_dir_fn(*blend_dir_fn);
+
+#define decl_emu_edge_fn(name) \
+void (name)(intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih, intptr_t x, intptr_t y, \
+ pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride)
+typedef decl_emu_edge_fn(*emu_edge_fn);
+
+#define decl_resize_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const pixel *src, ptrdiff_t src_stride, \
+ int dst_w, int h, int src_w, int dx, int mx HIGHBD_DECL_SUFFIX)
+typedef decl_resize_fn(*resize_fn);
+
+typedef struct Dav1dMCDSPContext {
+ mc_fn mc[N_2D_FILTERS];
+ mc_scaled_fn mc_scaled[N_2D_FILTERS];
+ mct_fn mct[N_2D_FILTERS];
+ mct_scaled_fn mct_scaled[N_2D_FILTERS];
+ avg_fn avg;
+ w_avg_fn w_avg;
+ mask_fn mask;
+ w_mask_fn w_mask[3 /* 444, 422, 420 */];
+ blend_fn blend;
+ blend_dir_fn blend_v;
+ blend_dir_fn blend_h;
+ warp8x8_fn warp8x8;
+ warp8x8t_fn warp8x8t;
+ emu_edge_fn emu_edge;
+ resize_fn resize;
+} Dav1dMCDSPContext;
+
+bitfn_decls(void dav1d_mc_dsp_init, Dav1dMCDSPContext *c);
+
+#endif /* DAV1D_SRC_MC_H */
diff --git a/third_party/dav1d/src/mc_tmpl.c b/third_party/dav1d/src/mc_tmpl.c
new file mode 100644
index 0000000000..20226d8a39
--- /dev/null
+++ b/third_party/dav1d/src/mc_tmpl.c
@@ -0,0 +1,953 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/mc.h"
+#include "src/tables.h"
+
+#if BITDEPTH == 8
+#define get_intermediate_bits(bitdepth_max) 4
+// Output in interval [-5132, 9212], fits in int16_t as is
+#define PREP_BIAS 0
+#else
+// 4 for 10 bits/component, 2 for 12 bits/component
+#define get_intermediate_bits(bitdepth_max) (14 - bitdepth_from_max(bitdepth_max))
+// Output in interval [-20588, 36956] (10-bit), [-20602, 36983] (12-bit)
+// Subtract a bias to ensure the output fits in int16_t
+#define PREP_BIAS 8192
+#endif
+
+static NOINLINE void
+put_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride, const int w, int h)
+{
+ do {
+ pixel_copy(dst, src, w);
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+}
+
+static NOINLINE void
+prep_c(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride,
+ const int w, int h HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = (src[x] << intermediate_bits) - PREP_BIAS;
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+}
+
+#define FILTER_8TAP(src, x, F, stride) \
+ (F[0] * src[x + -3 * stride] + \
+ F[1] * src[x + -2 * stride] + \
+ F[2] * src[x + -1 * stride] + \
+ F[3] * src[x + +0 * stride] + \
+ F[4] * src[x + +1 * stride] + \
+ F[5] * src[x + +2 * stride] + \
+ F[6] * src[x + +3 * stride] + \
+ F[7] * src[x + +4 * stride])
+
+#define DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh) \
+ ((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh))
+
+#define DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh) \
+ ((FILTER_8TAP(src, x, F, stride) + (rnd)) >> (sh))
+
+#define DAV1D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \
+ iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh))
+
+#define DAV1D_FILTER_8TAP_CLIP2(src, x, F, stride, rnd, sh) \
+ iclip_pixel(DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh))
+
+#define GET_H_FILTER(mx) \
+ const int8_t *const fh = !(mx) ? NULL : w > 4 ? \
+ dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \
+ dav1d_mc_subpel_filters[3 + (filter_type & 1)][(mx) - 1]
+
+#define GET_V_FILTER(my) \
+ const int8_t *const fv = !(my) ? NULL : h > 4 ? \
+ dav1d_mc_subpel_filters[filter_type >> 2][(my) - 1] : \
+ dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][(my) - 1]
+
+#define GET_FILTERS() \
+ GET_H_FILTER(mx); \
+ GET_V_FILTER(my)
+
+static NOINLINE void
+put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, const int my,
+ const int filter_type HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ const int intermediate_rnd = 32 + ((1 << (6 - intermediate_bits)) >> 1);
+
+ GET_FILTERS();
+ dst_stride = PXSTRIDE(dst_stride);
+ src_stride = PXSTRIDE(src_stride);
+
+ if (fh) {
+ if (fv) {
+ int tmp_h = h + 7;
+ int16_t mid[128 * 135], *mid_ptr = mid;
+
+ src -= src_stride * 3;
+ do {
+ for (int x = 0; x < w; x++)
+ mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
+ 6 - intermediate_bits);
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid + 128 * 3;
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128,
+ 6 + intermediate_bits);
+
+ mid_ptr += 128;
+ dst += dst_stride;
+ } while (--h);
+ } else {
+ do {
+ for (int x = 0; x < w; x++) {
+ dst[x] = DAV1D_FILTER_8TAP_CLIP2(src, x, fh, 1,
+ intermediate_rnd, 6);
+ }
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+ }
+ } else if (fv) {
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = DAV1D_FILTER_8TAP_CLIP(src, x, fv, src_stride, 6);
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+ } else
+ put_c(dst, dst_stride, src, src_stride, w, h);
+}
+
+static NOINLINE void
+put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, int my,
+ const int dx, const int dy, const int filter_type
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ const int intermediate_rnd = (1 << intermediate_bits) >> 1;
+ int tmp_h = (((h - 1) * dy + my) >> 10) + 8;
+ int16_t mid[128 * (256 + 7)], *mid_ptr = mid;
+ src_stride = PXSTRIDE(src_stride);
+
+ src -= src_stride * 3;
+ do {
+ int x;
+ int imx = mx, ioff = 0;
+
+ for (x = 0; x < w; x++) {
+ GET_H_FILTER(imx >> 6);
+ mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
+ 6 - intermediate_bits) :
+ src[ioff] << intermediate_bits;
+ imx += dx;
+ ioff += imx >> 10;
+ imx &= 0x3ff;
+ }
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid + 128 * 3;
+ for (int y = 0; y < h; y++) {
+ int x;
+ GET_V_FILTER(my >> 6);
+
+ for (x = 0; x < w; x++)
+ dst[x] = fv ? DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128,
+ 6 + intermediate_bits) :
+ iclip_pixel((mid_ptr[x] + intermediate_rnd) >>
+ intermediate_bits);
+
+ my += dy;
+ mid_ptr += (my >> 10) * 128;
+ my &= 0x3ff;
+ dst += PXSTRIDE(dst_stride);
+ }
+}
+
+static NOINLINE void
+prep_8tap_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, const int my,
+ const int filter_type HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ GET_FILTERS();
+ src_stride = PXSTRIDE(src_stride);
+
+ if (fh) {
+ if (fv) {
+ int tmp_h = h + 7;
+ int16_t mid[128 * 135], *mid_ptr = mid;
+
+ src -= src_stride * 3;
+ do {
+ for (int x = 0; x < w; x++)
+ mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
+ 6 - intermediate_bits);
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid + 128 * 3;
+ do {
+ for (int x = 0; x < w; x++) {
+ int t = DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) -
+ PREP_BIAS;
+ assert(t >= INT16_MIN && t <= INT16_MAX);
+ tmp[x] = t;
+ }
+
+ mid_ptr += 128;
+ tmp += w;
+ } while (--h);
+ } else {
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
+ 6 - intermediate_bits) -
+ PREP_BIAS;
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+ }
+ } else if (fv) {
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fv, src_stride,
+ 6 - intermediate_bits) -
+ PREP_BIAS;
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+ } else
+ prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX);
+}
+
+static NOINLINE void
+prep_8tap_scaled_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, int my,
+ const int dx, const int dy, const int filter_type
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ int tmp_h = (((h - 1) * dy + my) >> 10) + 8;
+ int16_t mid[128 * (256 + 7)], *mid_ptr = mid;
+ src_stride = PXSTRIDE(src_stride);
+
+ src -= src_stride * 3;
+ do {
+ int x;
+ int imx = mx, ioff = 0;
+
+ for (x = 0; x < w; x++) {
+ GET_H_FILTER(imx >> 6);
+ mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
+ 6 - intermediate_bits) :
+ src[ioff] << intermediate_bits;
+ imx += dx;
+ ioff += imx >> 10;
+ imx &= 0x3ff;
+ }
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid + 128 * 3;
+ for (int y = 0; y < h; y++) {
+ int x;
+ GET_V_FILTER(my >> 6);
+
+ for (x = 0; x < w; x++)
+ tmp[x] = (fv ? DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6)
+ : mid_ptr[x]) - PREP_BIAS;
+
+ my += dy;
+ mid_ptr += (my >> 10) * 128;
+ my &= 0x3ff;
+ tmp += w;
+ }
+}
+
+#define filter_fns(type, type_h, type_v) \
+static void put_8tap_##type##_c(pixel *const dst, \
+ const ptrdiff_t dst_stride, \
+ const pixel *const src, \
+ const ptrdiff_t src_stride, \
+ const int w, const int h, \
+ const int mx, const int my \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \
+ type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
+} \
+static void put_8tap_##type##_scaled_c(pixel *const dst, \
+ const ptrdiff_t dst_stride, \
+ const pixel *const src, \
+ const ptrdiff_t src_stride, \
+ const int w, const int h, \
+ const int mx, const int my, \
+ const int dx, const int dy \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ put_8tap_scaled_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
+ type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
+} \
+static void prep_8tap_##type##_c(int16_t *const tmp, \
+ const pixel *const src, \
+ const ptrdiff_t src_stride, \
+ const int w, const int h, \
+ const int mx, const int my \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \
+ type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
+} \
+static void prep_8tap_##type##_scaled_c(int16_t *const tmp, \
+ const pixel *const src, \
+ const ptrdiff_t src_stride, \
+ const int w, const int h, \
+ const int mx, const int my, \
+ const int dx, const int dy \
+ HIGHBD_DECL_SUFFIX) \
+{ \
+ prep_8tap_scaled_c(tmp, src, src_stride, w, h, mx, my, dx, dy, \
+ type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
+}
+
+filter_fns(regular, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_REGULAR)
+filter_fns(regular_sharp, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SHARP)
+filter_fns(regular_smooth, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SMOOTH)
+filter_fns(smooth, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SMOOTH)
+filter_fns(smooth_regular, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_REGULAR)
+filter_fns(smooth_sharp, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SHARP)
+filter_fns(sharp, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SHARP)
+filter_fns(sharp_regular, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_REGULAR)
+filter_fns(sharp_smooth, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SMOOTH)
+
+#define FILTER_BILIN(src, x, mxy, stride) \
+ (16 * src[x] + ((mxy) * (src[x + stride] - src[x])))
+
+#define FILTER_BILIN_RND(src, x, mxy, stride, sh) \
+ ((FILTER_BILIN(src, x, mxy, stride) + ((1 << (sh)) >> 1)) >> (sh))
+
+#define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \
+ iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))
+
+static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, const int my
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ const int intermediate_rnd = (1 << intermediate_bits) >> 1;
+ dst_stride = PXSTRIDE(dst_stride);
+ src_stride = PXSTRIDE(src_stride);
+
+ if (mx) {
+ if (my) {
+ int16_t mid[128 * 129], *mid_ptr = mid;
+ int tmp_h = h + 1;
+
+ do {
+ for (int x = 0; x < w; x++)
+ mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1,
+ 4 - intermediate_bits);
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid;
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 128,
+ 4 + intermediate_bits);
+
+ mid_ptr += 128;
+ dst += dst_stride;
+ } while (--h);
+ } else {
+ do {
+ for (int x = 0; x < w; x++) {
+ const int px = FILTER_BILIN_RND(src, x, mx, 1,
+ 4 - intermediate_bits);
+ dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits);
+ }
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+ }
+ } else if (my) {
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4);
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+ } else
+ put_c(dst, dst_stride, src, src_stride, w, h);
+}
+
+static void put_bilin_scaled_c(pixel *dst, ptrdiff_t dst_stride,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, int my,
+ const int dx, const int dy
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ int tmp_h = (((h - 1) * dy + my) >> 10) + 2;
+ int16_t mid[128 * (256 + 1)], *mid_ptr = mid;
+
+ do {
+ int x;
+ int imx = mx, ioff = 0;
+
+ for (x = 0; x < w; x++) {
+ mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
+ 4 - intermediate_bits);
+ imx += dx;
+ ioff += imx >> 10;
+ imx &= 0x3ff;
+ }
+
+ mid_ptr += 128;
+ src += PXSTRIDE(src_stride);
+ } while (--tmp_h);
+
+ mid_ptr = mid;
+ do {
+ int x;
+
+ for (x = 0; x < w; x++)
+ dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my >> 6, 128,
+ 4 + intermediate_bits);
+
+ my += dy;
+ mid_ptr += (my >> 10) * 128;
+ my &= 0x3ff;
+ dst += PXSTRIDE(dst_stride);
+ } while (--h);
+}
+
+static void prep_bilin_c(int16_t *tmp,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, const int my
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ src_stride = PXSTRIDE(src_stride);
+
+ if (mx) {
+ if (my) {
+ int16_t mid[128 * 129], *mid_ptr = mid;
+ int tmp_h = h + 1;
+
+ do {
+ for (int x = 0; x < w; x++)
+ mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1,
+ 4 - intermediate_bits);
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid;
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4) -
+ PREP_BIAS;
+
+ mid_ptr += 128;
+ tmp += w;
+ } while (--h);
+ } else {
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = FILTER_BILIN_RND(src, x, mx, 1,
+ 4 - intermediate_bits) -
+ PREP_BIAS;
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+ }
+ } else if (my) {
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = FILTER_BILIN_RND(src, x, my, src_stride,
+ 4 - intermediate_bits) - PREP_BIAS;
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+ } else
+ prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX);
+}
+
+static void prep_bilin_scaled_c(int16_t *tmp,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, int my,
+ const int dx, const int dy HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ int tmp_h = (((h - 1) * dy + my) >> 10) + 2;
+ int16_t mid[128 * (256 + 1)], *mid_ptr = mid;
+
+ do {
+ int x;
+ int imx = mx, ioff = 0;
+
+ for (x = 0; x < w; x++) {
+ mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
+ 4 - intermediate_bits);
+ imx += dx;
+ ioff += imx >> 10;
+ imx &= 0x3ff;
+ }
+
+ mid_ptr += 128;
+ src += PXSTRIDE(src_stride);
+ } while (--tmp_h);
+
+ mid_ptr = mid;
+ do {
+ int x;
+
+ for (x = 0; x < w; x++)
+ tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my >> 6, 128, 4) - PREP_BIAS;
+
+ my += dy;
+ mid_ptr += (my >> 10) * 128;
+ my &= 0x3ff;
+ tmp += w;
+ } while (--h);
+}
+
+static void avg_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2, const int w, int h
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ const int sh = intermediate_bits + 1;
+ const int rnd = (1 << intermediate_bits) + PREP_BIAS * 2;
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + rnd) >> sh);
+
+ tmp1 += w;
+ tmp2 += w;
+ dst += PXSTRIDE(dst_stride);
+ } while (--h);
+}
+
+static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+ const int weight HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ const int sh = intermediate_bits + 4;
+ const int rnd = (8 << intermediate_bits) + PREP_BIAS * 16;
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = iclip_pixel((tmp1[x] * weight +
+ tmp2[x] * (16 - weight) + rnd) >> sh);
+
+ tmp1 += w;
+ tmp2 += w;
+ dst += PXSTRIDE(dst_stride);
+ } while (--h);
+}
+
+static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+ const uint8_t *mask HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ const int sh = intermediate_bits + 6;
+ const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64;
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = iclip_pixel((tmp1[x] * mask[x] +
+ tmp2[x] * (64 - mask[x]) + rnd) >> sh);
+
+ tmp1 += w;
+ tmp2 += w;
+ mask += w;
+ dst += PXSTRIDE(dst_stride);
+ } while (--h);
+}
+
+#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
+static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+ const int w, int h, const uint8_t *mask)
+{
+ do {
+ for (int x = 0; x < w; x++) {
+ dst[x] = blend_px(dst[x], tmp[x], mask[x]);
+ }
+ dst += PXSTRIDE(dst_stride);
+ tmp += w;
+ mask += w;
+ } while (--h);
+}
+
+static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+ const int w, int h)
+{
+ const uint8_t *const mask = &dav1d_obmc_masks[w];
+ do {
+ for (int x = 0; x < (w * 3) >> 2; x++) {
+ dst[x] = blend_px(dst[x], tmp[x], mask[x]);
+ }
+ dst += PXSTRIDE(dst_stride);
+ tmp += w;
+ } while (--h);
+}
+
+static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+ const int w, int h)
+{
+ const uint8_t *mask = &dav1d_obmc_masks[h];
+ h = (h * 3) >> 2;
+ do {
+ const int m = *mask++;
+ for (int x = 0; x < w; x++) {
+ dst[x] = blend_px(dst[x], tmp[x], m);
+ }
+ dst += PXSTRIDE(dst_stride);
+ tmp += w;
+ } while (--h);
+}
+
+static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
+ const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
+ uint8_t *mask, const int sign,
+ const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX)
+{
+ // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows,
+ // and then load this intermediate to calculate final value for odd rows
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ const int bitdepth = bitdepth_from_max(bitdepth_max);
+ const int sh = intermediate_bits + 6;
+ const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64;
+ const int mask_sh = bitdepth + intermediate_bits - 4;
+ const int mask_rnd = 1 << (mask_sh - 5);
+ do {
+ for (int x = 0; x < w; x++) {
+ const int m = imin(38 + ((abs(tmp1[x] - tmp2[x]) + mask_rnd) >> mask_sh), 64);
+ dst[x] = iclip_pixel((tmp1[x] * m +
+ tmp2[x] * (64 - m) + rnd) >> sh);
+
+ if (ss_hor) {
+ x++;
+
+ const int n = imin(38 + ((abs(tmp1[x] - tmp2[x]) + mask_rnd) >> mask_sh), 64);
+ dst[x] = iclip_pixel((tmp1[x] * n +
+ tmp2[x] * (64 - n) + rnd) >> sh);
+
+ if (h & ss_ver) {
+ mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2;
+ } else if (ss_ver) {
+ mask[x >> 1] = m + n;
+ } else {
+ mask[x >> 1] = (m + n + 1 - sign) >> 1;
+ }
+ } else {
+ mask[x] = m;
+ }
+ }
+
+ tmp1 += w;
+ tmp2 += w;
+ dst += PXSTRIDE(dst_stride);
+ if (!ss_ver || (h & 1)) mask += w >> ss_hor;
+ } while (--h);
+}
+
+#define w_mask_fns(ssn, ss_hor, ss_ver) \
+static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \
+ const int16_t *const tmp1, const int16_t *const tmp2, \
+ const int w, const int h, uint8_t *mask, \
+ const int sign HIGHBD_DECL_SUFFIX) \
+{ \
+ w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, ss_hor, ss_ver \
+ HIGHBD_TAIL_SUFFIX); \
+}
+
+w_mask_fns(444, 0, 0);
+w_mask_fns(422, 1, 0);
+w_mask_fns(420, 1, 1);
+
+#undef w_mask_fns
+
+#define FILTER_WARP_RND(src, x, F, stride, sh) \
+ ((F[0] * src[x - 3 * stride] + \
+ F[1] * src[x - 2 * stride] + \
+ F[2] * src[x - 1 * stride] + \
+ F[3] * src[x + 0 * stride] + \
+ F[4] * src[x + 1 * stride] + \
+ F[5] * src[x + 2 * stride] + \
+ F[6] * src[x + 3 * stride] + \
+ F[7] * src[x + 4 * stride] + \
+ ((1 << (sh)) >> 1)) >> (sh))
+
+#define FILTER_WARP_CLIP(src, x, F, stride, sh) \
+ iclip_pixel(FILTER_WARP_RND(src, x, F, stride, sh))
+
+static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *const abcd, int mx, int my
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ int16_t mid[15 * 8], *mid_ptr = mid;
+
+ src -= 3 * PXSTRIDE(src_stride);
+ for (int y = 0; y < 15; y++, mx += abcd[1]) {
+ for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
+ const int8_t *const filter =
+ dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
+
+ mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
+ 7 - intermediate_bits);
+ }
+ src += PXSTRIDE(src_stride);
+ mid_ptr += 8;
+ }
+
+ mid_ptr = &mid[3 * 8];
+ for (int y = 0; y < 8; y++, my += abcd[3]) {
+ for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
+ const int8_t *const filter =
+ dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
+
+ dst[x] = FILTER_WARP_CLIP(mid_ptr, x, filter, 8,
+ 7 + intermediate_bits);
+ }
+ mid_ptr += 8;
+ dst += PXSTRIDE(dst_stride);
+ }
+}
+
+static void warp_affine_8x8t_c(int16_t *tmp, const ptrdiff_t tmp_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *const abcd, int mx, int my
+ HIGHBD_DECL_SUFFIX)
+{
+ const int intermediate_bits = get_intermediate_bits(bitdepth_max);
+ int16_t mid[15 * 8], *mid_ptr = mid;
+
+ src -= 3 * PXSTRIDE(src_stride);
+ for (int y = 0; y < 15; y++, mx += abcd[1]) {
+ for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
+ const int8_t *const filter =
+ dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
+
+ mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
+ 7 - intermediate_bits);
+ }
+ src += PXSTRIDE(src_stride);
+ mid_ptr += 8;
+ }
+
+ mid_ptr = &mid[3 * 8];
+ for (int y = 0; y < 8; y++, my += abcd[3]) {
+ for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
+ const int8_t *const filter =
+ dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
+
+ tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7) - PREP_BIAS;
+ }
+ mid_ptr += 8;
+ tmp += tmp_stride;
+ }
+}
+
+static void emu_edge_c(const intptr_t bw, const intptr_t bh,
+ const intptr_t iw, const intptr_t ih,
+ const intptr_t x, const intptr_t y,
+ pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *ref, const ptrdiff_t ref_stride)
+{
+ // find offset in reference of visible block to copy
+ ref += iclip((int) y, 0, (int) ih - 1) * PXSTRIDE(ref_stride) +
+ iclip((int) x, 0, (int) iw - 1);
+
+ // number of pixels to extend (left, right, top, bottom)
+ const int left_ext = iclip((int) -x, 0, (int) bw - 1);
+ const int right_ext = iclip((int) (x + bw - iw), 0, (int) bw - 1);
+ assert(left_ext + right_ext < bw);
+ const int top_ext = iclip((int) -y, 0, (int) bh - 1);
+ const int bottom_ext = iclip((int) (y + bh - ih), 0, (int) bh - 1);
+ assert(top_ext + bottom_ext < bh);
+
+ // copy visible portion first
+ pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);
+ const int center_w = (int) (bw - left_ext - right_ext);
+ const int center_h = (int) (bh - top_ext - bottom_ext);
+ for (int y = 0; y < center_h; y++) {
+ pixel_copy(blk + left_ext, ref, center_w);
+ // extend left edge for this line
+ if (left_ext)
+ pixel_set(blk, blk[left_ext], left_ext);
+ // extend right edge for this line
+ if (right_ext)
+ pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],
+ right_ext);
+ ref += PXSTRIDE(ref_stride);
+ blk += PXSTRIDE(dst_stride);
+ }
+
+ // copy top
+ blk = dst + top_ext * PXSTRIDE(dst_stride);
+ for (int y = 0; y < top_ext; y++) {
+ pixel_copy(dst, blk, bw);
+ dst += PXSTRIDE(dst_stride);
+ }
+
+ // copy bottom
+ dst += center_h * PXSTRIDE(dst_stride);
+ for (int y = 0; y < bottom_ext; y++) {
+ pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);
+ dst += PXSTRIDE(dst_stride);
+ }
+}
+
+static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int dst_w, int h, const int src_w,
+ const int dx, const int mx0 HIGHBD_DECL_SUFFIX)
+{
+ do {
+ int mx = mx0, src_x = -1;
+ for (int x = 0; x < dst_w; x++) {
+ const int8_t *const F = dav1d_resize_filter[mx >> 8];
+ dst[x] = iclip_pixel((-(F[0] * src[iclip(src_x - 3, 0, src_w - 1)] +
+ F[1] * src[iclip(src_x - 2, 0, src_w - 1)] +
+ F[2] * src[iclip(src_x - 1, 0, src_w - 1)] +
+ F[3] * src[iclip(src_x + 0, 0, src_w - 1)] +
+ F[4] * src[iclip(src_x + 1, 0, src_w - 1)] +
+ F[5] * src[iclip(src_x + 2, 0, src_w - 1)] +
+ F[6] * src[iclip(src_x + 3, 0, src_w - 1)] +
+ F[7] * src[iclip(src_x + 4, 0, src_w - 1)]) +
+ 64) >> 7);
+ mx += dx;
+ src_x += mx >> 14;
+ mx &= 0x3fff;
+ }
+
+ dst += PXSTRIDE(dst_stride);
+ src += PXSTRIDE(src_stride);
+ } while (--h);
+}
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/mc.h"
+#elif ARCH_X86
+#include "src/x86/mc.h"
+#endif
+#endif
+
+COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
+#define init_mc_fns(type, name) do { \
+ c->mc [type] = put_##name##_c; \
+ c->mc_scaled [type] = put_##name##_scaled_c; \
+ c->mct [type] = prep_##name##_c; \
+ c->mct_scaled[type] = prep_##name##_scaled_c; \
+} while (0)
+
+ init_mc_fns(FILTER_2D_8TAP_REGULAR, 8tap_regular);
+ init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);
+ init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp);
+ init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular);
+ init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth);
+ init_mc_fns(FILTER_2D_8TAP_SHARP, 8tap_sharp);
+ init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular);
+ init_mc_fns(FILTER_2D_8TAP_SMOOTH, 8tap_smooth);
+ init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp);
+ init_mc_fns(FILTER_2D_BILINEAR, bilin);
+
+ c->avg = avg_c;
+ c->w_avg = w_avg_c;
+ c->mask = mask_c;
+ c->blend = blend_c;
+ c->blend_v = blend_v_c;
+ c->blend_h = blend_h_c;
+ c->w_mask[0] = w_mask_444_c;
+ c->w_mask[1] = w_mask_422_c;
+ c->w_mask[2] = w_mask_420_c;
+ c->warp8x8 = warp_affine_8x8_c;
+ c->warp8x8t = warp_affine_8x8t_c;
+ c->emu_edge = emu_edge_c;
+ c->resize = resize_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ mc_dsp_init_arm(c);
+#elif ARCH_X86
+ mc_dsp_init_x86(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/mem.c b/third_party/dav1d/src/mem.c
new file mode 100644
index 0000000000..558bc01cae
--- /dev/null
+++ b/third_party/dav1d/src/mem.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2020, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+
+#include "src/internal.h"
+
+static COLD void mem_pool_destroy(Dav1dMemPool *const pool) {
+ pthread_mutex_destroy(&pool->lock);
+ free(pool);
+}
+
+void dav1d_mem_pool_push(Dav1dMemPool *const pool, Dav1dMemPoolBuffer *const buf) {
+ pthread_mutex_lock(&pool->lock);
+ const int ref_cnt = --pool->ref_cnt;
+ if (!pool->end) {
+ buf->next = pool->buf;
+ pool->buf = buf;
+ pthread_mutex_unlock(&pool->lock);
+ assert(ref_cnt > 0);
+ } else {
+ pthread_mutex_unlock(&pool->lock);
+ dav1d_free_aligned(buf->data);
+ if (!ref_cnt) mem_pool_destroy(pool);
+ }
+}
+
+Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *const pool, const size_t size) {
+ assert(!(size & (sizeof(void*) - 1)));
+ pthread_mutex_lock(&pool->lock);
+ Dav1dMemPoolBuffer *buf = pool->buf;
+ pool->ref_cnt++;
+ uint8_t *data;
+ if (buf) {
+ pool->buf = buf->next;
+ pthread_mutex_unlock(&pool->lock);
+ data = buf->data;
+ if ((uintptr_t)buf - (uintptr_t)data != size) {
+ /* Reallocate if the size has changed */
+ dav1d_free_aligned(data);
+ goto alloc;
+ }
+ } else {
+ pthread_mutex_unlock(&pool->lock);
+alloc:
+ data = dav1d_alloc_aligned(size + sizeof(Dav1dMemPoolBuffer), 64);
+ if (!data) {
+ pthread_mutex_lock(&pool->lock);
+ const int ref_cnt = --pool->ref_cnt;
+ pthread_mutex_unlock(&pool->lock);
+ if (!ref_cnt) mem_pool_destroy(pool);
+ return NULL;
+ }
+ buf = (Dav1dMemPoolBuffer*)(data + size);
+ buf->data = data;
+ }
+
+ return buf;
+}
+
+COLD int dav1d_mem_pool_init(Dav1dMemPool **const ppool) {
+ Dav1dMemPool *const pool = malloc(sizeof(Dav1dMemPool));
+ if (pool) {
+ if (!pthread_mutex_init(&pool->lock, NULL)) {
+ pool->buf = NULL;
+ pool->ref_cnt = 1;
+ pool->end = 0;
+ *ppool = pool;
+ return 0;
+ }
+ free(pool);
+ }
+ *ppool = NULL;
+ return DAV1D_ERR(ENOMEM);
+}
+
+COLD void dav1d_mem_pool_end(Dav1dMemPool *const pool) {
+ if (pool) {
+ pthread_mutex_lock(&pool->lock);
+ Dav1dMemPoolBuffer *buf = pool->buf;
+ const int ref_cnt = --pool->ref_cnt;
+ pool->buf = NULL;
+ pool->end = 1;
+ pthread_mutex_unlock(&pool->lock);
+
+ while (buf) {
+ void *const data = buf->data;
+ buf = buf->next;
+ dav1d_free_aligned(data);
+ }
+ if (!ref_cnt) mem_pool_destroy(pool);
+ }
+}
diff --git a/third_party/dav1d/src/mem.h b/third_party/dav1d/src/mem.h
new file mode 100644
index 0000000000..41ae47a2fd
--- /dev/null
+++ b/third_party/dav1d/src/mem.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_MEM_H
+#define DAV1D_SRC_MEM_H
+
+#include <stdlib.h>
+
+#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
+#include <malloc.h>
+#endif
+
+#include "common/attributes.h"
+
+#include "src/thread.h"
+
+typedef struct Dav1dMemPoolBuffer {
+ void *data;
+ struct Dav1dMemPoolBuffer *next;
+} Dav1dMemPoolBuffer;
+
+typedef struct Dav1dMemPool {
+ pthread_mutex_t lock;
+ Dav1dMemPoolBuffer *buf;
+ int ref_cnt;
+ int end;
+} Dav1dMemPool;
+
+void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);
+Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size);
+int dav1d_mem_pool_init(Dav1dMemPool **pool);
+void dav1d_mem_pool_end(Dav1dMemPool *pool);
+
+/*
+ * Allocate align-byte aligned memory. The return value can be released
+ * by calling the dav1d_free_aligned() function.
+ */
+static inline void *dav1d_alloc_aligned(size_t sz, size_t align) {
+ assert(!(align & (align - 1)));
+#ifdef HAVE_POSIX_MEMALIGN
+ void *ptr;
+ if (posix_memalign(&ptr, align, sz)) return NULL;
+ return ptr;
+#elif defined(HAVE_ALIGNED_MALLOC)
+ return _aligned_malloc(sz, align);
+#elif defined(HAVE_MEMALIGN)
+ return memalign(align, sz);
+#else
+#error Missing aligned alloc implementation
+#endif
+}
+
+static inline void dav1d_free_aligned(void* ptr) {
+#ifdef HAVE_POSIX_MEMALIGN
+ free(ptr);
+#elif defined(HAVE_ALIGNED_MALLOC)
+ _aligned_free(ptr);
+#elif defined(HAVE_MEMALIGN)
+ free(ptr);
+#endif
+}
+
+static inline void dav1d_freep_aligned(void* ptr) {
+ void **mem = (void **) ptr;
+ if (*mem) {
+ dav1d_free_aligned(*mem);
+ *mem = NULL;
+ }
+}
+
+static inline void freep(void *ptr) {
+ void **mem = (void **) ptr;
+ if (*mem) {
+ free(*mem);
+ *mem = NULL;
+ }
+}
+
+#endif /* DAV1D_SRC_MEM_H */
diff --git a/third_party/dav1d/src/meson.build b/third_party/dav1d/src/meson.build
new file mode 100644
index 0000000000..2ace55e1a8
--- /dev/null
+++ b/third_party/dav1d/src/meson.build
@@ -0,0 +1,348 @@
+# Copyright © 2018-2019, VideoLAN and dav1d authors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# Build definition for the dav1d library
+#
+
+# libdav1d source files
+libdav1d_sources = files(
+ 'cdf.c',
+ 'cpu.c',
+ 'data.c',
+ 'decode.c',
+ 'dequant_tables.c',
+ 'getbits.c',
+ 'intra_edge.c',
+ 'itx_1d.c',
+ 'lf_mask.c',
+ 'lib.c',
+ 'log.c',
+ 'mem.c',
+ 'msac.c',
+ 'obu.c',
+ 'picture.c',
+ 'qm.c',
+ 'ref.c',
+ 'refmvs.c',
+ 'scan.c',
+ 'tables.c',
+ 'thread_task.c',
+ 'warpmv.c',
+ 'wedge.c',
+)
+
+# libdav1d bitdepth source files
+# These files are compiled for each bitdepth with
+# `BITDEPTH` defined to the currently built bitdepth.
+libdav1d_tmpl_sources = files(
+ 'cdef_apply_tmpl.c',
+ 'cdef_tmpl.c',
+ 'fg_apply_tmpl.c',
+ 'filmgrain_tmpl.c',
+ 'ipred_prepare_tmpl.c',
+ 'ipred_tmpl.c',
+ 'itx_tmpl.c',
+ 'lf_apply_tmpl.c',
+ 'loopfilter_tmpl.c',
+ 'looprestoration_tmpl.c',
+ 'lr_apply_tmpl.c',
+ 'mc_tmpl.c',
+ 'recon_tmpl.c',
+)
+
+libdav1d_arch_tmpl_sources = []
+
+libdav1d_bitdepth_objs = []
+
+# ASM specific sources
+libdav1d_asm_objs = []
+# Arch-specific flags
+arch_flags = []
+if is_asm_enabled
+ if (host_machine.cpu_family() == 'aarch64' or
+ host_machine.cpu_family().startswith('arm'))
+
+ libdav1d_sources += files(
+ 'arm/cpu.c',
+ )
+ if (host_machine.cpu_family() == 'aarch64' or
+ host_machine.cpu() == 'arm64')
+ libdav1d_sources_asm = files(
+ # itx.S is used for both 8 and 16 bpc.
+ 'arm/64/itx.S',
+ 'arm/64/looprestoration_common.S',
+ 'arm/64/msac.S',
+ 'arm/64/refmvs.S',
+ )
+
+ if dav1d_bitdepths.contains('8')
+ libdav1d_sources_asm += files(
+ 'arm/64/cdef.S',
+ 'arm/64/filmgrain.S',
+ 'arm/64/ipred.S',
+ 'arm/64/loopfilter.S',
+ 'arm/64/looprestoration.S',
+ 'arm/64/mc.S',
+ )
+ endif
+
+ if dav1d_bitdepths.contains('16')
+ libdav1d_sources_asm += files(
+ 'arm/64/cdef16.S',
+ 'arm/64/filmgrain16.S',
+ 'arm/64/ipred16.S',
+ 'arm/64/itx16.S',
+ 'arm/64/loopfilter16.S',
+ 'arm/64/looprestoration16.S',
+ 'arm/64/mc16.S',
+ )
+ endif
+ elif host_machine.cpu_family().startswith('arm')
+ libdav1d_sources_asm = files(
+ # itx.S is used for both 8 and 16 bpc.
+ 'arm/32/itx.S',
+ 'arm/32/looprestoration_common.S',
+ 'arm/32/msac.S',
+ 'arm/32/refmvs.S',
+ )
+
+ if dav1d_bitdepths.contains('8')
+ libdav1d_sources_asm += files(
+ 'arm/32/cdef.S',
+ 'arm/32/filmgrain.S',
+ 'arm/32/ipred.S',
+ 'arm/32/loopfilter.S',
+ 'arm/32/looprestoration.S',
+ 'arm/32/mc.S',
+ )
+ endif
+
+ if dav1d_bitdepths.contains('16')
+ libdav1d_sources_asm += files(
+ 'arm/32/cdef16.S',
+ 'arm/32/filmgrain16.S',
+ 'arm/32/ipred16.S',
+ 'arm/32/itx16.S',
+ 'arm/32/loopfilter16.S',
+ 'arm/32/looprestoration16.S',
+ 'arm/32/mc16.S',
+ )
+ endif
+ endif
+
+ if use_gaspp
+ libdav1d_asm_objs = gaspp_gen.process(libdav1d_sources_asm)
+ else
+ libdav1d_sources += libdav1d_sources_asm
+ endif
+ elif host_machine.cpu_family().startswith('x86')
+
+ libdav1d_sources += files(
+ 'x86/cpu.c',
+ )
+
+ # NASM source files
+ libdav1d_sources_asm = files(
+ 'x86/cpuid.asm',
+ 'x86/msac.asm',
+ 'x86/refmvs.asm',
+ 'x86/itx_avx512.asm',
+ 'x86/cdef_avx2.asm',
+ 'x86/itx_avx2.asm',
+ 'x86/looprestoration_avx2.asm',
+ 'x86/cdef_sse.asm',
+ 'x86/itx_sse.asm',
+ )
+
+ if dav1d_bitdepths.contains('8')
+ libdav1d_sources_asm += files(
+ 'x86/cdef_avx512.asm',
+ 'x86/filmgrain_avx512.asm',
+ 'x86/ipred_avx512.asm',
+ 'x86/loopfilter_avx512.asm',
+ 'x86/looprestoration_avx512.asm',
+ 'x86/mc_avx512.asm',
+ 'x86/filmgrain_avx2.asm',
+ 'x86/ipred_avx2.asm',
+ 'x86/loopfilter_avx2.asm',
+ 'x86/mc_avx2.asm',
+ 'x86/filmgrain_sse.asm',
+ 'x86/ipred_sse.asm',
+ 'x86/loopfilter_sse.asm',
+ 'x86/looprestoration_sse.asm',
+ 'x86/mc_sse.asm',
+ )
+ endif
+
+ if dav1d_bitdepths.contains('16')
+ libdav1d_sources_asm += files(
+ 'x86/cdef16_avx512.asm',
+ 'x86/filmgrain16_avx512.asm',
+ 'x86/ipred16_avx512.asm',
+ 'x86/itx16_avx512.asm',
+ 'x86/loopfilter16_avx512.asm',
+ 'x86/looprestoration16_avx512.asm',
+ 'x86/mc16_avx512.asm',
+ 'x86/cdef16_avx2.asm',
+ 'x86/filmgrain16_avx2.asm',
+ 'x86/ipred16_avx2.asm',
+ 'x86/itx16_avx2.asm',
+ 'x86/loopfilter16_avx2.asm',
+ 'x86/looprestoration16_avx2.asm',
+ 'x86/mc16_avx2.asm',
+ 'x86/cdef16_sse.asm',
+ 'x86/filmgrain16_sse.asm',
+ 'x86/ipred16_sse.asm',
+ 'x86/itx16_sse.asm',
+ 'x86/loopfilter16_sse.asm',
+ 'x86/looprestoration16_sse.asm',
+ 'x86/mc16_sse.asm',
+ )
+ endif
+
+ # Compile the ASM sources with NASM
+ libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm)
+ elif host_machine.cpu() == 'ppc64le'
+ arch_flags = ['-maltivec', '-mvsx']
+ libdav1d_sources += files(
+ 'ppc/cpu.c',
+ )
+ libdav1d_arch_tmpl_sources += files(
+ 'ppc/cdef_tmpl.c',
+ 'ppc/looprestoration_tmpl.c',
+ )
+ endif
+endif
+
+
+
+libdav1d_rc_obj = []
+libdav1d_flags = []
+api_export_flags = []
+
+#
+# Windows .rc file and API export flags
+#
+
+if host_machine.system() == 'windows'
+ if get_option('default_library') != 'static'
+ rc_file = configure_file(
+ input : 'dav1d.rc.in',
+ output : 'dav1d.rc',
+ configuration : rc_data
+ )
+
+ libdav1d_rc_obj = winmod.compile_resources(rc_file)
+
+ api_export_flags = ['-DDAV1D_BUILDING_DLL']
+ endif
+
+ if (host_machine.cpu_family() == 'x86_64' and cc.get_id() == 'gcc')
+ # We don't expect to reference data members from other DLLs without
+ # dllimport attributes. Set the -mcmodel=small flag, which avoids
+ # generating indirection via .refptr.<symname> for all potentially
+ # dllimported variable references.
+ libdav1d_flags += '-mcmodel=small'
+ endif
+endif
+
+
+
+#
+# Library definitions
+#
+
+# Helper library for each bitdepth
+libdav1d_bitdepth_objs = []
+foreach bitdepth : dav1d_bitdepths
+ libdav1d_bitdepth_objs += static_library(
+ 'dav1d_bitdepth_@0@'.format(bitdepth),
+ libdav1d_tmpl_sources, config_h_target,
+ include_directories: dav1d_inc_dirs,
+ dependencies : [stdatomic_dependencies],
+ c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags,
+ install : false,
+ build_by_default : false,
+ ).extract_all_objects(recursive: true)
+endforeach
+
+# Helper library for each bitdepth and architecture-specific flags
+foreach bitdepth : dav1d_bitdepths
+ libdav1d_bitdepth_objs += static_library(
+ 'dav1d_arch_bitdepth_@0@'.format(bitdepth),
+ libdav1d_arch_tmpl_sources, config_h_target,
+ include_directories: dav1d_inc_dirs,
+ dependencies : [stdatomic_dependencies],
+ c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags + arch_flags,
+ install : false,
+ build_by_default : false,
+ ).extract_all_objects(recursive: true)
+endforeach
+
+# The final dav1d library
+if host_machine.system() == 'windows'
+ dav1d_soversion = ''
+else
+ dav1d_soversion = dav1d_api_version_major
+endif
+
+libdav1d = library('dav1d',
+ libdav1d_sources,
+ libdav1d_asm_objs,
+ libdav1d_rc_obj,
+ rev_target,
+ config_h_target,
+
+ objects : [
+ libdav1d_bitdepth_objs,
+ ],
+
+ include_directories : dav1d_inc_dirs,
+ dependencies : [
+ stdatomic_dependencies,
+ thread_dependency,
+ thread_compat_dep,
+ libdl_dependency,
+ ],
+ c_args : [libdav1d_flags, api_export_flags],
+ version : dav1d_soname_version,
+ soversion : dav1d_soversion,
+ install : true,
+)
+
+dav1d_dep = declare_dependency(link_with: libdav1d,
+ include_directories : include_directories('../include/dav1d')
+)
+
+#
+# Generate pkg-config .pc file
+#
+pkg_mod = import('pkgconfig')
+pkg_mod.generate(libraries: libdav1d,
+ version: meson.project_version(),
+ name: 'libdav1d',
+ filebase: 'dav1d',
+ description: 'AV1 decoding library'
+)
diff --git a/third_party/dav1d/src/msac.c b/third_party/dav1d/src/msac.c
new file mode 100644
index 0000000000..43d8ae5d07
--- /dev/null
+++ b/third_party/dav1d/src/msac.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <limits.h>
+
+#include "common/intops.h"
+
+#include "src/msac.h"
+
+#define EC_PROB_SHIFT 6
+#define EC_MIN_PROB 4 // must be <= (1<<EC_PROB_SHIFT)/16
+
+#define EC_WIN_SIZE (sizeof(ec_win) << 3)
+
+static inline void ctx_refill(MsacContext *const s) {
+ const uint8_t *buf_pos = s->buf_pos;
+ const uint8_t *buf_end = s->buf_end;
+ int c = EC_WIN_SIZE - s->cnt - 24;
+ ec_win dif = s->dif;
+ while (c >= 0 && buf_pos < buf_end) {
+ dif ^= ((ec_win)*buf_pos++) << c;
+ c -= 8;
+ }
+ s->dif = dif;
+ s->cnt = EC_WIN_SIZE - c - 24;
+ s->buf_pos = buf_pos;
+}
+
+/* Takes updated dif and range values, renormalizes them so that
+ * 32768 <= rng < 65536 (reading more bytes from the stream into dif if
+ * necessary), and stores them back in the decoder context.
+ * dif: The new value of dif.
+ * rng: The new value of the range. */
+static inline void ctx_norm(MsacContext *const s, const ec_win dif,
+ const unsigned rng)
+{
+ const int d = 15 ^ (31 ^ clz(rng));
+ assert(rng <= 65535U);
+ s->cnt -= d;
+ s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */
+ s->rng = rng << d;
+ if (s->cnt < 0)
+ ctx_refill(s);
+}
+
+unsigned dav1d_msac_decode_bool_equi_c(MsacContext *const s) {
+ const unsigned r = s->rng;
+ ec_win dif = s->dif;
+ assert((dif >> (EC_WIN_SIZE - 16)) < r);
+ // When the probability is 1/2, f = 16384 >> EC_PROB_SHIFT = 256 and we can
+ // replace the multiply with a simple shift.
+ unsigned v = ((r >> 8) << 7) + EC_MIN_PROB;
+ const ec_win vw = (ec_win)v << (EC_WIN_SIZE - 16);
+ const unsigned ret = dif >= vw;
+ dif -= ret * vw;
+ v += ret * (r - 2 * v);
+ ctx_norm(s, dif, v);
+ return !ret;
+}
+
+/* Decode a single binary value.
+ * f: The probability that the bit is one
+ * Return: The value decoded (0 or 1). */
+unsigned dav1d_msac_decode_bool_c(MsacContext *const s, const unsigned f) {
+ const unsigned r = s->rng;
+ ec_win dif = s->dif;
+ assert((dif >> (EC_WIN_SIZE - 16)) < r);
+ unsigned v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB;
+ const ec_win vw = (ec_win)v << (EC_WIN_SIZE - 16);
+ const unsigned ret = dif >= vw;
+ dif -= ret * vw;
+ v += ret * (r - 2 * v);
+ ctx_norm(s, dif, v);
+ return !ret;
+}
+
+int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
+ const int n, unsigned k)
+{
+ assert(n >> k == 8);
+
+ unsigned a = 0;
+ if (dav1d_msac_decode_bool_equi(s)) {
+ if (dav1d_msac_decode_bool_equi(s))
+ k += dav1d_msac_decode_bool_equi(s) + 1;
+ a = 1 << k;
+ }
+ const unsigned v = dav1d_msac_decode_bools(s, k) + a;
+ return ref * 2 <= n ? inv_recenter(ref, v) :
+ n - 1 - inv_recenter(n - 1 - ref, v);
+}
+
+/* Decodes a symbol given an inverse cumulative distribution function (CDF)
+ * table in Q15. */
+unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
+ uint16_t *const cdf,
+ const size_t n_symbols)
+{
+ const unsigned c = s->dif >> (EC_WIN_SIZE - 16), r = s->rng >> 8;
+ unsigned u, v = s->rng, val = -1;
+
+ assert(n_symbols <= 15);
+ assert(cdf[n_symbols] <= 32);
+
+ do {
+ val++;
+ u = v;
+ v = r * (cdf[val] >> EC_PROB_SHIFT);
+ v >>= 7 - EC_PROB_SHIFT;
+ v += EC_MIN_PROB * ((unsigned)n_symbols - val);
+ } while (c < v);
+
+ assert(u <= s->rng);
+
+ ctx_norm(s, s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)), u - v);
+
+ if (s->allow_update_cdf) {
+ const unsigned count = cdf[n_symbols];
+ const unsigned rate = 4 + (count >> 4) + (n_symbols > 2);
+ unsigned i;
+ for (i = 0; i < val; i++)
+ cdf[i] += (32768 - cdf[i]) >> rate;
+ for (; i < n_symbols; i++)
+ cdf[i] -= cdf[i] >> rate;
+ cdf[n_symbols] = count + (count < 32);
+ }
+
+ return val;
+}
+
+unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *const s,
+ uint16_t *const cdf)
+{
+ const unsigned bit = dav1d_msac_decode_bool(s, *cdf);
+
+ if (s->allow_update_cdf) {
+ // update_cdf() specialized for boolean CDFs
+ const unsigned count = cdf[1];
+ const int rate = 4 + (count >> 4);
+ if (bit)
+ cdf[0] += (32768 - cdf[0]) >> rate;
+ else
+ cdf[0] -= cdf[0] >> rate;
+ cdf[1] = count + (count < 32);
+ }
+
+ return bit;
+}
+
+unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf) {
+ unsigned tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+ unsigned tok = 3 + tok_br;
+ if (tok_br == 3) {
+ tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+ tok = 6 + tok_br;
+ if (tok_br == 3) {
+ tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+ tok = 9 + tok_br;
+ if (tok_br == 3)
+ tok = 12 + dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
+ }
+ }
+ return tok;
+}
+
+void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
+ const size_t sz, const int disable_cdf_update_flag)
+{
+ s->buf_pos = data;
+ s->buf_end = data + sz;
+ s->dif = ((ec_win)1 << (EC_WIN_SIZE - 1)) - 1;
+ s->rng = 0x8000;
+ s->cnt = -15;
+ s->allow_update_cdf = !disable_cdf_update_flag;
+ ctx_refill(s);
+
+#if ARCH_X86_64 && HAVE_ASM
+ s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
+
+ msac_init_x86(s);
+#endif
+}
diff --git a/third_party/dav1d/src/msac.h b/third_party/dav1d/src/msac.h
new file mode 100644
index 0000000000..eb04f58f81
--- /dev/null
+++ b/third_party/dav1d/src/msac.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_MSAC_H
+#define DAV1D_SRC_MSAC_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+typedef size_t ec_win;
+
+typedef struct MsacContext {
+ const uint8_t *buf_pos;
+ const uint8_t *buf_end;
+ ec_win dif;
+ unsigned rng;
+ int cnt;
+ int allow_update_cdf;
+
+#if ARCH_X86_64 && HAVE_ASM
+ unsigned (*symbol_adapt16)(struct MsacContext *s, uint16_t *cdf, size_t n_symbols);
+#endif
+} MsacContext;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/msac.h"
+#elif ARCH_X86
+#include "src/x86/msac.h"
+#endif
+#endif
+
+void dav1d_msac_init(MsacContext *s, const uint8_t *data, size_t sz,
+ int disable_cdf_update_flag);
+unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_c(MsacContext *s);
+unsigned dav1d_msac_decode_bool_c(MsacContext *s, unsigned f);
+unsigned dav1d_msac_decode_hi_tok_c(MsacContext *s, uint16_t *cdf);
+int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
+
+/* Supported n_symbols ranges: adapt4: 1-4, adapt8: 1-7, adapt16: 3-15 */
+#ifndef dav1d_msac_decode_symbol_adapt4
+#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_symbol_adapt8
+#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_symbol_adapt16
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt_c
+#endif
+#ifndef dav1d_msac_decode_bool_adapt
+#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_c
+#endif
+#ifndef dav1d_msac_decode_bool_equi
+#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_c
+#endif
+#ifndef dav1d_msac_decode_bool
+#define dav1d_msac_decode_bool dav1d_msac_decode_bool_c
+#endif
+#ifndef dav1d_msac_decode_hi_tok
+#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_c
+#endif
+
+static inline unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
+ unsigned v = 0;
+ while (n--)
+ v = (v << 1) | dav1d_msac_decode_bool_equi(s);
+ return v;
+}
+
+static inline int dav1d_msac_decode_uniform(MsacContext *const s, const unsigned n) {
+ assert(n > 0);
+ const int l = ulog2(n) + 1;
+ assert(l > 1);
+ const unsigned m = (1 << l) - n;
+ const unsigned v = dav1d_msac_decode_bools(s, l - 1);
+ return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(s);
+}
+
+#endif /* DAV1D_SRC_MSAC_H */
diff --git a/third_party/dav1d/src/obu.c b/third_party/dav1d/src/obu.c
new file mode 100644
index 0000000000..e08129aba5
--- /dev/null
+++ b/third_party/dav1d/src/obu.c
@@ -0,0 +1,1702 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "dav1d/data.h"
+
+#include "common/frame.h"
+#include "common/intops.h"
+
+#include "src/decode.h"
+#include "src/getbits.h"
+#include "src/levels.h"
+#include "src/log.h"
+#include "src/obu.h"
+#include "src/ref.h"
+#include "src/thread_task.h"
+
+static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
+ Dav1dSequenceHeader *const hdr)
+{
+#define DEBUG_SEQ_HDR 0
+
+#if DEBUG_SEQ_HDR
+ const unsigned init_bit_pos = dav1d_get_bits_pos(gb);
+#endif
+
+ memset(hdr, 0, sizeof(*hdr));
+ hdr->profile = dav1d_get_bits(gb, 3);
+ if (hdr->profile > 2) goto error;
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-profile: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+ hdr->still_picture = dav1d_get_bit(gb);
+ hdr->reduced_still_picture_header = dav1d_get_bit(gb);
+ if (hdr->reduced_still_picture_header && !hdr->still_picture) goto error;
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-stillpicture_flags: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+ if (hdr->reduced_still_picture_header) {
+ hdr->num_operating_points = 1;
+ hdr->operating_points[0].major_level = dav1d_get_bits(gb, 3);
+ hdr->operating_points[0].minor_level = dav1d_get_bits(gb, 2);
+ hdr->operating_points[0].initial_display_delay = 10;
+ } else {
+ hdr->timing_info_present = dav1d_get_bit(gb);
+ if (hdr->timing_info_present) {
+ hdr->num_units_in_tick = dav1d_get_bits(gb, 32);
+ hdr->time_scale = dav1d_get_bits(gb, 32);
+ hdr->equal_picture_interval = dav1d_get_bit(gb);
+ if (hdr->equal_picture_interval) {
+ const unsigned num_ticks_per_picture = dav1d_get_vlc(gb);
+ if (num_ticks_per_picture == 0xFFFFFFFFU)
+ goto error;
+ hdr->num_ticks_per_picture = num_ticks_per_picture + 1;
+ }
+
+ hdr->decoder_model_info_present = dav1d_get_bit(gb);
+ if (hdr->decoder_model_info_present) {
+ hdr->encoder_decoder_buffer_delay_length = dav1d_get_bits(gb, 5) + 1;
+ hdr->num_units_in_decoding_tick = dav1d_get_bits(gb, 32);
+ hdr->buffer_removal_delay_length = dav1d_get_bits(gb, 5) + 1;
+ hdr->frame_presentation_delay_length = dav1d_get_bits(gb, 5) + 1;
+ }
+ }
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-timinginfo: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+ hdr->display_model_info_present = dav1d_get_bit(gb);
+ hdr->num_operating_points = dav1d_get_bits(gb, 5) + 1;
+ for (int i = 0; i < hdr->num_operating_points; i++) {
+ struct Dav1dSequenceHeaderOperatingPoint *const op =
+ &hdr->operating_points[i];
+ op->idc = dav1d_get_bits(gb, 12);
+ if (op->idc && (!(op->idc & 0xff) || !(op->idc & 0xf00)))
+ goto error;
+ op->major_level = 2 + dav1d_get_bits(gb, 3);
+ op->minor_level = dav1d_get_bits(gb, 2);
+ if (op->major_level > 3)
+ op->tier = dav1d_get_bit(gb);
+ if (hdr->decoder_model_info_present) {
+ op->decoder_model_param_present = dav1d_get_bit(gb);
+ if (op->decoder_model_param_present) {
+ struct Dav1dSequenceHeaderOperatingParameterInfo *const opi =
+ &hdr->operating_parameter_info[i];
+ opi->decoder_buffer_delay =
+ dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length);
+ opi->encoder_buffer_delay =
+ dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length);
+ opi->low_delay_mode = dav1d_get_bit(gb);
+ }
+ }
+ if (hdr->display_model_info_present)
+ op->display_model_param_present = dav1d_get_bit(gb);
+ op->initial_display_delay =
+ op->display_model_param_present ? dav1d_get_bits(gb, 4) + 1 : 10;
+ }
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-operating-points: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+ }
+
+ const int op_idx =
+ c->operating_point < hdr->num_operating_points ? c->operating_point : 0;
+ c->operating_point_idc = hdr->operating_points[op_idx].idc;
+ const unsigned spatial_mask = c->operating_point_idc >> 8;
+ c->max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0;
+
+ hdr->width_n_bits = dav1d_get_bits(gb, 4) + 1;
+ hdr->height_n_bits = dav1d_get_bits(gb, 4) + 1;
+ hdr->max_width = dav1d_get_bits(gb, hdr->width_n_bits) + 1;
+ hdr->max_height = dav1d_get_bits(gb, hdr->height_n_bits) + 1;
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-size: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+ if (!hdr->reduced_still_picture_header) {
+ hdr->frame_id_numbers_present = dav1d_get_bit(gb);
+ if (hdr->frame_id_numbers_present) {
+ hdr->delta_frame_id_n_bits = dav1d_get_bits(gb, 4) + 2;
+ hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1;
+ }
+ }
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-frame-id-numbers-present: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+ hdr->sb128 = dav1d_get_bit(gb);
+ hdr->filter_intra = dav1d_get_bit(gb);
+ hdr->intra_edge_filter = dav1d_get_bit(gb);
+ if (hdr->reduced_still_picture_header) {
+ hdr->screen_content_tools = DAV1D_ADAPTIVE;
+ hdr->force_integer_mv = DAV1D_ADAPTIVE;
+ } else {
+ hdr->inter_intra = dav1d_get_bit(gb);
+ hdr->masked_compound = dav1d_get_bit(gb);
+ hdr->warped_motion = dav1d_get_bit(gb);
+ hdr->dual_filter = dav1d_get_bit(gb);
+ hdr->order_hint = dav1d_get_bit(gb);
+ if (hdr->order_hint) {
+ hdr->jnt_comp = dav1d_get_bit(gb);
+ hdr->ref_frame_mvs = dav1d_get_bit(gb);
+ }
+ hdr->screen_content_tools = dav1d_get_bit(gb) ? DAV1D_ADAPTIVE : dav1d_get_bit(gb);
+ #if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-screentools: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+ #endif
+ hdr->force_integer_mv = hdr->screen_content_tools ?
+ dav1d_get_bit(gb) ? DAV1D_ADAPTIVE : dav1d_get_bit(gb) : 2;
+ if (hdr->order_hint)
+ hdr->order_hint_n_bits = dav1d_get_bits(gb, 3) + 1;
+ }
+ hdr->super_res = dav1d_get_bit(gb);
+ hdr->cdef = dav1d_get_bit(gb);
+ hdr->restoration = dav1d_get_bit(gb);
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-featurebits: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+ hdr->hbd = dav1d_get_bit(gb);
+ if (hdr->profile == 2 && hdr->hbd)
+ hdr->hbd += dav1d_get_bit(gb);
+ if (hdr->profile != 1)
+ hdr->monochrome = dav1d_get_bit(gb);
+ hdr->color_description_present = dav1d_get_bit(gb);
+ if (hdr->color_description_present) {
+ hdr->pri = dav1d_get_bits(gb, 8);
+ hdr->trc = dav1d_get_bits(gb, 8);
+ hdr->mtrx = dav1d_get_bits(gb, 8);
+ } else {
+ hdr->pri = DAV1D_COLOR_PRI_UNKNOWN;
+ hdr->trc = DAV1D_TRC_UNKNOWN;
+ hdr->mtrx = DAV1D_MC_UNKNOWN;
+ }
+ if (hdr->monochrome) {
+ hdr->color_range = dav1d_get_bit(gb);
+ hdr->layout = DAV1D_PIXEL_LAYOUT_I400;
+ hdr->ss_hor = hdr->ss_ver = 1;
+ hdr->chr = DAV1D_CHR_UNKNOWN;
+ } else if (hdr->pri == DAV1D_COLOR_PRI_BT709 &&
+ hdr->trc == DAV1D_TRC_SRGB &&
+ hdr->mtrx == DAV1D_MC_IDENTITY)
+ {
+ hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
+ hdr->color_range = 1;
+ if (hdr->profile != 1 && !(hdr->profile == 2 && hdr->hbd == 2))
+ goto error;
+ } else {
+ hdr->color_range = dav1d_get_bit(gb);
+ switch (hdr->profile) {
+ case 0: hdr->layout = DAV1D_PIXEL_LAYOUT_I420;
+ hdr->ss_hor = hdr->ss_ver = 1;
+ break;
+ case 1: hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
+ break;
+ case 2:
+ if (hdr->hbd == 2) {
+ hdr->ss_hor = dav1d_get_bit(gb);
+ if (hdr->ss_hor)
+ hdr->ss_ver = dav1d_get_bit(gb);
+ } else
+ hdr->ss_hor = 1;
+ hdr->layout = hdr->ss_hor ?
+ hdr->ss_ver ? DAV1D_PIXEL_LAYOUT_I420 :
+ DAV1D_PIXEL_LAYOUT_I422 :
+ DAV1D_PIXEL_LAYOUT_I444;
+ break;
+ }
+ hdr->chr = (hdr->ss_hor & hdr->ss_ver) ?
+ dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN;
+ }
+ if (c->strict_std_compliance &&
+ hdr->mtrx == DAV1D_MC_IDENTITY && hdr->layout != DAV1D_PIXEL_LAYOUT_I444)
+ {
+ goto error;
+ }
+ if (!hdr->monochrome)
+ hdr->separate_uv_delta_q = dav1d_get_bit(gb);
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-colorinfo: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+ hdr->film_grain_present = dav1d_get_bit(gb);
+#if DEBUG_SEQ_HDR
+ printf("SEQHDR: post-filmgrain: off=%u\n",
+ dav1d_get_bits_pos(gb) - init_bit_pos);
+#endif
+
+ dav1d_get_bit(gb); // dummy bit
+
+ // We needn't bother flushing the OBU here: we'll check we didn't
+ // overrun in the caller and will then discard gb, so there's no
+ // point in setting its position properly.
+
+ return 0;
+
+error:
+ dav1d_log(c, "Error parsing sequence header\n");
+ return DAV1D_ERR(EINVAL);
+}
+
+static int read_frame_size(Dav1dContext *const c, GetBits *const gb,
+ const int use_ref)
+{
+ const Dav1dSequenceHeader *const seqhdr = c->seq_hdr;
+ Dav1dFrameHeader *const hdr = c->frame_hdr;
+
+ if (use_ref) {
+ for (int i = 0; i < 7; i++) {
+ if (dav1d_get_bit(gb)) {
+ const Dav1dThreadPicture *const ref =
+ &c->refs[c->frame_hdr->refidx[i]].p;
+ if (!ref->p.frame_hdr) return -1;
+ hdr->width[1] = ref->p.frame_hdr->width[1];
+ hdr->height = ref->p.frame_hdr->height;
+ hdr->render_width = ref->p.frame_hdr->render_width;
+ hdr->render_height = ref->p.frame_hdr->render_height;
+ hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bit(gb);
+ if (hdr->super_res.enabled) {
+ const int d = hdr->super_res.width_scale_denominator =
+ 9 + dav1d_get_bits(gb, 3);
+ hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d,
+ imin(16, hdr->width[1]));
+ } else {
+ hdr->super_res.width_scale_denominator = 8;
+ hdr->width[0] = hdr->width[1];
+ }
+ return 0;
+ }
+ }
+ }
+
+ if (hdr->frame_size_override) {
+ hdr->width[1] = dav1d_get_bits(gb, seqhdr->width_n_bits) + 1;
+ hdr->height = dav1d_get_bits(gb, seqhdr->height_n_bits) + 1;
+ } else {
+ hdr->width[1] = seqhdr->max_width;
+ hdr->height = seqhdr->max_height;
+ }
+ hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bit(gb);
+ if (hdr->super_res.enabled) {
+ const int d = hdr->super_res.width_scale_denominator = 9 + dav1d_get_bits(gb, 3);
+ hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d, imin(16, hdr->width[1]));
+ } else {
+ hdr->super_res.width_scale_denominator = 8;
+ hdr->width[0] = hdr->width[1];
+ }
+ hdr->have_render_size = dav1d_get_bit(gb);
+ if (hdr->have_render_size) {
+ hdr->render_width = dav1d_get_bits(gb, 16) + 1;
+ hdr->render_height = dav1d_get_bits(gb, 16) + 1;
+ } else {
+ hdr->render_width = hdr->width[1];
+ hdr->render_height = hdr->height;
+ }
+ return 0;
+}
+
+static inline int tile_log2(const int sz, const int tgt) {
+ int k;
+ for (k = 0; (sz << k) < tgt; k++) ;
+ return k;
+}
+
+static const Dav1dLoopfilterModeRefDeltas default_mode_ref_deltas = {
+ .mode_delta = { 0, 0 },
+ .ref_delta = { 1, 0, 0, 0, -1, 0, -1, -1 },
+};
+
+static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
+#define DEBUG_FRAME_HDR 0
+
+#if DEBUG_FRAME_HDR
+ const uint8_t *const init_ptr = gb->ptr;
+#endif
+ const Dav1dSequenceHeader *const seqhdr = c->seq_hdr;
+ Dav1dFrameHeader *const hdr = c->frame_hdr;
+
+ hdr->show_existing_frame =
+ !seqhdr->reduced_still_picture_header && dav1d_get_bit(gb);
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-show_existing_frame: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ if (hdr->show_existing_frame) {
+ hdr->existing_frame_idx = dav1d_get_bits(gb, 3);
+ if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval)
+ hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length);
+ if (seqhdr->frame_id_numbers_present) {
+ hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits);
+ Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->existing_frame_idx].p.p.frame_hdr;
+ if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) goto error;
+ }
+ return 0;
+ }
+
+ hdr->frame_type = seqhdr->reduced_still_picture_header ? DAV1D_FRAME_TYPE_KEY : dav1d_get_bits(gb, 2);
+ hdr->show_frame = seqhdr->reduced_still_picture_header || dav1d_get_bit(gb);
+ if (hdr->show_frame) {
+ if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval)
+ hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length);
+ hdr->showable_frame = hdr->frame_type != DAV1D_FRAME_TYPE_KEY;
+ } else
+ hdr->showable_frame = dav1d_get_bit(gb);
+ hdr->error_resilient_mode =
+ (hdr->frame_type == DAV1D_FRAME_TYPE_KEY && hdr->show_frame) ||
+ hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ||
+ seqhdr->reduced_still_picture_header || dav1d_get_bit(gb);
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-frametype_bits: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ hdr->disable_cdf_update = dav1d_get_bit(gb);
+ hdr->allow_screen_content_tools = seqhdr->screen_content_tools == DAV1D_ADAPTIVE ?
+ dav1d_get_bit(gb) : seqhdr->screen_content_tools;
+ if (hdr->allow_screen_content_tools)
+ hdr->force_integer_mv = seqhdr->force_integer_mv == DAV1D_ADAPTIVE ?
+ dav1d_get_bit(gb) : seqhdr->force_integer_mv;
+ else
+ hdr->force_integer_mv = 0;
+
+ if (IS_KEY_OR_INTRA(hdr))
+ hdr->force_integer_mv = 1;
+
+ if (seqhdr->frame_id_numbers_present)
+ hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits);
+
+ hdr->frame_size_override = seqhdr->reduced_still_picture_header ? 0 :
+ hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 1 : dav1d_get_bit(gb);
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-frame_size_override_flag: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ hdr->frame_offset = seqhdr->order_hint ?
+ dav1d_get_bits(gb, seqhdr->order_hint_n_bits) : 0;
+ hdr->primary_ref_frame = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) ?
+ dav1d_get_bits(gb, 3) : DAV1D_PRIMARY_REF_NONE;
+
+ if (seqhdr->decoder_model_info_present) {
+ hdr->buffer_removal_time_present = dav1d_get_bit(gb);
+ if (hdr->buffer_removal_time_present) {
+ for (int i = 0; i < c->seq_hdr->num_operating_points; i++) {
+ const struct Dav1dSequenceHeaderOperatingPoint *const seqop = &seqhdr->operating_points[i];
+ struct Dav1dFrameHeaderOperatingPoint *const op = &hdr->operating_points[i];
+ if (seqop->decoder_model_param_present) {
+ int in_temporal_layer = (seqop->idc >> hdr->temporal_id) & 1;
+ int in_spatial_layer = (seqop->idc >> (hdr->spatial_id + 8)) & 1;
+ if (!seqop->idc || (in_temporal_layer && in_spatial_layer))
+ op->buffer_removal_time = dav1d_get_bits(gb, seqhdr->buffer_removal_delay_length);
+ }
+ }
+ }
+ }
+
+ if (IS_KEY_OR_INTRA(hdr)) {
+ hdr->refresh_frame_flags = (hdr->frame_type == DAV1D_FRAME_TYPE_KEY &&
+ hdr->show_frame) ? 0xff : dav1d_get_bits(gb, 8);
+ if (hdr->refresh_frame_flags != 0xff && hdr->error_resilient_mode && seqhdr->order_hint)
+ for (int i = 0; i < 8; i++)
+ dav1d_get_bits(gb, seqhdr->order_hint_n_bits);
+ if (c->strict_std_compliance &&
+ hdr->frame_type == DAV1D_FRAME_TYPE_INTRA && hdr->refresh_frame_flags == 0xff)
+ {
+ goto error;
+ }
+ if (read_frame_size(c, gb, 0) < 0) goto error;
+ hdr->allow_intrabc = hdr->allow_screen_content_tools &&
+ !hdr->super_res.enabled && dav1d_get_bit(gb);
+ hdr->use_ref_frame_mvs = 0;
+ } else {
+ hdr->allow_intrabc = 0;
+ hdr->refresh_frame_flags = hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 0xff :
+ dav1d_get_bits(gb, 8);
+ if (hdr->error_resilient_mode && seqhdr->order_hint)
+ for (int i = 0; i < 8; i++)
+ dav1d_get_bits(gb, seqhdr->order_hint_n_bits);
+ hdr->frame_ref_short_signaling =
+ seqhdr->order_hint && dav1d_get_bit(gb);
+ if (hdr->frame_ref_short_signaling) { // FIXME: Nearly verbatim copy from section 7.8
+ hdr->refidx[0] = dav1d_get_bits(gb, 3);
+ hdr->refidx[1] = hdr->refidx[2] = -1;
+ hdr->refidx[3] = dav1d_get_bits(gb, 3);
+ hdr->refidx[4] = hdr->refidx[5] = hdr->refidx[6] = -1;
+
+ int shifted_frame_offset[8];
+ const int current_frame_offset = 1 << (seqhdr->order_hint_n_bits - 1);
+ for (int i = 0; i < 8; i++) {
+ if (!c->refs[i].p.p.frame_hdr) goto error;
+ shifted_frame_offset[i] = current_frame_offset +
+ get_poc_diff(seqhdr->order_hint_n_bits,
+ c->refs[i].p.p.frame_hdr->frame_offset,
+ hdr->frame_offset);
+ }
+
+ int used_frame[8] = { 0 };
+ used_frame[hdr->refidx[0]] = 1;
+ used_frame[hdr->refidx[3]] = 1;
+
+ int latest_frame_offset = -1;
+ for (int i = 0; i < 8; i++) {
+ const int hint = shifted_frame_offset[i];
+ if (!used_frame[i] && hint >= current_frame_offset &&
+ hint >= latest_frame_offset)
+ {
+ hdr->refidx[6] = i;
+ latest_frame_offset = hint;
+ }
+ }
+ if (latest_frame_offset != -1)
+ used_frame[hdr->refidx[6]] = 1;
+
+ int earliest_frame_offset = INT_MAX;
+ for (int i = 0; i < 8; i++) {
+ const int hint = shifted_frame_offset[i];
+ if (!used_frame[i] && hint >= current_frame_offset &&
+ hint < earliest_frame_offset)
+ {
+ hdr->refidx[4] = i;
+ earliest_frame_offset = hint;
+ }
+ }
+ if (earliest_frame_offset != INT_MAX)
+ used_frame[hdr->refidx[4]] = 1;
+
+ earliest_frame_offset = INT_MAX;
+ for (int i = 0; i < 8; i++) {
+ const int hint = shifted_frame_offset[i];
+ if (!used_frame[i] && hint >= current_frame_offset &&
+ (hint < earliest_frame_offset))
+ {
+ hdr->refidx[5] = i;
+ earliest_frame_offset = hint;
+ }
+ }
+ if (earliest_frame_offset != INT_MAX)
+ used_frame[hdr->refidx[5]] = 1;
+
+ for (int i = 1; i < 7; i++) {
+ if (hdr->refidx[i] < 0) {
+ latest_frame_offset = -1;
+ for (int j = 0; j < 8; j++) {
+ const int hint = shifted_frame_offset[j];
+ if (!used_frame[j] && hint < current_frame_offset &&
+ hint >= latest_frame_offset)
+ {
+ hdr->refidx[i] = j;
+ latest_frame_offset = hint;
+ }
+ }
+ if (latest_frame_offset != -1)
+ used_frame[hdr->refidx[i]] = 1;
+ }
+ }
+
+ earliest_frame_offset = INT_MAX;
+ int ref = -1;
+ for (int i = 0; i < 8; i++) {
+ const int hint = shifted_frame_offset[i];
+ if (hint < earliest_frame_offset) {
+ ref = i;
+ earliest_frame_offset = hint;
+ }
+ }
+ for (int i = 0; i < 7; i++) {
+ if (hdr->refidx[i] < 0)
+ hdr->refidx[i] = ref;
+ }
+ }
+ for (int i = 0; i < 7; i++) {
+ if (!hdr->frame_ref_short_signaling)
+ hdr->refidx[i] = dav1d_get_bits(gb, 3);
+ if (seqhdr->frame_id_numbers_present) {
+ const int delta_ref_frame_id_minus_1 = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits);
+ const int ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id_minus_1 - 1) & ((1 << seqhdr->frame_id_n_bits) - 1);
+ Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->refidx[i]].p.p.frame_hdr;
+ if (!ref_frame_hdr || ref_frame_hdr->frame_id != ref_frame_id) goto error;
+ }
+ }
+ const int use_ref = !hdr->error_resilient_mode &&
+ hdr->frame_size_override;
+ if (read_frame_size(c, gb, use_ref) < 0) goto error;
+ hdr->hp = !hdr->force_integer_mv && dav1d_get_bit(gb);
+ hdr->subpel_filter_mode = dav1d_get_bit(gb) ? DAV1D_FILTER_SWITCHABLE :
+ dav1d_get_bits(gb, 2);
+ hdr->switchable_motion_mode = dav1d_get_bit(gb);
+ hdr->use_ref_frame_mvs = !hdr->error_resilient_mode &&
+ seqhdr->ref_frame_mvs && seqhdr->order_hint &&
+ IS_INTER_OR_SWITCH(hdr) && dav1d_get_bit(gb);
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-frametype-specific-bits: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ hdr->refresh_context = !seqhdr->reduced_still_picture_header &&
+ !hdr->disable_cdf_update && !dav1d_get_bit(gb);
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-refresh_context: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ // tile data
+ hdr->tiling.uniform = dav1d_get_bit(gb);
+ const int sbsz_min1 = (64 << seqhdr->sb128) - 1;
+ const int sbsz_log2 = 6 + seqhdr->sb128;
+ const int sbw = (hdr->width[0] + sbsz_min1) >> sbsz_log2;
+ const int sbh = (hdr->height + sbsz_min1) >> sbsz_log2;
+ const int max_tile_width_sb = 4096 >> sbsz_log2;
+ const int max_tile_area_sb = 4096 * 2304 >> (2 * sbsz_log2);
+ hdr->tiling.min_log2_cols = tile_log2(max_tile_width_sb, sbw);
+ hdr->tiling.max_log2_cols = tile_log2(1, imin(sbw, DAV1D_MAX_TILE_COLS));
+ hdr->tiling.max_log2_rows = tile_log2(1, imin(sbh, DAV1D_MAX_TILE_ROWS));
+ const int min_log2_tiles = imax(tile_log2(max_tile_area_sb, sbw * sbh),
+ hdr->tiling.min_log2_cols);
+ if (hdr->tiling.uniform) {
+ for (hdr->tiling.log2_cols = hdr->tiling.min_log2_cols;
+ hdr->tiling.log2_cols < hdr->tiling.max_log2_cols && dav1d_get_bit(gb);
+ hdr->tiling.log2_cols++) ;
+ const int tile_w = 1 + ((sbw - 1) >> hdr->tiling.log2_cols);
+ hdr->tiling.cols = 0;
+ for (int sbx = 0; sbx < sbw; sbx += tile_w, hdr->tiling.cols++)
+ hdr->tiling.col_start_sb[hdr->tiling.cols] = sbx;
+ hdr->tiling.min_log2_rows =
+ imax(min_log2_tiles - hdr->tiling.log2_cols, 0);
+
+ for (hdr->tiling.log2_rows = hdr->tiling.min_log2_rows;
+ hdr->tiling.log2_rows < hdr->tiling.max_log2_rows && dav1d_get_bit(gb);
+ hdr->tiling.log2_rows++) ;
+ const int tile_h = 1 + ((sbh - 1) >> hdr->tiling.log2_rows);
+ hdr->tiling.rows = 0;
+ for (int sby = 0; sby < sbh; sby += tile_h, hdr->tiling.rows++)
+ hdr->tiling.row_start_sb[hdr->tiling.rows] = sby;
+ } else {
+ hdr->tiling.cols = 0;
+ int widest_tile = 0, max_tile_area_sb = sbw * sbh;
+ for (int sbx = 0; sbx < sbw && hdr->tiling.cols < DAV1D_MAX_TILE_COLS; hdr->tiling.cols++) {
+ const int tile_width_sb = imin(sbw - sbx, max_tile_width_sb);
+ const int tile_w = (tile_width_sb > 1) ?
+ 1 + dav1d_get_uniform(gb, tile_width_sb) :
+ 1;
+ hdr->tiling.col_start_sb[hdr->tiling.cols] = sbx;
+ sbx += tile_w;
+ widest_tile = imax(widest_tile, tile_w);
+ }
+ hdr->tiling.log2_cols = tile_log2(1, hdr->tiling.cols);
+ if (min_log2_tiles) max_tile_area_sb >>= min_log2_tiles + 1;
+ const int max_tile_height_sb = imax(max_tile_area_sb / widest_tile, 1);
+
+ hdr->tiling.rows = 0;
+ for (int sby = 0; sby < sbh && hdr->tiling.rows < DAV1D_MAX_TILE_ROWS; hdr->tiling.rows++) {
+ const int tile_height_sb = imin(sbh - sby, max_tile_height_sb);
+ const int tile_h = (tile_height_sb > 1) ?
+ 1 + dav1d_get_uniform(gb, tile_height_sb) :
+ 1;
+ hdr->tiling.row_start_sb[hdr->tiling.rows] = sby;
+ sby += tile_h;
+ }
+ hdr->tiling.log2_rows = tile_log2(1, hdr->tiling.rows);
+ }
+ hdr->tiling.col_start_sb[hdr->tiling.cols] = sbw;
+ hdr->tiling.row_start_sb[hdr->tiling.rows] = sbh;
+ if (hdr->tiling.log2_cols || hdr->tiling.log2_rows) {
+ hdr->tiling.update = dav1d_get_bits(gb, hdr->tiling.log2_cols +
+ hdr->tiling.log2_rows);
+ if (hdr->tiling.update >= hdr->tiling.cols * hdr->tiling.rows)
+ goto error;
+ hdr->tiling.n_bytes = dav1d_get_bits(gb, 2) + 1;
+ } else {
+ hdr->tiling.n_bytes = hdr->tiling.update = 0;
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-tiling: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ // quant data
+ hdr->quant.yac = dav1d_get_bits(gb, 8);
+ hdr->quant.ydc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
+ if (!seqhdr->monochrome) {
+ // If the sequence header says that delta_q might be different
+ // for U, V, we must check whether it actually is for this
+ // frame.
+ const int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bit(gb) : 0;
+ hdr->quant.udc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
+ hdr->quant.uac_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
+ if (diff_uv_delta) {
+ hdr->quant.vdc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
+ hdr->quant.vac_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0;
+ } else {
+ hdr->quant.vdc_delta = hdr->quant.udc_delta;
+ hdr->quant.vac_delta = hdr->quant.uac_delta;
+ }
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-quant: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ hdr->quant.qm = dav1d_get_bit(gb);
+ if (hdr->quant.qm) {
+ hdr->quant.qm_y = dav1d_get_bits(gb, 4);
+ hdr->quant.qm_u = dav1d_get_bits(gb, 4);
+ hdr->quant.qm_v =
+ seqhdr->separate_uv_delta_q ? (int)dav1d_get_bits(gb, 4) :
+ hdr->quant.qm_u;
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-qm: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ // segmentation data
+ hdr->segmentation.enabled = dav1d_get_bit(gb);
+ if (hdr->segmentation.enabled) {
+ if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
+ hdr->segmentation.update_map = 1;
+ hdr->segmentation.temporal = 0;
+ hdr->segmentation.update_data = 1;
+ } else {
+ hdr->segmentation.update_map = dav1d_get_bit(gb);
+ hdr->segmentation.temporal =
+ hdr->segmentation.update_map ? dav1d_get_bit(gb) : 0;
+ hdr->segmentation.update_data = dav1d_get_bit(gb);
+ }
+
+ if (hdr->segmentation.update_data) {
+ hdr->segmentation.seg_data.preskip = 0;
+ hdr->segmentation.seg_data.last_active_segid = -1;
+ for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) {
+ Dav1dSegmentationData *const seg =
+ &hdr->segmentation.seg_data.d[i];
+ if (dav1d_get_bit(gb)) {
+ seg->delta_q = dav1d_get_sbits(gb, 9);
+ hdr->segmentation.seg_data.last_active_segid = i;
+ } else {
+ seg->delta_q = 0;
+ }
+ if (dav1d_get_bit(gb)) {
+ seg->delta_lf_y_v = dav1d_get_sbits(gb, 7);
+ hdr->segmentation.seg_data.last_active_segid = i;
+ } else {
+ seg->delta_lf_y_v = 0;
+ }
+ if (dav1d_get_bit(gb)) {
+ seg->delta_lf_y_h = dav1d_get_sbits(gb, 7);
+ hdr->segmentation.seg_data.last_active_segid = i;
+ } else {
+ seg->delta_lf_y_h = 0;
+ }
+ if (dav1d_get_bit(gb)) {
+ seg->delta_lf_u = dav1d_get_sbits(gb, 7);
+ hdr->segmentation.seg_data.last_active_segid = i;
+ } else {
+ seg->delta_lf_u = 0;
+ }
+ if (dav1d_get_bit(gb)) {
+ seg->delta_lf_v = dav1d_get_sbits(gb, 7);
+ hdr->segmentation.seg_data.last_active_segid = i;
+ } else {
+ seg->delta_lf_v = 0;
+ }
+ if (dav1d_get_bit(gb)) {
+ seg->ref = dav1d_get_bits(gb, 3);
+ hdr->segmentation.seg_data.last_active_segid = i;
+ hdr->segmentation.seg_data.preskip = 1;
+ } else {
+ seg->ref = -1;
+ }
+ if ((seg->skip = dav1d_get_bit(gb))) {
+ hdr->segmentation.seg_data.last_active_segid = i;
+ hdr->segmentation.seg_data.preskip = 1;
+ }
+ if ((seg->globalmv = dav1d_get_bit(gb))) {
+ hdr->segmentation.seg_data.last_active_segid = i;
+ hdr->segmentation.seg_data.preskip = 1;
+ }
+ }
+ } else {
+ // segmentation.update_data was false so we should copy
+ // segmentation data from the reference frame.
+ assert(hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
+ const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
+ if (!c->refs[pri_ref].p.p.frame_hdr) goto error;
+ hdr->segmentation.seg_data =
+ c->refs[pri_ref].p.p.frame_hdr->segmentation.seg_data;
+ }
+ } else {
+ memset(&hdr->segmentation.seg_data, 0, sizeof(Dav1dSegmentationDataSet));
+ for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++)
+ hdr->segmentation.seg_data.d[i].ref = -1;
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-segmentation: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ // delta q
+ hdr->delta.q.present = hdr->quant.yac ? dav1d_get_bit(gb) : 0;
+ hdr->delta.q.res_log2 = hdr->delta.q.present ? dav1d_get_bits(gb, 2) : 0;
+ hdr->delta.lf.present = hdr->delta.q.present && !hdr->allow_intrabc &&
+ dav1d_get_bit(gb);
+ hdr->delta.lf.res_log2 = hdr->delta.lf.present ? dav1d_get_bits(gb, 2) : 0;
+ hdr->delta.lf.multi = hdr->delta.lf.present ? dav1d_get_bit(gb) : 0;
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-delta_q_lf_flags: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ // derive lossless flags
+ const int delta_lossless = !hdr->quant.ydc_delta && !hdr->quant.udc_delta &&
+ !hdr->quant.uac_delta && !hdr->quant.vdc_delta && !hdr->quant.vac_delta;
+ hdr->all_lossless = 1;
+ for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) {
+ hdr->segmentation.qidx[i] = hdr->segmentation.enabled ?
+ iclip_u8(hdr->quant.yac + hdr->segmentation.seg_data.d[i].delta_q) :
+ hdr->quant.yac;
+ hdr->segmentation.lossless[i] =
+ !hdr->segmentation.qidx[i] && delta_lossless;
+ hdr->all_lossless &= hdr->segmentation.lossless[i];
+ }
+
+ // loopfilter
+ if (hdr->all_lossless || hdr->allow_intrabc) {
+ hdr->loopfilter.level_y[0] = hdr->loopfilter.level_y[1] = 0;
+ hdr->loopfilter.level_u = hdr->loopfilter.level_v = 0;
+ hdr->loopfilter.sharpness = 0;
+ hdr->loopfilter.mode_ref_delta_enabled = 1;
+ hdr->loopfilter.mode_ref_delta_update = 1;
+ hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas;
+ } else {
+ hdr->loopfilter.level_y[0] = dav1d_get_bits(gb, 6);
+ hdr->loopfilter.level_y[1] = dav1d_get_bits(gb, 6);
+ if (!seqhdr->monochrome &&
+ (hdr->loopfilter.level_y[0] || hdr->loopfilter.level_y[1]))
+ {
+ hdr->loopfilter.level_u = dav1d_get_bits(gb, 6);
+ hdr->loopfilter.level_v = dav1d_get_bits(gb, 6);
+ }
+ hdr->loopfilter.sharpness = dav1d_get_bits(gb, 3);
+
+ if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
+ hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas;
+ } else {
+ const int ref = hdr->refidx[hdr->primary_ref_frame];
+ if (!c->refs[ref].p.p.frame_hdr) goto error;
+ hdr->loopfilter.mode_ref_deltas =
+ c->refs[ref].p.p.frame_hdr->loopfilter.mode_ref_deltas;
+ }
+ hdr->loopfilter.mode_ref_delta_enabled = dav1d_get_bit(gb);
+ if (hdr->loopfilter.mode_ref_delta_enabled) {
+ hdr->loopfilter.mode_ref_delta_update = dav1d_get_bit(gb);
+ if (hdr->loopfilter.mode_ref_delta_update) {
+ for (int i = 0; i < 8; i++)
+ if (dav1d_get_bit(gb))
+ hdr->loopfilter.mode_ref_deltas.ref_delta[i] =
+ dav1d_get_sbits(gb, 7);
+ for (int i = 0; i < 2; i++)
+ if (dav1d_get_bit(gb))
+ hdr->loopfilter.mode_ref_deltas.mode_delta[i] =
+ dav1d_get_sbits(gb, 7);
+ }
+ }
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-lpf: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ // cdef
+ if (!hdr->all_lossless && seqhdr->cdef && !hdr->allow_intrabc) {
+ hdr->cdef.damping = dav1d_get_bits(gb, 2) + 3;
+ hdr->cdef.n_bits = dav1d_get_bits(gb, 2);
+ for (int i = 0; i < (1 << hdr->cdef.n_bits); i++) {
+ hdr->cdef.y_strength[i] = dav1d_get_bits(gb, 6);
+ if (!seqhdr->monochrome)
+ hdr->cdef.uv_strength[i] = dav1d_get_bits(gb, 6);
+ }
+ } else {
+ hdr->cdef.n_bits = 0;
+ hdr->cdef.y_strength[0] = 0;
+ hdr->cdef.uv_strength[0] = 0;
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-cdef: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ // restoration
+ if ((!hdr->all_lossless || hdr->super_res.enabled) &&
+ seqhdr->restoration && !hdr->allow_intrabc)
+ {
+ hdr->restoration.type[0] = dav1d_get_bits(gb, 2);
+ if (!seqhdr->monochrome) {
+ hdr->restoration.type[1] = dav1d_get_bits(gb, 2);
+ hdr->restoration.type[2] = dav1d_get_bits(gb, 2);
+ } else {
+ hdr->restoration.type[1] =
+ hdr->restoration.type[2] = DAV1D_RESTORATION_NONE;
+ }
+
+ if (hdr->restoration.type[0] || hdr->restoration.type[1] ||
+ hdr->restoration.type[2])
+ {
+ // Log2 of the restoration unit size.
+ hdr->restoration.unit_size[0] = 6 + seqhdr->sb128;
+ if (dav1d_get_bit(gb)) {
+ hdr->restoration.unit_size[0]++;
+ if (!seqhdr->sb128)
+ hdr->restoration.unit_size[0] += dav1d_get_bit(gb);
+ }
+ hdr->restoration.unit_size[1] = hdr->restoration.unit_size[0];
+ if ((hdr->restoration.type[1] || hdr->restoration.type[2]) &&
+ seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1)
+ {
+ hdr->restoration.unit_size[1] -= dav1d_get_bit(gb);
+ }
+ } else {
+ hdr->restoration.unit_size[0] = 8;
+ }
+ } else {
+ hdr->restoration.type[0] = DAV1D_RESTORATION_NONE;
+ hdr->restoration.type[1] = DAV1D_RESTORATION_NONE;
+ hdr->restoration.type[2] = DAV1D_RESTORATION_NONE;
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-restoration: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ hdr->txfm_mode = hdr->all_lossless ? DAV1D_TX_4X4_ONLY :
+ dav1d_get_bit(gb) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST;
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-txfmmode: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ hdr->switchable_comp_refs = IS_INTER_OR_SWITCH(hdr) ? dav1d_get_bit(gb) : 0;
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-refmode: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ hdr->skip_mode_allowed = 0;
+ if (hdr->switchable_comp_refs && IS_INTER_OR_SWITCH(hdr) && seqhdr->order_hint) {
+ const unsigned poc = hdr->frame_offset;
+ unsigned off_before = 0xFFFFFFFFU;
+ int off_after = -1;
+ int off_before_idx, off_after_idx;
+ for (int i = 0; i < 7; i++) {
+ if (!c->refs[hdr->refidx[i]].p.p.frame_hdr) goto error;
+ const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
+
+ const int diff = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc);
+ if (diff > 0) {
+ if (off_after == -1 || get_poc_diff(seqhdr->order_hint_n_bits,
+ off_after, refpoc) > 0)
+ {
+ off_after = refpoc;
+ off_after_idx = i;
+ }
+ } else if (diff < 0 && (off_before == 0xFFFFFFFFU ||
+ get_poc_diff(seqhdr->order_hint_n_bits,
+ refpoc, off_before) > 0))
+ {
+ off_before = refpoc;
+ off_before_idx = i;
+ }
+ }
+
+ if (off_before != 0xFFFFFFFFU && off_after != -1) {
+ hdr->skip_mode_refs[0] = imin(off_before_idx, off_after_idx);
+ hdr->skip_mode_refs[1] = imax(off_before_idx, off_after_idx);
+ hdr->skip_mode_allowed = 1;
+ } else if (off_before != 0xFFFFFFFFU) {
+ unsigned off_before2 = 0xFFFFFFFFU;
+ int off_before2_idx;
+ for (int i = 0; i < 7; i++) {
+ if (!c->refs[hdr->refidx[i]].p.p.frame_hdr) goto error;
+ const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
+ if (get_poc_diff(seqhdr->order_hint_n_bits,
+ refpoc, off_before) < 0) {
+ if (off_before2 == 0xFFFFFFFFU ||
+ get_poc_diff(seqhdr->order_hint_n_bits,
+ refpoc, off_before2) > 0)
+ {
+ off_before2 = refpoc;
+ off_before2_idx = i;
+ }
+ }
+ }
+
+ if (off_before2 != 0xFFFFFFFFU) {
+ hdr->skip_mode_refs[0] = imin(off_before_idx, off_before2_idx);
+ hdr->skip_mode_refs[1] = imax(off_before_idx, off_before2_idx);
+ hdr->skip_mode_allowed = 1;
+ }
+ }
+ }
+ hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bit(gb) : 0;
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-extskip: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ hdr->warp_motion = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) &&
+ seqhdr->warped_motion && dav1d_get_bit(gb);
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-warpmotionbit: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+ hdr->reduced_txtp_set = dav1d_get_bit(gb);
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-reducedtxtpset: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ for (int i = 0; i < 7; i++)
+ hdr->gmv[i] = dav1d_default_wm_params;
+
+ if (IS_INTER_OR_SWITCH(hdr)) {
+ for (int i = 0; i < 7; i++) {
+ hdr->gmv[i].type = !dav1d_get_bit(gb) ? DAV1D_WM_TYPE_IDENTITY :
+ dav1d_get_bit(gb) ? DAV1D_WM_TYPE_ROT_ZOOM :
+ dav1d_get_bit(gb) ? DAV1D_WM_TYPE_TRANSLATION :
+ DAV1D_WM_TYPE_AFFINE;
+
+ if (hdr->gmv[i].type == DAV1D_WM_TYPE_IDENTITY) continue;
+
+ const Dav1dWarpedMotionParams *ref_gmv;
+ if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
+ ref_gmv = &dav1d_default_wm_params;
+ } else {
+ const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
+ if (!c->refs[pri_ref].p.p.frame_hdr) goto error;
+ ref_gmv = &c->refs[pri_ref].p.p.frame_hdr->gmv[i];
+ }
+ int32_t *const mat = hdr->gmv[i].matrix;
+ const int32_t *const ref_mat = ref_gmv->matrix;
+ int bits, shift;
+
+ if (hdr->gmv[i].type >= DAV1D_WM_TYPE_ROT_ZOOM) {
+ mat[2] = (1 << 16) + 2 *
+ dav1d_get_bits_subexp(gb, (ref_mat[2] - (1 << 16)) >> 1, 12);
+ mat[3] = 2 * dav1d_get_bits_subexp(gb, ref_mat[3] >> 1, 12);
+
+ bits = 12;
+ shift = 10;
+ } else {
+ bits = 9 - !hdr->hp;
+ shift = 13 + !hdr->hp;
+ }
+
+ if (hdr->gmv[i].type == DAV1D_WM_TYPE_AFFINE) {
+ mat[4] = 2 * dav1d_get_bits_subexp(gb, ref_mat[4] >> 1, 12);
+ mat[5] = (1 << 16) + 2 *
+ dav1d_get_bits_subexp(gb, (ref_mat[5] - (1 << 16)) >> 1, 12);
+ } else {
+ mat[4] = -mat[3];
+ mat[5] = mat[2];
+ }
+
+ mat[0] = dav1d_get_bits_subexp(gb, ref_mat[0] >> shift, bits) * (1 << shift);
+ mat[1] = dav1d_get_bits_subexp(gb, ref_mat[1] >> shift, bits) * (1 << shift);
+ }
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-gmv: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ hdr->film_grain.present = seqhdr->film_grain_present &&
+ (hdr->show_frame || hdr->showable_frame) &&
+ dav1d_get_bit(gb);
+ if (hdr->film_grain.present) {
+ const unsigned seed = dav1d_get_bits(gb, 16);
+ hdr->film_grain.update = hdr->frame_type != DAV1D_FRAME_TYPE_INTER || dav1d_get_bit(gb);
+ if (!hdr->film_grain.update) {
+ const int refidx = dav1d_get_bits(gb, 3);
+ int i;
+ for (i = 0; i < 7; i++)
+ if (hdr->refidx[i] == refidx)
+ break;
+ if (i == 7 || !c->refs[refidx].p.p.frame_hdr) goto error;
+ hdr->film_grain.data = c->refs[refidx].p.p.frame_hdr->film_grain.data;
+ hdr->film_grain.data.seed = seed;
+ } else {
+ Dav1dFilmGrainData *const fgd = &hdr->film_grain.data;
+ fgd->seed = seed;
+
+ fgd->num_y_points = dav1d_get_bits(gb, 4);
+ if (fgd->num_y_points > 14) goto error;
+ for (int i = 0; i < fgd->num_y_points; i++) {
+ fgd->y_points[i][0] = dav1d_get_bits(gb, 8);
+ if (i && fgd->y_points[i - 1][0] >= fgd->y_points[i][0])
+ goto error;
+ fgd->y_points[i][1] = dav1d_get_bits(gb, 8);
+ }
+
+ fgd->chroma_scaling_from_luma =
+ !seqhdr->monochrome && dav1d_get_bit(gb);
+ if (seqhdr->monochrome || fgd->chroma_scaling_from_luma ||
+ (seqhdr->ss_ver == 1 && seqhdr->ss_hor == 1 && !fgd->num_y_points))
+ {
+ fgd->num_uv_points[0] = fgd->num_uv_points[1] = 0;
+ } else for (int pl = 0; pl < 2; pl++) {
+ fgd->num_uv_points[pl] = dav1d_get_bits(gb, 4);
+ if (fgd->num_uv_points[pl] > 10) goto error;
+ for (int i = 0; i < fgd->num_uv_points[pl]; i++) {
+ fgd->uv_points[pl][i][0] = dav1d_get_bits(gb, 8);
+ if (i && fgd->uv_points[pl][i - 1][0] >= fgd->uv_points[pl][i][0])
+ goto error;
+ fgd->uv_points[pl][i][1] = dav1d_get_bits(gb, 8);
+ }
+ }
+
+ if (seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1 &&
+ !!fgd->num_uv_points[0] != !!fgd->num_uv_points[1])
+ {
+ goto error;
+ }
+
+ fgd->scaling_shift = dav1d_get_bits(gb, 2) + 8;
+ fgd->ar_coeff_lag = dav1d_get_bits(gb, 2);
+ const int num_y_pos = 2 * fgd->ar_coeff_lag * (fgd->ar_coeff_lag + 1);
+ if (fgd->num_y_points)
+ for (int i = 0; i < num_y_pos; i++)
+ fgd->ar_coeffs_y[i] = dav1d_get_bits(gb, 8) - 128;
+ for (int pl = 0; pl < 2; pl++)
+ if (fgd->num_uv_points[pl] || fgd->chroma_scaling_from_luma) {
+ const int num_uv_pos = num_y_pos + !!fgd->num_y_points;
+ for (int i = 0; i < num_uv_pos; i++)
+ fgd->ar_coeffs_uv[pl][i] = dav1d_get_bits(gb, 8) - 128;
+ if (!fgd->num_y_points)
+ fgd->ar_coeffs_uv[pl][num_uv_pos] = 0;
+ }
+ fgd->ar_coeff_shift = dav1d_get_bits(gb, 2) + 6;
+ fgd->grain_scale_shift = dav1d_get_bits(gb, 2);
+ for (int pl = 0; pl < 2; pl++)
+ if (fgd->num_uv_points[pl]) {
+ fgd->uv_mult[pl] = dav1d_get_bits(gb, 8) - 128;
+ fgd->uv_luma_mult[pl] = dav1d_get_bits(gb, 8) - 128;
+ fgd->uv_offset[pl] = dav1d_get_bits(gb, 9) - 256;
+ }
+ fgd->overlap_flag = dav1d_get_bit(gb);
+ fgd->clip_to_restricted_range = dav1d_get_bit(gb);
+ }
+ } else {
+ memset(&hdr->film_grain.data, 0, sizeof(hdr->film_grain.data));
+ }
+#if DEBUG_FRAME_HDR
+ printf("HDR: post-filmgrain: off=%td\n",
+ (gb->ptr - init_ptr) * 8 - gb->bits_left);
+#endif
+
+ return 0;
+
+error:
+ dav1d_log(c, "Error parsing frame header\n");
+ return DAV1D_ERR(EINVAL);
+}
+
+static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) {
+ const int n_tiles = c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows;
+ const int have_tile_pos = n_tiles > 1 ? dav1d_get_bit(gb) : 0;
+
+ if (have_tile_pos) {
+ const int n_bits = c->frame_hdr->tiling.log2_cols +
+ c->frame_hdr->tiling.log2_rows;
+ c->tile[c->n_tile_data].start = dav1d_get_bits(gb, n_bits);
+ c->tile[c->n_tile_data].end = dav1d_get_bits(gb, n_bits);
+ } else {
+ c->tile[c->n_tile_data].start = 0;
+ c->tile[c->n_tile_data].end = n_tiles - 1;
+ }
+}
+
+// Check that we haven't read more than obu_len bytes from the buffer
+// since init_bit_pos.
+static int check_for_overrun(Dav1dContext *const c, GetBits *const gb,
+ const unsigned init_bit_pos,
+ const unsigned obu_len)
+{
+ // Make sure we haven't actually read past the end of the gb buffer
+ if (gb->error) {
+ dav1d_log(c, "Overrun in OBU bit buffer\n");
+ return 1;
+ }
+
+ const unsigned pos = dav1d_get_bits_pos(gb);
+
+ // We assume that init_bit_pos was the bit position of the buffer
+ // at some point in the past, so cannot be smaller than pos.
+ assert (init_bit_pos <= pos);
+
+ if (pos - init_bit_pos > 8 * obu_len) {
+ dav1d_log(c, "Overrun in OBU bit buffer into next OBU\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int global) {
+ GetBits gb;
+ int res;
+
+ dav1d_init_get_bits(&gb, in->data, in->sz);
+
+ // obu header
+ dav1d_get_bit(&gb); // obu_forbidden_bit
+ const enum Dav1dObuType type = dav1d_get_bits(&gb, 4);
+ const int has_extension = dav1d_get_bit(&gb);
+ const int has_length_field = dav1d_get_bit(&gb);
+ dav1d_get_bit(&gb); // reserved
+
+ int temporal_id = 0, spatial_id = 0;
+ if (has_extension) {
+ temporal_id = dav1d_get_bits(&gb, 3);
+ spatial_id = dav1d_get_bits(&gb, 2);
+ dav1d_get_bits(&gb, 3); // reserved
+ }
+
+ // obu length field
+ const unsigned len = has_length_field ?
+ dav1d_get_uleb128(&gb) : (unsigned) in->sz - 1 - has_extension;
+ if (gb.error) goto error;
+
+ const unsigned init_bit_pos = dav1d_get_bits_pos(&gb);
+ const unsigned init_byte_pos = init_bit_pos >> 3;
+
+ // We must have read a whole number of bytes at this point (1 byte
+ // for the header and whole bytes at a time when reading the
+ // leb128 length field).
+ assert((init_bit_pos & 7) == 0);
+
+ // We also know that we haven't tried to read more than in->sz
+ // bytes yet (otherwise the error flag would have been set by the
+ // code in getbits.c)
+ assert(in->sz >= init_byte_pos);
+
+ // Make sure that there are enough bits left in the buffer for the
+ // rest of the OBU.
+ if (len > in->sz - init_byte_pos) goto error;
+
+ // skip obu not belonging to the selected temporal/spatial layer
+ if (type != DAV1D_OBU_SEQ_HDR && type != DAV1D_OBU_TD &&
+ has_extension && c->operating_point_idc != 0)
+ {
+ const int in_temporal_layer = (c->operating_point_idc >> temporal_id) & 1;
+ const int in_spatial_layer = (c->operating_point_idc >> (spatial_id + 8)) & 1;
+ if (!in_temporal_layer || !in_spatial_layer)
+ return len + init_byte_pos;
+ }
+
+ switch (type) {
+ case DAV1D_OBU_SEQ_HDR: {
+ Dav1dRef *ref = dav1d_ref_create_using_pool(c->seq_hdr_pool,
+ sizeof(Dav1dSequenceHeader));
+ if (!ref) return DAV1D_ERR(ENOMEM);
+ Dav1dSequenceHeader *seq_hdr = ref->data;
+ if ((res = parse_seq_hdr(c, &gb, seq_hdr)) < 0) {
+ dav1d_ref_dec(&ref);
+ goto error;
+ }
+ if (check_for_overrun(c, &gb, init_bit_pos, len)) {
+ dav1d_ref_dec(&ref);
+ goto error;
+ }
+ // If we have read a sequence header which is different from
+ // the old one, this is a new video sequence and can't use any
+ // previous state. Free that state.
+
+ if (!c->seq_hdr) {
+ c->frame_hdr = NULL;
+ c->frame_flags |= PICTURE_FLAG_NEW_SEQUENCE;
+ // see 7.5, operating_parameter_info is allowed to change in
+ // sequence headers of a single sequence
+ } else if (memcmp(seq_hdr, c->seq_hdr, offsetof(Dav1dSequenceHeader, operating_parameter_info))) {
+ c->frame_hdr = NULL;
+ c->mastering_display = NULL;
+ c->content_light = NULL;
+ dav1d_ref_dec(&c->mastering_display_ref);
+ dav1d_ref_dec(&c->content_light_ref);
+ for (int i = 0; i < 8; i++) {
+ if (c->refs[i].p.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->refs[i].p);
+ dav1d_ref_dec(&c->refs[i].segmap);
+ dav1d_ref_dec(&c->refs[i].refmvs);
+ dav1d_cdf_thread_unref(&c->cdf[i]);
+ }
+ c->frame_flags |= PICTURE_FLAG_NEW_SEQUENCE;
+ // If operating_parameter_info changed, signal it
+ } else if (memcmp(seq_hdr->operating_parameter_info, c->seq_hdr->operating_parameter_info,
+ sizeof(seq_hdr->operating_parameter_info)))
+ {
+ c->frame_flags |= PICTURE_FLAG_NEW_OP_PARAMS_INFO;
+ }
+ dav1d_ref_dec(&c->seq_hdr_ref);
+ c->seq_hdr_ref = ref;
+ c->seq_hdr = seq_hdr;
+ break;
+ }
+ case DAV1D_OBU_REDUNDANT_FRAME_HDR:
+ if (c->frame_hdr) break;
+ // fall-through
+ case DAV1D_OBU_FRAME:
+ case DAV1D_OBU_FRAME_HDR:
+ if (global) break;
+ if (!c->seq_hdr) goto error;
+ if (!c->frame_hdr_ref) {
+ c->frame_hdr_ref = dav1d_ref_create_using_pool(c->frame_hdr_pool,
+ sizeof(Dav1dFrameHeader));
+ if (!c->frame_hdr_ref) return DAV1D_ERR(ENOMEM);
+ }
+#ifndef NDEBUG
+ // ensure that the reference is writable
+ assert(dav1d_ref_is_writable(c->frame_hdr_ref));
+#endif
+ c->frame_hdr = c->frame_hdr_ref->data;
+ memset(c->frame_hdr, 0, sizeof(*c->frame_hdr));
+ c->frame_hdr->temporal_id = temporal_id;
+ c->frame_hdr->spatial_id = spatial_id;
+ if ((res = parse_frame_hdr(c, &gb)) < 0) {
+ c->frame_hdr = NULL;
+ goto error;
+ }
+ for (int n = 0; n < c->n_tile_data; n++)
+ dav1d_data_unref_internal(&c->tile[n].data);
+ c->n_tile_data = 0;
+ c->n_tiles = 0;
+ if (type != DAV1D_OBU_FRAME) {
+ // This is actually a frame header OBU so read the
+ // trailing bit and check for overrun.
+ dav1d_get_bit(&gb);
+ if (check_for_overrun(c, &gb, init_bit_pos, len)) {
+ c->frame_hdr = NULL;
+ goto error;
+ }
+ }
+
+ if (c->frame_size_limit && (int64_t)c->frame_hdr->width[1] *
+ c->frame_hdr->height > c->frame_size_limit)
+ {
+ dav1d_log(c, "Frame size %dx%d exceeds limit %u\n", c->frame_hdr->width[1],
+ c->frame_hdr->height, c->frame_size_limit);
+ c->frame_hdr = NULL;
+ return DAV1D_ERR(ERANGE);
+ }
+
+ if (type != DAV1D_OBU_FRAME)
+ break;
+ // OBU_FRAMEs shouldn't be signaled with show_existing_frame
+ if (c->frame_hdr->show_existing_frame) {
+ c->frame_hdr = NULL;
+ goto error;
+ }
+
+ // This is the frame header at the start of a frame OBU.
+ // There's no trailing bit at the end to skip, but we do need
+ // to align to the next byte.
+ dav1d_bytealign_get_bits(&gb);
+ // fall-through
+ case DAV1D_OBU_TILE_GRP: {
+ if (global) break;
+ if (!c->frame_hdr) goto error;
+ if (c->n_tile_data_alloc < c->n_tile_data + 1) {
+ if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error;
+ struct Dav1dTileGroup *tile = realloc(c->tile, (c->n_tile_data + 1) * sizeof(*c->tile));
+ if (!tile) goto error;
+ c->tile = tile;
+ memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile));
+ c->n_tile_data_alloc = c->n_tile_data + 1;
+ }
+ parse_tile_hdr(c, &gb);
+ // Align to the next byte boundary and check for overrun.
+ dav1d_bytealign_get_bits(&gb);
+ if (check_for_overrun(c, &gb, init_bit_pos, len))
+ goto error;
+ // The current bit position is a multiple of 8 (because we
+ // just aligned it) and less than 8*pkt_bytelen because
+ // otherwise the overrun check would have fired.
+ const unsigned pkt_bytelen = init_byte_pos + len;
+ const unsigned bit_pos = dav1d_get_bits_pos(&gb);
+ assert((bit_pos & 7) == 0);
+ assert(pkt_bytelen >= (bit_pos >> 3));
+ dav1d_data_ref(&c->tile[c->n_tile_data].data, in);
+ c->tile[c->n_tile_data].data.data += bit_pos >> 3;
+ c->tile[c->n_tile_data].data.sz = pkt_bytelen - (bit_pos >> 3);
+ // ensure tile groups are in order and sane, see 6.10.1
+ if (c->tile[c->n_tile_data].start > c->tile[c->n_tile_data].end ||
+ c->tile[c->n_tile_data].start != c->n_tiles)
+ {
+ for (int i = 0; i <= c->n_tile_data; i++)
+ dav1d_data_unref_internal(&c->tile[i].data);
+ c->n_tile_data = 0;
+ c->n_tiles = 0;
+ goto error;
+ }
+ c->n_tiles += 1 + c->tile[c->n_tile_data].end -
+ c->tile[c->n_tile_data].start;
+ c->n_tile_data++;
+ break;
+ }
+ case DAV1D_OBU_METADATA: {
+#define DEBUG_OBU_METADATA 0
+#if DEBUG_OBU_METADATA
+ const uint8_t *const init_ptr = gb.ptr;
+#endif
+ // obu metadta type field
+ const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb);
+ const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3;
+ if (gb.error) goto error;
+
+ switch (meta_type) {
+ case OBU_META_HDR_CLL: {
+ Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel));
+ if (!ref) return DAV1D_ERR(ENOMEM);
+ Dav1dContentLightLevel *const content_light = ref->data;
+
+ content_light->max_content_light_level = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+ printf("CLLOBU: max-content-light-level: %d [off=%td]\n",
+ content_light->max_content_light_level,
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
+ content_light->max_frame_average_light_level = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+ printf("CLLOBU: max-frame-average-light-level: %d [off=%td]\n",
+ content_light->max_frame_average_light_level,
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
+
+ // Skip the trailing bit, align to the next byte boundary and check for overrun.
+ dav1d_get_bit(&gb);
+ dav1d_bytealign_get_bits(&gb);
+ if (check_for_overrun(c, &gb, init_bit_pos, len)) {
+ dav1d_ref_dec(&ref);
+ goto error;
+ }
+
+ dav1d_ref_dec(&c->content_light_ref);
+ c->content_light = content_light;
+ c->content_light_ref = ref;
+ break;
+ }
+ case OBU_META_HDR_MDCV: {
+ Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay));
+ if (!ref) return DAV1D_ERR(ENOMEM);
+ Dav1dMasteringDisplay *const mastering_display = ref->data;
+
+ for (int i = 0; i < 3; i++) {
+ mastering_display->primaries[i][0] = dav1d_get_bits(&gb, 16);
+ mastering_display->primaries[i][1] = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: primaries[%d]: (%d, %d) [off=%td]\n", i,
+ mastering_display->primaries[i][0],
+ mastering_display->primaries[i][1],
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
+ }
+ mastering_display->white_point[0] = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: white-point-x: %d [off=%td]\n",
+ mastering_display->white_point[0],
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
+ mastering_display->white_point[1] = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: white-point-y: %d [off=%td]\n",
+ mastering_display->white_point[1],
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
+ mastering_display->max_luminance = dav1d_get_bits(&gb, 32);
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: max-luminance: %d [off=%td]\n",
+ mastering_display->max_luminance,
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
+ mastering_display->min_luminance = dav1d_get_bits(&gb, 32);
+#if DEBUG_OBU_METADATA
+ printf("MDCVOBU: min-luminance: %d [off=%td]\n",
+ mastering_display->min_luminance,
+ (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
+ // Skip the trailing bit, align to the next byte boundary and check for overrun.
+ dav1d_get_bit(&gb);
+ dav1d_bytealign_get_bits(&gb);
+ if (check_for_overrun(c, &gb, init_bit_pos, len)) {
+ dav1d_ref_dec(&ref);
+ goto error;
+ }
+
+ dav1d_ref_dec(&c->mastering_display_ref);
+ c->mastering_display = mastering_display;
+ c->mastering_display_ref = ref;
+ break;
+ }
+ case OBU_META_ITUT_T35: {
+ int payload_size = len;
+ // Don't take into account all the trailing bits for payload_size
+ while (payload_size > 0 && !in->data[init_byte_pos + payload_size - 1])
+ payload_size--; // trailing_zero_bit x 8
+ payload_size--; // trailing_one_bit + trailing_zero_bit x 7
+
+ // Don't take into account meta_type bytes
+ payload_size -= meta_type_len;
+
+ int country_code_extension_byte = 0;
+ const int country_code = dav1d_get_bits(&gb, 8);
+ payload_size--;
+ if (country_code == 0xFF) {
+ country_code_extension_byte = dav1d_get_bits(&gb, 8);
+ payload_size--;
+ }
+
+ if (payload_size <= 0) {
+ dav1d_log(c, "Malformed ITU-T T.35 metadata message format\n");
+ break;
+ }
+
+ Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t));
+ if (!ref) return DAV1D_ERR(ENOMEM);
+ Dav1dITUTT35 *const itut_t35_metadata = ref->data;
+
+ // We need our public headers to be C++ compatible, so payload can't be
+ // a flexible array member
+ itut_t35_metadata->payload = (uint8_t *) &itut_t35_metadata[1];
+ itut_t35_metadata->country_code = country_code;
+ itut_t35_metadata->country_code_extension_byte = country_code_extension_byte;
+ for (int i = 0; i < payload_size; i++)
+ itut_t35_metadata->payload[i] = dav1d_get_bits(&gb, 8);
+ itut_t35_metadata->payload_size = payload_size;
+
+ dav1d_ref_dec(&c->itut_t35_ref);
+ c->itut_t35 = itut_t35_metadata;
+ c->itut_t35_ref = ref;
+ break;
+ }
+ case OBU_META_SCALABILITY:
+ case OBU_META_TIMECODE:
+ // ignore metadata OBUs we don't care about
+ break;
+ default:
+ // print a warning but don't fail for unknown types
+ dav1d_log(c, "Unknown Metadata OBU type %d\n", meta_type);
+ break;
+ }
+
+ break;
+ }
+ case DAV1D_OBU_TD:
+ c->frame_flags |= PICTURE_FLAG_NEW_TEMPORAL_UNIT;
+ break;
+ case DAV1D_OBU_PADDING:
+ // ignore OBUs we don't care about
+ break;
+ default:
+ // print a warning but don't fail for unknown types
+ dav1d_log(c, "Unknown OBU type %d of size %u\n", type, len);
+ break;
+ }
+
+ if (c->seq_hdr && c->frame_hdr) {
+ if (c->frame_hdr->show_existing_frame) {
+ if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr) goto error;
+ switch (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type) {
+ case DAV1D_FRAME_TYPE_INTER:
+ case DAV1D_FRAME_TYPE_SWITCH:
+ if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_REFERENCE)
+ goto skip;
+ break;
+ case DAV1D_FRAME_TYPE_INTRA:
+ if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_INTRA)
+ goto skip;
+ // fall-through
+ default:
+ break;
+ }
+ if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) goto error;
+ if (c->strict_std_compliance &&
+ !c->refs[c->frame_hdr->existing_frame_idx].p.showable)
+ {
+ goto error;
+ }
+ if (c->n_fc == 1) {
+ dav1d_thread_picture_ref(&c->out,
+ &c->refs[c->frame_hdr->existing_frame_idx].p);
+ dav1d_picture_copy_props(&c->out.p,
+ c->content_light, c->content_light_ref,
+ c->mastering_display, c->mastering_display_ref,
+ c->itut_t35, c->itut_t35_ref,
+ &in->m);
+ // Must be removed from the context after being attached to the frame
+ dav1d_ref_dec(&c->itut_t35_ref);
+ c->itut_t35 = NULL;
+ c->event_flags |= dav1d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p);
+ } else {
+ pthread_mutex_lock(&c->task_thread.lock);
+ // need to append this to the frame output queue
+ const unsigned next = c->frame_thread.next++;
+ if (c->frame_thread.next == c->n_fc)
+ c->frame_thread.next = 0;
+
+ Dav1dFrameContext *const f = &c->fc[next];
+ while (f->n_tile_data > 0)
+ pthread_cond_wait(&f->task_thread.cond,
+ &f->task_thread.ttd->lock);
+ Dav1dThreadPicture *const out_delayed =
+ &c->frame_thread.out_delayed[next];
+ if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
+ unsigned first = atomic_load(&c->task_thread.first);
+ if (first + 1U < c->n_fc)
+ atomic_fetch_add(&c->task_thread.first, 1U);
+ else
+ atomic_store(&c->task_thread.first, 0);
+ atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
+ &first, UINT_MAX);
+ if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
+ c->task_thread.cur--;
+ }
+ const int error = f->task_thread.retval;
+ if (error) {
+ c->cached_error = error;
+ f->task_thread.retval = 0;
+ dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
+ dav1d_thread_picture_unref(out_delayed);
+ } else if (out_delayed->p.data[0]) {
+ const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
+ memory_order_relaxed);
+ if ((out_delayed->visible || c->output_invisible_frames) &&
+ progress != FRAME_ERROR)
+ {
+ dav1d_thread_picture_ref(&c->out, out_delayed);
+ c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
+ }
+ dav1d_thread_picture_unref(out_delayed);
+ }
+ dav1d_thread_picture_ref(out_delayed,
+ &c->refs[c->frame_hdr->existing_frame_idx].p);
+ out_delayed->visible = 1;
+ dav1d_picture_copy_props(&out_delayed->p,
+ c->content_light, c->content_light_ref,
+ c->mastering_display, c->mastering_display_ref,
+ c->itut_t35, c->itut_t35_ref,
+ &in->m);
+ // Must be removed from the context after being attached to the frame
+ dav1d_ref_dec(&c->itut_t35_ref);
+ c->itut_t35 = NULL;
+
+ pthread_mutex_unlock(&c->task_thread.lock);
+ }
+ if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) {
+ const int r = c->frame_hdr->existing_frame_idx;
+ c->refs[r].p.showable = 0;
+ for (int i = 0; i < 8; i++) {
+ if (i == r) continue;
+
+ if (c->refs[i].p.p.frame_hdr)
+ dav1d_thread_picture_unref(&c->refs[i].p);
+ dav1d_thread_picture_ref(&c->refs[i].p, &c->refs[r].p);
+
+ dav1d_cdf_thread_unref(&c->cdf[i]);
+ dav1d_cdf_thread_ref(&c->cdf[i], &c->cdf[r]);
+
+ dav1d_ref_dec(&c->refs[i].segmap);
+ c->refs[i].segmap = c->refs[r].segmap;
+ if (c->refs[r].segmap)
+ dav1d_ref_inc(c->refs[r].segmap);
+ dav1d_ref_dec(&c->refs[i].refmvs);
+ }
+ }
+ c->frame_hdr = NULL;
+ } else if (c->n_tiles == c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows) {
+ switch (c->frame_hdr->frame_type) {
+ case DAV1D_FRAME_TYPE_INTER:
+ case DAV1D_FRAME_TYPE_SWITCH:
+ if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_REFERENCE ||
+ (c->decode_frame_type == DAV1D_DECODEFRAMETYPE_REFERENCE &&
+ !c->frame_hdr->refresh_frame_flags))
+ goto skip;
+ break;
+ case DAV1D_FRAME_TYPE_INTRA:
+ if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_INTRA ||
+ (c->decode_frame_type == DAV1D_DECODEFRAMETYPE_REFERENCE &&
+ !c->frame_hdr->refresh_frame_flags))
+ goto skip;
+ // fall-through
+ default:
+ break;
+ }
+ if (!c->n_tile_data)
+ goto error;
+ if ((res = dav1d_submit_frame(c)) < 0)
+ return res;
+ assert(!c->n_tile_data);
+ c->frame_hdr = NULL;
+ c->n_tiles = 0;
+ }
+ }
+
+ return len + init_byte_pos;
+
+skip:
+ // update refs with only the headers in case we skip the frame
+ for (int i = 0; i < 8; i++) {
+ if (c->frame_hdr->refresh_frame_flags & (1 << i)) {
+ dav1d_thread_picture_unref(&c->refs[i].p);
+ c->refs[i].p.p.frame_hdr = c->frame_hdr;
+ c->refs[i].p.p.seq_hdr = c->seq_hdr;
+ c->refs[i].p.p.frame_hdr_ref = c->frame_hdr_ref;
+ c->refs[i].p.p.seq_hdr_ref = c->seq_hdr_ref;
+ dav1d_ref_inc(c->frame_hdr_ref);
+ dav1d_ref_inc(c->seq_hdr_ref);
+ }
+ }
+
+ dav1d_ref_dec(&c->frame_hdr_ref);
+ c->frame_hdr = NULL;
+ c->n_tiles = 0;
+
+ return len + init_byte_pos;
+
+error:
+ dav1d_data_props_copy(&c->cached_error_props, &in->m);
+ dav1d_log(c, "Error parsing OBU data\n");
+ return DAV1D_ERR(EINVAL);
+}
diff --git a/third_party/dav1d/src/obu.h b/third_party/dav1d/src/obu.h
new file mode 100644
index 0000000000..aa79b5277a
--- /dev/null
+++ b/third_party/dav1d/src/obu.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_OBU_H
+#define DAV1D_SRC_OBU_H
+
+#include "dav1d/data.h"
+#include "src/internal.h"
+
+int dav1d_parse_obus(Dav1dContext *c, Dav1dData *in, int global);
+
+#endif /* DAV1D_SRC_OBU_H */
diff --git a/third_party/dav1d/src/picture.c b/third_party/dav1d/src/picture.c
new file mode 100644
index 0000000000..3e55d3f273
--- /dev/null
+++ b/third_party/dav1d/src/picture.c
@@ -0,0 +1,367 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/intops.h"
+#include "common/validate.h"
+
+#include "src/internal.h"
+#include "src/log.h"
+#include "src/picture.h"
+#include "src/ref.h"
+#include "src/thread.h"
+#include "src/thread_task.h"
+
+int dav1d_default_picture_alloc(Dav1dPicture *const p, void *const cookie) {
+ assert(sizeof(Dav1dMemPoolBuffer) <= DAV1D_PICTURE_ALIGNMENT);
+ const int hbd = p->p.bpc > 8;
+ const int aligned_w = (p->p.w + 127) & ~127;
+ const int aligned_h = (p->p.h + 127) & ~127;
+ const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
+ const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ ptrdiff_t y_stride = aligned_w << hbd;
+ ptrdiff_t uv_stride = has_chroma ? y_stride >> ss_hor : 0;
+ /* Due to how mapping of addresses to sets works in most L1 and L2 cache
+ * implementations, strides of multiples of certain power-of-two numbers
+ * may cause multiple rows of the same superblock to map to the same set,
+ * causing evictions of previous rows resulting in a reduction in cache
+ * hit rate. Avoid that by slightly padding the stride when necessary. */
+ if (!(y_stride & 1023))
+ y_stride += DAV1D_PICTURE_ALIGNMENT;
+ if (!(uv_stride & 1023) && has_chroma)
+ uv_stride += DAV1D_PICTURE_ALIGNMENT;
+ p->stride[0] = y_stride;
+ p->stride[1] = uv_stride;
+ const size_t y_sz = y_stride * aligned_h;
+ const size_t uv_sz = uv_stride * (aligned_h >> ss_ver);
+ const size_t pic_size = y_sz + 2 * uv_sz;
+
+ Dav1dMemPoolBuffer *const buf = dav1d_mem_pool_pop(cookie, pic_size +
+ DAV1D_PICTURE_ALIGNMENT -
+ sizeof(Dav1dMemPoolBuffer));
+ if (!buf) return DAV1D_ERR(ENOMEM);
+ p->allocator_data = buf;
+
+ uint8_t *const data = buf->data;
+ p->data[0] = data;
+ p->data[1] = has_chroma ? data + y_sz : NULL;
+ p->data[2] = has_chroma ? data + y_sz + uv_sz : NULL;
+
+ return 0;
+}
+
+void dav1d_default_picture_release(Dav1dPicture *const p, void *const cookie) {
+ dav1d_mem_pool_push(cookie, p->allocator_data);
+}
+
+struct pic_ctx_context {
+ struct Dav1dRef *plane_ref[3]; /* MUST BE FIRST */
+ enum Dav1dPixelLayout layout;
+ void *extra_ptr; /* MUST BE AT THE END */
+};
+
+struct plane_ctx_context {
+ Dav1dPicAllocator allocator;
+ Dav1dPicture pic;
+};
+
+static void free_buffer(const uint8_t *const data, void *const user_data) {
+ struct pic_ctx_context *pic_ctx = user_data;
+ const int planes = pic_ctx->layout != DAV1D_PIXEL_LAYOUT_I400 ? 3 : 1;
+
+ for (int i = 0; i < planes; i++)
+ dav1d_ref_dec(&pic_ctx->plane_ref[i]);
+ free(pic_ctx);
+}
+
+static void free_plane_buffer(const uint8_t *const data, void *const user_data) {
+ struct plane_ctx_context *plane_ctx = user_data;
+
+ plane_ctx->allocator.release_picture_callback(&plane_ctx->pic,
+ plane_ctx->allocator.cookie);
+ free(plane_ctx);
+}
+
+static int picture_alloc_with_edges(Dav1dContext *const c,
+ Dav1dPicture *const p,
+ const int w, const int h,
+ Dav1dSequenceHeader *const seq_hdr, Dav1dRef *const seq_hdr_ref,
+ Dav1dFrameHeader *const frame_hdr, Dav1dRef *const frame_hdr_ref,
+ Dav1dContentLightLevel *const content_light, Dav1dRef *const content_light_ref,
+ Dav1dMasteringDisplay *const mastering_display, Dav1dRef *const mastering_display_ref,
+ Dav1dITUTT35 *const itut_t35, Dav1dRef *const itut_t35_ref,
+ const int bpc,
+ const Dav1dDataProps *const props,
+ Dav1dPicAllocator *const p_allocator,
+ const size_t extra, void **const extra_ptr)
+{
+ if (p->data[0]) {
+ dav1d_log(c, "Picture already allocated!\n");
+ return -1;
+ }
+ assert(bpc > 0 && bpc <= 16);
+
+ struct pic_ctx_context *pic_ctx = malloc(extra + sizeof(struct pic_ctx_context));
+ if (pic_ctx == NULL)
+ return DAV1D_ERR(ENOMEM);
+ memset(pic_ctx, 0, sizeof(struct pic_ctx_context));
+
+ p->p.w = w;
+ p->p.h = h;
+ p->seq_hdr = seq_hdr;
+ p->frame_hdr = frame_hdr;
+ p->p.layout = seq_hdr->layout;
+ p->p.bpc = bpc;
+ dav1d_data_props_set_defaults(&p->m);
+ const int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie);
+ if (res < 0) {
+ free(pic_ctx);
+ return res;
+ }
+
+ pic_ctx->layout = p->p.layout;
+
+ if (!(p->ref = dav1d_ref_wrap(p->data[0], free_buffer, pic_ctx))) {
+ p_allocator->release_picture_callback(p, p_allocator->cookie);
+ free(pic_ctx);
+ dav1d_log(c, "Failed to wrap picture: %s\n", strerror(errno));
+ return DAV1D_ERR(ENOMEM);
+ }
+
+ struct plane_ctx_context *plane_ctx = malloc(sizeof(struct plane_ctx_context));
+ if (plane_ctx == NULL){
+ dav1d_ref_dec(&p->ref);
+ p_allocator->release_picture_callback(p, p_allocator->cookie);
+ return DAV1D_ERR(ENOMEM);
+ }
+
+ plane_ctx->allocator = *p_allocator;
+ plane_ctx->pic = *p;
+
+ pic_ctx->plane_ref[0] = dav1d_ref_wrap(p->data[0], free_plane_buffer, plane_ctx);
+ if (!pic_ctx->plane_ref[0]) {
+ dav1d_ref_dec(&p->ref);
+ p_allocator->release_picture_callback(p, p_allocator->cookie);
+ free(plane_ctx);
+ dav1d_log(c, "Failed to wrap picture plane: %s\n", strerror(errno));
+ return DAV1D_ERR(ENOMEM);
+ }
+
+ const int planes = p->p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 3 : 1;
+ for (int i = 1; i < planes; i++) {
+ pic_ctx->plane_ref[i] = pic_ctx->plane_ref[0];
+ dav1d_ref_inc(pic_ctx->plane_ref[i]);
+ }
+
+ p->seq_hdr_ref = seq_hdr_ref;
+ if (seq_hdr_ref) dav1d_ref_inc(seq_hdr_ref);
+
+ p->frame_hdr_ref = frame_hdr_ref;
+ if (frame_hdr_ref) dav1d_ref_inc(frame_hdr_ref);
+
+ dav1d_picture_copy_props(p, content_light, content_light_ref,
+ mastering_display, mastering_display_ref,
+ itut_t35, itut_t35_ref, props);
+
+ if (extra && extra_ptr)
+ *extra_ptr = &pic_ctx->extra_ptr;
+
+ return 0;
+}
+
+void dav1d_picture_copy_props(Dav1dPicture *const p,
+ Dav1dContentLightLevel *const content_light, Dav1dRef *const content_light_ref,
+ Dav1dMasteringDisplay *const mastering_display, Dav1dRef *const mastering_display_ref,
+ Dav1dITUTT35 *const itut_t35, Dav1dRef *const itut_t35_ref,
+ const Dav1dDataProps *const props)
+{
+ dav1d_data_props_copy(&p->m, props);
+
+ dav1d_ref_dec(&p->content_light_ref);
+ p->content_light_ref = content_light_ref;
+ p->content_light = content_light;
+ if (content_light_ref) dav1d_ref_inc(content_light_ref);
+
+ dav1d_ref_dec(&p->mastering_display_ref);
+ p->mastering_display_ref = mastering_display_ref;
+ p->mastering_display = mastering_display;
+ if (mastering_display_ref) dav1d_ref_inc(mastering_display_ref);
+
+ dav1d_ref_dec(&p->itut_t35_ref);
+ p->itut_t35_ref = itut_t35_ref;
+ p->itut_t35 = itut_t35;
+ if (itut_t35_ref) dav1d_ref_inc(itut_t35_ref);
+}
+
+int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f,
+ const int bpc)
+{
+ Dav1dThreadPicture *const p = &f->sr_cur;
+ const int have_frame_mt = c->n_fc > 1;
+
+ const int res =
+ picture_alloc_with_edges(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height,
+ f->seq_hdr, f->seq_hdr_ref,
+ f->frame_hdr, f->frame_hdr_ref,
+ c->content_light, c->content_light_ref,
+ c->mastering_display, c->mastering_display_ref,
+ c->itut_t35, c->itut_t35_ref,
+ bpc, &f->tile[0].data.m, &c->allocator,
+ have_frame_mt ? sizeof(atomic_int) * 2 : 0,
+ (void **) &p->progress);
+ if (res) return res;
+
+ // Must be removed from the context after being attached to the frame
+ dav1d_ref_dec(&c->itut_t35_ref);
+ c->itut_t35 = NULL;
+
+ // Don't clear these flags from c->frame_flags if the frame is not visible.
+ // This way they will be added to the next visible frame too.
+ const int flags_mask = (f->frame_hdr->show_frame || c->output_invisible_frames)
+ ? 0 : (PICTURE_FLAG_NEW_SEQUENCE | PICTURE_FLAG_NEW_OP_PARAMS_INFO);
+ p->flags = c->frame_flags;
+ c->frame_flags &= flags_mask;
+
+ p->visible = f->frame_hdr->show_frame;
+ p->showable = f->frame_hdr->showable_frame;
+ if (have_frame_mt) {
+ atomic_init(&p->progress[0], 0);
+ atomic_init(&p->progress[1], 0);
+ }
+ return res;
+}
+
+int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, const int w,
+ const Dav1dPicture *const src)
+{
+ struct pic_ctx_context *const pic_ctx = src->ref->user_data;
+ struct plane_ctx_context *const plane_ctx = pic_ctx->plane_ref[0]->user_data;
+ const int res = picture_alloc_with_edges(c, dst, w, src->p.h,
+ src->seq_hdr, src->seq_hdr_ref,
+ src->frame_hdr, src->frame_hdr_ref,
+ src->content_light, src->content_light_ref,
+ src->mastering_display, src->mastering_display_ref,
+ src->itut_t35, src->itut_t35_ref,
+ src->p.bpc, &src->m, &plane_ctx->allocator,
+ 0, NULL);
+ return res;
+}
+
+void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
+ validate_input(dst != NULL);
+ validate_input(dst->data[0] == NULL);
+ validate_input(src != NULL);
+
+ if (src->ref) {
+ validate_input(src->data[0] != NULL);
+ dav1d_ref_inc(src->ref);
+ }
+ if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref);
+ if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref);
+ if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
+ if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref);
+ if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref);
+ if (src->itut_t35_ref) dav1d_ref_inc(src->itut_t35_ref);
+ *dst = *src;
+}
+
+void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) {
+ validate_input(dst != NULL);
+ validate_input(dst->data[0] == NULL);
+ validate_input(src != NULL);
+
+ if (src->ref)
+ validate_input(src->data[0] != NULL);
+
+ *dst = *src;
+ memset(src, 0, sizeof(*src));
+}
+
+void dav1d_thread_picture_ref(Dav1dThreadPicture *const dst,
+ const Dav1dThreadPicture *const src)
+{
+ dav1d_picture_ref(&dst->p, &src->p);
+ dst->visible = src->visible;
+ dst->showable = src->showable;
+ dst->progress = src->progress;
+ dst->flags = src->flags;
+}
+
+void dav1d_thread_picture_move_ref(Dav1dThreadPicture *const dst,
+ Dav1dThreadPicture *const src)
+{
+ dav1d_picture_move_ref(&dst->p, &src->p);
+ dst->visible = src->visible;
+ dst->showable = src->showable;
+ dst->progress = src->progress;
+ dst->flags = src->flags;
+ memset(src, 0, sizeof(*src));
+}
+
+void dav1d_picture_unref_internal(Dav1dPicture *const p) {
+ validate_input(p != NULL);
+
+ if (p->ref) {
+ validate_input(p->data[0] != NULL);
+ dav1d_ref_dec(&p->ref);
+ }
+ dav1d_ref_dec(&p->seq_hdr_ref);
+ dav1d_ref_dec(&p->frame_hdr_ref);
+ dav1d_ref_dec(&p->m.user_data.ref);
+ dav1d_ref_dec(&p->content_light_ref);
+ dav1d_ref_dec(&p->mastering_display_ref);
+ dav1d_ref_dec(&p->itut_t35_ref);
+ memset(p, 0, sizeof(*p));
+ dav1d_data_props_set_defaults(&p->m);
+}
+
+void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) {
+ dav1d_picture_unref_internal(&p->p);
+
+ p->progress = NULL;
+}
+
+enum Dav1dEventFlags dav1d_picture_get_event_flags(const Dav1dThreadPicture *const p) {
+ if (!p->flags)
+ return 0;
+
+ enum Dav1dEventFlags flags = 0;
+ if (p->flags & PICTURE_FLAG_NEW_SEQUENCE)
+ flags |= DAV1D_EVENT_FLAG_NEW_SEQUENCE;
+ if (p->flags & PICTURE_FLAG_NEW_OP_PARAMS_INFO)
+ flags |= DAV1D_EVENT_FLAG_NEW_OP_PARAMS_INFO;
+
+ return flags;
+}
diff --git a/third_party/dav1d/src/picture.h b/third_party/dav1d/src/picture.h
new file mode 100644
index 0000000000..0c3a0ec562
--- /dev/null
+++ b/third_party/dav1d/src/picture.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_PICTURE_H
+#define DAV1D_SRC_PICTURE_H
+
+#include <stdatomic.h>
+
+#include "src/thread.h"
+#include "dav1d/picture.h"
+
+#include "src/thread_data.h"
+#include "src/ref.h"
+
+enum PlaneType {
+ PLANE_TYPE_Y,
+ PLANE_TYPE_UV,
+ PLANE_TYPE_BLOCK,
+ PLANE_TYPE_ALL,
+};
+
+enum PictureFlags {
+ PICTURE_FLAG_NEW_SEQUENCE = 1 << 0,
+ PICTURE_FLAG_NEW_OP_PARAMS_INFO = 1 << 1,
+ PICTURE_FLAG_NEW_TEMPORAL_UNIT = 1 << 2,
+};
+
+typedef struct Dav1dThreadPicture {
+ Dav1dPicture p;
+ int visible;
+ // This can be set for inter frames, non-key intra frames, or for invisible
+ // keyframes that have not yet been made visible using the show-existing-frame
+ // mechanism.
+ int showable;
+ enum PictureFlags flags;
+ // [0] block data (including segmentation map and motion vectors)
+ // [1] pixel data
+ atomic_uint *progress;
+} Dav1dThreadPicture;
+
+typedef struct Dav1dPictureBuffer {
+ void *data;
+ struct Dav1dPictureBuffer *next;
+} Dav1dPictureBuffer;
+
+/*
+ * Allocate a picture with custom border size.
+ */
+int dav1d_thread_picture_alloc(Dav1dContext *c, Dav1dFrameContext *f, const int bpc);
+
+/**
+ * Allocate a picture with identical metadata to an existing picture.
+ * The width is a separate argument so this function can be used for
+ * super-res, where the width changes, but everything else is the same.
+ * For the more typical use case of allocating a new image of the same
+ * dimensions, use src->p.w as width.
+ */
+int dav1d_picture_alloc_copy(Dav1dContext *c, Dav1dPicture *dst, const int w,
+ const Dav1dPicture *src);
+
+/**
+ * Create a copy of a picture.
+ */
+void dav1d_picture_ref(Dav1dPicture *dst, const Dav1dPicture *src);
+void dav1d_thread_picture_ref(Dav1dThreadPicture *dst,
+ const Dav1dThreadPicture *src);
+void dav1d_thread_picture_move_ref(Dav1dThreadPicture *dst,
+ Dav1dThreadPicture *src);
+void dav1d_thread_picture_unref(Dav1dThreadPicture *p);
+
+/**
+ * Move a picture reference.
+ */
+void dav1d_picture_move_ref(Dav1dPicture *dst, Dav1dPicture *src);
+
+int dav1d_default_picture_alloc(Dav1dPicture *p, void *cookie);
+void dav1d_default_picture_release(Dav1dPicture *p, void *cookie);
+void dav1d_picture_unref_internal(Dav1dPicture *p);
+
+void dav1d_picture_copy_props(Dav1dPicture *p,
+ Dav1dContentLightLevel *content_light, Dav1dRef *content_light_ref,
+ Dav1dMasteringDisplay *mastering_display, Dav1dRef *mastering_display_ref,
+ Dav1dITUTT35 *itut_t35, Dav1dRef *itut_t35_ref,
+ const Dav1dDataProps *props);
+
+/**
+ * Get event flags from picture flags.
+ */
+enum Dav1dEventFlags dav1d_picture_get_event_flags(const Dav1dThreadPicture *p);
+
+#endif /* DAV1D_SRC_PICTURE_H */
diff --git a/third_party/dav1d/src/ppc/cdef.h b/third_party/dav1d/src/ppc/cdef.h
new file mode 100644
index 0000000000..b794ba53be
--- /dev/null
+++ b/third_party/dav1d/src/ppc/cdef.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2019, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+
+#include "common/bitdepth.h"
+#include "common/intops.h"
+
+#include "src/cdef.h"
+#include "src/cpu.h"
+
+#define cdef_vsx_fn(w, h) \
+void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \
+ const ptrdiff_t dst_stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, \
+ const int sec_strength, \
+ const int dir, \
+ const int damping, \
+ const enum CdefEdgeFlags edges)
+
+cdef_vsx_fn(4, 4);
+cdef_vsx_fn(4, 8);
+cdef_vsx_fn(8, 8);
+
+static ALWAYS_INLINE void cdef_dsp_init_ppc(Dav1dCdefDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
+
+#if BITDEPTH == 8
+ c->fb[0] = dav1d_cdef_filter_8x8_vsx;
+ c->fb[1] = dav1d_cdef_filter_4x8_vsx;
+ c->fb[2] = dav1d_cdef_filter_4x4_vsx;
+#endif
+}
diff --git a/third_party/dav1d/src/ppc/cdef_tmpl.c b/third_party/dav1d/src/ppc/cdef_tmpl.c
new file mode 100644
index 0000000000..e2e759810f
--- /dev/null
+++ b/third_party/dav1d/src/ppc/cdef_tmpl.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright © 2019, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/ppc/dav1d_types.h"
+#include "src/ppc/cdef.h"
+
+#if BITDEPTH == 8
+static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
+ const int damping)
+{
+ const i16x8 zero = vec_splat_s16(0);
+ if (!threshold) return zero;
+ const uint16_t shift = imax(0, damping - ulog2(threshold));
+ const i16x8 abs_diff = vec_abs(diff);
+ const b16x8 mask = vec_cmplt(diff, zero);
+ const i16x8 thr = vec_splats(threshold);
+ const i16x8 sub = vec_sub(thr, vec_sra(abs_diff, vec_splats(shift)));
+ const i16x8 max = vec_max(zero, sub);
+ const i16x8 min = vec_min(abs_diff, max);
+ const i16x8 neg = vec_sub(zero, min);
+ return vec_sel(min, neg, mask);
+}
+
+static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+ const uint8_t *src, const ptrdiff_t src_stride,
+ const uint8_t (*left)[2], const uint8_t *const top,
+ const uint8_t *const bottom, const int w, const int h,
+ const enum CdefEdgeFlags edges)
+{
+ const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
+
+ u16x8 l0;
+ u16x8 l1;
+
+ int y_start = -2, y_end = h + 2;
+
+ // Copy top and bottom first
+ if (!(edges & CDEF_HAVE_TOP)) {
+ l0 = fill;
+ l1 = fill;
+ y_start = 0;
+ } else {
+ l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2));
+ l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2));
+ }
+
+ vec_st(l0, 0, tmp - 2 * 8);
+ vec_st(l1, 0, tmp - 1 * 8);
+
+ if (!(edges & CDEF_HAVE_BOTTOM)) {
+ l0 = fill;
+ l1 = fill;
+ y_end -= 2;
+ } else {
+ l0 = u8h_to_u16(vec_vsx_ld(0, bottom + 0 * src_stride - 2));
+ l1 = u8h_to_u16(vec_vsx_ld(0, bottom + 1 * src_stride - 2));
+ }
+
+ vec_st(l0, 0, tmp + (h + 0) * 8);
+ vec_st(l1, 0, tmp + (h + 1) * 8);
+
+ int y_with_left_edge = 0;
+ if (!(edges & CDEF_HAVE_LEFT)) {
+ u16x8 l = u8h_to_u16(vec_vsx_ld(0, src));
+ vec_vsx_st(l, 0, tmp + 2);
+
+ y_with_left_edge = 1;
+ }
+
+ for (int y = y_with_left_edge; y < h; y++) {
+ u16x8 l = u8h_to_u16(vec_vsx_ld(0, src - 2 + y * src_stride));
+ vec_st(l, 0, tmp + y * 8);
+ }
+
+ if (!(edges & CDEF_HAVE_LEFT)) {
+ for (int y = y_start; y < y_end; y++) {
+ tmp[y * 8] = INT16_MAX;
+ tmp[1 + y * 8] = INT16_MAX;
+ }
+ } else {
+ for (int y = 0; y < h; y++) {
+ tmp[y * 8] = left[y][0];
+ tmp[1 + y * 8] = left[y][1];
+ }
+ }
+ if (!(edges & CDEF_HAVE_RIGHT)) {
+ for (int y = y_start; y < y_end; y++) {
+ tmp[- 2 + (y + 1) * 8] = INT16_MAX;
+ tmp[- 1 + (y + 1) * 8] = INT16_MAX;
+ }
+ }
+}
+
+static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+ const uint8_t *src, const ptrdiff_t src_stride,
+ const uint8_t (*left)[2], const uint8_t *const top,
+ const uint8_t *const bottom, const int w, const int h,
+ const enum CdefEdgeFlags edges)
+{
+ const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
+
+ u16x8 l0h, l0l;
+ u16x8 l1h, l1l;
+
+ int y_start = -2, y_end = h + 2;
+
+ // Copy top and bottom first
+ if (!(edges & CDEF_HAVE_TOP)) {
+ l0h = fill;
+ l0l = fill;
+ l1h = fill;
+ l1l = fill;
+ y_start = 0;
+ } else {
+ u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2);
+ u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2);
+ l0h = u8h_to_u16(l0);
+ l0l = u8l_to_u16(l0);
+ l1h = u8h_to_u16(l1);
+ l1l = u8l_to_u16(l1);
+ }
+
+ vec_st(l0h, 0, tmp - 4 * 8);
+ vec_st(l0l, 0, tmp - 3 * 8);
+ vec_st(l1h, 0, tmp - 2 * 8);
+ vec_st(l1l, 0, tmp - 1 * 8);
+
+ if (!(edges & CDEF_HAVE_BOTTOM)) {
+ l0h = fill;
+ l0l = fill;
+ l1h = fill;
+ l1l = fill;
+ y_end -= 2;
+ } else {
+ u8x16 l0 = vec_vsx_ld(0, bottom + 0 * src_stride - 2);
+ u8x16 l1 = vec_vsx_ld(0, bottom + 1 * src_stride - 2);
+ l0h = u8h_to_u16(l0);
+ l0l = u8l_to_u16(l0);
+ l1h = u8h_to_u16(l1);
+ l1l = u8l_to_u16(l1);
+ }
+
+ vec_st(l0h, 0, tmp + (h + 0) * 16);
+ vec_st(l0l, 0, tmp + (h + 0) * 16 + 8);
+ vec_st(l1h, 0, tmp + (h + 1) * 16);
+ vec_st(l1l, 0, tmp + (h + 1) * 16 + 8);
+
+ int y_with_left_edge = 0;
+ if (!(edges & CDEF_HAVE_LEFT)) {
+ u8x16 l = vec_vsx_ld(0, src);
+ u16x8 lh = u8h_to_u16(l);
+ u16x8 ll = u8l_to_u16(l);
+ vec_vsx_st(lh, 0, tmp + 2);
+ vec_vsx_st(ll, 0, tmp + 8 + 2);
+
+ y_with_left_edge = 1;
+ }
+
+ for (int y = y_with_left_edge; y < h; y++) {
+ u8x16 l = vec_vsx_ld(0, src - 2 + y * src_stride);
+ u16x8 lh = u8h_to_u16(l);
+ u16x8 ll = u8l_to_u16(l);
+ vec_st(lh, 0, tmp + y * 16);
+ vec_st(ll, 0, tmp + 8 + y * 16);
+ }
+
+ if (!(edges & CDEF_HAVE_LEFT)) {
+ for (int y = y_start; y < y_end; y++) {
+ tmp[y * 16] = INT16_MAX;
+ tmp[1 + y * 16] = INT16_MAX;
+ }
+ } else {
+ for (int y = 0; y < h; y++) {
+ tmp[y * 16] = left[y][0];
+ tmp[1 + y * 16] = left[y][1];
+ }
+ }
+ if (!(edges & CDEF_HAVE_RIGHT)) {
+ for (int y = y_start; y < y_end; y++) {
+ tmp[- 6 + (y + 1) * 16] = INT16_MAX;
+ tmp[- 5 + (y + 1) * 16] = INT16_MAX;
+ }
+ }
+}
+
+static inline i16x8 max_mask(i16x8 a, i16x8 b) {
+ const i16x8 I16X8_INT16_MAX = vec_splats((int16_t)INT16_MAX);
+
+ const b16x8 mask = vec_cmpeq(a, I16X8_INT16_MAX);
+
+ const i16x8 val = vec_sel(a, b, mask);
+
+ return vec_max(val, b);
+}
+
+#define LOAD_PIX(addr) \
+ const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \
+ i16x8 max = px; \
+ i16x8 min = px; \
+ i16x8 sum = vec_splat_s16(0);
+
+#define LOAD_PIX4(addr) \
+ const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \
+ const i16x8 b = (i16x8)vec_vsx_ld(0, addr + tmp_stride); \
+ const i16x8 px = vec_xxpermdi(a, b, 0); \
+ i16x8 max = px; \
+ i16x8 min = px; \
+ i16x8 sum = vec_splat_s16(0);
+
+#define LOAD_DIR(p, addr, o0, o1) \
+ const i16x8 p ## 0 = (i16x8)vec_vsx_ld(0, addr + o0); \
+ const i16x8 p ## 1 = (i16x8)vec_vsx_ld(0, addr - o0); \
+ const i16x8 p ## 2 = (i16x8)vec_vsx_ld(0, addr + o1); \
+ const i16x8 p ## 3 = (i16x8)vec_vsx_ld(0, addr - o1);
+
+#define LOAD_DIR4(p, addr, o0, o1) \
+ LOAD_DIR(p ## a, addr, o0, o1) \
+ LOAD_DIR(p ## b, addr + tmp_stride, o0, o1) \
+ const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \
+ const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \
+ const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \
+ const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0);
+
+#define CONSTRAIN(p, strength) \
+ const i16x8 p ## _d0 = vec_sub(p ## 0, px); \
+ const i16x8 p ## _d1 = vec_sub(p ## 1, px); \
+ const i16x8 p ## _d2 = vec_sub(p ## 2, px); \
+ const i16x8 p ## _d3 = vec_sub(p ## 3, px); \
+\
+ i16x8 p ## _c0 = vconstrain(p ## _d0, strength, damping); \
+ i16x8 p ## _c1 = vconstrain(p ## _d1, strength, damping); \
+ i16x8 p ## _c2 = vconstrain(p ## _d2, strength, damping); \
+ i16x8 p ## _c3 = vconstrain(p ## _d3, strength, damping);
+
+#define MIN_MAX(p) \
+ max = max_mask(p ## 0, max); \
+ min = vec_min(p ## 0, min); \
+ max = max_mask(p ## 1, max); \
+ min = vec_min(p ## 1, min); \
+ max = max_mask(p ## 2, max); \
+ min = vec_min(p ## 2, min); \
+ max = max_mask(p ## 3, max); \
+ min = vec_min(p ## 3, min);
+
+#define PRI_0(p) \
+ p ## _c0 = vec_add(vec_sl(p ## _c0, vec_splat_u16(1)), vec_sl(p ## _c0, vec_splats(tap_even))); \
+ p ## _c1 = vec_add(vec_sl(p ## _c1, vec_splat_u16(1)), vec_sl(p ## _c1, vec_splats(tap_even)));
+
+#define PRI_1(p) \
+ p ## _c2 = vec_sub(vec_sl(p ## _c2, vec_splat_u16(2)), vec_sl(p ## _c2, vec_splats(tap_even))); \
+ p ## _c3 = vec_sub(vec_sl(p ## _c3, vec_splat_u16(2)), vec_sl(p ## _c3, vec_splats(tap_even)));
+
+#define SEC_0(p) \
+ p ## _c0 = vec_sl(p ## _c0, vec_splat_u16(1)); \
+ p ## _c1 = vec_sl(p ## _c1, vec_splat_u16(1)); \
+ p ## _c2 = vec_sl(p ## _c2, vec_splat_u16(1)); \
+ p ## _c3 = vec_sl(p ## _c3, vec_splat_u16(1));
+
+#define UPDATE_SUM(p) \
+ const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \
+ const i16x8 p ## sum1 = vec_add(p ## _c2, p ## _c3); \
+ sum = vec_add(sum, p ## sum0); \
+ sum = vec_add(sum, p ## sum1);
+
+static inline void
+filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2], const pixel *const top,
+ const pixel *const bottom, const int w, const int h,
+ const int pri_strength, const int sec_strength, const int dir,
+ const int damping, const enum CdefEdgeFlags edges,
+ const ptrdiff_t tmp_stride, uint16_t *tmp)
+{
+ const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
+ { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, -1 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, 0 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, 1 * tmp_stride + 2 },
+ { 1 * tmp_stride + 1, 2 * tmp_stride + 2 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 1 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 0 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride - 1 }
+ };
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+ const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
+ const int off1 = cdef_directions[dir][0];
+ const int off1_1 = cdef_directions[dir][1];
+
+ const int off2 = cdef_directions[(dir + 2) & 7][0];
+ const int off3 = cdef_directions[(dir + 6) & 7][0];
+
+ const int off2_1 = cdef_directions[(dir + 2) & 7][1];
+ const int off3_1 = cdef_directions[(dir + 6) & 7][1];
+
+ copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+
+ for (int y = 0; y < h / 2; y++) {
+ LOAD_PIX4(tmp)
+
+ // Primary pass
+ LOAD_DIR4(p, tmp, off1, off1_1)
+
+ CONSTRAIN(p, pri_strength)
+
+ MIN_MAX(p)
+
+ PRI_0(p)
+ PRI_1(p)
+
+ UPDATE_SUM(p)
+
+ // Secondary pass 1
+ LOAD_DIR4(s, tmp, off2, off3)
+
+ CONSTRAIN(s, sec_strength)
+
+ MIN_MAX(s)
+
+ SEC_0(s)
+
+ UPDATE_SUM(s)
+
+ // Secondary pass 2
+ LOAD_DIR4(s2, tmp, off2_1, off3_1)
+
+ CONSTRAIN(s2, sec_strength)
+
+ MIN_MAX(s2)
+
+ UPDATE_SUM(s2)
+
+ // Store
+ i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
+ bias = vec_sub(vec_splat_s16(8), bias);
+ i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
+ i16x8 vdst = vec_max(vec_min(unclamped, max), min);
+
+ dst[0] = vdst[0];
+ dst[1] = vdst[1];
+ dst[2] = vdst[2];
+ dst[3] = vdst[3];
+
+ tmp += tmp_stride;
+ dst += PXSTRIDE(dst_stride);
+ dst[0] = vdst[4];
+ dst[1] = vdst[5];
+ dst[2] = vdst[6];
+ dst[3] = vdst[7];
+
+ tmp += tmp_stride;
+ dst += PXSTRIDE(dst_stride);
+ }
+}
+
+static inline void
+filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel (*left)[2], const pixel *const top,
+ const pixel *const bottom, const int w, const int h,
+ const int pri_strength, const int sec_strength, const int dir,
+ const int damping, const enum CdefEdgeFlags edges,
+ const ptrdiff_t tmp_stride, uint16_t *tmp)
+{
+ const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
+ { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, -1 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, 0 * tmp_stride + 2 },
+ { 0 * tmp_stride + 1, 1 * tmp_stride + 2 },
+ { 1 * tmp_stride + 1, 2 * tmp_stride + 2 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 1 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride + 0 },
+ { 1 * tmp_stride + 0, 2 * tmp_stride - 1 }
+ };
+ const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+
+
+ const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
+ const int off1 = cdef_directions[dir][0];
+ const int off1_1 = cdef_directions[dir][1];
+
+ const int off2 = cdef_directions[(dir + 2) & 7][0];
+ const int off3 = cdef_directions[(dir + 6) & 7][0];
+
+ const int off2_1 = cdef_directions[(dir + 2) & 7][1];
+ const int off3_1 = cdef_directions[(dir + 6) & 7][1];
+
+ copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+
+ for (int y = 0; y < h; y++) {
+ LOAD_PIX(tmp)
+
+ // Primary pass
+ LOAD_DIR(p, tmp, off1, off1_1)
+
+ CONSTRAIN(p, pri_strength)
+
+ MIN_MAX(p)
+
+ PRI_0(p)
+ PRI_1(p)
+
+ UPDATE_SUM(p)
+
+ // Secondary pass 1
+ LOAD_DIR(s, tmp, off2, off3)
+
+ CONSTRAIN(s, sec_strength)
+
+ MIN_MAX(s)
+
+ SEC_0(s)
+
+ UPDATE_SUM(s)
+
+ // Secondary pass 2
+ LOAD_DIR(s2, tmp, off2_1, off3_1)
+
+ CONSTRAIN(s2, sec_strength)
+
+ MIN_MAX(s2)
+
+ UPDATE_SUM(s2)
+
+ // Store
+ i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
+ bias = vec_sub(vec_splat_s16(8), bias);
+ i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
+ i16x8 vdst = vec_max(vec_min(unclamped, max), min);
+
+ dst[0] = vdst[0];
+ dst[1] = vdst[1];
+ dst[2] = vdst[2];
+ dst[3] = vdst[3];
+ dst[4] = vdst[4];
+ dst[5] = vdst[5];
+ dst[6] = vdst[6];
+ dst[7] = vdst[7];
+
+ tmp += tmp_stride;
+ dst += PXSTRIDE(dst_stride);
+ }
+
+}
+
+#define cdef_fn(w, h, tmp_stride) \
+void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \
+ const ptrdiff_t dst_stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, \
+ const int sec_strength, \
+ const int dir, \
+ const int damping, \
+ const enum CdefEdgeFlags edges) \
+{ \
+ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
+ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
+ filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
+ sec_strength, dir, damping, edges, tmp_stride, tmp); \
+}
+
+cdef_fn(4, 4, 8);
+cdef_fn(4, 8, 8);
+cdef_fn(8, 8, 16);
+#endif
diff --git a/third_party/dav1d/src/ppc/cpu.c b/third_party/dav1d/src/ppc/cpu.c
new file mode 100644
index 0000000000..fe77057c57
--- /dev/null
+++ b/third_party/dav1d/src/ppc/cpu.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/attributes.h"
+
+#include "src/ppc/cpu.h"
+
+#if (defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)) && ARCH_PPC64LE
+#include <sys/auxv.h>
+#define HAVE_AUX
+#endif
+
+COLD unsigned dav1d_get_cpu_flags_ppc(void) {
+ unsigned flags = 0;
+#if defined(HAVE_GETAUXVAL) && ARCH_PPC64LE
+ unsigned long hw_cap = getauxval(AT_HWCAP);
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_PPC64LE
+ unsigned long hw_cap = 0;
+ elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+#endif
+#ifdef HAVE_AUX
+ flags |= (hw_cap & PPC_FEATURE_HAS_VSX) ? DAV1D_PPC_CPU_FLAG_VSX : 0;
+#endif
+ return flags;
+}
diff --git a/third_party/dav1d/src/ppc/cpu.h b/third_party/dav1d/src/ppc/cpu.h
new file mode 100644
index 0000000000..cfd2ff4ff5
--- /dev/null
+++ b/third_party/dav1d/src/ppc/cpu.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_PPC_CPU_H
+#define DAV1D_SRC_PPC_CPU_H
+
+enum CpuFlags {
+ DAV1D_PPC_CPU_FLAG_VSX = 1 << 0,
+};
+
+unsigned dav1d_get_cpu_flags_ppc(void);
+
+#endif /* DAV1D_SRC_PPC_CPU_H */
diff --git a/third_party/dav1d/src/ppc/dav1d_types.h b/third_party/dav1d/src/ppc/dav1d_types.h
new file mode 100644
index 0000000000..0b4bd72f0e
--- /dev/null
+++ b/third_party/dav1d/src/ppc/dav1d_types.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_PPC_TYPES_H
+#define DAV1D_SRC_PPC_TYPES_H
+
+#include <altivec.h>
+#undef pixel
+
+#define u8x16 vector unsigned char
+#define i8x16 vector signed char
+#define b8x16 vector bool char
+#define u16x8 vector unsigned short
+#define i16x8 vector signed short
+#define b16x8 vector bool short
+#define u32x4 vector unsigned int
+#define i32x4 vector signed int
+#define b32x4 vector bool int
+#define u64x2 vector unsigned long long
+#define i64x2 vector signed long long
+#define b64x2 vector bool long long
+
+#define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0)))
+#define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0)))
+#define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0)))
+#define i16h_to_i32(v) ((i32x4) vec_unpackh((i16x8)v))
+#define u16l_to_i32(v) ((i32x4) vec_mergel((u16x8) v, vec_splat_u16(0)))
+#define i16l_to_i32(v) ((i32x4) vec_unpackl((i16x8)v))
+
+#endif /* DAV1D_SRC_PPC_TYPES_H */
diff --git a/third_party/dav1d/src/ppc/looprestoration.h b/third_party/dav1d/src/ppc/looprestoration.h
new file mode 100644
index 0000000000..3fe16318bd
--- /dev/null
+++ b/third_party/dav1d/src/ppc/looprestoration.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Michail Alvanos
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "common/intops.h"
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride,
+ const uint8_t (*const left)[4],
+ const uint8_t *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_ppc(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
+
+#if BITDEPTH == 8
+ c->wiener[0] = c->wiener[1] = dav1d_wiener_filter_vsx;
+#endif
+}
diff --git a/third_party/dav1d/src/ppc/looprestoration_tmpl.c b/third_party/dav1d/src/ppc/looprestoration_tmpl.c
new file mode 100644
index 0000000000..c0c64e1800
--- /dev/null
+++ b/third_party/dav1d/src/ppc/looprestoration_tmpl.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Michail Alvanos
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/ppc/dav1d_types.h"
+#include "src/ppc/looprestoration.h"
+
+#if BITDEPTH == 8
+
+#define REST_UNIT_STRIDE (400)
+
+static inline i32x4 iclip_vec(i32x4 v, const i32x4 minv, const i32x4 maxv) {
+ v = vec_max(minv, v);
+ v = vec_min(maxv, v);
+ return v;
+}
+
+#define APPLY_FILTER_H(v, f, ssum1, ssum2) do { \
+ i16x8 ktmp_u16_high = (i16x8) u8h_to_u16(v); \
+ i16x8 ktmp_u16_low = (i16x8) u8l_to_u16(v); \
+ ssum1 = vec_madd(ktmp_u16_high, f, ssum1); \
+ ssum2 = vec_madd(ktmp_u16_low, f, ssum2); \
+} while (0)
+
+static void wiener_filter_h_vsx(int32_t *hor_ptr,
+ uint8_t *tmp_ptr,
+ const int16_t filterh[8],
+ const int w, const int h)
+{
+ const i32x4 zerov = vec_splats(0);
+ const i32x4 seven_vec = vec_splats(7);
+ const i32x4 bitdepth_added_vec = vec_splats(1 << 14);
+ const i32x4 round_bits_vec = vec_splats(3);
+ const i32x4 rounding_off_vec = vec_splats(1<<2);
+ const i32x4 clip_limit_v = vec_splats((1 << 13) - 1);
+
+ i16x8 filterhvall = vec_vsx_ld(0, filterh);
+ i16x8 filterhv0 = vec_splat( filterhvall, 0);
+ i16x8 filterhv1 = vec_splat( filterhvall, 1);
+ i16x8 filterhv2 = vec_splat( filterhvall, 2);
+ i16x8 filterhv3 = vec_splat( filterhvall, 3);
+ i16x8 filterhv4 = vec_splat( filterhvall, 4);
+ i16x8 filterhv5 = vec_splat( filterhvall, 5);
+ i16x8 filterhv6 = vec_splat( filterhvall, 6);
+
+ for (int j = 0; j < h + 6; j++) {
+ for (int i = 0; i < w; i+=16) {
+ i32x4 sum1 = bitdepth_added_vec;
+ i32x4 sum2 = bitdepth_added_vec;
+ i32x4 sum3 = bitdepth_added_vec;
+ i32x4 sum4 = bitdepth_added_vec;
+
+ u8x16 tmp_v0 = vec_ld(0, &tmp_ptr[i]);
+ u8x16 tmp_v7 = vec_ld(0, &tmp_ptr[i+16]);
+
+ u8x16 tmp_v1 = vec_sld( tmp_v7, tmp_v0, 15);
+ u8x16 tmp_v2 = vec_sld( tmp_v7, tmp_v0, 14);
+ u8x16 tmp_v3 = vec_sld( tmp_v7, tmp_v0, 13);
+ u8x16 tmp_v4 = vec_sld( tmp_v7, tmp_v0, 12);
+ u8x16 tmp_v5 = vec_sld( tmp_v7, tmp_v0, 11);
+ u8x16 tmp_v6 = vec_sld( tmp_v7, tmp_v0, 10);
+
+ u16x8 tmp_u16_high = u8h_to_u16(tmp_v3);
+ u16x8 tmp_u16_low = u8l_to_u16(tmp_v3);
+
+ i32x4 tmp_expanded1 = i16h_to_i32(tmp_u16_high);
+ i32x4 tmp_expanded2 = i16l_to_i32(tmp_u16_high);
+ i32x4 tmp_expanded3 = i16h_to_i32(tmp_u16_low);
+ i32x4 tmp_expanded4 = i16l_to_i32(tmp_u16_low);
+
+ i16x8 ssum1 = (i16x8) zerov;
+ i16x8 ssum2 = (i16x8) zerov;
+
+ APPLY_FILTER_H(tmp_v0, filterhv0, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v1, filterhv1, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v2, filterhv2, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v3, filterhv3, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v4, filterhv4, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v5, filterhv5, ssum1, ssum2);
+ APPLY_FILTER_H(tmp_v6, filterhv6, ssum1, ssum2);
+
+ sum1 += i16h_to_i32(ssum1) + (tmp_expanded1 << seven_vec);
+ sum2 += i16l_to_i32(ssum1) + (tmp_expanded2 << seven_vec);
+ sum3 += i16h_to_i32(ssum2) + (tmp_expanded3 << seven_vec);
+ sum4 += i16l_to_i32(ssum2) + (tmp_expanded4 << seven_vec);
+
+ sum1 = (sum1 + rounding_off_vec) >> round_bits_vec;
+ sum2 = (sum2 + rounding_off_vec) >> round_bits_vec;
+ sum3 = (sum3 + rounding_off_vec) >> round_bits_vec;
+ sum4 = (sum4 + rounding_off_vec) >> round_bits_vec;
+
+ sum1 = iclip_vec(sum1, zerov, clip_limit_v);
+ sum2 = iclip_vec(sum2, zerov, clip_limit_v);
+ sum3 = iclip_vec(sum3, zerov, clip_limit_v);
+ sum4 = iclip_vec(sum4, zerov, clip_limit_v);
+
+ vec_st(sum1, 0, &hor_ptr[i]);
+ vec_st(sum2, 16, &hor_ptr[i]);
+ vec_st(sum3, 32, &hor_ptr[i]);
+ vec_st(sum4, 48, &hor_ptr[i]);
+ }
+ tmp_ptr += REST_UNIT_STRIDE;
+ hor_ptr += REST_UNIT_STRIDE;
+ }
+}
+
+static inline i16x8 iclip_u8_vec(i16x8 v) {
+ const i16x8 zerov = vec_splats((int16_t)0);
+ const i16x8 maxv = vec_splats((int16_t)255);
+ v = vec_max(zerov, v);
+ v = vec_min(maxv, v);
+ return v;
+}
+
+#define APPLY_FILTER_V(index, f) do { \
+ i32x4 v1 = vec_ld( 0, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+ i32x4 v2 = vec_ld(16, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+ i32x4 v3 = vec_ld(32, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+ i32x4 v4 = vec_ld(48, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
+ sum1 = sum1 + v1 * f; \
+ sum2 = sum2 + v2 * f; \
+ sum3 = sum3 + v3 * f; \
+ sum4 = sum4 + v4 * f; \
+} while (0)
+
+#define LOAD_AND_APPLY_FILTER_V(sumpixelv, hor) do { \
+ i32x4 sum1 = round_vec; \
+ i32x4 sum2 = round_vec; \
+ i32x4 sum3 = round_vec; \
+ i32x4 sum4 = round_vec; \
+ APPLY_FILTER_V(0, filterv0); \
+ APPLY_FILTER_V(1, filterv1); \
+ APPLY_FILTER_V(2, filterv2); \
+ APPLY_FILTER_V(3, filterv3); \
+ APPLY_FILTER_V(4, filterv4); \
+ APPLY_FILTER_V(5, filterv5); \
+ APPLY_FILTER_V(6, filterv6); \
+ sum1 = sum1 >> round_bits_vec; \
+ sum2 = sum2 >> round_bits_vec; \
+ sum3 = sum3 >> round_bits_vec; \
+ sum4 = sum4 >> round_bits_vec; \
+ i16x8 sum_short_packed_1 = (i16x8) vec_pack(sum1, sum2); \
+ i16x8 sum_short_packed_2 = (i16x8) vec_pack(sum3, sum4); \
+ sum_short_packed_1 = iclip_u8_vec(sum_short_packed_1); \
+ sum_short_packed_2 = iclip_u8_vec(sum_short_packed_2); \
+ sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2); \
+} while (0)
+
+static inline void wiener_filter_v_vsx(uint8_t *p,
+ const ptrdiff_t stride,
+ const int32_t *hor,
+ const int16_t filterv[8],
+ const int w, const int h)
+{
+ const i32x4 round_bits_vec = vec_splats(11);
+ const i32x4 round_vec = vec_splats((1 << 10) - (1 << 18));
+
+ i32x4 filterv0 = vec_splats((int32_t) filterv[0]);
+ i32x4 filterv1 = vec_splats((int32_t) filterv[1]);
+ i32x4 filterv2 = vec_splats((int32_t) filterv[2]);
+ i32x4 filterv3 = vec_splats((int32_t) filterv[3]);
+ i32x4 filterv4 = vec_splats((int32_t) filterv[4]);
+ i32x4 filterv5 = vec_splats((int32_t) filterv[5]);
+ i32x4 filterv6 = vec_splats((int32_t) filterv[6]);
+
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i <(w-w%16); i += 16) {
+ u8x16 sum_pixel;
+ LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
+ vec_vsx_st(sum_pixel, 0, &p[j * PXSTRIDE(stride) + i]);
+ }
+ // remaining loop
+ if (w & 0xf){
+ int i=w-w%16;
+ ALIGN_STK_16(uint8_t, tmp_out, 16,);
+ u8x16 sum_pixel;
+
+ LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
+ vec_vsx_st(sum_pixel, 0, tmp_out);
+
+ for (int k=0; i<w; i++, k++) {
+ p[j * PXSTRIDE(stride) + i] = tmp_out[k];
+ }
+ }
+ }
+}
+
+static inline void padding(uint8_t *dst, const uint8_t *p,
+ const ptrdiff_t stride, const uint8_t (*left)[4],
+ const uint8_t *lpf, int unit_w, const int stripe_h,
+ const enum LrEdgeFlags edges)
+{
+ const int have_left = !!(edges & LR_HAVE_LEFT);
+ const int have_right = !!(edges & LR_HAVE_RIGHT);
+
+ // Copy more pixels if we don't have to pad them
+ unit_w += 3 * have_left + 3 * have_right;
+ uint8_t *dst_l = dst + 3 * !have_left;
+ p -= 3 * have_left;
+ lpf -= 3 * have_left;
+
+ if (edges & LR_HAVE_TOP) {
+ // Copy previous loop filtered rows
+ const uint8_t *const above_1 = lpf;
+ const uint8_t *const above_2 = above_1 + PXSTRIDE(stride);
+ pixel_copy(dst_l, above_1, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
+ } else {
+ // Pad with first row
+ pixel_copy(dst_l, p, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+ if (have_left) {
+ pixel_copy(dst_l, &left[0][1], 3);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+ }
+ }
+
+ uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
+ if (edges & LR_HAVE_BOTTOM) {
+ // Copy next loop filtered rows
+ const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride);
+ const uint8_t *const below_2 = below_1 + PXSTRIDE(stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
+ } else {
+ // Pad with last row
+ const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+ if (have_left) {
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ }
+ }
+
+ // Inner UNIT_WxSTRIPE_H
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
+ dst_tl += REST_UNIT_STRIDE;
+ p += PXSTRIDE(stride);
+ }
+
+ if (!have_right) {
+ uint8_t *pad = dst_l + unit_w;
+ uint8_t *row_last = &dst_l[unit_w - 1];
+ // Pad 3x(STRIPE_H+6) with last column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(pad, *row_last, 3);
+ pad += REST_UNIT_STRIDE;
+ row_last += REST_UNIT_STRIDE;
+ }
+ }
+
+ if (!have_left) {
+ // Pad 3x(STRIPE_H+6) with first column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(dst, *dst_l, 3);
+ dst += REST_UNIT_STRIDE;
+ dst_l += REST_UNIT_STRIDE;
+ }
+ } else {
+ dst += 3 * REST_UNIT_STRIDE;
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst, &left[j][1], 3);
+ dst += REST_UNIT_STRIDE;
+ }
+ }
+}
+
+// FIXME Could split into luma and chroma specific functions,
+// (since first and last tops are always 0 for chroma)
+// FIXME Could implement a version that requires less temporary memory
+// (should be possible to implement with only 6 rows of temp storage)
+void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride,
+ const uint8_t (*const left)[4],
+ const uint8_t *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+{
+ const int16_t (*const filter)[8] = params->filter;
+
+ // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
+ // of padding above and below
+ ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
+ padding(tmp, p, stride, left, lpf, w, h, edges);
+ ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
+
+ wiener_filter_h_vsx(hor, tmp, filter[0], w, h);
+ wiener_filter_v_vsx(p, stride, hor, filter[1], w, h);
+}
+#endif
diff --git a/third_party/dav1d/src/qm.c b/third_party/dav1d/src/qm.c
new file mode 100644
index 0000000000..e2e0d61874
--- /dev/null
+++ b/third_party/dav1d/src/qm.c
@@ -0,0 +1,3148 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/attributes.h"
+
+#include "src/qm.h"
+
+static const uint8_t qm_tbl_4x4_t[][2][10] = {
+ {
+ {
+ 32,
+ 43, 67,
+ 73, 94, 137,
+ 97, 110, 150, 200,
+ }, {
+ 35,
+ 46, 60,
+ 57, 69, 90,
+ 66, 71, 90, 109,
+ },
+ }, {
+ {
+ 32,
+ 41, 63,
+ 69, 88, 127,
+ 92, 103, 140, 184,
+ }, {
+ 33,
+ 45, 58,
+ 56, 66, 86,
+ 64, 69, 87, 105,
+ },
+ }, {
+ {
+ 32,
+ 38, 56,
+ 63, 78, 113,
+ 86, 97, 130, 169,
+ }, {
+ 32,
+ 45, 55,
+ 53, 62, 80,
+ 63, 67, 84, 101,
+ },
+ }, {
+ {
+ 32,
+ 37, 54,
+ 58, 72, 102,
+ 81, 91, 121, 156,
+ }, {
+ 32,
+ 45, 54,
+ 51, 59, 75,
+ 61, 65, 81, 97,
+ },
+ }, {
+ {
+ 32,
+ 34, 49,
+ 53, 64, 91,
+ 75, 81, 112, 140,
+ }, {
+ 32,
+ 46, 53,
+ 49, 55, 70,
+ 58, 62, 78, 91,
+ },
+ }, {
+ {
+ 32,
+ 34, 48,
+ 49, 60, 82,
+ 72, 79, 104, 134,
+ }, {
+ 32,
+ 46, 53,
+ 47, 54, 66,
+ 57, 60, 75, 89,
+ },
+ }, {
+ {
+ 32,
+ 33, 39,
+ 45, 51, 71,
+ 62, 64, 87, 108,
+ }, {
+ 31,
+ 42, 48,
+ 47, 50, 61,
+ 53, 54, 67, 78,
+ },
+ }, {
+ {
+ 32,
+ 33, 38,
+ 42, 46, 63,
+ 55, 57, 75, 92,
+ }, {
+ 31,
+ 41, 48,
+ 46, 48, 58,
+ 51, 51, 62, 71,
+ },
+ }, {
+ {
+ 32,
+ 32, 35,
+ 38, 40, 54,
+ 51, 49, 64, 81,
+ }, {
+ 31,
+ 38, 47,
+ 47, 46, 54,
+ 49, 46, 57, 66,
+ },
+ }, {
+ {
+ 32,
+ 32, 34,
+ 35, 37, 48,
+ 43, 43, 54, 65,
+ }, {
+ 31,
+ 37, 44,
+ 47, 47, 53,
+ 47, 45, 53, 59,
+ },
+ }, {
+ {
+ 32,
+ 32, 33,
+ 34, 35, 39,
+ 38, 39, 45, 54,
+ }, {
+ 31,
+ 34, 39,
+ 42, 45, 48,
+ 47, 46, 49, 54,
+ },
+ }, {
+ {
+ 32,
+ 32, 32,
+ 32, 33, 35,
+ 35, 35, 38, 46,
+ }, {
+ 31,
+ 32, 34,
+ 38, 41, 47,
+ 46, 46, 47, 52,
+ },
+ }, {
+ {
+ 31,
+ 32, 32,
+ 32, 32, 33,
+ 32, 33, 34, 35,
+ }, {
+ 31,
+ 31, 32,
+ 34, 35, 39,
+ 38, 40, 43, 47,
+ },
+ }, {
+ {
+ 31,
+ 31, 32,
+ 31, 32, 32,
+ 32, 32, 32, 33,
+ }, {
+ 31,
+ 31, 31,
+ 31, 31, 32,
+ 34, 35, 35, 39,
+ },
+ }, {
+ {
+ 31,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ }, {
+ 31,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ },
+ },
+};
+
+static const uint8_t qm_tbl_8x4[][2][32] = {
+ {
+ {
+ 32, 33, 37, 49, 65, 80, 91, 104,
+ 42, 42, 58, 71, 84, 97, 100, 112,
+ 75, 69, 84, 103, 125, 142, 145, 146,
+ 91, 86, 91, 110, 128, 152, 178, 190,
+ }, {
+ 31, 40, 46, 48, 54, 61, 64, 68,
+ 47, 45, 56, 61, 65, 69, 68, 71,
+ 60, 54, 64, 75, 85, 92, 90, 87,
+ 66, 61, 64, 73, 82, 92, 102, 105,
+ },
+ }, {
+ {
+ 32, 33, 36, 46, 60, 75, 86, 98,
+ 42, 42, 56, 67, 79, 92, 95, 105,
+ 69, 64, 77, 93, 112, 130, 136, 136,
+ 88, 83, 88, 105, 122, 144, 167, 177,
+ }, {
+ 31, 40, 46, 47, 52, 59, 63, 66,
+ 47, 45, 55, 60, 64, 68, 66, 69,
+ 57, 52, 61, 70, 79, 87, 88, 85,
+ 65, 61, 63, 72, 81, 90, 99, 102,
+ },
+ }, {
+ {
+ 32, 32, 34, 44, 54, 72, 82, 92,
+ 38, 40, 51, 61, 69, 84, 89, 98,
+ 62, 58, 68, 85, 98, 118, 129, 127,
+ 86, 80, 85, 101, 117, 136, 157, 165,
+ }, {
+ 31, 38, 46, 46, 50, 57, 61, 65,
+ 47, 46, 53, 56, 59, 64, 65, 67,
+ 54, 50, 57, 66, 74, 82, 85, 82,
+ 64, 60, 62, 71, 79, 88, 97, 99,
+ },
+ }, {
+ {
+ 32, 32, 34, 41, 51, 65, 75, 86,
+ 35, 36, 47, 53, 61, 73, 81, 92,
+ 59, 57, 65, 78, 92, 108, 117, 119,
+ 83, 78, 82, 97, 111, 129, 148, 154,
+ }, {
+ 31, 36, 46, 45, 49, 54, 59, 63,
+ 47, 47, 52, 53, 55, 58, 61, 65,
+ 53, 50, 55, 63, 71, 77, 81, 80,
+ 63, 59, 61, 70, 77, 86, 94, 95,
+ },
+ }, {
+ {
+ 32, 32, 34, 38, 48, 60, 72, 81,
+ 35, 36, 42, 51, 59, 68, 79, 86,
+ 51, 50, 54, 67, 80, 92, 104, 112,
+ 77, 72, 75, 87, 103, 119, 135, 144,
+ }, {
+ 31, 36, 43, 45, 47, 52, 57, 61,
+ 47, 47, 50, 53, 54, 56, 60, 63,
+ 50, 47, 50, 58, 66, 70, 75, 77,
+ 61, 57, 58, 65, 74, 82, 90, 93,
+ },
+ }, {
+ {
+ 32, 32, 34, 37, 45, 54, 65, 75,
+ 35, 36, 42, 50, 56, 63, 73, 81,
+ 51, 50, 54, 65, 76, 87, 97, 106,
+ 75, 71, 73, 84, 96, 110, 125, 136,
+ }, {
+ 31, 36, 43, 46, 46, 50, 54, 59,
+ 47, 47, 50, 53, 54, 55, 58, 61,
+ 50, 47, 50, 57, 64, 68, 72, 75,
+ 60, 56, 57, 64, 71, 78, 85, 90,
+ },
+ }, {
+ {
+ 32, 32, 33, 35, 41, 49, 57, 66,
+ 34, 34, 37, 43, 48, 54, 60, 68,
+ 43, 42, 44, 54, 64, 71, 78, 86,
+ 62, 59, 58, 68, 79, 91, 101, 111,
+ }, {
+ 31, 33, 40, 47, 45, 48, 51, 55,
+ 42, 44, 47, 50, 49, 50, 52, 55,
+ 47, 45, 46, 54, 59, 61, 63, 66,
+ 54, 51, 50, 57, 64, 70, 75, 79,
+ },
+ }, {
+ {
+ 32, 32, 32, 34, 38, 44, 50, 61,
+ 32, 33, 35, 37, 40, 45, 50, 58,
+ 42, 41, 42, 50, 58, 66, 71, 79,
+ 56, 53, 52, 59, 68, 78, 86, 97,
+ }, {
+ 31, 32, 39, 44, 46, 47, 48, 53,
+ 38, 40, 47, 47, 47, 46, 47, 50,
+ 47, 45, 45, 51, 56, 59, 61, 64,
+ 52, 49, 48, 53, 58, 64, 68, 73,
+ },
+ }, {
+ {
+ 32, 32, 32, 34, 35, 40, 46, 52,
+ 32, 33, 34, 37, 38, 42, 46, 51,
+ 37, 36, 38, 44, 49, 55, 59, 64,
+ 52, 49, 49, 54, 60, 69, 76, 83,
+ }, {
+ 31, 31, 36, 42, 47, 46, 48, 50,
+ 38, 40, 44, 47, 48, 46, 46, 48,
+ 47, 46, 47, 50, 53, 54, 55, 56,
+ 50, 48, 47, 50, 54, 60, 64, 67,
+ },
+ }, {
+ {
+ 31, 32, 32, 32, 34, 37, 42, 46,
+ 32, 33, 34, 35, 37, 40, 43, 46,
+ 35, 34, 36, 38, 43, 49, 53, 56,
+ 43, 41, 42, 42, 49, 56, 63, 67,
+ }, {
+ 31, 31, 35, 39, 43, 47, 46, 48,
+ 38, 40, 43, 47, 47, 47, 46, 46,
+ 47, 46, 47, 47, 50, 53, 53, 54,
+ 48, 45, 46, 45, 50, 55, 58, 59,
+ },
+ }, {
+ {
+ 31, 32, 32, 32, 33, 34, 37, 40,
+ 32, 32, 33, 33, 34, 36, 38, 40,
+ 34, 34, 34, 36, 38, 41, 44, 46,
+ 39, 38, 38, 40, 42, 47, 52, 56,
+ }, {
+ 31, 31, 33, 36, 40, 45, 47, 47,
+ 34, 35, 37, 41, 44, 46, 47, 46,
+ 42, 42, 44, 46, 48, 49, 50, 49,
+ 48, 46, 46, 46, 48, 51, 54, 55,
+ },
+ }, {
+ {
+ 31, 32, 32, 32, 32, 33, 34, 35,
+ 31, 32, 32, 32, 33, 33, 34, 34,
+ 32, 32, 33, 34, 35, 36, 37, 38,
+ 35, 35, 34, 36, 38, 40, 42, 48,
+ }, {
+ 31, 31, 31, 34, 37, 39, 42, 48,
+ 31, 31, 32, 36, 39, 41, 43, 46,
+ 37, 38, 40, 43, 46, 47, 47, 48,
+ 48, 47, 46, 47, 47, 48, 50, 53,
+ },
+ }, {
+ {
+ 31, 31, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 33, 34, 34, 35,
+ 32, 32, 32, 33, 34, 34, 35, 36,
+ }, {
+ 31, 31, 31, 31, 34, 35, 38, 41,
+ 31, 31, 32, 32, 36, 37, 40, 42,
+ 35, 36, 37, 37, 40, 42, 45, 45,
+ 37, 38, 39, 40, 43, 44, 47, 47,
+ },
+ }, {
+ {
+ 31, 31, 31, 31, 31, 31, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33,
+ }, {
+ 31, 31, 31, 31, 31, 31, 34, 34,
+ 31, 31, 31, 32, 32, 33, 36, 36,
+ 31, 31, 31, 32, 32, 33, 36, 36,
+ 34, 35, 35, 36, 36, 37, 40, 40,
+ },
+ }, {
+ {
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32,
+ }, {
+ 31, 31, 31, 31, 31, 31, 31, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32,
+ },
+ },
+};
+
+static const uint8_t qm_tbl_8x8_t[][2][36] = {
+ {
+ {
+ 32,
+ 32, 35,
+ 38, 40, 54,
+ 51, 49, 65, 82,
+ 68, 63, 78, 97, 117,
+ 84, 76, 91, 111, 134, 152,
+ 95, 89, 98, 113, 138, 159, 183,
+ 109, 102, 106, 121, 142, 168, 199, 220,
+ }, {
+ 31,
+ 38, 47,
+ 47, 46, 54,
+ 50, 47, 57, 66,
+ 57, 52, 61, 72, 82,
+ 63, 57, 66, 77, 88, 96,
+ 67, 62, 67, 75, 86, 95, 104,
+ 71, 67, 68, 75, 84, 95, 107, 113,
+ },
+ }, {
+ {
+ 32,
+ 32, 35,
+ 37, 39, 51,
+ 47, 46, 60, 73,
+ 62, 58, 71, 87, 105,
+ 78, 72, 84, 100, 121, 140,
+ 90, 84, 93, 106, 129, 148, 169,
+ 102, 96, 100, 113, 132, 155, 183, 201,
+ }, {
+ 31,
+ 38, 47,
+ 47, 47, 53,
+ 48, 46, 55, 62,
+ 54, 50, 58, 67, 76,
+ 61, 55, 63, 72, 83, 91,
+ 66, 61, 65, 73, 84, 92, 101,
+ 69, 65, 66, 73, 82, 92, 103, 109,
+ },
+ }, {
+ {
+ 32,
+ 32, 34,
+ 35, 37, 48,
+ 46, 45, 56, 70,
+ 57, 54, 64, 80, 93,
+ 76, 70, 79, 96, 111, 134,
+ 85, 79, 87, 100, 121, 138, 156,
+ 96, 90, 93, 105, 122, 144, 168, 184,
+ }, {
+ 31,
+ 36, 43,
+ 47, 47, 53,
+ 48, 46, 54, 61,
+ 52, 49, 55, 65, 71,
+ 60, 55, 60, 70, 78, 89,
+ 64, 59, 63, 71, 81, 89, 97,
+ 67, 63, 64, 71, 79, 89, 99, 104,
+ },
+ }, {
+ {
+ 32,
+ 32, 33,
+ 35, 36, 46,
+ 42, 42, 52, 63,
+ 53, 51, 60, 73, 86,
+ 68, 64, 72, 84, 100, 117,
+ 78, 74, 80, 92, 109, 128, 140,
+ 90, 84, 87, 98, 114, 133, 155, 168,
+ }, {
+ 31,
+ 34, 39,
+ 46, 47, 52,
+ 47, 45, 52, 58,
+ 50, 48, 54, 62, 68,
+ 57, 53, 58, 65, 73, 82,
+ 61, 57, 61, 68, 77, 86, 91,
+ 65, 61, 62, 68, 76, 86, 95, 100,
+ },
+ }, {
+ {
+ 32,
+ 32, 33,
+ 34, 35, 39,
+ 39, 40, 46, 56,
+ 50, 48, 53, 65, 78,
+ 62, 59, 63, 75, 90, 105,
+ 76, 71, 74, 86, 101, 118, 134,
+ 84, 79, 81, 92, 106, 123, 142, 153,
+ }, {
+ 31,
+ 34, 39,
+ 42, 45, 48,
+ 47, 46, 49, 55,
+ 49, 47, 50, 58, 65,
+ 54, 51, 53, 61, 69, 76,
+ 60, 56, 57, 65, 73, 82, 89,
+ 64, 59, 60, 66, 74, 83, 92, 96,
+ },
+ }, {
+ {
+ 32,
+ 32, 33,
+ 34, 35, 39,
+ 38, 39, 45, 54,
+ 46, 45, 51, 61, 71,
+ 56, 54, 58, 69, 80, 92,
+ 68, 64, 68, 78, 90, 103, 117,
+ 78, 74, 76, 86, 99, 113, 128, 140,
+ }, {
+ 31,
+ 34, 39,
+ 42, 45, 48,
+ 47, 46, 49, 54,
+ 48, 46, 50, 56, 61,
+ 52, 49, 52, 58, 65, 71,
+ 57, 53, 55, 61, 68, 75, 82,
+ 61, 57, 58, 64, 71, 79, 86, 91,
+ },
+ }, {
+ {
+ 31,
+ 32, 32,
+ 32, 33, 35,
+ 35, 35, 38, 48,
+ 42, 41, 43, 54, 63,
+ 51, 49, 49, 59, 71, 81,
+ 59, 56, 56, 66, 77, 89, 98,
+ 69, 65, 64, 73, 85, 97, 108, 119,
+ }, {
+ 31,
+ 32, 35,
+ 38, 42, 47,
+ 48, 47, 48, 53,
+ 47, 45, 45, 53, 58,
+ 50, 47, 47, 54, 61, 66,
+ 53, 50, 49, 56, 63, 69, 73,
+ 57, 54, 52, 58, 65, 72, 77, 82,
+ },
+ }, {
+ {
+ 31,
+ 32, 32,
+ 32, 32, 35,
+ 34, 34, 37, 42,
+ 38, 37, 40, 47, 54,
+ 46, 44, 45, 52, 60, 69,
+ 52, 49, 49, 56, 65, 75, 82,
+ 63, 59, 58, 65, 73, 84, 92, 105,
+ }, {
+ 31,
+ 31, 32,
+ 38, 40, 47,
+ 44, 44, 47, 50,
+ 47, 45, 46, 51, 54,
+ 48, 46, 46, 51, 56, 61,
+ 50, 47, 47, 52, 57, 63, 66,
+ 55, 52, 50, 54, 60, 66, 70, 76,
+ },
+ }, {
+ {
+ 31,
+ 32, 32,
+ 32, 32, 34,
+ 34, 33, 35, 39,
+ 35, 34, 37, 42, 48,
+ 41, 40, 41, 47, 53, 60,
+ 47, 44, 45, 51, 57, 65, 71,
+ 53, 50, 51, 55, 61, 70, 77, 85,
+ }, {
+ 31,
+ 31, 32,
+ 35, 36, 41,
+ 42, 42, 45, 48,
+ 48, 46, 47, 50, 53,
+ 47, 45, 45, 49, 53, 57,
+ 49, 46, 46, 50, 54, 59, 61,
+ 51, 48, 48, 51, 54, 60, 64, 68,
+ },
+ }, {
+ {
+ 31,
+ 31, 32,
+ 32, 32, 33,
+ 32, 32, 34, 35,
+ 34, 34, 35, 37, 41,
+ 37, 36, 38, 39, 45, 51,
+ 43, 41, 42, 42, 49, 56, 63,
+ 47, 44, 45, 46, 52, 59, 67, 71,
+ }, {
+ 31,
+ 31, 32,
+ 34, 35, 39,
+ 37, 40, 43, 47,
+ 43, 43, 45, 47, 49,
+ 48, 46, 46, 47, 50, 53,
+ 47, 45, 45, 45, 50, 55, 58,
+ 49, 46, 46, 46, 50, 55, 60, 61,
+ },
+ }, {
+ {
+ 31,
+ 31, 32,
+ 32, 32, 32,
+ 32, 32, 33, 34,
+ 33, 33, 34, 35, 37,
+ 34, 34, 35, 36, 39, 43,
+ 37, 36, 37, 38, 41, 46, 51,
+ 41, 39, 40, 41, 44, 49, 54, 58,
+ }, {
+ 31,
+ 31, 31,
+ 32, 33, 35,
+ 35, 37, 39, 43,
+ 39, 41, 42, 45, 47,
+ 45, 44, 45, 47, 48, 50,
+ 48, 46, 46, 47, 48, 51, 53,
+ 48, 46, 45, 46, 47, 51, 54, 56,
+ },
+ }, {
+ {
+ 31,
+ 31, 32,
+ 31, 32, 32,
+ 32, 32, 32, 33,
+ 32, 32, 32, 34, 35,
+ 32, 33, 33, 34, 35, 36,
+ 34, 34, 33, 35, 36, 38, 39,
+ 35, 35, 34, 36, 38, 40, 42, 48,
+ }, {
+ 31,
+ 31, 31,
+ 30, 31, 32,
+ 34, 34, 35, 39,
+ 36, 37, 39, 42, 46,
+ 39, 40, 41, 44, 47, 47,
+ 42, 42, 42, 45, 47, 48, 48,
+ 48, 47, 46, 47, 47, 49, 50, 53,
+ },
+ }, {
+ {
+ 31,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 32, 32, 32, 32, 33,
+ 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 34, 34, 35,
+ 33, 33, 33, 33, 35, 35, 36, 38,
+ }, {
+ 31,
+ 31, 31,
+ 31, 31, 31,
+ 30, 31, 31, 32,
+ 34, 34, 35, 35, 39,
+ 35, 35, 36, 36, 40, 41,
+ 37, 38, 39, 40, 43, 44, 47,
+ 40, 41, 41, 42, 44, 45, 47, 48,
+ },
+ }, {
+ {
+ 31,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33,
+ }, {
+ 31,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 30, 31, 31, 31, 32,
+ 31, 32, 32, 32, 32, 33,
+ 33, 34, 34, 35, 35, 36, 39,
+ 33, 34, 34, 35, 35, 36, 39, 39,
+ },
+ }, {
+ {
+ 31,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 32, 32,
+ 31, 31, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32,
+ }, {
+ 31,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31,
+ 30, 31, 31, 31, 31, 31, 31, 31,
+ },
+ },
+};
+
+static const uint8_t qm_tbl_16x4[][2][64] = {
+ {
+ {
+ 31, 32, 32, 34, 34, 41, 45, 54, 60, 72, 75, 83, 88, 94, 101, 108,
+ 44, 41, 42, 48, 54, 63, 67, 75, 79, 90, 92, 100, 100, 101, 108, 115,
+ 79, 72, 71, 73, 78, 90, 96, 110, 118, 133, 136, 142, 140, 144, 141, 151,
+ 96, 90, 86, 83, 89, 95, 102, 111, 123, 135, 149, 160, 173, 180, 188, 197,
+ }, {
+ 31, 32, 36, 43, 46, 45, 46, 50, 52, 57, 59, 62, 63, 65, 67, 69,
+ 49, 45, 46, 49, 53, 58, 59, 62, 64, 67, 68, 71, 69, 68, 70, 72,
+ 63, 57, 56, 57, 60, 67, 71, 78, 82, 89, 90, 91, 89, 89, 86, 88,
+ 69, 65, 62, 60, 63, 66, 70, 74, 80, 85, 91, 96, 101, 103, 105, 107,
+ },
+ }, {
+ {
+ 31, 32, 32, 33, 34, 37, 44, 49, 56, 65, 72, 78, 84, 89, 95, 101,
+ 44, 41, 42, 44, 54, 58, 66, 71, 77, 84, 90, 95, 95, 95, 101, 108,
+ 73, 67, 65, 66, 74, 79, 90, 99, 107, 119, 127, 133, 132, 136, 132, 141,
+ 93, 87, 83, 81, 86, 92, 98, 107, 117, 129, 141, 151, 163, 169, 175, 183,
+ }, {
+ 31, 32, 36, 41, 46, 46, 46, 48, 51, 54, 57, 60, 62, 64, 65, 67,
+ 49, 45, 46, 47, 53, 56, 59, 61, 63, 65, 67, 69, 67, 66, 68, 70,
+ 61, 55, 54, 54, 59, 62, 68, 73, 77, 82, 86, 88, 86, 87, 83, 86,
+ 69, 64, 61, 59, 62, 65, 68, 73, 78, 84, 89, 93, 98, 100, 102, 103,
+ },
+ }, {
+ {
+ 31, 32, 32, 33, 34, 37, 41, 46, 53, 60, 65, 74, 79, 84, 89, 94,
+ 39, 38, 39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 95, 101,
+ 65, 60, 59, 58, 65, 73, 79, 86, 97, 105, 111, 120, 125, 128, 124, 131,
+ 90, 84, 81, 78, 83, 89, 94, 102, 112, 123, 134, 143, 154, 158, 164, 170,
+ }, {
+ 31, 32, 36, 40, 44, 46, 45, 47, 49, 52, 54, 58, 60, 62, 64, 65,
+ 48, 46, 46, 46, 51, 54, 56, 57, 58, 60, 62, 64, 66, 64, 66, 68,
+ 57, 53, 51, 50, 54, 60, 64, 68, 73, 76, 79, 82, 84, 84, 81, 83,
+ 68, 63, 60, 58, 61, 64, 67, 71, 77, 82, 87, 91, 95, 97, 99, 100,
+ },
+ }, {
+ {
+ 31, 32, 32, 33, 34, 34, 39, 44, 49, 54, 60, 68, 75, 79, 84, 88,
+ 36, 35, 36, 38, 42, 48, 51, 56, 60, 63, 68, 75, 81, 85, 89, 94,
+ 62, 58, 57, 56, 61, 66, 74, 82, 90, 95, 102, 110, 117, 120, 116, 123,
+ 88, 82, 79, 76, 81, 85, 91, 98, 107, 117, 127, 135, 145, 148, 153, 159,
+ }, {
+ 31, 32, 35, 40, 43, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 63,
+ 48, 46, 47, 47, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 66,
+ 56, 52, 50, 49, 53, 56, 61, 65, 70, 72, 75, 79, 81, 82, 79, 81,
+ 67, 62, 60, 57, 60, 63, 66, 70, 75, 80, 85, 89, 93, 94, 96, 97,
+ },
+ }, {
+ {
+ 31, 32, 32, 32, 33, 34, 37, 41, 45, 49, 54, 61, 68, 74, 78, 83,
+ 36, 35, 35, 37, 41, 48, 50, 53, 56, 60, 63, 69, 75, 80, 84, 88,
+ 53, 51, 49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 114,
+ 81, 76, 73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148,
+ }, {
+ 31, 31, 33, 38, 42, 46, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62,
+ 48, 47, 46, 47, 49, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 64,
+ 52, 49, 48, 47, 50, 54, 57, 61, 64, 66, 68, 71, 73, 75, 76, 78,
+ 64, 60, 57, 56, 57, 61, 64, 68, 71, 75, 78, 83, 87, 90, 92, 94,
+ },
+ }, {
+ {
+ 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75,
+ 36, 35, 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81,
+ 53, 51, 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106,
+ 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136,
+ }, {
+ 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59,
+ 48, 47, 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61,
+ 52, 50, 48, 47, 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75,
+ 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, 89, 90,
+ },
+ }, {
+ {
+ 31, 32, 32, 32, 32, 34, 34, 36, 39, 42, 45, 50, 54, 60, 66, 73,
+ 34, 34, 33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74,
+ 44, 43, 41, 43, 43, 48, 53, 57, 60, 64, 67, 72, 76, 80, 85, 91,
+ 65, 62, 59, 59, 58, 63, 67, 71, 76, 81, 85, 92, 98, 105, 111, 118,
+ }, {
+ 31, 31, 32, 35, 40, 43, 46, 46, 46, 46, 47, 48, 50, 52, 55, 58,
+ 42, 42, 42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 58,
+ 49, 47, 45, 46, 46, 49, 53, 55, 57, 59, 60, 61, 63, 64, 66, 68,
+ 57, 54, 52, 51, 50, 53, 56, 58, 61, 64, 67, 71, 73, 76, 79, 82,
+ },
+ }, {
+ {
+ 31, 32, 32, 32, 32, 32, 34, 35, 37, 39, 41, 45, 50, 54, 57, 61,
+ 32, 32, 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58,
+ 44, 42, 41, 42, 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79,
+ 58, 55, 53, 53, 53, 52, 57, 63, 67, 70, 74, 79, 86, 90, 93, 97,
+ }, {
+ 31, 31, 32, 34, 37, 39, 42, 47, 46, 46, 46, 47, 48, 50, 51, 53,
+ 37, 38, 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50,
+ 49, 47, 45, 45, 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64,
+ 54, 51, 49, 49, 48, 48, 51, 55, 58, 60, 62, 65, 68, 70, 71, 73,
+ },
+ }, {
+ {
+ 31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 55,
+ 32, 32, 32, 33, 34, 35, 36, 37, 38, 40, 40, 43, 45, 47, 50, 54,
+ 38, 37, 36, 36, 38, 39, 41, 44, 49, 51, 52, 56, 58, 60, 63, 67,
+ 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87,
+ }, {
+ 31, 31, 31, 32, 35, 39, 40, 42, 47, 47, 46, 46, 47, 48, 49, 51,
+ 37, 38, 39, 40, 43, 47, 47, 47, 48, 47, 47, 46, 46, 47, 47, 49,
+ 48, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57,
+ 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68,
+ },
+ }, {
+ {
+ 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46,
+ 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46,
+ 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56,
+ 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67,
+ }, {
+ 31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 47, 47, 46, 48, 48,
+ 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46,
+ 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54,
+ 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59,
+ },
+ }, {
+ {
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42,
+ 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42,
+ 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 43, 43, 45, 45, 48,
+ 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 51, 51, 54, 54, 58,
+ }, {
+ 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 48, 48, 47, 47, 47,
+ 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45,
+ 42, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49,
+ 48, 47, 47, 45, 45, 46, 46, 46, 46, 50, 50, 53, 53, 54, 54, 56,
+ },
+ }, {
+ {
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34,
+ 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38,
+ 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48,
+ }, {
+ 31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 48, 48,
+ 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46,
+ 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48,
+ 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53,
+ },
+ }, {
+ {
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 36,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37,
+ }, {
+ 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 38, 38, 39, 42,
+ 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43,
+ 35, 35, 36, 36, 36, 37, 37, 38, 40, 40, 40, 43, 45, 45, 45, 46,
+ 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47,
+ },
+ }, {
+ {
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ }, {
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36,
+ 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 39, 40, 40, 40,
+ },
+ }, {
+ {
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ }, {
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ },
+ },
+};
+
+static const uint8_t qm_tbl_16x8[][2][128] = {
+ {
+ {
+ 32, 31, 32, 34, 36, 44, 48, 58, 65, 79, 82, 91, 97, 103, 110, 118,
+ 32, 33, 34, 37, 38, 43, 46, 54, 58, 70, 72, 80, 86, 93, 100, 107,
+ 36, 34, 36, 42, 48, 53, 56, 63, 68, 79, 81, 88, 94, 98, 101, 105,
+ 53, 49, 50, 54, 60, 71, 76, 87, 92, 104, 106, 106, 107, 114, 117, 118,
+ 65, 59, 59, 63, 68, 79, 85, 98, 105, 118, 121, 130, 128, 131, 138, 136,
+ 87, 78, 77, 79, 84, 95, 102, 116, 124, 141, 144, 148, 157, 150, 161, 157,
+ 93, 86, 82, 80, 86, 94, 105, 112, 122, 135, 149, 162, 167, 174, 183, 182,
+ 99, 93, 89, 88, 90, 97, 105, 115, 124, 135, 146, 159, 171, 186, 193, 203,
+ }, {
+ 32, 30, 33, 42, 49, 49, 50, 54, 57, 63, 64, 68, 70, 72, 74, 76,
+ 37, 40, 43, 47, 48, 46, 46, 49, 50, 55, 56, 59, 62, 64, 67, 69,
+ 48, 46, 47, 50, 53, 53, 54, 55, 56, 60, 61, 64, 66, 66, 66, 67,
+ 52, 48, 47, 50, 54, 61, 64, 68, 70, 75, 75, 74, 73, 75, 74, 73,
+ 57, 52, 51, 53, 57, 64, 67, 73, 76, 82, 83, 86, 83, 83, 84, 82,
+ 66, 60, 59, 60, 62, 69, 73, 80, 84, 92, 93, 94, 96, 92, 94, 91,
+ 68, 63, 60, 59, 62, 66, 72, 76, 80, 87, 93, 98, 99, 101, 103, 101,
+ 71, 66, 63, 62, 62, 66, 70, 75, 79, 84, 89, 94, 98, 104, 106, 109,
+ },
+ }, {
+ {
+ 32, 31, 32, 32, 36, 39, 47, 53, 61, 71, 79, 86, 92, 98, 104, 110,
+ 32, 32, 34, 35, 37, 40, 45, 50, 56, 64, 70, 76, 82, 88, 94, 100,
+ 36, 35, 36, 40, 48, 50, 56, 60, 65, 73, 79, 84, 89, 93, 95, 98,
+ 47, 44, 45, 47, 56, 60, 69, 75, 81, 89, 95, 100, 101, 108, 110, 111,
+ 65, 60, 59, 60, 68, 73, 84, 92, 100, 111, 118, 124, 121, 124, 129, 127,
+ 79, 72, 71, 71, 78, 84, 95, 103, 113, 125, 133, 140, 148, 141, 151, 147,
+ 90, 84, 80, 78, 83, 91, 101, 108, 116, 129, 142, 153, 157, 163, 171, 169,
+ 96, 90, 87, 85, 87, 94, 101, 110, 118, 129, 138, 150, 161, 174, 181, 188,
+ }, {
+ 32, 30, 33, 39, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74,
+ 35, 38, 41, 46, 48, 46, 46, 47, 49, 53, 55, 58, 60, 62, 65, 67,
+ 48, 46, 47, 48, 53, 53, 54, 54, 56, 58, 60, 62, 64, 65, 65, 65,
+ 50, 46, 46, 47, 54, 56, 61, 63, 65, 68, 70, 72, 71, 73, 72, 71,
+ 57, 52, 51, 51, 57, 60, 66, 71, 74, 79, 82, 84, 81, 81, 82, 79,
+ 63, 58, 56, 55, 60, 64, 70, 75, 79, 85, 89, 91, 94, 89, 92, 89,
+ 68, 63, 60, 58, 61, 65, 71, 75, 79, 85, 91, 95, 97, 98, 100, 98,
+ 70, 65, 63, 61, 61, 65, 69, 74, 78, 82, 87, 91, 96, 101, 103, 105,
+ },
+ }, {
+ {
+ 32, 31, 32, 32, 34, 39, 44, 49, 57, 65, 71, 81, 87, 92, 98, 103,
+ 32, 32, 33, 34, 36, 39, 42, 46, 53, 59, 64, 72, 77, 83, 88, 94,
+ 36, 35, 36, 38, 44, 50, 53, 57, 63, 68, 73, 80, 85, 88, 89, 92,
+ 44, 41, 42, 42, 50, 58, 63, 67, 74, 79, 84, 91, 96, 102, 103, 103,
+ 58, 54, 53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114, 117, 121, 119,
+ 79, 73, 71, 69, 75, 84, 90, 97, 108, 118, 125, 135, 140, 133, 141, 137,
+ 88, 81, 78, 76, 81, 88, 97, 104, 111, 123, 135, 145, 148, 153, 160, 158,
+ 93, 88, 84, 82, 84, 90, 97, 105, 113, 122, 131, 141, 151, 163, 169, 175,
+ }, {
+ 32, 31, 33, 37, 44, 48, 49, 51, 54, 57, 60, 64, 66, 68, 70, 72,
+ 34, 36, 40, 44, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65,
+ 48, 46, 47, 47, 51, 53, 53, 54, 55, 56, 58, 61, 63, 63, 63, 63,
+ 49, 46, 46, 45, 51, 56, 58, 60, 62, 64, 65, 68, 69, 71, 70, 69,
+ 54, 50, 49, 48, 53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 80, 77,
+ 63, 58, 56, 54, 59, 64, 67, 71, 77, 82, 85, 89, 91, 87, 89, 86,
+ 67, 62, 59, 57, 60, 64, 70, 73, 77, 83, 89, 93, 94, 96, 97, 95,
+ 69, 65, 62, 60, 61, 64, 68, 72, 76, 81, 85, 89, 93, 98, 100, 102,
+ },
+ }, {
+ {
+ 32, 31, 31, 32, 34, 36, 41, 47, 53, 58, 65, 74, 82, 87, 92, 97,
+ 31, 32, 33, 34, 35, 36, 40, 44, 50, 54, 59, 67, 73, 78, 83, 88,
+ 35, 34, 35, 37, 41, 46, 49, 53, 57, 61, 66, 73, 79, 83, 84, 86,
+ 44, 41, 42, 42, 48, 54, 60, 66, 71, 75, 79, 86, 92, 96, 97, 97,
+ 53, 50, 49, 49, 54, 60, 67, 75, 82, 87, 92, 100, 105, 110, 114, 111,
+ 65, 61, 59, 58, 63, 68, 76, 84, 92, 98, 105, 113, 120, 125, 132, 128,
+ 82, 76, 73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144, 150, 147,
+ 90, 85, 81, 79, 81, 87, 93, 101, 108, 116, 124, 134, 142, 153, 157, 163,
+ }, {
+ 32, 31, 33, 37, 42, 49, 48, 50, 52, 54, 57, 61, 64, 66, 68, 70,
+ 33, 34, 37, 43, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63,
+ 45, 45, 46, 47, 49, 52, 51, 52, 53, 54, 55, 58, 60, 61, 61, 61,
+ 49, 46, 45, 45, 49, 53, 57, 59, 61, 62, 64, 66, 68, 69, 68, 67,
+ 52, 49, 47, 47, 50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 77, 75,
+ 57, 53, 51, 50, 53, 57, 61, 66, 71, 73, 76, 80, 83, 84, 86, 83,
+ 64, 60, 57, 55, 58, 61, 66, 71, 75, 79, 83, 87, 91, 93, 94, 92,
+ 68, 64, 61, 59, 60, 63, 67, 71, 74, 79, 83, 87, 91, 95, 97, 98,
+ },
+ }, {
+ {
+ 32, 31, 31, 32, 33, 36, 39, 44, 48, 53, 58, 66, 74, 81, 86, 91,
+ 31, 32, 32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 78, 82,
+ 33, 33, 34, 36, 38, 42, 44, 46, 50, 53, 57, 63, 69, 75, 78, 80,
+ 40, 39, 38, 40, 44, 51, 54, 59, 62, 66, 70, 75, 81, 86, 90, 90,
+ 51, 49, 47, 48, 52, 58, 63, 69, 74, 79, 84, 90, 97, 102, 106, 103,
+ 65, 61, 59, 58, 62, 68, 73, 79, 85, 92, 98, 106, 113, 120, 124, 119,
+ 79, 74, 71, 69, 72, 78, 84, 90, 96, 103, 110, 119, 128, 135, 140, 137,
+ 87, 82, 79, 77, 78, 84, 89, 96, 103, 111, 118, 126, 134, 143, 147, 151,
+ }, {
+ 32, 31, 31, 35, 41, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68,
+ 32, 33, 35, 39, 43, 47, 46, 45, 46, 48, 50, 52, 55, 58, 59, 61,
+ 40, 41, 43, 46, 48, 50, 49, 48, 49, 50, 51, 53, 56, 58, 59, 59,
+ 49, 47, 46, 46, 49, 53, 54, 56, 57, 58, 59, 61, 63, 65, 66, 65,
+ 51, 49, 47, 47, 49, 54, 57, 61, 63, 65, 67, 69, 72, 73, 75, 72,
+ 57, 54, 51, 50, 52, 57, 60, 64, 67, 71, 73, 77, 80, 82, 84, 81,
+ 63, 59, 57, 55, 57, 60, 64, 67, 71, 75, 78, 82, 86, 89, 91, 89,
+ 67, 63, 60, 58, 59, 62, 65, 69, 73, 77, 81, 85, 88, 92, 94, 95,
+ },
+ }, {
+ {
+ 32, 31, 31, 32, 32, 34, 36, 39, 44, 48, 53, 58, 65, 71, 79, 82,
+ 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75,
+ 32, 32, 33, 34, 35, 37, 38, 40, 43, 46, 50, 54, 58, 63, 70, 72,
+ 36, 35, 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81,
+ 44, 42, 41, 42, 42, 48, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92,
+ 53, 51, 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106,
+ 65, 62, 59, 59, 58, 63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121,
+ 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136,
+ }, {
+ 32, 31, 30, 33, 37, 42, 49, 48, 49, 50, 52, 54, 57, 60, 63, 64,
+ 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59,
+ 37, 38, 40, 43, 47, 47, 48, 47, 46, 46, 47, 49, 50, 52, 55, 56,
+ 48, 47, 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61,
+ 49, 47, 45, 46, 45, 49, 53, 56, 58, 59, 61, 62, 64, 65, 67, 68,
+ 52, 50, 48, 47, 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75,
+ 57, 54, 52, 51, 50, 53, 57, 60, 64, 67, 71, 73, 76, 79, 82, 83,
+ 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, 89, 90,
+ },
+ }, {
+ {
+ 32, 31, 31, 32, 32, 34, 35, 38, 41, 44, 48, 53, 58, 65, 71, 79,
+ 31, 32, 32, 32, 33, 34, 34, 36, 39, 42, 45, 49, 54, 60, 65, 72,
+ 32, 32, 33, 34, 35, 37, 38, 40, 41, 43, 46, 50, 54, 58, 63, 70,
+ 36, 35, 34, 36, 38, 42, 47, 49, 51, 54, 56, 60, 63, 68, 73, 79,
+ 44, 42, 41, 42, 42, 48, 52, 56, 60, 64, 67, 71, 75, 79, 84, 90,
+ 53, 51, 49, 50, 49, 54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104,
+ 62, 59, 57, 57, 56, 61, 65, 69, 74, 79, 83, 90, 95, 102, 108, 115,
+ 73, 69, 66, 65, 64, 69, 73, 77, 81, 86, 91, 99, 105, 112, 119, 127,
+ }, {
+ 32, 31, 30, 33, 37, 42, 47, 48, 48, 49, 50, 52, 54, 57, 60, 63,
+ 31, 31, 32, 36, 40, 43, 46, 46, 45, 45, 46, 48, 50, 52, 54, 57,
+ 37, 38, 40, 43, 47, 47, 48, 47, 46, 46, 46, 47, 49, 50, 52, 55,
+ 48, 47, 46, 47, 47, 50, 52, 53, 53, 53, 54, 54, 55, 56, 58, 60,
+ 49, 47, 45, 46, 45, 49, 53, 55, 57, 58, 59, 61, 62, 64, 65, 67,
+ 52, 50, 48, 47, 47, 50, 53, 56, 59, 62, 64, 66, 68, 70, 72, 75,
+ 56, 53, 51, 50, 49, 53, 55, 58, 61, 64, 66, 70, 72, 75, 77, 80,
+ 61, 57, 55, 54, 52, 56, 58, 61, 63, 66, 69, 73, 76, 79, 82, 86,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 53, 57, 61, 65,
+ 31, 32, 32, 32, 32, 33, 34, 34, 37, 39, 41, 45, 49, 53, 56, 60,
+ 32, 32, 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58,
+ 35, 35, 34, 35, 36, 37, 41, 46, 47, 49, 51, 54, 57, 60, 63, 66,
+ 39, 38, 37, 38, 39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72,
+ 44, 42, 41, 42, 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79,
+ 53, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76, 82, 86, 89, 92,
+ 65, 62, 59, 59, 58, 58, 63, 68, 72, 76, 79, 85, 92, 97, 100, 105,
+ }, {
+ 32, 31, 30, 33, 35, 37, 42, 49, 48, 48, 49, 50, 52, 54, 55, 57,
+ 31, 31, 32, 35, 37, 40, 43, 46, 46, 45, 45, 46, 48, 49, 51, 52,
+ 37, 38, 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50,
+ 45, 45, 44, 46, 46, 47, 49, 52, 51, 51, 51, 52, 53, 54, 54, 55,
+ 48, 47, 45, 46, 46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59,
+ 49, 47, 45, 45, 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64,
+ 52, 50, 48, 47, 47, 47, 50, 54, 57, 59, 61, 64, 66, 68, 69, 70,
+ 57, 54, 52, 51, 51, 50, 53, 57, 59, 61, 64, 67, 71, 73, 74, 76,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58,
+ 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54,
+ 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54,
+ 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 45, 47, 48, 51, 55,
+ 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63,
+ 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75,
+ 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79,
+ 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87,
+ }, {
+ 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54,
+ 31, 31, 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50,
+ 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49,
+ 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47, 48, 48, 48, 50,
+ 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55,
+ 49, 47, 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62,
+ 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65,
+ 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48,
+ 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45,
+ 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45,
+ 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46,
+ 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54,
+ 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56,
+ 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67,
+ 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70,
+ }, {
+ 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50,
+ 31, 31, 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46,
+ 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46,
+ 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46,
+ 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52,
+ 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54,
+ 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59,
+ 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44,
+ 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41,
+ 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41,
+ 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43,
+ 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43,
+ 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53,
+ 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53,
+ 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63,
+ }, {
+ 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49,
+ 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45,
+ 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45,
+ 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46,
+ 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46,
+ 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53,
+ 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53,
+ 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34,
+ 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36,
+ 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38,
+ 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38,
+ 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46,
+ 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48,
+ }, {
+ 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49,
+ 31, 31, 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47,
+ 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46,
+ 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44, 44, 45, 47, 47,
+ 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48,
+ 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48,
+ 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52,
+ 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37,
+ 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38,
+ }, {
+ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42,
+ 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42,
+ 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43,
+ 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43,
+ 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44,
+ 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47,
+ 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47,
+ 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+ }, {
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36,
+ 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38,
+ 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41, 41, 41,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ }, {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ },
+ },
+};
+
+static const uint8_t qm_tbl_32x8[][2][256] = {
+ {
+ {
+ 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122,
+ 32, 32, 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111,
+ 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100, 101, 103, 105, 107,
+ 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118, 119,
+ 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, 138, 137, 136, 136,
+ 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, 150, 155, 161, 159, 157, 156,
+ 93, 88, 86, 84, 82, 82, 80, 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, 165, 167, 173, 174, 177, 183, 185, 182, 179,
+ 99, 94, 93, 90, 89, 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204,
+ }, {
+ 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+ 37, 38, 40, 41, 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70,
+ 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67,
+ 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74, 73, 73,
+ 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81,
+ 66, 63, 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90,
+ 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103, 103, 101, 99,
+ 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109, 108,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114,
+ 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104,
+ 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, 100,
+ 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111,
+ 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127,
+ 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145,
+ 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, 166,
+ 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, 188, 188, 190,
+ }, {
+ 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
+ 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68,
+ 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65,
+ 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72, 71, 71,
+ 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78,
+ 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87,
+ 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100, 100, 98, 96,
+ 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106,
+ 32, 32, 32, 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97,
+ 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93,
+ 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103, 103, 103, 103, 104,
+ 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120, 121, 120, 119, 118,
+ 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135,
+ 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, 155, 160, 161, 158, 155,
+ 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176,
+ }, {
+ 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73,
+ 34, 35, 36, 36, 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66,
+ 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63,
+ 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70, 69, 69,
+ 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76,
+ 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84,
+ 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97, 95, 93,
+ 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99,
+ 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 81, 83, 85, 88, 91,
+ 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87,
+ 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97, 97, 97,
+ 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111, 110,
+ 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126,
+ 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144,
+ 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, 163, 163,
+ }, {
+ 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70,
+ 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+ 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61,
+ 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68, 67, 67,
+ 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74,
+ 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82,
+ 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94, 92, 90,
+ 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93,
+ 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 78, 80, 82, 85,
+ 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81,
+ 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90, 90, 90,
+ 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103,
+ 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117,
+ 79, 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134,
+ 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152,
+ }, {
+ 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68,
+ 32, 33, 33, 33, 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57, 58, 58, 59, 60, 61, 62,
+ 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59,
+ 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66, 65, 65,
+ 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71,
+ 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79,
+ 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91, 89, 87,
+ 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87,
+ 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79,
+ 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76,
+ 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84,
+ 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96,
+ 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109,
+ 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124,
+ 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141,
+ }, {
+ 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66,
+ 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60,
+ 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57,
+ 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63,
+ 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69,
+ 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77,
+ 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84,
+ 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57, 60, 61, 65, 66, 72, 72,
+ 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70,
+ 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73, 79, 79,
+ 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90,
+ 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104,
+ 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102, 103, 108, 108, 115, 115,
+ 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119, 127, 127,
+ }, {
+ 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63,
+ 31, 31, 31, 32, 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57,
+ 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55,
+ 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59, 60, 60,
+ 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67,
+ 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75,
+ 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78, 80, 80,
+ 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49, 53, 54, 56, 60, 60, 64,
+ 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62,
+ 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66, 66, 70,
+ 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76,
+ 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83,
+ 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92, 96,
+ 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109,
+ }, {
+ 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59,
+ 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 49, 50, 51, 52, 52, 54,
+ 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52,
+ 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55, 55, 57,
+ 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61,
+ 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65,
+ 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70, 70, 72,
+ 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54,
+ 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54,
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53, 55, 55,
+ 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63,
+ 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75,
+ 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, 79, 79,
+ 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87,
+ }, {
+ 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54,
+ 31, 31, 31, 31, 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50,
+ 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49,
+ 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50,
+ 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55,
+ 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62,
+ 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63, 65, 65,
+ 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48,
+ 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56,
+ 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58,
+ 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69,
+ 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73,
+ }, {
+ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51,
+ 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47,
+ 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47,
+ 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47,
+ 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52,
+ 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54,
+ 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60,
+ 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+ 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53,
+ 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53,
+ 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63,
+ }, {
+ 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45,
+ 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
+ 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
+ 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53,
+ 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53,
+ 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39,
+ 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46, 46, 46,
+ 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49,
+ }, {
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42, 44, 46, 47, 47, 47, 47,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46,
+ 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47,
+ 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47,
+ 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47,
+ 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52, 52, 52,
+ 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38,
+ }, {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44,
+ 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34,
+ }, {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39,
+ 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ }, {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ },
+ },
+};
+
+static const uint8_t qm_tbl_32x16[][2][512] = {
+ {
+ {
+ 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122,
+ 31, 32, 32, 32, 32, 33, 34, 34, 34, 37, 41, 43, 45, 49, 54, 57, 60, 65, 72, 74, 75, 80, 83, 85, 88, 91, 94, 97, 101, 104, 108, 111,
+ 32, 32, 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111,
+ 34, 34, 33, 34, 35, 37, 39, 41, 43, 45, 48, 49, 51, 54, 58, 60, 63, 68, 74, 75, 76, 80, 81, 82, 85, 87, 90, 93, 97, 100, 103, 107,
+ 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100, 101, 103, 105, 107,
+ 44, 42, 41, 41, 42, 42, 48, 50, 54, 58, 63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 97, 100, 100, 100, 100, 101, 104, 108, 112, 115, 119,
+ 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118, 119,
+ 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, 92, 95, 98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123, 127, 131, 136,
+ 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, 138, 137, 136, 136,
+ 79, 75, 72, 71, 71, 69, 73, 76, 78, 84, 90, 93, 96, 103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137, 140, 145, 144, 142, 141, 146, 151, 156,
+ 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, 150, 155, 161, 159, 157, 156,
+ 90, 85, 82, 81, 80, 78, 78, 83, 87, 89, 93, 100, 102, 107, 115, 118, 123, 132, 136, 140, 151, 153, 155, 160, 161, 164, 170, 168, 165, 167, 172, 178,
+ 93, 88, 86, 84, 82, 82, 80, 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, 165, 167, 173, 174, 177, 183, 185, 182, 179,
+ 96, 91, 90, 87, 86, 86, 83, 84, 89, 91, 95, 100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152, 160, 167, 173, 178, 180, 187, 188, 190, 197, 203,
+ 99, 94, 93, 90, 89, 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204,
+ 102, 97, 97, 93, 93, 92, 92, 90, 90, 96, 97, 103, 104, 111, 112, 120, 121, 130, 131, 142, 143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208, 217,
+ }, {
+ 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+ 31, 31, 32, 34, 36, 40, 43, 44, 46, 46, 45, 46, 46, 48, 50, 51, 52, 54, 57, 58, 59, 61, 62, 62, 63, 64, 65, 66, 67, 68, 69, 70,
+ 37, 38, 40, 41, 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70,
+ 42, 42, 42, 44, 45, 47, 48, 49, 50, 50, 49, 49, 50, 50, 52, 52, 53, 55, 58, 58, 58, 60, 60, 60, 60, 61, 62, 63, 64, 65, 66, 67,
+ 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67,
+ 49, 47, 45, 45, 46, 45, 49, 51, 53, 56, 58, 59, 59, 61, 62, 63, 64, 65, 67, 68, 68, 69, 71, 70, 69, 68, 68, 69, 70, 71, 72, 73,
+ 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74, 73, 73,
+ 54, 52, 50, 49, 49, 48, 52, 54, 55, 59, 62, 64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 81, 79, 78, 76, 77, 78, 80, 81,
+ 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81,
+ 63, 60, 57, 57, 56, 54, 57, 59, 60, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 89, 90, 92, 91, 88, 89, 90, 89, 87, 86, 87, 88, 90,
+ 66, 63, 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90,
+ 67, 64, 62, 61, 60, 58, 58, 61, 63, 65, 67, 70, 72, 74, 78, 80, 82, 86, 88, 90, 95, 96, 96, 98, 97, 98, 100, 98, 96, 96, 97, 99,
+ 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103, 103, 101, 99,
+ 69, 66, 65, 63, 62, 61, 60, 60, 63, 64, 66, 68, 70, 73, 74, 78, 80, 82, 85, 87, 91, 92, 96, 98, 101, 102, 103, 105, 105, 105, 107, 108,
+ 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109, 108,
+ 72, 68, 68, 65, 65, 63, 63, 61, 62, 65, 65, 68, 69, 72, 73, 77, 77, 81, 81, 86, 87, 91, 91, 96, 97, 101, 102, 107, 107, 109, 110, 113,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114,
+ 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 56, 60, 65, 69, 72, 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104,
+ 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104,
+ 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 57, 60, 65, 69, 71, 74, 77, 78, 80, 83, 85, 88, 91, 94, 97, 100,
+ 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, 100,
+ 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 77, 79, 84, 88, 90, 92, 95, 95, 95, 95, 95, 98, 101, 105, 108, 111,
+ 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111,
+ 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104, 106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126,
+ 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127,
+ 73, 69, 67, 66, 65, 64, 66, 69, 74, 77, 79, 85, 90, 93, 99, 105, 107, 112, 119, 123, 127, 130, 133, 130, 132, 136, 136, 133, 132, 136, 141, 145,
+ 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145,
+ 87, 83, 80, 79, 78, 76, 76, 80, 84, 86, 90, 96, 99, 103, 111, 114, 118, 126, 130, 134, 143, 146, 147, 152, 151, 155, 160, 158, 154, 156, 161, 166,
+ 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, 166,
+ 93, 88, 87, 84, 83, 83, 81, 81, 86, 88, 92, 96, 98, 105, 107, 113, 117, 122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177, 183, 189,
+ 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, 188, 188, 190,
+ 99, 94, 94, 90, 90, 88, 89, 86, 87, 93, 93, 99, 99, 106, 107, 115, 116, 124, 125, 135, 136, 145, 146, 158, 159, 170, 171, 185, 186, 192, 193, 201,
+ }, {
+ 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
+ 31, 31, 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 54, 56, 57, 59, 60, 61, 62, 63, 64, 65, 65, 66, 67, 68,
+ 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68,
+ 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 50, 51, 53, 55, 56, 57, 58, 58, 59, 60, 60, 61, 62, 63, 64, 65,
+ 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65,
+ 49, 47, 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 68, 67, 66, 66, 67, 68, 69, 70, 71,
+ 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72, 71, 71,
+ 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 78, 79, 77, 76, 74, 75, 76, 77, 78,
+ 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78,
+ 61, 57, 55, 55, 54, 52, 54, 56, 59, 61, 62, 66, 68, 70, 73, 76, 77, 79, 82, 84, 86, 87, 88, 86, 86, 88, 87, 85, 83, 85, 86, 87,
+ 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87,
+ 67, 63, 61, 60, 59, 57, 57, 60, 63, 64, 66, 69, 71, 73, 77, 79, 81, 85, 87, 88, 92, 93, 94, 96, 95, 96, 97, 95, 93, 93, 94, 96,
+ 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100, 100, 98, 96,
+ 69, 65, 64, 62, 61, 61, 59, 59, 62, 63, 65, 67, 68, 72, 73, 76, 78, 81, 84, 85, 89, 90, 93, 96, 98, 99, 100, 102, 102, 102, 103, 105,
+ 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105,
+ 71, 67, 67, 64, 64, 62, 62, 60, 61, 64, 64, 67, 67, 71, 71, 75, 75, 79, 80, 84, 84, 89, 89, 94, 94, 98, 99, 104, 104, 106, 106, 109,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106,
+ 31, 32, 32, 32, 32, 32, 33, 34, 34, 34, 37, 38, 41, 44, 46, 49, 53, 54, 60, 63, 65, 72, 74, 75, 79, 82, 84, 87, 89, 92, 94, 97,
+ 32, 32, 32, 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97,
+ 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 40, 41, 43, 46, 47, 50, 53, 54, 58, 62, 63, 70, 71, 72, 76, 78, 81, 83, 85, 88, 90, 93,
+ 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93,
+ 39, 38, 38, 37, 39, 40, 40, 45, 47, 51, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78, 84, 85, 86, 90, 89, 90, 92, 95, 98, 101, 104,
+ 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103, 103, 103, 103, 104,
+ 53, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 75, 77, 82, 86, 87, 92, 96, 97, 104, 105, 106, 110, 108, 106, 105, 108, 111, 114, 118,
+ 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120, 121, 120, 119, 118,
+ 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 86, 92, 97, 98, 105, 109, 111, 118, 120, 121, 125, 129, 128, 125, 124, 127, 131, 135,
+ 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135,
+ 81, 77, 75, 74, 72, 71, 70, 75, 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 119, 124, 127, 135, 137, 139, 143, 146, 150, 148, 144, 146, 150, 154,
+ 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, 155, 160, 161, 158, 155,
+ 90, 86, 84, 82, 81, 80, 78, 79, 83, 85, 89, 92, 94, 101, 102, 108, 112, 117, 123, 125, 134, 136, 143, 148, 154, 157, 158, 164, 164, 165, 170, 175,
+ 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176,
+ 96, 91, 91, 87, 87, 85, 86, 83, 84, 89, 89, 95, 95, 102, 102, 110, 110, 118, 119, 128, 129, 137, 138, 149, 149, 159, 160, 173, 174, 179, 180, 187,
+ }, {
+ 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73,
+ 31, 31, 32, 32, 36, 38, 40, 43, 44, 46, 46, 45, 45, 46, 47, 48, 49, 50, 52, 54, 54, 57, 58, 59, 60, 61, 62, 63, 64, 65, 65, 66,
+ 34, 35, 36, 36, 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66,
+ 37, 38, 39, 40, 43, 45, 47, 47, 47, 48, 47, 46, 46, 46, 47, 47, 48, 49, 50, 52, 52, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63,
+ 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63,
+ 48, 47, 46, 45, 46, 46, 46, 50, 51, 53, 54, 55, 56, 56, 57, 57, 58, 59, 60, 61, 62, 64, 64, 65, 66, 65, 64, 65, 66, 67, 68, 69,
+ 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70, 69, 69,
+ 52, 50, 48, 48, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 75, 74, 72, 73, 74, 75, 76,
+ 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76,
+ 57, 54, 53, 52, 51, 50, 50, 53, 54, 57, 60, 61, 64, 66, 68, 71, 73, 74, 76, 78, 79, 82, 82, 83, 84, 85, 84, 82, 81, 82, 83, 84,
+ 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84,
+ 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 90, 90, 91, 93, 93, 94, 93, 90, 90, 92, 93,
+ 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97, 95, 93,
+ 68, 64, 63, 61, 60, 60, 58, 58, 61, 62, 64, 66, 67, 71, 71, 75, 77, 79, 82, 83, 87, 88, 91, 93, 95, 97, 97, 99, 99, 99, 100, 101,
+ 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101,
+ 69, 66, 66, 63, 63, 61, 61, 59, 60, 63, 63, 66, 66, 70, 70, 73, 74, 78, 78, 82, 82, 86, 87, 91, 91, 95, 96, 101, 101, 103, 103, 105,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99,
+ 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 49, 51, 54, 58, 60, 65, 68, 72, 75, 75, 79, 82, 84, 86, 88, 91,
+ 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 81, 83, 85, 88, 91,
+ 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 50, 51, 54, 57, 58, 63, 66, 70, 72, 72, 76, 78, 80, 82, 85, 87,
+ 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87,
+ 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 60, 61, 63, 67, 68, 73, 75, 79, 81, 81, 85, 87, 89, 92, 94, 97,
+ 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97, 97, 97,
+ 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99, 101, 104, 107, 110,
+ 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111, 110,
+ 62, 59, 58, 57, 57, 57, 56, 58, 61, 65, 66, 71, 74, 78, 82, 83, 90, 92, 95, 100, 102, 108, 110, 115, 117, 117, 120, 118, 116, 119, 123, 126,
+ 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126,
+ 79, 75, 74, 72, 71, 71, 69, 71, 73, 77, 78, 84, 86, 90, 95, 96, 103, 106, 110, 116, 118, 125, 128, 133, 136, 136, 141, 139, 135, 136, 140, 144,
+ 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144,
+ 88, 83, 82, 79, 79, 78, 76, 76, 81, 82, 85, 89, 91, 97, 98, 104, 107, 111, 117, 119, 127, 129, 135, 140, 145, 148, 148, 153, 153, 154, 159, 163,
+ 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, 163, 163,
+ 93, 88, 88, 84, 84, 82, 83, 80, 80, 86, 86, 91, 91, 97, 98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149, 150, 161, 162, 166, 167, 173,
+ }, {
+ 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70,
+ 31, 31, 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 48, 49, 50, 51, 52, 54, 56, 57, 58, 59, 60, 61, 62, 63, 63, 64,
+ 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+ 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 47, 48, 49, 50, 50, 52, 53, 55, 56, 56, 57, 58, 59, 59, 60, 61,
+ 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61,
+ 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58, 59, 60, 61, 61, 63, 63, 64, 65, 66, 67,
+ 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68, 67, 67,
+ 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 69, 70, 71, 71, 72, 70, 71, 72, 73, 74,
+ 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74,
+ 56, 53, 52, 51, 50, 50, 49, 50, 53, 55, 56, 59, 61, 63, 65, 66, 70, 71, 72, 74, 75, 77, 79, 80, 81, 81, 82, 80, 79, 80, 81, 82,
+ 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82,
+ 63, 60, 59, 57, 56, 56, 54, 55, 57, 60, 60, 64, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90, 92, 90, 88, 88, 89, 90,
+ 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94, 92, 90,
+ 67, 63, 62, 60, 60, 59, 57, 57, 60, 61, 63, 65, 66, 70, 70, 73, 75, 77, 80, 81, 85, 86, 89, 91, 93, 94, 94, 96, 96, 95, 97, 98,
+ 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98,
+ 68, 65, 65, 62, 62, 60, 61, 59, 59, 62, 62, 65, 65, 68, 68, 72, 72, 76, 76, 80, 80, 84, 84, 89, 89, 93, 93, 97, 98, 99, 99, 102,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93,
+ 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 37, 38, 41, 43, 45, 48, 49, 53, 54, 60, 61, 65, 68, 72, 74, 75, 78, 81, 83, 85,
+ 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 78, 80, 82, 85,
+ 32, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 40, 41, 43, 44, 46, 49, 50, 53, 54, 58, 59, 63, 66, 70, 71, 72, 75, 77, 79, 81,
+ 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81,
+ 36, 35, 35, 34, 35, 36, 37, 38, 41, 42, 48, 48, 50, 51, 53, 55, 56, 59, 60, 63, 63, 68, 69, 73, 75, 79, 80, 81, 84, 86, 88, 90,
+ 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90, 90, 90,
+ 44, 42, 42, 41, 41, 42, 42, 42, 46, 48, 54, 54, 58, 59, 63, 65, 67, 70, 71, 74, 75, 79, 80, 84, 86, 90, 91, 92, 95, 98, 100, 102,
+ 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103,
+ 53, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71, 73, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, 114, 117,
+ 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117,
+ 66, 63, 62, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 112, 115, 119, 121, 122, 125, 127, 130, 134,
+ 79, 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134,
+ 81, 77, 76, 74, 73, 72, 71, 70, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, 112, 119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151,
+ 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152,
+ 90, 85, 85, 81, 81, 80, 80, 77, 78, 83, 83, 87, 88, 93, 93, 100, 100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151, 155, 155, 160,
+ }, {
+ 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68,
+ 31, 31, 31, 32, 33, 36, 38, 40, 42, 43, 46, 46, 46, 45, 45, 46, 46, 47, 48, 50, 50, 52, 52, 54, 56, 57, 58, 59, 60, 61, 62, 62,
+ 32, 33, 33, 33, 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57, 58, 58, 59, 60, 61, 62,
+ 37, 38, 38, 40, 41, 43, 45, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51, 52, 53, 55, 55, 56, 57, 58, 58, 59,
+ 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59,
+ 48, 47, 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65,
+ 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66, 65, 65,
+ 49, 47, 47, 45, 45, 46, 45, 45, 48, 49, 53, 54, 56, 56, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 66, 67, 68, 68, 69, 70, 71, 71,
+ 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71,
+ 52, 50, 49, 48, 48, 47, 47, 47, 50, 50, 54, 55, 57, 58, 61, 62, 64, 66, 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 78, 79,
+ 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79,
+ 58, 55, 54, 52, 52, 52, 51, 50, 53, 54, 57, 57, 60, 61, 64, 66, 67, 70, 71, 73, 74, 77, 77, 79, 81, 82, 83, 83, 85, 85, 86, 87,
+ 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91, 89, 87,
+ 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 90, 90, 91, 92, 93, 94, 95,
+ 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95,
+ 67, 64, 64, 61, 61, 60, 60, 58, 58, 61, 61, 64, 64, 67, 67, 70, 71, 74, 74, 78, 78, 82, 82, 86, 86, 90, 90, 95, 95, 96, 96, 98,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87,
+ 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79,
+ 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79,
+ 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76,
+ 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76,
+ 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84,
+ 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84,
+ 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96,
+ 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96,
+ 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109,
+ 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109,
+ 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124,
+ 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124,
+ 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141,
+ 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141,
+ 87, 82, 82, 78, 78, 77, 77, 75, 75, 79, 79, 84, 84, 89, 89, 95, 95, 102, 102, 109, 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149,
+ }, {
+ 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66,
+ 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60,
+ 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60,
+ 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57,
+ 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57,
+ 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63,
+ 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63,
+ 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69,
+ 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69,
+ 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77,
+ 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77,
+ 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84,
+ 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84,
+ 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92,
+ 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92,
+ 66, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 73, 73, 77, 77, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 38, 39, 41, 42, 45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66, 73, 73,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57, 60, 61, 65, 66, 72, 72,
+ 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 36, 36, 37, 37, 38, 40, 41, 42, 43, 46, 46, 49, 50, 52, 54, 56, 59, 60, 64, 64, 71, 71,
+ 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70,
+ 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 43, 44, 45, 46, 48, 48, 51, 51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74,
+ 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73, 79, 79,
+ 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 49, 51, 52, 54, 56, 56, 59, 59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82, 82,
+ 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90,
+ 44, 43, 43, 42, 41, 42, 43, 43, 43, 44, 48, 48, 53, 54, 57, 58, 60, 64, 64, 67, 67, 71, 72, 75, 76, 78, 80, 82, 85, 86, 91, 91,
+ 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104,
+ 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104,
+ 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102, 103, 108, 108, 115, 115,
+ 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, 67, 68, 71, 73, 76, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106, 111, 111, 118, 118,
+ 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119, 127, 127,
+ 79, 75, 75, 73, 72, 71, 71, 70, 69, 69, 73, 73, 77, 78, 81, 84, 86, 90, 91, 96, 96, 103, 103, 108, 110, 114, 118, 120, 125, 125, 133, 133,
+ }, {
+ 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63,
+ 31, 31, 31, 32, 32, 33, 35, 37, 40, 40, 43, 43, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55, 58, 58,
+ 31, 31, 31, 32, 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57,
+ 35, 36, 36, 37, 37, 39, 40, 42, 45, 45, 46, 46, 47, 47, 47, 46, 46, 45, 46, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53, 56, 56,
+ 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55,
+ 42, 42, 42, 42, 42, 44, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 58, 58,
+ 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59, 60, 60,
+ 48, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62, 62,
+ 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67,
+ 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 53, 54, 55, 56, 57, 59, 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68,
+ 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75,
+ 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75,
+ 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78, 80, 80,
+ 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 57, 58, 60, 61, 64, 64, 67, 67, 70, 71, 72, 73, 75, 76, 77, 79, 79, 82, 82,
+ 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86,
+ 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 64, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 41, 41, 44, 45, 47, 50, 50, 54, 55, 57, 61, 61, 65,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49, 53, 54, 56, 60, 60, 64,
+ 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 50, 50, 53, 54, 56, 59, 59, 63,
+ 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62,
+ 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62,
+ 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66, 66, 70,
+ 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48, 50, 50, 51, 53, 53, 56, 56, 58, 60, 60, 63, 63, 65, 68, 68, 72,
+ 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76,
+ 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83,
+ 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83,
+ 51, 49, 49, 48, 47, 47, 48, 48, 48, 48, 48, 52, 53, 55, 58, 58, 62, 63, 66, 69, 69, 73, 74, 76, 79, 79, 83, 84, 86, 89, 89, 93,
+ 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92, 96,
+ 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101,
+ 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109,
+ 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109,
+ }, {
+ 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59,
+ 31, 31, 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 48, 48, 50, 51, 51, 53, 53, 55,
+ 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 49, 50, 51, 52, 52, 54,
+ 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51, 51, 53,
+ 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52,
+ 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52,
+ 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55, 55, 57,
+ 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 58,
+ 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61,
+ 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65,
+ 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65,
+ 51, 50, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 52, 54, 54, 56, 57, 58, 61, 61, 62, 63, 64, 65, 65, 67, 67, 68, 69, 69, 70,
+ 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70, 70, 72,
+ 54, 52, 51, 51, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74,
+ 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78,
+ 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 42, 42, 45, 46, 47, 50, 51, 52, 55, 55,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54,
+ 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54,
+ 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 40, 40, 40, 42, 43, 43, 45, 46, 47, 49, 50, 51, 54, 54,
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53, 55, 55,
+ 35, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 39, 42, 42, 44, 47, 47, 48, 49, 49, 51, 52, 52, 54, 55, 56, 58, 59, 60, 62, 62,
+ 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63,
+ 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 46, 49, 49, 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67,
+ 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75,
+ 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75,
+ 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, 79, 79,
+ 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87,
+ 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87,
+ 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92,
+ }, {
+ 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54,
+ 31, 31, 31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 40, 42, 42, 45, 47, 47, 47, 46, 46, 46, 46, 46, 47, 48, 48, 49, 49, 50, 51, 51,
+ 31, 31, 31, 31, 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50,
+ 31, 32, 32, 32, 32, 33, 33, 36, 36, 37, 41, 41, 42, 43, 43, 45, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50,
+ 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49,
+ 37, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49,
+ 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50,
+ 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 50, 50, 51, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 53, 54, 55, 55,
+ 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55,
+ 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57,
+ 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62,
+ 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62,
+ 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63, 65, 65,
+ 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68,
+ 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68,
+ 54, 53, 52, 52, 50, 50, 50, 49, 49, 49, 48, 48, 50, 52, 52, 54, 55, 55, 57, 59, 59, 61, 62, 63, 65, 65, 66, 68, 68, 69, 71, 71,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46, 46, 49,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48,
+ 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45, 47, 48, 48, 50,
+ 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56,
+ 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58,
+ 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58,
+ 40, 39, 39, 39, 39, 38, 38, 38, 39, 39, 39, 40, 41, 41, 42, 45, 45, 46, 50, 51, 51, 53, 54, 54, 56, 59, 59, 59, 61, 62, 62, 64,
+ 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69,
+ 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69,
+ 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73,
+ 53, 52, 51, 51, 50, 49, 49, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 55, 59, 60, 60, 63, 65, 65, 67, 71, 71, 72, 75, 76, 76, 79,
+ }, {
+ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51,
+ 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 46, 46, 47, 48, 48, 48, 49,
+ 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47,
+ 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47,
+ 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47,
+ 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47,
+ 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47,
+ 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 48, 48, 48, 48,
+ 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52,
+ 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54,
+ 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54,
+ 49, 48, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 56, 56, 56, 57, 57, 57, 58,
+ 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60,
+ 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60,
+ 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62,
+ 52, 51, 50, 50, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 59, 61, 61, 62, 63, 64, 64, 65,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+ 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 48, 48,
+ 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53,
+ 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53,
+ 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53,
+ 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 51, 51, 51, 52, 54, 54, 54, 56, 58, 58,
+ 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63,
+ 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63,
+ }, {
+ 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45,
+ 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45,
+ 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
+ 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
+ 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
+ 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49,
+ 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53,
+ 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53,
+ 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53,
+ 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 50, 50, 50, 51, 53, 53, 53, 54, 54, 54, 54, 55, 56, 56,
+ 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58,
+ 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40, 41, 42, 42, 42, 42,
+ 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46, 46, 46,
+ 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49,
+ 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49,
+ 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49,
+ }, {
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 48, 48, 48, 48,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42, 44, 46, 47, 47, 47, 47,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46,
+ 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47,
+ 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41, 41, 41, 43, 44, 45, 45, 45, 46, 46, 46, 46, 46, 47, 47, 48, 48, 48, 47,
+ 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47,
+ 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47,
+ 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47,
+ 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 49, 50, 50, 50, 49,
+ 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52, 52, 52,
+ 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53,
+ 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53,
+ 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39,
+ }, {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 35, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 39, 40, 41, 41, 41, 41, 42, 42, 43, 43,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44,
+ 35, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 39, 40, 40, 40, 40, 40, 42, 43, 44, 45, 45, 45, 45, 45, 45, 46, 46,
+ 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+ 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34,
+ }, {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, 35, 35,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 37, 37, 37, 37, 37, 37, 38,
+ 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39,
+ 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40,
+ 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42,
+ 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 42, 43, 43, 43, 43, 43, 43, 44,
+ },
+ }, {
+ {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ }, {
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ },
+ },
+};
+
+static const uint8_t qm_tbl_32x32_t[][2][528] = {
+ {
+ {
+ 32,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 33, 33,
+ 32, 32, 32, 33, 34, 35,
+ 34, 34, 33, 34, 35, 37, 39,
+ 35, 34, 34, 35, 36, 37, 41, 43,
+ 36, 35, 34, 35, 36, 38, 42, 45, 48,
+ 39, 38, 37, 38, 39, 40, 45, 47, 50, 54,
+ 44, 42, 41, 41, 42, 42, 47, 50, 54, 58, 63,
+ 46, 44, 42, 43, 44, 44, 49, 52, 55, 59, 65, 67,
+ 48, 46, 44, 45, 45, 46, 51, 53, 57, 61, 67, 69, 71,
+ 54, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 74, 76, 82,
+ 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, 92,
+ 62, 59, 56, 56, 56, 55, 60, 63, 66, 71, 77, 80, 83, 89, 95, 98,
+ 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105,
+ 71, 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90, 97, 103, 107, 111, 117,
+ 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90, 93, 96, 104, 110, 114, 118, 125, 134,
+ 81, 77, 73, 73, 72, 70, 75, 77, 80, 85, 91, 94, 97, 105, 111, 115, 119, 126, 135, 137,
+ 83, 78, 75, 74, 74, 72, 76, 79, 81, 86, 92, 95, 99, 106, 113, 117, 121, 128, 137, 138, 140,
+ 88, 84, 80, 79, 78, 76, 80, 82, 85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152,
+ 91, 86, 83, 82, 81, 79, 81, 84, 88, 92, 95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151, 154, 159,
+ 94, 89, 86, 85, 84, 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136, 139, 146, 156, 158, 161, 166,
+ 97, 92, 90, 88, 86, 85, 84, 89, 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143, 148, 152, 163, 166, 168, 174,
+ 101, 95, 93, 91, 89, 89, 87, 91, 93, 98, 101, 105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171, 174, 176, 183,
+ 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111, 116, 122, 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191,
+ 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110, 113, 120, 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193, 200,
+ 111, 105, 104, 101, 100, 99, 97, 96, 102, 103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168, 173, 180, 189, 195, 200, 202, 210,
+ 115, 109, 108, 104, 104, 102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131, 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204, 210, 212, 220,
+ 119, 113, 112, 107, 107, 106, 105, 103, 105, 110, 112, 117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179, 183, 193, 197, 210, 214, 220, 222, 231,
+ 123, 116, 116, 111, 111, 109, 110, 107, 107, 114, 114, 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176, 177, 190, 191, 204, 206, 222, 224, 230, 232, 242,
+ }, {
+ 32,
+ 31, 31,
+ 30, 31, 32,
+ 32, 33, 33, 35,
+ 33, 34, 35, 37, 39,
+ 36, 38, 40, 41, 43, 47,
+ 41, 42, 42, 43, 45, 47, 48,
+ 45, 45, 44, 45, 46, 47, 49, 50,
+ 49, 47, 46, 47, 47, 48, 50, 51, 53,
+ 48, 47, 45, 46, 46, 46, 49, 51, 53, 54,
+ 49, 47, 45, 45, 45, 45, 49, 51, 53, 55, 58,
+ 50, 47, 45, 46, 46, 46, 49, 51, 54, 56, 59, 60,
+ 50, 48, 46, 46, 46, 46, 50, 52, 54, 56, 60, 60, 61,
+ 52, 50, 47, 47, 47, 47, 50, 52, 54, 57, 61, 62, 63, 66,
+ 54, 52, 49, 49, 49, 48, 52, 53, 55, 58, 62, 64, 65, 68, 71,
+ 56, 53, 51, 50, 50, 49, 52, 54, 56, 59, 63, 64, 66, 69, 72, 73,
+ 57, 54, 52, 51, 51, 50, 53, 55, 56, 60, 63, 65, 67, 70, 73, 75, 76,
+ 60, 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72, 75, 77, 79, 82,
+ 63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67, 69, 71, 75, 78, 80, 82, 85, 89,
+ 64, 61, 58, 57, 57, 55, 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90,
+ 65, 61, 58, 58, 57, 55, 58, 60, 61, 64, 68, 70, 71, 75, 79, 81, 83, 86, 90, 91, 91,
+ 67, 63, 61, 60, 59, 57, 60, 61, 63, 66, 69, 70, 73, 77, 79, 81, 85, 88, 90, 92, 94, 96,
+ 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74, 75, 78, 82, 84, 86, 90, 93, 94, 96, 98,
+ 69, 65, 63, 62, 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87, 89, 92, 96, 97, 98, 100,
+ 70, 66, 64, 63, 62, 61, 60, 63, 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98, 99, 100, 102,
+ 71, 67, 66, 64, 63, 62, 61, 63, 64, 67, 68, 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100, 101, 102, 104,
+ 72, 68, 67, 65, 64, 64, 61, 63, 65, 67, 68, 71, 73, 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102, 103, 104, 106,
+ 73, 69, 68, 66, 65, 65, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104, 105, 106, 109,
+ 74, 70, 70, 67, 66, 66, 64, 63, 66, 67, 70, 71, 74, 75, 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106, 108, 108, 111,
+ 75, 71, 71, 68, 68, 67, 66, 64, 66, 68, 70, 71, 74, 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108, 110, 111, 113,
+ 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70, 72, 74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104, 105, 109, 111, 112, 113, 116,
+ 78, 74, 74, 70, 70, 69, 69, 66, 66, 70, 70, 74, 74, 77, 78, 82, 82, 86, 87, 92, 92, 96, 97, 102, 102, 107, 107, 112, 113, 115, 115, 118,
+ },
+ }, {
+ {
+ 32,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 33,
+ 32, 32, 32, 33, 34, 35,
+ 32, 33, 33, 33, 34, 36, 36,
+ 34, 34, 33, 34, 35, 37, 38, 39,
+ 36, 35, 34, 35, 36, 38, 40, 42, 48,
+ 38, 37, 36, 36, 38, 39, 41, 44, 50, 51,
+ 39, 38, 37, 38, 39, 40, 42, 45, 50, 52, 54,
+ 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58, 63,
+ 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69,
+ 49, 47, 46, 45, 46, 46, 48, 51, 57, 60, 62, 68, 71, 73,
+ 54, 51, 50, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82,
+ 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92,
+ 61, 58, 56, 56, 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97,
+ 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, 98, 101, 105,
+ 71, 68, 65, 65, 64, 63, 65, 68, 73, 76, 78, 84, 89, 92, 97, 103, 106, 111, 117,
+ 76, 72, 70, 69, 68, 66, 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122, 127,
+ 80, 76, 73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98, 104, 110, 113, 118, 125, 130, 134,
+ 83, 78, 76, 75, 74, 72, 73, 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121, 128, 133, 137, 140,
+ 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, 103, 109, 116, 119, 124, 131, 136, 140, 144, 147,
+ 89, 85, 82, 81, 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114, 120, 128, 131, 136, 146, 147, 150, 155,
+ 92, 88, 85, 84, 82, 81, 80, 85, 86, 90, 95, 97, 102, 107, 110, 117, 122, 125, 134, 138, 142, 152, 154, 156, 162,
+ 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95, 99, 105, 106, 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163, 169,
+ 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102, 104, 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, 176,
+ 101, 96, 95, 92, 91, 90, 88, 88, 93, 95, 99, 103, 106, 112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176, 178, 184,
+ 104, 99, 98, 95, 94, 93, 91, 90, 95, 96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156, 160, 167, 174, 180, 184, 186, 193,
+ 108, 102, 101, 98, 97, 96, 95, 93, 97, 100, 104, 106, 111, 113, 121, 122, 130, 132, 140, 143, 150, 155, 162, 169, 174, 183, 188, 192, 194, 201,
+ 111, 105, 105, 101, 100, 99, 98, 96, 98, 103, 105, 109, 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178, 181, 193, 196, 201, 202, 210,
+ 114, 109, 109, 104, 104, 102, 102, 99, 100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140, 151, 151, 162, 162, 175, 176, 187, 188, 203, 204, 210, 211, 219,
+ }, {
+ 32,
+ 31, 31,
+ 30, 31, 31,
+ 31, 32, 32, 33,
+ 33, 34, 35, 36, 39,
+ 36, 38, 39, 40, 43, 47,
+ 38, 40, 41, 41, 44, 47, 47,
+ 41, 42, 42, 43, 45, 47, 48, 48,
+ 49, 47, 46, 46, 47, 48, 49, 50, 53,
+ 49, 47, 46, 46, 46, 47, 48, 50, 53, 53,
+ 48, 47, 46, 45, 46, 46, 48, 49, 53, 54, 54,
+ 49, 47, 45, 45, 45, 45, 47, 49, 53, 55, 55, 58,
+ 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61,
+ 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62,
+ 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66,
+ 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, 64, 66, 68, 71,
+ 55, 53, 51, 50, 50, 49, 50, 52, 56, 58, 59, 63, 65, 66, 69, 72, 73,
+ 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, 66, 67, 70, 73, 74, 76,
+ 60, 57, 55, 54, 53, 52, 53, 55, 58, 60, 61, 65, 68, 69, 72, 75, 77, 79, 82,
+ 62, 59, 57, 56, 55, 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86,
+ 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 63, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89,
+ 65, 61, 59, 58, 57, 55, 56, 58, 61, 63, 64, 68, 71, 72, 75, 79, 80, 83, 86, 88, 90, 91,
+ 66, 63, 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80, 81, 84, 87, 90, 91, 93, 94,
+ 67, 64, 62, 61, 59, 58, 58, 60, 63, 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89, 93, 94, 95, 97,
+ 68, 65, 63, 62, 60, 59, 58, 61, 62, 64, 67, 68, 71, 74, 75, 79, 81, 83, 87, 89, 91, 95, 96, 97, 99,
+ 69, 66, 64, 63, 61, 61, 59, 61, 62, 65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88, 91, 92, 97, 98, 98, 101,
+ 70, 67, 65, 63, 62, 62, 60, 61, 63, 65, 66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, 100, 100, 103,
+ 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69, 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, 105,
+ 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73, 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107,
+ 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77, 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109,
+ 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76, 78, 80, 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108, 111,
+ 75, 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 84, 88, 89, 93, 93, 98, 98, 102, 103, 108, 108, 110, 110, 113,
+ },
+ }, {
+ {
+ 32,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 33,
+ 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 34, 34, 35,
+ 34, 34, 33, 33, 35, 36, 37, 39,
+ 34, 34, 34, 34, 36, 36, 37, 41, 42,
+ 36, 35, 34, 34, 36, 37, 38, 42, 45, 48,
+ 39, 38, 38, 37, 39, 40, 40, 45, 47, 50, 54,
+ 41, 39, 39, 38, 40, 40, 41, 46, 48, 51, 55, 56,
+ 44, 42, 41, 41, 42, 42, 42, 47, 50, 54, 58, 59, 63,
+ 48, 46, 45, 44, 45, 45, 45, 50, 53, 56, 61, 62, 66, 70,
+ 49, 47, 46, 45, 46, 46, 46, 51, 53, 57, 62, 63, 68, 71, 73,
+ 54, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 76, 77, 82,
+ 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90,
+ 59, 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82, 87, 91, 93,
+ 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 87, 92, 97, 99, 105,
+ 69, 66, 64, 63, 63, 62, 61, 66, 68, 71, 76, 78, 83, 88, 90, 96, 100, 102, 109, 113,
+ 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90, 92, 97, 102, 104, 111, 115, 117,
+ 80, 76, 73, 72, 71, 70, 69, 74, 76, 79, 84, 86, 90, 96, 98, 104, 109, 111, 118, 123, 125, 134,
+ 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 120, 125, 127, 136, 137,
+ 83, 78, 76, 75, 74, 73, 72, 76, 78, 81, 86, 88, 92, 98, 100, 106, 111, 113, 121, 126, 128, 137, 139, 140,
+ 87, 83, 81, 79, 78, 77, 75, 80, 82, 85, 90, 91, 96, 101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150,
+ 90, 85, 83, 81, 80, 79, 78, 81, 83, 87, 89, 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151, 156,
+ 93, 88, 86, 84, 83, 82, 80, 82, 85, 89, 90, 96, 98, 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, 156, 157, 163,
+ 95, 90, 89, 86, 85, 85, 83, 83, 88, 89, 93, 97, 99, 105, 106, 113, 116, 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169,
+ 98, 93, 92, 89, 88, 87, 86, 85, 89, 90, 96, 97, 102, 105, 109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169, 170, 176,
+ 101, 96, 95, 91, 91, 90, 89, 87, 90, 93, 97, 99, 104, 105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155, 160, 168, 172, 176, 177, 184,
+ 104, 99, 98, 94, 94, 92, 92, 90, 92, 96, 98, 102, 104, 109, 112, 116, 121, 125, 130, 135, 141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191,
+ 107, 101, 101, 97, 97, 95, 95, 93, 93, 99, 99, 105, 105, 112, 112, 120, 120, 129, 129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186, 191, 192, 199,
+ }, {
+ 32,
+ 31, 31,
+ 30, 31, 31,
+ 30, 31, 31, 32,
+ 33, 34, 35, 35, 39,
+ 35, 36, 37, 37, 41, 43,
+ 36, 38, 39, 40, 43, 45, 47,
+ 41, 42, 42, 42, 45, 46, 47, 48,
+ 44, 44, 44, 44, 46, 46, 47, 49, 50,
+ 49, 47, 47, 46, 47, 47, 48, 50, 51, 53,
+ 48, 47, 46, 45, 46, 46, 46, 49, 51, 53, 54,
+ 48, 47, 46, 45, 46, 46, 46, 49, 51, 53, 54, 55,
+ 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56, 58,
+ 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61,
+ 51, 48, 47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62,
+ 52, 50, 48, 47, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63, 64, 66,
+ 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 64, 65, 68, 70,
+ 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, 62, 65, 66, 68, 70, 71,
+ 57, 54, 53, 52, 51, 50, 50, 53, 54, 56, 60, 61, 63, 66, 67, 70, 73, 73, 76,
+ 59, 56, 54, 53, 53, 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80,
+ 60, 57, 55, 54, 53, 53, 52, 55, 56, 58, 61, 63, 65, 68, 69, 72, 75, 76, 79, 81, 82,
+ 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 63, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89,
+ 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 89, 90,
+ 65, 61, 60, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85, 86, 90, 91, 91,
+ 67, 63, 61, 60, 59, 58, 57, 60, 61, 63, 65, 66, 69, 72, 73, 77, 79, 80, 84, 86, 88, 92, 93, 93, 95,
+ 68, 64, 63, 61, 60, 59, 58, 60, 61, 63, 65, 67, 70, 71, 74, 76, 78, 81, 83, 86, 88, 89, 94, 94, 95, 97,
+ 68, 65, 64, 62, 61, 60, 58, 59, 61, 64, 64, 68, 69, 71, 74, 75, 79, 80, 83, 86, 87, 91, 92, 95, 96, 97, 99,
+ 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67, 69, 72, 72, 76, 78, 80, 83, 84, 88, 89, 92, 94, 97, 98, 99, 101,
+ 70, 67, 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 86, 90, 91, 94, 96, 99, 100, 100, 103,
+ 71, 67, 67, 64, 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, 74, 78, 79, 83, 84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104,
+ 72, 68, 68, 65, 65, 64, 63, 61, 62, 65, 66, 68, 69, 71, 73, 75, 77, 79, 82, 84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104, 106,
+ 73, 69, 69, 66, 66, 64, 64, 62, 62, 66, 66, 69, 69, 72, 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99, 99, 104, 104, 106, 106, 108,
+ },
+ }, {
+ {
+ 32,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 33,
+ 31, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 33, 34, 35,
+ 32, 33, 33, 33, 34, 34, 36, 36,
+ 34, 34, 34, 33, 35, 35, 37, 38, 39,
+ 35, 35, 34, 34, 36, 36, 38, 39, 42, 46,
+ 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48,
+ 39, 38, 38, 37, 39, 39, 40, 42, 45, 49, 50, 54,
+ 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55, 57,
+ 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60, 63,
+ 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69,
+ 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70, 71,
+ 54, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82,
+ 56, 53, 52, 51, 51, 51, 51, 53, 56, 60, 61, 66, 69, 73, 77, 78, 84, 86,
+ 59, 56, 55, 54, 54, 54, 53, 55, 58, 62, 64, 69, 71, 75, 79, 80, 87, 89, 92,
+ 64, 61, 60, 58, 58, 58, 57, 59, 62, 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102,
+ 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 75, 79, 84, 85, 92, 94, 98, 103, 105,
+ 71, 68, 67, 65, 64, 64, 63, 65, 68, 72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111, 117,
+ 74, 71, 69, 68, 67, 67, 65, 67, 70, 74, 75, 80, 83, 86, 91, 93, 100, 102, 106, 112, 114, 120, 123,
+ 80, 76, 74, 72, 71, 71, 69, 71, 74, 78, 79, 84, 86, 90, 95, 96, 104, 106, 110, 116, 118, 125, 128, 134,
+ 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139,
+ 83, 78, 77, 75, 74, 74, 72, 73, 76, 80, 81, 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139, 140,
+ 87, 83, 81, 79, 78, 78, 75, 77, 80, 83, 85, 90, 92, 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142, 144, 145, 150,
+ 90, 85, 84, 81, 80, 80, 78, 78, 82, 84, 87, 91, 93, 98, 99, 106, 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156,
+ 92, 88, 87, 84, 83, 82, 80, 80, 84, 85, 90, 91, 95, 98, 102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152, 155, 156, 162,
+ 95, 90, 89, 86, 85, 84, 83, 82, 85, 87, 91, 92, 97, 98, 105, 105, 112, 114, 121, 123, 129, 133, 138, 143, 147, 155, 158, 161, 162, 168,
+ 97, 92, 92, 88, 88, 86, 86, 84, 85, 90, 91, 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133, 140, 143, 150, 152, 162, 164, 168, 168, 174,
+ 100, 95, 95, 90, 90, 89, 89, 86, 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129, 137, 137, 147, 148, 157, 158, 169, 170, 174, 175, 181,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 30, 31, 31, 32,
+ 33, 34, 34, 34, 37,
+ 33, 34, 35, 35, 38, 39,
+ 36, 38, 39, 40, 42, 43, 47,
+ 38, 40, 40, 41, 43, 44, 47, 47,
+ 41, 42, 42, 42, 44, 45, 47, 48, 48,
+ 47, 46, 46, 45, 46, 47, 47, 48, 50, 52,
+ 49, 47, 47, 46, 47, 47, 48, 49, 50, 52, 53,
+ 48, 47, 46, 45, 46, 46, 46, 48, 49, 52, 53, 54,
+ 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55, 55,
+ 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58,
+ 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61,
+ 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60, 61, 61,
+ 52, 50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 63, 66,
+ 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, 59, 62, 64, 64, 67, 68,
+ 54, 52, 51, 49, 49, 49, 48, 49, 52, 55, 55, 58, 60, 62, 64, 65, 68, 69, 71,
+ 56, 54, 53, 51, 51, 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75,
+ 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 56, 60, 61, 63, 66, 67, 70, 71, 73, 76, 76,
+ 60, 57, 56, 54, 53, 53, 52, 53, 55, 58, 58, 61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82,
+ 61, 58, 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69, 73, 74, 76, 79, 80, 83, 84,
+ 63, 60, 59, 57, 56, 56, 54, 55, 57, 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89,
+ 64, 61, 60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68, 70, 71, 75, 77, 79, 82, 82, 86, 87, 90, 91,
+ 65, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 88, 90, 91, 91,
+ 67, 63, 62, 60, 59, 59, 57, 58, 60, 62, 63, 66, 67, 69, 72, 73, 77, 78, 80, 83, 84, 88, 89, 92, 93, 93, 95,
+ 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65, 67, 70, 70, 74, 75, 78, 80, 81, 85, 86, 89, 91, 93, 94, 95, 97,
+ 68, 65, 64, 62, 61, 60, 59, 58, 61, 61, 64, 65, 67, 69, 71, 73, 75, 78, 79, 83, 83, 87, 88, 91, 93, 95, 96, 97, 99,
+ 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 72, 76, 76, 80, 81, 84, 86, 88, 90, 92, 95, 96, 98, 98, 100,
+ 70, 66, 66, 63, 63, 62, 61, 60, 60, 63, 64, 66, 67, 69, 71, 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, 100, 100, 102,
+ 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67, 70, 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100, 101, 101, 104,
+ },
+ }, {
+ {
+ 32,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 32,
+ 31, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 33, 33, 34,
+ 32, 32, 32, 32, 33, 34, 35, 35,
+ 33, 33, 33, 33, 34, 35, 36, 36, 38,
+ 34, 34, 34, 33, 34, 35, 36, 37, 39, 39,
+ 36, 35, 35, 34, 35, 36, 37, 38, 42, 42, 48,
+ 36, 35, 35, 34, 35, 36, 38, 38, 42, 43, 48, 49,
+ 39, 38, 38, 37, 38, 39, 40, 40, 44, 45, 50, 51, 54,
+ 41, 39, 39, 38, 39, 40, 40, 41, 45, 46, 51, 52, 55, 56,
+ 44, 42, 42, 41, 41, 42, 42, 42, 46, 47, 54, 54, 58, 59, 63,
+ 46, 44, 44, 42, 43, 44, 44, 44, 48, 49, 55, 55, 59, 61, 65, 67,
+ 48, 46, 46, 44, 45, 45, 45, 46, 50, 51, 57, 57, 61, 63, 67, 69, 71,
+ 52, 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59, 64, 65, 70, 72, 74, 78,
+ 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71, 74, 76, 80, 82,
+ 58, 56, 55, 53, 53, 53, 53, 53, 57, 58, 63, 64, 68, 70, 75, 77, 80, 84, 86, 91,
+ 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70, 75, 78, 80, 85, 87, 91, 92,
+ 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105,
+ 66, 63, 63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, 99, 106, 107,
+ 71, 68, 67, 65, 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, 97, 103, 103, 111, 112, 117,
+ 74, 71, 70, 68, 67, 67, 66, 65, 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105, 106, 114, 115, 120, 123,
+ 80, 76, 75, 72, 72, 71, 70, 69, 73, 74, 79, 79, 84, 86, 90, 93, 96, 101, 104, 110, 110, 118, 119, 125, 128, 134,
+ 81, 77, 77, 74, 73, 73, 71, 71, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, 112, 120, 121, 127, 130, 136, 137,
+ 83, 78, 78, 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88, 92, 95, 99, 104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140,
+ 86, 82, 81, 78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91, 95, 98, 101, 106, 109, 115, 116, 124, 125, 131, 135, 140, 142, 144, 147,
+ 89, 84, 84, 80, 80, 79, 78, 77, 79, 81, 85, 86, 91, 92, 97, 98, 104, 106, 112, 114, 119, 123, 128, 132, 135, 142, 145, 148, 149, 153,
+ 91, 86, 86, 82, 82, 81, 80, 79, 80, 84, 85, 88, 91, 94, 97, 100, 104, 107, 112, 115, 120, 123, 129, 132, 138, 140, 148, 150, 153, 154, 159,
+ 93, 88, 88, 84, 84, 83, 83, 80, 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118, 119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 30, 31, 31, 32,
+ 31, 32, 32, 33, 34,
+ 33, 34, 35, 35, 37, 39,
+ 35, 37, 37, 38, 39, 41, 44,
+ 36, 38, 39, 40, 41, 43, 46, 47,
+ 40, 41, 41, 42, 43, 44, 46, 47, 48,
+ 41, 42, 42, 42, 43, 45, 46, 47, 48, 48,
+ 49, 47, 47, 46, 46, 47, 47, 48, 50, 50, 53,
+ 49, 47, 47, 46, 46, 47, 47, 47, 49, 50, 53, 53,
+ 48, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54,
+ 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53, 53, 54, 55,
+ 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56, 58,
+ 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56, 57, 59, 60,
+ 50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60, 60, 61,
+ 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65,
+ 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65, 66,
+ 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 55, 58, 59, 62, 63, 65, 67, 68, 70,
+ 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71,
+ 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76,
+ 57, 55, 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, 67, 70, 71, 73, 74, 77, 77,
+ 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61, 63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82,
+ 61, 58, 57, 55, 55, 54, 53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80, 83, 84,
+ 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67, 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89,
+ 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 89, 90,
+ 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64, 65, 68, 70, 71, 74, 75, 78, 79, 83, 83, 86, 88, 90, 91, 91,
+ 66, 63, 62, 60, 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, 79, 80, 84, 84, 87, 89, 91, 92, 93, 94,
+ 67, 64, 63, 61, 60, 59, 58, 57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87, 89, 92, 93, 94, 94, 96,
+ 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62, 64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94, 96, 96, 98,
+ 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97, 99,
+ },
+ }, {
+ {
+ 32,
+ 31, 32,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 34, 34, 35,
+ 32, 32, 32, 32, 32, 34, 34, 35, 35,
+ 34, 34, 34, 33, 33, 35, 35, 37, 37, 39,
+ 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39,
+ 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
+ 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48,
+ 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54,
+ 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54,
+ 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63,
+ 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63,
+ 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71,
+ 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71, 71,
+ 54, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82,
+ 54, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82,
+ 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92,
+ 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92,
+ 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105,
+ 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105,
+ 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117,
+ 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117, 117,
+ 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134,
+ 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134, 134,
+ 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, 128, 128, 137, 137, 140,
+ 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140,
+ 87, 83, 83, 79, 79, 77, 77, 75, 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109, 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 30, 31, 31, 32,
+ 30, 31, 31, 32, 32,
+ 33, 34, 34, 35, 35, 39,
+ 33, 34, 34, 35, 35, 39, 39,
+ 36, 38, 38, 40, 40, 43, 43, 47,
+ 36, 38, 38, 40, 40, 43, 43, 47, 47,
+ 41, 42, 42, 42, 42, 45, 45, 47, 47, 48,
+ 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48,
+ 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53,
+ 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 53,
+ 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54,
+ 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54,
+ 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58,
+ 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58,
+ 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61,
+ 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61, 61,
+ 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66,
+ 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66,
+ 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71,
+ 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71,
+ 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76,
+ 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 76,
+ 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82,
+ 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82,
+ 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89,
+ 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89,
+ 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91,
+ 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91, 91,
+ 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 32, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 33, 33, 34,
+ 32, 32, 32, 32, 32, 33, 34, 34, 35,
+ 32, 32, 32, 32, 33, 33, 34, 34, 35, 35,
+ 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39,
+ 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 39,
+ 35, 35, 35, 34, 34, 35, 36, 36, 38, 38, 42, 42, 46,
+ 36, 35, 35, 34, 34, 35, 36, 37, 38, 38, 42, 42, 47, 48,
+ 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 50, 51,
+ 39, 38, 38, 38, 37, 38, 39, 39, 40, 41, 45, 45, 49, 50, 52, 54,
+ 41, 40, 40, 39, 38, 39, 40, 40, 41, 41, 46, 46, 50, 52, 54, 55, 57,
+ 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54, 56, 58, 60, 63,
+ 45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48, 48, 53, 54, 57, 58, 60, 64, 65,
+ 48, 46, 46, 45, 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71,
+ 48, 46, 46, 45, 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71,
+ 53, 51, 51, 49, 49, 49, 49, 49, 49, 49, 54, 54, 58, 59, 62, 64, 67, 71, 72, 75, 75, 81,
+ 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82,
+ 57, 55, 55, 53, 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74, 75, 79, 79, 85, 85, 89,
+ 59, 56, 56, 54, 54, 54, 54, 54, 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80, 86, 87, 90, 92,
+ 62, 59, 59, 57, 56, 56, 56, 56, 55, 56, 60, 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93, 95, 98,
+ 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, 67, 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105,
+ 67, 64, 64, 62, 61, 61, 60, 60, 59, 60, 64, 64, 68, 69, 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106, 108,
+ 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68, 72, 73, 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113, 117,
+ 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72, 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111, 113, 118, 119,
+ 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, 134,
+ 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, 134, 134,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 30, 31, 31, 31,
+ 30, 31, 31, 31, 32,
+ 32, 32, 33, 33, 33, 35,
+ 33, 34, 34, 35, 35, 37, 39,
+ 34, 35, 35, 36, 36, 38, 40, 41,
+ 36, 38, 38, 39, 40, 41, 43, 44, 47,
+ 37, 38, 39, 40, 40, 42, 43, 44, 47, 47,
+ 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48,
+ 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 48,
+ 47, 46, 46, 46, 45, 46, 47, 47, 47, 48, 50, 50, 52,
+ 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50, 50, 52, 53,
+ 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53,
+ 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52, 53, 54, 54,
+ 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54, 55, 55,
+ 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57, 58,
+ 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, 57, 59, 59,
+ 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61,
+ 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61,
+ 52, 50, 49, 48, 47, 47, 47, 47, 46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66,
+ 52, 50, 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66,
+ 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54, 55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69,
+ 54, 52, 52, 50, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68, 70, 71,
+ 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58, 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73,
+ 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63, 64, 67, 67, 70, 70, 72, 73, 75, 76,
+ 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56, 57, 59, 60, 62, 64, 65, 67, 67, 71, 71, 73, 74, 75, 77, 78,
+ 60, 57, 57, 55, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, 68, 68, 72, 72, 74, 75, 77, 79, 80, 82,
+ 60, 57, 57, 55, 54, 54, 54, 53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75, 76, 77, 79, 80, 82, 82,
+ 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89,
+ 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 32,
+ 31, 32, 32, 32,
+ 31, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35,
+ 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38,
+ 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39, 39,
+ 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42,
+ 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48,
+ 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48,
+ 38, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52,
+ 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, 45, 47, 50, 50, 53, 54,
+ 41, 40, 40, 39, 38, 38, 40, 40, 40, 41, 41, 45, 46, 48, 52, 52, 54, 55, 57,
+ 44, 42, 42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63,
+ 44, 42, 42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63,
+ 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69,
+ 48, 47, 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57, 60, 61, 63, 67, 67, 70, 71,
+ 50, 49, 48, 47, 46, 46, 47, 47, 47, 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68, 68, 72, 73, 75,
+ 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82,
+ 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82,
+ 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90,
+ 59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57, 58, 60, 64, 64, 68, 69, 71, 75, 75, 79, 80, 83, 87, 87, 91, 92,
+ 61, 59, 58, 57, 56, 56, 56, 56, 55, 55, 55, 59, 60, 62, 65, 65, 69, 70, 73, 77, 77, 81, 82, 85, 89, 89, 93, 94, 97,
+ 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105,
+ 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105, 105,
+ 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67, 69, 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105, 109, 109, 114,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 30, 31, 31, 31, 32,
+ 30, 31, 31, 31, 32, 32,
+ 33, 33, 34, 34, 34, 34, 37,
+ 33, 34, 34, 35, 35, 35, 38, 39,
+ 34, 36, 36, 36, 37, 37, 40, 40, 42,
+ 36, 38, 38, 39, 40, 40, 42, 43, 45, 47,
+ 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47,
+ 40, 41, 41, 41, 42, 42, 44, 44, 45, 47, 47, 48,
+ 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48, 48,
+ 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49, 50,
+ 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53,
+ 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53,
+ 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54,
+ 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 54,
+ 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 55, 55,
+ 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58,
+ 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58,
+ 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61,
+ 50, 49, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 58, 60, 60, 61, 61,
+ 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63,
+ 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66,
+ 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66,
+ 54, 52, 51, 50, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70,
+ 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 71,
+ 55, 53, 53, 52, 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, 63, 63, 65, 66, 67, 69, 69, 71, 72, 73,
+ 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76,
+ 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76, 76,
+ 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56, 58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78, 80,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 32,
+ 31, 31, 32, 32,
+ 31, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35,
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36,
+ 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39,
+ 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39,
+ 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39, 41, 41, 43,
+ 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+ 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48,
+ 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, 50, 50, 51,
+ 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54,
+ 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54,
+ 42, 41, 41, 41, 40, 40, 40, 41, 41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60,
+ 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, 54, 54, 56, 58, 58, 61, 63,
+ 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45, 48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64,
+ 47, 46, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69,
+ 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51, 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71,
+ 49, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57, 60, 62, 62, 66, 68, 68, 71, 72, 73,
+ 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51, 54, 54, 57, 59, 59, 62, 64, 64, 69, 71, 71, 74, 75, 77, 81,
+ 54, 52, 51, 51, 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82,
+ 55, 53, 53, 52, 51, 50, 50, 51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73, 76, 77, 78, 83, 83, 85,
+ 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92,
+ 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 30, 31, 31, 31, 31,
+ 30, 31, 31, 31, 31, 32,
+ 31, 31, 32, 32, 32, 32, 33,
+ 33, 34, 34, 34, 35, 35, 35, 38,
+ 33, 34, 34, 34, 35, 35, 36, 38, 39,
+ 34, 35, 35, 36, 36, 36, 37, 40, 40, 41,
+ 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47,
+ 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 47,
+ 38, 39, 40, 40, 41, 41, 41, 43, 44, 45, 47, 47, 47,
+ 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48,
+ 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48,
+ 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48, 49, 49, 50,
+ 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+ 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53, 53,
+ 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53,
+ 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54,
+ 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54,
+ 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57,
+ 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 55, 55, 57, 58,
+ 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59,
+ 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 59, 61,
+ 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61,
+ 51, 49, 48, 48, 47, 46, 46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 62, 62,
+ 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66,
+ 52, 50, 50, 49, 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 66,
+ 53, 51, 50, 50, 48, 48, 48, 48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62, 63, 64, 64, 67, 67, 68,
+ 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71,
+ 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, 71,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 32,
+ 31, 31, 32, 32,
+ 31, 31, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35,
+ 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36,
+ 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39,
+ 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39,
+ 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40, 41,
+ 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, 42, 43, 46,
+ 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48,
+ 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48,
+ 38, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51,
+ 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50, 50, 52, 54,
+ 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54,
+ 41, 40, 40, 40, 39, 38, 38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55, 55, 57,
+ 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63,
+ 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63,
+ 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43, 43, 45, 48, 48, 49, 53, 54, 54, 57, 58, 58, 60, 64, 64, 65,
+ 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69,
+ 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71,
+ 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71, 71,
+ 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48, 50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74, 77,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 30, 31, 31, 31, 31, 32,
+ 30, 31, 31, 31, 31, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 34, 34, 34, 35, 37,
+ 33, 34, 34, 34, 35, 35, 35, 36, 38, 39,
+ 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39,
+ 35, 36, 37, 37, 37, 38, 38, 38, 41, 41, 41, 44,
+ 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47,
+ 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47,
+ 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47,
+ 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48,
+ 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49, 49,
+ 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52,
+ 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53,
+ 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53,
+ 49, 48, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53,
+ 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53, 53, 54, 54,
+ 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54,
+ 49, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55, 55, 55,
+ 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58,
+ 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58,
+ 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 56, 56, 57, 59, 59, 59,
+ 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61,
+ 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61,
+ 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61, 61,
+ 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63, 64,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 32,
+ 31, 31, 32, 32,
+ 31, 31, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37,
+ 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39,
+ 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39,
+ 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39,
+ 35, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43,
+ 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48,
+ 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48,
+ 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48,
+ 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51,
+ 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54,
+ 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54,
+ 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54,
+ 41, 41, 40, 40, 40, 39, 39, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52, 52, 54, 56, 56, 56, 58,
+ 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63,
+ 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31,
+ 30, 31, 31, 31, 31, 31, 32,
+ 30, 31, 31, 31, 31, 31, 32, 32,
+ 30, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 35,
+ 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39,
+ 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39,
+ 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39,
+ 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41, 41, 41, 43,
+ 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47,
+ 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
+ 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47,
+ 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47, 47,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48,
+ 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50,
+ 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53,
+ 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53,
+ 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 53,
+ 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53,
+ 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54,
+ 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54,
+ 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54,
+ 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 56,
+ 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58,
+ 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, 58,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 32,
+ 31, 31, 31, 32, 32,
+ 31, 31, 31, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38,
+ 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39,
+ 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39,
+ 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 41, 41, 41, 42,
+ 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46,
+ 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48,
+ 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48,
+ 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48, 48,
+ 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49, 50,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31,
+ 30, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 31, 31, 31, 31, 31, 31, 32,
+ 30, 30, 31, 31, 31, 31, 31, 31, 32, 32,
+ 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 37,
+ 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39,
+ 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39,
+ 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39,
+ 34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40, 40, 42,
+ 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44, 46,
+ 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47,
+ 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47,
+ 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47,
+ 38, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47,
+ 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50,
+ 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52,
+ 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53,
+ 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53,
+ 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53, 53,
+ 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 32, 32,
+ 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 36,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 38,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37,
+ 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39,
+ 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39,
+ 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39,
+ 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39,
+ 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41,
+ 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 41, 41, 41, 41, 41, 42, 44,
+ 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46,
+ 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47,
+ 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47,
+ 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47,
+ 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47,
+ 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47,
+ 40, 40, 40, 41, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32,
+ 31, 31, 31, 31, 31, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36,
+ 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39,
+ 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40,
+ },
+ }, {
+ {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ }, {
+ 32,
+ 31, 31,
+ 31, 31, 31,
+ 31, 31, 31, 31,
+ 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ },
+ },
+};
+
+const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
+static uint8_t qm_tbl_4x4[15][2][16];
+static uint8_t qm_tbl_4x8[15][2][32];
+static uint8_t qm_tbl_4x16[15][2][64];
+static uint8_t qm_tbl_8x8[15][2][64];
+static uint8_t qm_tbl_8x16[15][2][128];
+static uint8_t qm_tbl_8x32[15][2][256];
+static uint8_t qm_tbl_16x16[15][2][256];
+static uint8_t qm_tbl_16x32[15][2][512];
+static uint8_t qm_tbl_32x32[15][2][1024];
+
+static void subsample(uint8_t *const dst, const uint8_t *const src,
+ const int sz, const int step)
+{
+ for (int y = 0; y < sz; y++)
+ for (int x = 0; x < sz; x++)
+ dst[y * sz + x] = src[y * sz * step * step + x * step];
+}
+
+static void transpose(uint8_t *const dst, const uint8_t *const src,
+ const int w, const int h)
+{
+ for (int y = 0, y_off = 0; y < h; y++, y_off += w)
+ for (int x = 0, x_off = 0; x < w; x++, x_off += h)
+ dst[x_off + y] = src[y_off + x];
+}
+
+static void untriangle(uint8_t *dst, const uint8_t *src, const int sz) {
+ for (int y = 0; y < sz; y++) {
+ memcpy(dst, src, y + 1);
+ const uint8_t *src_ptr = &src[y];
+ for (int x = y + 1; x < sz; x++) {
+ src_ptr += x;
+ dst[x] = *src_ptr;
+ }
+ dst += sz;
+ src += y + 1;
+ }
+}
+
+COLD void dav1d_init_qm_tables(void) {
+ // This function is guaranteed to be called only once
+
+ for (int i = 0; i < 15; i++)
+ for (int j = 0; j < 2; j++) {
+ // note that the w/h in the assignment is inverted, this is on purpose
+ // because we store coefficients transposed
+ dav1d_qm_tbl[i][j][RTX_4X8 ] = qm_tbl_8x4[i][j];
+ dav1d_qm_tbl[i][j][RTX_8X4 ] = qm_tbl_4x8[i][j];
+ transpose(qm_tbl_4x8[i][j], qm_tbl_8x4[i][j], 8, 4);
+ dav1d_qm_tbl[i][j][RTX_4X16 ] = qm_tbl_16x4[i][j];
+ dav1d_qm_tbl[i][j][RTX_16X4 ] = qm_tbl_4x16[i][j];
+ transpose(qm_tbl_4x16[i][j], qm_tbl_16x4[i][j], 16, 4);
+ dav1d_qm_tbl[i][j][RTX_8X16 ] = qm_tbl_16x8[i][j];
+ dav1d_qm_tbl[i][j][RTX_16X8 ] = qm_tbl_8x16[i][j];
+ transpose(qm_tbl_8x16[i][j], qm_tbl_16x8[i][j], 16, 8);
+ dav1d_qm_tbl[i][j][RTX_8X32 ] = qm_tbl_32x8[i][j];
+ dav1d_qm_tbl[i][j][RTX_32X8 ] = qm_tbl_8x32[i][j];
+ transpose(qm_tbl_8x32[i][j], qm_tbl_32x8[i][j], 32, 8);
+ dav1d_qm_tbl[i][j][RTX_16X32] = qm_tbl_32x16[i][j];
+ dav1d_qm_tbl[i][j][RTX_32X16] = qm_tbl_16x32[i][j];
+ transpose(qm_tbl_16x32[i][j], qm_tbl_32x16[i][j], 32, 16);
+
+ dav1d_qm_tbl[i][j][ TX_4X4 ] = qm_tbl_4x4[i][j];
+ dav1d_qm_tbl[i][j][ TX_8X8 ] = qm_tbl_8x8[i][j];
+ dav1d_qm_tbl[i][j][ TX_16X16] = qm_tbl_16x16[i][j];
+ dav1d_qm_tbl[i][j][ TX_32X32] = qm_tbl_32x32[i][j];
+ untriangle(qm_tbl_4x4[i][j], qm_tbl_4x4_t[i][j], 4);
+ untriangle(qm_tbl_8x8[i][j], qm_tbl_8x8_t[i][j], 8);
+ untriangle(qm_tbl_32x32[i][j], qm_tbl_32x32_t[i][j], 32);
+ subsample(qm_tbl_16x16[i][j], qm_tbl_32x32[i][j], 16, 2);
+
+ dav1d_qm_tbl[i][j][ TX_64X64] = dav1d_qm_tbl[i][j][ TX_32X32];
+ dav1d_qm_tbl[i][j][RTX_64X32] = dav1d_qm_tbl[i][j][ TX_32X32];
+ dav1d_qm_tbl[i][j][RTX_64X16] = dav1d_qm_tbl[i][j][RTX_32X16];
+ dav1d_qm_tbl[i][j][RTX_32X64] = dav1d_qm_tbl[i][j][ TX_32X32];
+ dav1d_qm_tbl[i][j][RTX_16X64] = dav1d_qm_tbl[i][j][RTX_16X32];
+ }
+
+ // dav1d_qm_tbl[15][*][*] == NULL
+}
diff --git a/third_party/dav1d/src/qm.h b/third_party/dav1d/src/qm.h
new file mode 100644
index 0000000000..8191c8afa7
--- /dev/null
+++ b/third_party/dav1d/src/qm.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_QM_H
+#define DAV1D_SRC_QM_H
+
+#include "src/levels.h"
+
+EXTERN const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
+
+void dav1d_init_qm_tables(void);
+
+#endif /* DAV1D_SRC_QM_H */
diff --git a/third_party/dav1d/src/recon.h b/third_party/dav1d/src/recon.h
new file mode 100644
index 0000000000..e97ac31ffb
--- /dev/null
+++ b/third_party/dav1d/src/recon.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_RECON_H
+#define DAV1D_SRC_RECON_H
+
+#include "src/internal.h"
+#include "src/levels.h"
+
+#define DEBUG_BLOCK_INFO 0 && \
+ f->frame_hdr->frame_offset == 2 && t->by >= 0 && t->by < 4 && \
+ t->bx >= 8 && t->bx < 12
+#define DEBUG_B_PIXELS 0
+
+#define decl_recon_b_intra_fn(name) \
+void (name)(Dav1dTaskContext *t, enum BlockSize bs, \
+ enum EdgeFlags intra_edge_flags, const Av1Block *b)
+typedef decl_recon_b_intra_fn(*recon_b_intra_fn);
+
+#define decl_recon_b_inter_fn(name) \
+int (name)(Dav1dTaskContext *t, enum BlockSize bs, const Av1Block *b)
+typedef decl_recon_b_inter_fn(*recon_b_inter_fn);
+
+#define decl_filter_sbrow_fn(name) \
+void (name)(Dav1dFrameContext *f, int sby)
+typedef decl_filter_sbrow_fn(*filter_sbrow_fn);
+
+#define decl_backup_ipred_edge_fn(name) \
+void (name)(Dav1dTaskContext *t)
+typedef decl_backup_ipred_edge_fn(*backup_ipred_edge_fn);
+
+#define decl_read_coef_blocks_fn(name) \
+void (name)(Dav1dTaskContext *t, enum BlockSize bs, const Av1Block *b)
+typedef decl_read_coef_blocks_fn(*read_coef_blocks_fn);
+
+decl_recon_b_intra_fn(dav1d_recon_b_intra_8bpc);
+decl_recon_b_intra_fn(dav1d_recon_b_intra_16bpc);
+
+decl_recon_b_inter_fn(dav1d_recon_b_inter_8bpc);
+decl_recon_b_inter_fn(dav1d_recon_b_inter_16bpc);
+
+decl_filter_sbrow_fn(dav1d_filter_sbrow_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_16bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_cols_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_cols_16bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_rows_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_rows_16bpc);
+void dav1d_filter_sbrow_cdef_8bpc(Dav1dTaskContext *tc, int sby);
+void dav1d_filter_sbrow_cdef_16bpc(Dav1dTaskContext *tc, int sby);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_16bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_16bpc);
+
+decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_8bpc);
+decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc);
+
+decl_read_coef_blocks_fn(dav1d_read_coef_blocks_8bpc);
+decl_read_coef_blocks_fn(dav1d_read_coef_blocks_16bpc);
+
+#endif /* DAV1D_SRC_RECON_H */
diff --git a/third_party/dav1d/src/recon_tmpl.c b/third_party/dav1d/src/recon_tmpl.c
new file mode 100644
index 0000000000..3158ef5b02
--- /dev/null
+++ b/third_party/dav1d/src/recon_tmpl.c
@@ -0,0 +1,2202 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+#include <stdio.h>
+
+#include "common/attributes.h"
+#include "common/bitdepth.h"
+#include "common/dump.h"
+#include "common/frame.h"
+#include "common/intops.h"
+
+#include "src/cdef_apply.h"
+#include "src/ctx.h"
+#include "src/ipred_prepare.h"
+#include "src/lf_apply.h"
+#include "src/lr_apply.h"
+#include "src/recon.h"
+#include "src/scan.h"
+#include "src/tables.h"
+#include "src/wedge.h"
+
+static inline unsigned read_golomb(MsacContext *const msac) {
+ int len = 0;
+ unsigned val = 1;
+
+ while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++;
+ while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac);
+
+ return val - 1;
+}
+
+static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
+ const enum BlockSize bs,
+ const uint8_t *const a,
+ const uint8_t *const l,
+ const int chroma,
+ const enum Dav1dPixelLayout layout)
+{
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+
+ if (chroma) {
+ const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
+ b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
+ unsigned ca, cl;
+
+#define MERGE_CTX(dir, type, no_val) \
+ c##dir = *(const type *) dir != no_val; \
+ break
+
+ switch (t_dim->lw) {
+ /* For some reason the MSVC CRT _wassert() function is not flagged as
+ * __declspec(noreturn), so when using those headers the compiler will
+ * expect execution to continue after an assertion has been triggered
+ * and will therefore complain about the use of uninitialized variables
+ * when compiled in debug mode if we put the default case at the end. */
+ default: assert(0); /* fall-through */
+ case TX_4X4: MERGE_CTX(a, uint8_t, 0x40);
+ case TX_8X8: MERGE_CTX(a, uint16_t, 0x4040);
+ case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U);
+ case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL);
+ }
+ switch (t_dim->lh) {
+ default: assert(0); /* fall-through */
+ case TX_4X4: MERGE_CTX(l, uint8_t, 0x40);
+ case TX_8X8: MERGE_CTX(l, uint16_t, 0x4040);
+ case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U);
+ case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL);
+ }
+#undef MERGE_CTX
+
+ return 7 + not_one_blk * 3 + ca + cl;
+ } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
+ return 0;
+ } else {
+ unsigned la, ll;
+
+#define MERGE_CTX(dir, type, tx) \
+ if (tx == TX_64X64) { \
+ uint64_t tmp = *(const uint64_t *) dir; \
+ tmp |= *(const uint64_t *) &dir[8]; \
+ l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
+ } else \
+ l##dir = *(const type *) dir; \
+ if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
+ if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
+ if (tx >= TX_8X8) l##dir |= l##dir >> 8; \
+ break
+
+ switch (t_dim->lw) {
+ default: assert(0); /* fall-through */
+ case TX_4X4: MERGE_CTX(a, uint8_t, TX_4X4);
+ case TX_8X8: MERGE_CTX(a, uint16_t, TX_8X8);
+ case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
+ case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);
+ case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);
+ }
+ switch (t_dim->lh) {
+ default: assert(0); /* fall-through */
+ case TX_4X4: MERGE_CTX(l, uint8_t, TX_4X4);
+ case TX_8X8: MERGE_CTX(l, uint16_t, TX_8X8);
+ case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
+ case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);
+ case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);
+ }
+#undef MERGE_CTX
+
+ return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)];
+ }
+}
+
+static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx,
+ const uint8_t *const a,
+ const uint8_t *const l)
+{
+ uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL;
+ int s;
+
+#if ARCH_X86_64 && defined(__GNUC__)
+ /* Coerce compilers into producing better code. For some reason
+ * every x86-64 compiler is awful at handling 64-bit constants. */
+ __asm__("" : "+r"(mask), "+r"(mul));
+#endif
+
+ switch(tx) {
+ default: assert(0); /* fall-through */
+ case TX_4X4: {
+ int t = *(const uint8_t *) a >> 6;
+ t += *(const uint8_t *) l >> 6;
+ s = t - 1 - 1;
+ break;
+ }
+ case TX_8X8: {
+ uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
+ t += *(const uint16_t *) l & (uint32_t) mask;
+ t *= 0x04040404U;
+ s = (int) (t >> 24) - 2 - 2;
+ break;
+ }
+ case TX_16X16: {
+ uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6;
+ t += (*(const uint32_t *) l & (uint32_t) mask) >> 6;
+ t *= (uint32_t) mul;
+ s = (int) (t >> 24) - 4 - 4;
+ break;
+ }
+ case TX_32X32: {
+ uint64_t t = (*(const uint64_t *) a & mask) >> 6;
+ t += (*(const uint64_t *) l & mask) >> 6;
+ t *= mul;
+ s = (int) (t >> 56) - 8 - 8;
+ break;
+ }
+ case TX_64X64: {
+ uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
+ t += (*(const uint64_t *) &a[8] & mask) >> 6;
+ t += (*(const uint64_t *) &l[0] & mask) >> 6;
+ t += (*(const uint64_t *) &l[8] & mask) >> 6;
+ t *= mul;
+ s = (int) (t >> 56) - 16 - 16;
+ break;
+ }
+ case RTX_4X8: {
+ uint32_t t = *(const uint8_t *) a & (uint32_t) mask;
+ t += *(const uint16_t *) l & (uint32_t) mask;
+ t *= 0x04040404U;
+ s = (int) (t >> 24) - 1 - 2;
+ break;
+ }
+ case RTX_8X4: {
+ uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
+ t += *(const uint8_t *) l & (uint32_t) mask;
+ t *= 0x04040404U;
+ s = (int) (t >> 24) - 2 - 1;
+ break;
+ }
+ case RTX_8X16: {
+ uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
+ t += *(const uint32_t *) l & (uint32_t) mask;
+ t = (t >> 6) * (uint32_t) mul;
+ s = (int) (t >> 24) - 2 - 4;
+ break;
+ }
+ case RTX_16X8: {
+ uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
+ t += *(const uint16_t *) l & (uint32_t) mask;
+ t = (t >> 6) * (uint32_t) mul;
+ s = (int) (t >> 24) - 4 - 2;
+ break;
+ }
+ case RTX_16X32: {
+ uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
+ t += *(const uint64_t *) l & mask;
+ t = (t >> 6) * mul;
+ s = (int) (t >> 56) - 4 - 8;
+ break;
+ }
+ case RTX_32X16: {
+ uint64_t t = *(const uint64_t *) a & mask;
+ t += *(const uint32_t *) l & (uint32_t) mask;
+ t = (t >> 6) * mul;
+ s = (int) (t >> 56) - 8 - 4;
+ break;
+ }
+ case RTX_32X64: {
+ uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
+ t += (*(const uint64_t *) &l[0] & mask) >> 6;
+ t += (*(const uint64_t *) &l[8] & mask) >> 6;
+ t *= mul;
+ s = (int) (t >> 56) - 8 - 16;
+ break;
+ }
+ case RTX_64X32: {
+ uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
+ t += (*(const uint64_t *) &a[8] & mask) >> 6;
+ t += (*(const uint64_t *) &l[0] & mask) >> 6;
+ t *= mul;
+ s = (int) (t >> 56) - 16 - 8;
+ break;
+ }
+ case RTX_4X16: {
+ uint32_t t = *(const uint8_t *) a & (uint32_t) mask;
+ t += *(const uint32_t *) l & (uint32_t) mask;
+ t = (t >> 6) * (uint32_t) mul;
+ s = (int) (t >> 24) - 1 - 4;
+ break;
+ }
+ case RTX_16X4: {
+ uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
+ t += *(const uint8_t *) l & (uint32_t) mask;
+ t = (t >> 6) * (uint32_t) mul;
+ s = (int) (t >> 24) - 4 - 1;
+ break;
+ }
+ case RTX_8X32: {
+ uint64_t t = *(const uint16_t *) a & (uint32_t) mask;
+ t += *(const uint64_t *) l & mask;
+ t = (t >> 6) * mul;
+ s = (int) (t >> 56) - 2 - 8;
+ break;
+ }
+ case RTX_32X8: {
+ uint64_t t = *(const uint64_t *) a & mask;
+ t += *(const uint16_t *) l & (uint32_t) mask;
+ t = (t >> 6) * mul;
+ s = (int) (t >> 56) - 8 - 2;
+ break;
+ }
+ case RTX_16X64: {
+ uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
+ t += *(const uint64_t *) &l[0] & mask;
+ t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6);
+ t *= mul;
+ s = (int) (t >> 56) - 4 - 16;
+ break;
+ }
+ case RTX_64X16: {
+ uint64_t t = *(const uint64_t *) &a[0] & mask;
+ t += *(const uint32_t *) l & (uint32_t) mask;
+ t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6);
+ t *= mul;
+ s = (int) (t >> 56) - 16 - 4;
+ break;
+ }
+ }
+
+ return (s != 0) + (s > 0);
+}
+
+static inline unsigned get_lo_ctx(const uint8_t *const levels,
+ const enum TxClass tx_class,
+ unsigned *const hi_mag,
+ const uint8_t (*const ctx_offsets)[5],
+ const unsigned x, const unsigned y,
+ const ptrdiff_t stride)
+{
+ unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0];
+ unsigned offset;
+ if (tx_class == TX_CLASS_2D) {
+ mag += levels[1 * stride + 1];
+ *hi_mag = mag;
+ mag += levels[0 * stride + 2] + levels[2 * stride + 0];
+ offset = ctx_offsets[umin(y, 4)][umin(x, 4)];
+ } else {
+ mag += levels[0 * stride + 2];
+ *hi_mag = mag;
+ mag += levels[0 * stride + 3] + levels[0 * stride + 4];
+ offset = 26 + (y > 1 ? 10 : y * 5);
+ }
+ return offset + (mag > 512 ? 4 : (mag + 64) >> 7);
+}
+
+static int decode_coefs(Dav1dTaskContext *const t,
+ uint8_t *const a, uint8_t *const l,
+ const enum RectTxfmSize tx, const enum BlockSize bs,
+ const Av1Block *const b, const int intra,
+ const int plane, coef *cf,
+ enum TxfmType *const txtp, uint8_t *res_ctx)
+{
+ Dav1dTileState *const ts = t->ts;
+ const int chroma = !!plane;
+ const Dav1dFrameContext *const f = t->f;
+ const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
+ const int dbg = DEBUG_BLOCK_INFO && plane && 0;
+
+ if (dbg)
+ printf("Start: r=%d\n", ts->msac.rng);
+
+ // does this block have any non-zero coefficients
+ const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
+ const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.coef.skip[t_dim->ctx][sctx]);
+ if (dbg)
+ printf("Post-non-zero[%d][%d][%d]: r=%d\n",
+ t_dim->ctx, sctx, all_skip, ts->msac.rng);
+ if (all_skip) {
+ *res_ctx = 0x40;
+ *txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */
+ return -1;
+ }
+
+ // transform type (chroma: derived, luma: explicitly coded)
+ if (lossless) {
+ assert(t_dim->max == TX_4X4);
+ *txtp = WHT_WHT;
+ } else if (t_dim->max + intra >= TX_64X64) {
+ *txtp = DCT_DCT;
+ } else if (chroma) {
+ // inferred from either the luma txtp (inter) or a LUT (intra)
+ *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
+ get_uv_inter_txtp(t_dim, *txtp);
+ } else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) {
+ // In libaom, lossless is checked by a literal qidx == 0, but not all
+ // such blocks are actually lossless. The remainder gets an implicit
+ // transform type (for luma)
+ *txtp = DCT_DCT;
+ } else {
+ unsigned idx;
+ if (intra) {
+ const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
+ dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
+ if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
+ idx = dav1d_msac_decode_symbol_adapt4(&ts->msac,
+ ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);
+ *txtp = dav1d_tx_types_per_set[idx + 0];
+ } else {
+ idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+ ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6);
+ *txtp = dav1d_tx_types_per_set[idx + 5];
+ }
+ if (dbg)
+ printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n",
+ tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng);
+ } else {
+ if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) {
+ idx = dav1d_msac_decode_bool_adapt(&ts->msac,
+ ts->cdf.m.txtp_inter3[t_dim->min]);
+ *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */
+ } else if (t_dim->min == TX_16X16) {
+ idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+ ts->cdf.m.txtp_inter2, 11);
+ *txtp = dav1d_tx_types_per_set[idx + 12];
+ } else {
+ idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
+ ts->cdf.m.txtp_inter1[t_dim->min], 15);
+ *txtp = dav1d_tx_types_per_set[idx + 24];
+ }
+ if (dbg)
+ printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n",
+ tx, t_dim->min, idx, *txtp, ts->msac.rng);
+ }
+ }
+
+ // find end-of-block (eob)
+ int eob_bin;
+ const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32);
+ const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
+ const int is_1d = tx_class != TX_CLASS_2D;
+ switch (tx2dszctx) {
+#define case_sz(sz, bin, ns, is_1d) \
+ case sz: { \
+ uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
+ eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
+ break; \
+ }
+ case_sz(0, 16, 4, [is_1d]);
+ case_sz(1, 32, 8, [is_1d]);
+ case_sz(2, 64, 8, [is_1d]);
+ case_sz(3, 128, 8, [is_1d]);
+ case_sz(4, 256, 16, [is_1d]);
+ case_sz(5, 512, 16, );
+ case_sz(6, 1024, 16, );
+#undef case_sz
+ }
+ if (dbg)
+ printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",
+ 16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);
+ int eob;
+ if (eob_bin > 1) {
+ uint16_t *const eob_hi_bit_cdf =
+ ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
+ const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
+ if (dbg)
+ printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
+ t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
+ eob = ((eob_hi_bit | 2) << (eob_bin - 2)) |
+ dav1d_msac_decode_bools(&ts->msac, eob_bin - 2);
+ if (dbg)
+ printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
+ } else {
+ eob = eob_bin;
+ }
+ assert(eob >= 0);
+
+ // base tokens
+ uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
+ uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
+ unsigned rc, dc_tok;
+
+ if (eob) {
+ uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
+ uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
+ const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8);
+
+ /* eob */
+ unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);
+ int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
+ int tok = eob_tok + 1;
+ int level_tok = tok * 0x41;
+ unsigned mag;
+
+#define DECODE_COEFS_CLASS(tx_class) \
+ unsigned x, y; \
+ if (tx_class == TX_CLASS_2D) \
+ rc = scan[eob], x = rc >> shift, y = rc & mask; \
+ else if (tx_class == TX_CLASS_H) \
+ /* Transposing reduces the stride and padding requirements */ \
+ x = eob & mask, y = eob >> shift, rc = eob; \
+ else /* tx_class == TX_CLASS_V */ \
+ x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
+ if (dbg) \
+ printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+ t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
+ if (eob_tok == 2) { \
+ ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \
+ tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
+ level_tok = tok + (3 << 6); \
+ if (dbg) \
+ printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+ imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
+ ts->msac.rng); \
+ } \
+ cf[rc] = tok << 11; \
+ levels[x * stride + y] = (uint8_t) level_tok; \
+ for (int i = eob - 1; i > 0; i--) { /* ac */ \
+ unsigned rc_i; \
+ if (tx_class == TX_CLASS_2D) \
+ rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
+ else if (tx_class == TX_CLASS_H) \
+ x = i & mask, y = i >> shift, rc_i = i; \
+ else /* tx_class == TX_CLASS_V */ \
+ x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
+ assert(x < 32 && y < 32); \
+ uint8_t *const level = levels + x * stride + y; \
+ ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
+ if (tx_class == TX_CLASS_2D) \
+ y |= x; \
+ tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
+ if (dbg) \
+ printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+ t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
+ if (tok == 3) { \
+ mag &= 63; \
+ ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
+ (mag > 12 ? 6 : (mag + 1) >> 1); \
+ tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
+ if (dbg) \
+ printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
+ imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \
+ ts->msac.rng); \
+ *level = (uint8_t) (tok + (3 << 6)); \
+ cf[rc_i] = (tok << 11) | rc; \
+ rc = rc_i; \
+ } else { \
+ /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
+ tok *= 0x17ff41; \
+ *level = (uint8_t) tok; \
+ /* tok ? (tok << 11) | rc : 0 */ \
+ tok = (tok >> 9) & (rc + ~0x7ffu); \
+ if (tok) rc = rc_i; \
+ cf[rc_i] = tok; \
+ } \
+ } \
+ /* dc */ \
+ ctx = (tx_class == TX_CLASS_2D) ? 0 : \
+ get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
+ dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
+ if (dbg) \
+ printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
+ t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
+ if (dc_tok == 3) { \
+ if (tx_class == TX_CLASS_2D) \
+ mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
+ levels[1 * stride + 1]; \
+ mag &= 63; \
+ ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
+ dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
+ if (dbg) \
+ printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
+ imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
+ } \
+ break
+
+ const uint16_t *scan;
+ switch (tx_class) {
+ case TX_CLASS_2D: {
+ const unsigned nonsquare_tx = tx >= RTX_4X8;
+ const uint8_t (*const lo_ctx_offsets)[5] =
+ dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
+ scan = dav1d_scans[tx];
+ const ptrdiff_t stride = 4 * sh;
+ const unsigned shift = t_dim->lh < 4 ? t_dim->lh + 2 : 5, shift2 = 0;
+ const unsigned mask = 4 * sh - 1;
+ memset(levels, 0, stride * (4 * sw + 2));
+ DECODE_COEFS_CLASS(TX_CLASS_2D);
+ }
+ case TX_CLASS_H: {
+ const uint8_t (*const lo_ctx_offsets)[5] = NULL;
+ const ptrdiff_t stride = 16;
+ const unsigned shift = t_dim->lh + 2, shift2 = 0;
+ const unsigned mask = 4 * sh - 1;
+ memset(levels, 0, stride * (4 * sh + 2));
+ DECODE_COEFS_CLASS(TX_CLASS_H);
+ }
+ case TX_CLASS_V: {
+ const uint8_t (*const lo_ctx_offsets)[5] = NULL;
+ const ptrdiff_t stride = 16;
+ const unsigned shift = t_dim->lw + 2, shift2 = t_dim->lh + 2;
+ const unsigned mask = 4 * sw - 1;
+ memset(levels, 0, stride * (4 * sw + 2));
+ DECODE_COEFS_CLASS(TX_CLASS_V);
+ }
+#undef DECODE_COEFS_CLASS
+ default: assert(0);
+ }
+ } else { // dc-only
+ int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2);
+ dc_tok = 1 + tok_br;
+ if (dbg)
+ printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
+ t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng);
+ if (tok_br == 2) {
+ dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]);
+ if (dbg)
+ printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
+ imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
+ }
+ rc = 0;
+ }
+
+ // residual and sign
+ const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
+ const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
+ const int dq_shift = imax(0, t_dim->ctx - 2);
+ const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
+ unsigned cul_level, dc_sign_level;
+
+ if (!dc_tok) {
+ cul_level = 0;
+ dc_sign_level = 1 << 6;
+ if (qm_tbl) goto ac_qm;
+ goto ac_noqm;
+ }
+
+ const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
+ uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
+ const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
+ if (dbg)
+ printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
+ chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
+
+ int dc_dq = dq_tbl[0];
+ dc_sign_level = (dc_sign - 1) & (2 << 6);
+
+ if (qm_tbl) {
+ dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5;
+
+ if (dc_tok == 15) {
+ dc_tok = read_golomb(&ts->msac) + 15;
+ if (dbg)
+ printf("Post-dc_residual[%d->%d]: r=%d\n",
+ dc_tok - 15, dc_tok, ts->msac.rng);
+
+ dc_tok &= 0xfffff;
+ dc_dq = (dc_dq * dc_tok) & 0xffffff;
+ } else {
+ dc_dq *= dc_tok;
+ assert(dc_dq <= 0xffffff);
+ }
+ cul_level = dc_tok;
+ dc_dq >>= dq_shift;
+ dc_dq = umin(dc_dq, cf_max + dc_sign);
+ cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
+
+ if (rc) ac_qm: {
+ const unsigned ac_dq = dq_tbl[1];
+ do {
+ const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
+ if (dbg)
+ printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
+ const unsigned rc_tok = cf[rc];
+ unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
+ int dq_sat;
+
+ if (rc_tok >= (15 << 11)) {
+ tok = read_golomb(&ts->msac) + 15;
+ if (dbg)
+ printf("Post-residual[%d=%d->%d]: r=%d\n",
+ rc, tok - 15, tok, ts->msac.rng);
+
+ tok &= 0xfffff;
+ dq = (dq * tok) & 0xffffff;
+ } else {
+ tok = rc_tok >> 11;
+ dq *= tok;
+ assert(dq <= 0xffffff);
+ }
+ cul_level += tok;
+ dq >>= dq_shift;
+ dq_sat = umin(dq, cf_max + sign);
+ cf[rc] = (coef) (sign ? -dq_sat : dq_sat);
+
+ rc = rc_tok & 0x3ff;
+ } while (rc);
+ }
+ } else {
+ // non-qmatrix is the common case and allows for additional optimizations
+ if (dc_tok == 15) {
+ dc_tok = read_golomb(&ts->msac) + 15;
+ if (dbg)
+ printf("Post-dc_residual[%d->%d]: r=%d\n",
+ dc_tok - 15, dc_tok, ts->msac.rng);
+
+ dc_tok &= 0xfffff;
+ dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
+ dc_dq = umin(dc_dq, cf_max + dc_sign);
+ } else {
+ dc_dq = ((dc_dq * dc_tok) >> dq_shift);
+ assert(dc_dq <= cf_max);
+ }
+ cul_level = dc_tok;
+ cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
+
+ if (rc) ac_noqm: {
+ const unsigned ac_dq = dq_tbl[1];
+ do {
+ const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
+ if (dbg)
+ printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
+ const unsigned rc_tok = cf[rc];
+ unsigned tok;
+ int dq;
+
+ // residual
+ if (rc_tok >= (15 << 11)) {
+ tok = read_golomb(&ts->msac) + 15;
+ if (dbg)
+ printf("Post-residual[%d=%d->%d]: r=%d\n",
+ rc, tok - 15, tok, ts->msac.rng);
+
+ // coefficient parsing, see 5.11.39
+ tok &= 0xfffff;
+
+ // dequant, see 7.12.3
+ dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
+ dq = umin(dq, cf_max + sign);
+ } else {
+ // cannot exceed cf_max, so we can avoid the clipping
+ tok = rc_tok >> 11;
+ dq = ((ac_dq * tok) >> dq_shift);
+ assert(dq <= cf_max);
+ }
+ cul_level += tok;
+ cf[rc] = (coef) (sign ? -dq : dq);
+
+ rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
+ } while (rc);
+ }
+ }
+
+ // context
+ *res_ctx = umin(cul_level, 63) | dc_sign_level;
+
+ return eob;
+}
+
+static void read_coef_tree(Dav1dTaskContext *const t,
+ const enum BlockSize bs, const Av1Block *const b,
+ const enum RectTxfmSize ytx, const int depth,
+ const uint16_t *const tx_split,
+ const int x_off, const int y_off, pixel *dst)
+{
+ const Dav1dFrameContext *const f = t->f;
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
+ const int txw = t_dim->w, txh = t_dim->h;
+
+ /* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't
+ * be splitted. Aviods an undefined left shift. */
+ if (depth < 2 && tx_split[depth] &&
+ tx_split[depth] & (1 << (y_off * 4 + x_off)))
+ {
+ const enum RectTxfmSize sub = t_dim->sub;
+ const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
+ const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
+
+ read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
+ x_off * 2 + 0, y_off * 2 + 0, dst);
+ t->bx += txsw;
+ if (txw >= txh && t->bx < f->bw)
+ read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
+ y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);
+ t->bx -= txsw;
+ t->by += txsh;
+ if (txh >= txw && t->by < f->bh) {
+ if (dst)
+ dst += 4 * txsh * PXSTRIDE(f->cur.stride[0]);
+ read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
+ x_off * 2 + 0, y_off * 2 + 1, dst);
+ t->bx += txsw;
+ if (txw >= txh && t->bx < f->bw)
+ read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
+ y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);
+ t->bx -= txsw;
+ }
+ t->by -= txsh;
+ } else {
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ enum TxfmType txtp;
+ uint8_t cf_ctx;
+ int eob;
+ coef *cf;
+ struct CodedBlockInfo *cbi;
+
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ assert(ts->frame_thread[p].cf);
+ cf = ts->frame_thread[p].cf;
+ ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+ cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+ } else {
+ cf = bitfn(t->cf);
+ }
+ if (t->frame_thread.pass != 2) {
+ eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
+ ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+ ytx, txtp, eob, ts->msac.rng);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir lcoef[off], cf_ctx, sz)
+ case_set_upto16_with_default(imin(txh, f->bh - t->by), l., 1, by4);
+ case_set_upto16_with_default(imin(txw, f->bw - t->bx), a->, 0, bx4);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ for (int y = 0; y < txh; y++) { \
+ rep_macro(type, txtp_map, 0, mul * txtp); \
+ txtp_map += 32; \
+ }
+ uint8_t *txtp_map = &t->txtp_map[by4 * 32 + bx4];
+ case_set_upto16(txw,,,);
+#undef set_ctx
+ if (t->frame_thread.pass == 1) {
+ cbi->eob[0] = eob;
+ cbi->txtp[0] = txtp;
+ }
+ } else {
+ eob = cbi->eob[0];
+ txtp = cbi->txtp[0];
+ }
+ if (!(t->frame_thread.pass & 1)) {
+ assert(dst);
+ if (eob >= 0) {
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
+ dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0], cf, eob
+ HIGHBD_CALL_SUFFIX);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
+ }
+ }
+ }
+}
+
+void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
+ const enum BlockSize bs, const Av1Block *const b)
+{
+ const Dav1dFrameContext *const f = t->f;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+ const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
+ const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+ (bw4 > ss_hor || t->bx & 1) &&
+ (bh4 > ss_ver || t->by & 1);
+
+ if (b->skip) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * 0x40)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
+ rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ }
+ return;
+ }
+
+ Dav1dTileState *const ts = t->ts;
+ const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+ const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+ assert(t->frame_thread.pass == 1);
+ assert(!b->skip);
+ const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
+ const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
+
+ for (int init_y = 0; init_y < h4; init_y += 16) {
+ const int sub_h4 = imin(h4, 16 + init_y);
+ for (int init_x = 0; init_x < w4; init_x += 16) {
+ const int sub_w4 = imin(w4, init_x + 16);
+ int y_off = !!init_y, y, x;
+ for (y = init_y, t->by += init_y; y < sub_h4;
+ y += t_dim->h, t->by += t_dim->h, y_off++)
+ {
+ struct CodedBlockInfo *const cbi =
+ &f->frame_thread.cbi[t->by * f->b4_stride];
+ int x_off = !!init_x;
+ for (x = init_x, t->bx += init_x; x < sub_w4;
+ x += t_dim->w, t->bx += t_dim->w, x_off++)
+ {
+ if (!b->intra) {
+ read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
+ x_off, y_off, NULL);
+ } else {
+ uint8_t cf_ctx = 0x40;
+ enum TxfmType txtp;
+ const int eob = cbi[t->bx].eob[0] =
+ decode_coefs(t, &t->a->lcoef[bx4 + x],
+ &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
+ 0, ts->frame_thread[1].cf, &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+ b->tx, txtp, eob, ts->msac.rng);
+ cbi[t->bx].txtp[0] = txtp;
+ ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir lcoef[off], cf_ctx, sz)
+ case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by),
+ l., 1, by4 + y);
+ case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx),
+ a->, 0, bx4 + x);
+#undef default_memset
+#undef set_ctx
+ }
+ }
+ t->bx -= x;
+ }
+ t->by -= y;
+
+ if (!has_chroma) continue;
+
+ const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
+ const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
+ for (int pl = 0; pl < 2; pl++) {
+ for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
+ y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
+ {
+ struct CodedBlockInfo *const cbi =
+ &f->frame_thread.cbi[t->by * f->b4_stride];
+ for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
+ x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
+ {
+ uint8_t cf_ctx = 0x40;
+ enum TxfmType txtp;
+ if (!b->intra)
+ txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +
+ bx4 + (x << ss_hor)];
+ const int eob = cbi[t->bx].eob[1 + pl] =
+ decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+ &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
+ b, b->intra, 1 + pl, ts->frame_thread[1].cf,
+ &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+ "txtp=%d,eob=%d]: r=%d\n",
+ pl, b->uvtx, txtp, eob, ts->msac.rng);
+ cbi[t->bx].txtp[1 + pl] = txtp;
+ ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+ case_set_upto16_with_default( \
+ imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
+ l., 1, cby4 + y);
+ case_set_upto16_with_default( \
+ imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+ a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
+ }
+ t->bx -= x << ss_hor;
+ }
+ t->by -= y << ss_ver;
+ }
+ }
+ }
+}
+
+static int mc(Dav1dTaskContext *const t,
+ pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride,
+ const int bw4, const int bh4,
+ const int bx, const int by, const int pl,
+ const mv mv, const Dav1dThreadPicture *const refp, const int refidx,
+ const enum Filter2d filter_2d)
+{
+ assert((dst8 != NULL) ^ (dst16 != NULL));
+ const Dav1dFrameContext *const f = t->f;
+ const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+ const int mvx = mv.x, mvy = mv.y;
+ const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
+ ptrdiff_t ref_stride = refp->p.stride[!!pl];
+ const pixel *ref;
+
+ if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) {
+ const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
+ const int dy = by * v_mul + (mvy >> (3 + ss_ver));
+ int w, h;
+
+ if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc
+ w = (f->cur.p.w + ss_hor) >> ss_hor;
+ h = (f->cur.p.h + ss_ver) >> ss_ver;
+ } else {
+ w = f->bw * 4 >> ss_hor;
+ h = f->bh * 4 >> ss_ver;
+ }
+ if (dx < !!mx * 3 || dy < !!my * 3 ||
+ dx + bw4 * h_mul + !!mx * 4 > w ||
+ dy + bh4 * v_mul + !!my * 4 > h)
+ {
+ pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
+ f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7,
+ w, h, dx - !!mx * 3, dy - !!my * 3,
+ emu_edge_buf, 192 * sizeof(pixel),
+ refp->p.data[pl], ref_stride);
+ ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3];
+ ref_stride = 192 * sizeof(pixel);
+ } else {
+ ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
+ }
+
+ if (dst8 != NULL) {
+ f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
+ bh4 * v_mul, mx << !ss_hor, my << !ss_ver
+ HIGHBD_CALL_SUFFIX);
+ } else {
+ f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
+ bh4 * v_mul, mx << !ss_hor, my << !ss_ver
+ HIGHBD_CALL_SUFFIX);
+ }
+ } else {
+ assert(refp != &f->sr_cur);
+
+ const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);
+ const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);
+#define scale_mv(res, val, scale) do { \
+ const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \
+ res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32; \
+ } while (0)
+ int pos_y, pos_x;
+ scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale);
+ scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale);
+#undef scale_mv
+ const int left = pos_x >> 10;
+ const int top = pos_y >> 10;
+ const int right =
+ ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1;
+ const int bottom =
+ ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;
+
+ if (DEBUG_BLOCK_INFO)
+ printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n",
+ left, top, orig_pos_x, f->svc[refidx][0].scale, refidx,
+ right-left, bottom-top,
+ f->svc[refidx][0].step, f->svc[refidx][1].step);
+
+ const int w = (refp->p.p.w + ss_hor) >> ss_hor;
+ const int h = (refp->p.p.h + ss_ver) >> ss_ver;
+ if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) {
+ pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
+ f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7,
+ w, h, left - 3, top - 3,
+ emu_edge_buf, 320 * sizeof(pixel),
+ refp->p.data[pl], ref_stride);
+ ref = &emu_edge_buf[320 * 3 + 3];
+ ref_stride = 320 * sizeof(pixel);
+ if (DEBUG_BLOCK_INFO) printf("Emu\n");
+ } else {
+ ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left;
+ }
+
+ if (dst8 != NULL) {
+ f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride,
+ bw4 * h_mul, bh4 * v_mul,
+ pos_x & 0x3ff, pos_y & 0x3ff,
+ f->svc[refidx][0].step,
+ f->svc[refidx][1].step
+ HIGHBD_CALL_SUFFIX);
+ } else {
+ f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride,
+ bw4 * h_mul, bh4 * v_mul,
+ pos_x & 0x3ff, pos_y & 0x3ff,
+ f->svc[refidx][0].step,
+ f->svc[refidx][1].step
+ HIGHBD_CALL_SUFFIX);
+ }
+ }
+
+ return 0;
+}
+
+static int obmc(Dav1dTaskContext *const t,
+ pixel *const dst, const ptrdiff_t dst_stride,
+ const uint8_t *const b_dim, const int pl,
+ const int bx4, const int by4, const int w4, const int h4)
+{
+ assert(!(t->bx & 1) && !(t->by & 1));
+ const Dav1dFrameContext *const f = t->f;
+ /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
+ pixel *const lap = bitfn(t->scratch.lap);
+ const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+ int res;
+
+ if (t->by > t->ts->tiling.row_start &&
+ (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
+ {
+ for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
+ // only odd blocks are considered for overlap handling, hence +1
+ const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
+ const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
+ const int step4 = iclip(a_b_dim[0], 2, 16);
+
+ if (a_r->ref.ref[0] > 0) {
+ const int ow4 = imin(step4, b_dim[0]);
+ const int oh4 = imin(b_dim[1], 16) >> 1;
+ res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
+ t->bx + x, t->by, pl, a_r->mv.mv[0],
+ &f->refp[a_r->ref.ref[0] - 1], a_r->ref.ref[0] - 1,
+ dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
+ if (res) return res;
+ f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
+ h_mul * ow4, v_mul * oh4);
+ i++;
+ }
+ x += step4;
+ }
+ }
+
+ if (t->bx > t->ts->tiling.col_start)
+ for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
+ // only odd blocks are considered for overlap handling, hence +1
+ const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
+ const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
+ const int step4 = iclip(l_b_dim[1], 2, 16);
+
+ if (l_r->ref.ref[0] > 0) {
+ const int ow4 = imin(b_dim[0], 16) >> 1;
+ const int oh4 = imin(step4, b_dim[1]);
+ res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
+ t->bx, t->by + y, pl, l_r->mv.mv[0],
+ &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1,
+ dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
+ if (res) return res;
+ f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
+ dst_stride, lap, h_mul * ow4, v_mul * oh4);
+ i++;
+ }
+ y += step4;
+ }
+ return 0;
+}
+
+static int warp_affine(Dav1dTaskContext *const t,
+ pixel *dst8, int16_t *dst16, const ptrdiff_t dstride,
+ const uint8_t *const b_dim, const int pl,
+ const Dav1dThreadPicture *const refp,
+ const Dav1dWarpedMotionParams *const wmp)
+{
+ assert((dst8 != NULL) ^ (dst16 != NULL));
+ const Dav1dFrameContext *const f = t->f;
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+ assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
+ const int32_t *const mat = wmp->matrix;
+ const int width = (refp->p.p.w + ss_hor) >> ss_hor;
+ const int height = (refp->p.p.h + ss_ver) >> ss_ver;
+
+ for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
+ const int src_y = t->by * 4 + ((y + 4) << ss_ver);
+ const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0];
+ const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
+ for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
+ // calculate transformation relative to center of 8x8 block in
+ // luma pixel units
+ const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
+ const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor;
+ const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
+
+ const int dx = (int) (mvx >> 16) - 4;
+ const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 -
+ wmp->u.p.beta * 7) & ~0x3f;
+ const int dy = (int) (mvy >> 16) - 4;
+ const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 -
+ wmp->u.p.delta * 4) & ~0x3f;
+
+ const pixel *ref_ptr;
+ ptrdiff_t ref_stride = refp->p.stride[!!pl];
+
+ if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
+ pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
+ f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3,
+ emu_edge_buf, 32 * sizeof(pixel),
+ refp->p.data[pl], ref_stride);
+ ref_ptr = &emu_edge_buf[32 * 3 + 3];
+ ref_stride = 32 * sizeof(pixel);
+ } else {
+ ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
+ }
+ if (dst16 != NULL)
+ dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
+ wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
+ else
+ dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
+ wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
+ }
+ if (dst8) dst8 += 8 * PXSTRIDE(dstride);
+ else dst16 += 8 * dstride;
+ }
+ return 0;
+}
+
+void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize bs,
+ const enum EdgeFlags intra_edge_flags,
+ const Av1Block *const b)
+{
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dFrameContext *const f = t->f;
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+ const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+ const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+ const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+ (bw4 > ss_hor || t->bx & 1) &&
+ (bh4 > ss_ver || t->by & 1);
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
+ const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
+
+ // coefficient coding
+ pixel *const edge = bitfn(t->scratch.edge) + 128;
+ const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
+
+ const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;
+
+ for (int init_y = 0; init_y < h4; init_y += 16) {
+ const int sub_h4 = imin(h4, 16 + init_y);
+ const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
+ for (int init_x = 0; init_x < w4; init_x += 16) {
+ if (b->pal_sz[0]) {
+ pixel *dst = ((pixel *) f->cur.data[0]) +
+ 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
+ const uint8_t *pal_idx;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ assert(ts->frame_thread[p].pal_idx);
+ pal_idx = ts->frame_thread[p].pal_idx;
+ ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
+ } else {
+ pal_idx = t->scratch.pal_idx;
+ }
+ const uint16_t *const pal = t->frame_thread.pass ?
+ f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+ ((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0];
+ f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
+ pal_idx, bw4 * 4, bh4 * 4);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(dst, PXSTRIDE(f->cur.stride[0]),
+ bw4 * 4, bh4 * 4, "y-pal-pred");
+ }
+
+ const int intra_flags = (sm_flag(t->a, bx4) |
+ sm_flag(&t->l, by4) |
+ intra_edge_filter_flag);
+ const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
+ intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
+ const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
+ intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
+ int y, x;
+ const int sub_w4 = imin(w4, init_x + 16);
+ for (y = init_y, t->by += init_y; y < sub_h4;
+ y += t_dim->h, t->by += t_dim->h)
+ {
+ pixel *dst = ((pixel *) f->cur.data[0]) +
+ 4 * (t->by * PXSTRIDE(f->cur.stride[0]) +
+ t->bx + init_x);
+ for (x = init_x, t->bx += init_x; x < sub_w4;
+ x += t_dim->w, t->bx += t_dim->w)
+ {
+ if (b->pal_sz[0]) goto skip_y_pred;
+
+ int angle = b->y_angle;
+ const enum EdgeFlags edge_flags =
+ (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
+ 0 : EDGE_I444_TOP_HAS_RIGHT) |
+ ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
+ 0 : EDGE_I444_LEFT_HAS_BOTTOM);
+ const pixel *top_sb_edge = NULL;
+ if (!(t->by & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[0];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ const enum IntraPredMode m =
+ bytefn(dav1d_prepare_intra_edges)(t->bx,
+ t->bx > ts->tiling.col_start,
+ t->by,
+ t->by > ts->tiling.row_start,
+ ts->tiling.col_end,
+ ts->tiling.row_end,
+ edge_flags, dst,
+ f->cur.stride[0], top_sb_edge,
+ b->y_mode, &angle,
+ t_dim->w, t_dim->h,
+ f->seq_hdr->intra_edge_filter,
+ edge HIGHBD_CALL_SUFFIX);
+ dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
+ t_dim->w * 4, t_dim->h * 4,
+ angle | intra_flags,
+ 4 * f->bw - 4 * t->bx,
+ 4 * f->bh - 4 * t->by
+ HIGHBD_CALL_SUFFIX);
+
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
+ t_dim->h * 4, 2, "l");
+ hex_dump(edge, 0, 1, 1, "tl");
+ hex_dump(edge + 1, t_dim->w * 4,
+ t_dim->w * 4, 2, "t");
+ hex_dump(dst, f->cur.stride[0],
+ t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
+ }
+
+ skip_y_pred: {}
+ if (!b->skip) {
+ coef *cf;
+ int eob;
+ enum TxfmType txtp;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ cf = ts->frame_thread[p].cf;
+ ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+ const struct CodedBlockInfo *const cbi =
+ &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+ eob = cbi->eob[0];
+ txtp = cbi->txtp[0];
+ } else {
+ uint8_t cf_ctx;
+ cf = bitfn(t->cf);
+ eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
+ &t->l.lcoef[by4 + y], b->tx, bs,
+ b, 1, 0, cf, &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+ b->tx, txtp, eob, ts->msac.rng);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir lcoef[off], cf_ctx, sz)
+ case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by), \
+ l., 1, by4 + y);
+ case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx), \
+ a->, 0, bx4 + x);
+#undef default_memset
+#undef set_ctx
+ }
+ if (eob >= 0) {
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ coef_dump(cf, imin(t_dim->h, 8) * 4,
+ imin(t_dim->w, 8) * 4, 3, "dq");
+ dsp->itx.itxfm_add[b->tx]
+ [txtp](dst,
+ f->cur.stride[0],
+ cf, eob HIGHBD_CALL_SUFFIX);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(dst, f->cur.stride[0],
+ t_dim->w * 4, t_dim->h * 4, "recon");
+ }
+ } else if (!t->frame_thread.pass) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * 0x40)
+ case_set_upto16(t_dim->h, l., 1, by4 + y);
+ case_set_upto16(t_dim->w, a->, 0, bx4 + x);
+#undef set_ctx
+ }
+ dst += 4 * t_dim->w;
+ }
+ t->bx -= x;
+ }
+ t->by -= y;
+
+ if (!has_chroma) continue;
+
+ const ptrdiff_t stride = f->cur.stride[1];
+
+ if (b->uv_mode == CFL_PRED) {
+ assert(!init_x && !init_y);
+
+ int16_t *const ac = t->scratch.ac;
+ pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) +
+ 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]);
+ const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
+ (t->by >> ss_ver) * PXSTRIDE(stride));
+ pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off,
+ ((pixel *) f->cur.data[2]) + uv_off };
+
+ const int furthest_r =
+ ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
+ const int furthest_b =
+ ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
+ dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0],
+ cbw4 - (furthest_r >> ss_hor),
+ cbh4 - (furthest_b >> ss_ver),
+ cbw4 * 4, cbh4 * 4);
+ for (int pl = 0; pl < 2; pl++) {
+ if (!b->cfl_alpha[pl]) continue;
+ int angle = 0;
+ const pixel *top_sb_edge = NULL;
+ if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[pl + 1];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
+ const int xstart = ts->tiling.col_start >> ss_hor;
+ const int ystart = ts->tiling.row_start >> ss_ver;
+ const enum IntraPredMode m =
+ bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
+ ypos, ypos > ystart,
+ ts->tiling.col_end >> ss_hor,
+ ts->tiling.row_end >> ss_ver,
+ 0, uv_dst[pl], stride,
+ top_sb_edge, DC_PRED, &angle,
+ uv_t_dim->w, uv_t_dim->h, 0,
+ edge HIGHBD_CALL_SUFFIX);
+ dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
+ uv_t_dim->w * 4,
+ uv_t_dim->h * 4,
+ ac, b->cfl_alpha[pl]
+ HIGHBD_CALL_SUFFIX);
+ }
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
+ hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
+ hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
+ }
+ } else if (b->pal_sz[1]) {
+ const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
+ (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
+ const uint16_t (*pal)[8];
+ const uint8_t *pal_idx;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ assert(ts->frame_thread[p].pal_idx);
+ pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+ ((t->bx >> 1) + (t->by & 1))];
+ pal_idx = ts->frame_thread[p].pal_idx;
+ ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
+ } else {
+ pal = t->scratch.pal;
+ pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
+ }
+
+ f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
+ f->cur.stride[1], pal[1],
+ pal_idx, cbw4 * 4, cbh4 * 4);
+ f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff,
+ f->cur.stride[1], pal[2],
+ pal_idx, cbw4 * 4, cbh4 * 4);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff,
+ PXSTRIDE(f->cur.stride[1]),
+ cbw4 * 4, cbh4 * 4, "u-pal-pred");
+ hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff,
+ PXSTRIDE(f->cur.stride[1]),
+ cbw4 * 4, cbh4 * 4, "v-pal-pred");
+ }
+ }
+
+ const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
+ sm_uv_flag(&t->l, cby4);
+ const int uv_sb_has_tr =
+ ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
+ intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1));
+ const int uv_sb_has_bl =
+ init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
+ intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));
+ const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
+ for (int pl = 0; pl < 2; pl++) {
+ for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
+ y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
+ {
+ pixel *dst = ((pixel *) f->cur.data[1 + pl]) +
+ 4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
+ ((t->bx + init_x) >> ss_hor));
+ for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
+ x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
+ {
+ if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
+ b->pal_sz[1])
+ {
+ goto skip_uv_pred;
+ }
+
+ int angle = b->uv_angle;
+ // this probably looks weird because we're using
+ // luma flags in a chroma loop, but that's because
+ // prepare_intra_edges() expects luma flags as input
+ const enum EdgeFlags edge_flags =
+ (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
+ (x + uv_t_dim->w >= sub_cw4)) ?
+ 0 : EDGE_I444_TOP_HAS_RIGHT) |
+ ((x > (init_x >> ss_hor) ||
+ (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
+ 0 : EDGE_I444_LEFT_HAS_BOTTOM);
+ const pixel *top_sb_edge = NULL;
+ if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[1 + pl];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ const enum IntraPredMode uv_mode =
+ b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
+ const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
+ const int xstart = ts->tiling.col_start >> ss_hor;
+ const int ystart = ts->tiling.row_start >> ss_ver;
+ const enum IntraPredMode m =
+ bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
+ ypos, ypos > ystart,
+ ts->tiling.col_end >> ss_hor,
+ ts->tiling.row_end >> ss_ver,
+ edge_flags, dst, stride,
+ top_sb_edge, uv_mode,
+ &angle, uv_t_dim->w,
+ uv_t_dim->h,
+ f->seq_hdr->intra_edge_filter,
+ edge HIGHBD_CALL_SUFFIX);
+ angle |= intra_edge_filter_flag;
+ dsp->ipred.intra_pred[m](dst, stride, edge,
+ uv_t_dim->w * 4,
+ uv_t_dim->h * 4,
+ angle | sm_uv_fl,
+ (4 * f->bw + ss_hor -
+ 4 * (t->bx & ~ss_hor)) >> ss_hor,
+ (4 * f->bh + ss_ver -
+ 4 * (t->by & ~ss_ver)) >> ss_ver
+ HIGHBD_CALL_SUFFIX);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
+ uv_t_dim->h * 4, 2, "l");
+ hex_dump(edge, 0, 1, 1, "tl");
+ hex_dump(edge + 1, uv_t_dim->w * 4,
+ uv_t_dim->w * 4, 2, "t");
+ hex_dump(dst, stride, uv_t_dim->w * 4,
+ uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
+ }
+
+ skip_uv_pred: {}
+ if (!b->skip) {
+ enum TxfmType txtp;
+ int eob;
+ coef *cf;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ cf = ts->frame_thread[p].cf;
+ ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
+ const struct CodedBlockInfo *const cbi =
+ &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+ eob = cbi->eob[pl + 1];
+ txtp = cbi->txtp[pl + 1];
+ } else {
+ uint8_t cf_ctx;
+ cf = bitfn(t->cf);
+ eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+ &t->l.ccoef[pl][cby4 + y],
+ b->uvtx, bs, b, 1, 1 + pl, cf,
+ &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+ "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
+ pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+ case_set_upto16_with_default( \
+ imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
+ l., 1, cby4 + y);
+ case_set_upto16_with_default( \
+ imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+ a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
+ }
+ if (eob >= 0) {
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ coef_dump(cf, uv_t_dim->h * 4,
+ uv_t_dim->w * 4, 3, "dq");
+ dsp->itx.itxfm_add[b->uvtx]
+ [txtp](dst, stride,
+ cf, eob HIGHBD_CALL_SUFFIX);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(dst, stride, uv_t_dim->w * 4,
+ uv_t_dim->h * 4, "recon");
+ }
+ } else if (!t->frame_thread.pass) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[pl], off, mul * 0x40)
+ case_set_upto16(uv_t_dim->h, l., 1, cby4 + y);
+ case_set_upto16(uv_t_dim->w, a->, 0, cbx4 + x);
+#undef set_ctx
+ }
+ dst += uv_t_dim->w * 4;
+ }
+ t->bx -= x << ss_hor;
+ }
+ t->by -= y << ss_ver;
+ }
+ }
+ }
+}
+
+int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize bs,
+ const Av1Block *const b)
+{
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dFrameContext *const f = t->f;
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+ const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+ const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+ (bw4 > ss_hor || t->bx & 1) &&
+ (bh4 > ss_ver || t->by & 1);
+ const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
+ DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;
+ int res;
+
+ // prediction
+ const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
+ pixel *dst = ((pixel *) f->cur.data[0]) +
+ 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
+ const ptrdiff_t uvdstoff =
+ 4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
+ if (IS_KEY_OR_INTRA(f->frame_hdr)) {
+ // intrabc
+ assert(!f->frame_hdr->super_res.enabled);
+ res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
+ b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
+ if (res) return res;
+ if (has_chroma) for (int pl = 1; pl < 3; pl++) {
+ res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1],
+ bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
+ t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0],
+ &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
+ if (res) return res;
+ }
+ } else if (b->comp_type == COMP_INTER_NONE) {
+ const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
+ const enum Filter2d filter_2d = b->filter2d;
+
+ if (imin(bw4, bh4) > 1 &&
+ ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
+ (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
+ {
+ res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp,
+ b->motion_mode == MM_WARP ? &t->warpmv :
+ &f->frame_hdr->gmv[b->ref[0]]);
+ if (res) return res;
+ } else {
+ res = mc(t, dst, NULL, f->cur.stride[0],
+ bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);
+ if (res) return res;
+ if (b->motion_mode == MM_OBMC) {
+ res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4);
+ if (res) return res;
+ }
+ }
+ if (b->interintra_type) {
+ pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
+ enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
+ SMOOTH_PRED : b->interintra_mode;
+ pixel *const tmp = bitfn(t->scratch.interintra);
+ int angle = 0;
+ const pixel *top_sb_edge = NULL;
+ if (!(t->by & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[0];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
+ t->by, t->by > ts->tiling.row_start,
+ ts->tiling.col_end, ts->tiling.row_end,
+ 0, dst, f->cur.stride[0], top_sb_edge,
+ m, &angle, bw4, bh4, 0, tl_edge
+ HIGHBD_CALL_SUFFIX);
+ dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
+ tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
+ HIGHBD_CALL_SUFFIX);
+ const uint8_t *const ii_mask =
+ b->interintra_type == INTER_INTRA_BLEND ?
+ dav1d_ii_masks[bs][0][b->interintra_mode] :
+ dav1d_wedge_masks[bs][0][0][b->wedge_idx];
+ dsp->mc.blend(dst, f->cur.stride[0], tmp,
+ bw4 * 4, bh4 * 4, ii_mask);
+ }
+
+ if (!has_chroma) goto skip_inter_chroma_pred;
+
+ // sub8x8 derivation
+ int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
+ refmvs_block *const *r;
+ if (is_sub8x8) {
+ assert(ss_hor == 1);
+ r = &t->rt.r[(t->by & 31) + 5];
+ if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
+ if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
+ if (bw4 == 1 && bh4 == ss_ver)
+ is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
+ }
+
+ // chroma prediction
+ if (is_sub8x8) {
+ assert(ss_hor == 1);
+ ptrdiff_t h_off = 0, v_off = 0;
+ if (bw4 == 1 && bh4 == ss_ver) {
+ for (int pl = 0; pl < 2; pl++) {
+ res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
+ NULL, f->cur.stride[1],
+ bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
+ r[-1][t->bx - 1].mv.mv[0],
+ &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
+ r[-1][t->bx - 1].ref.ref[0] - 1,
+ t->frame_thread.pass != 2 ? t->tl_4x4_filter :
+ f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
+ if (res) return res;
+ }
+ v_off = 2 * PXSTRIDE(f->cur.stride[1]);
+ h_off = 2;
+ }
+ if (bw4 == 1) {
+ const enum Filter2d left_filter_2d =
+ dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
+ for (int pl = 0; pl < 2; pl++) {
+ res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
+ f->cur.stride[1], bw4, bh4, t->bx - 1,
+ t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],
+ &f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
+ r[0][t->bx - 1].ref.ref[0] - 1,
+ t->frame_thread.pass != 2 ? left_filter_2d :
+ f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
+ if (res) return res;
+ }
+ h_off = 2;
+ }
+ if (bh4 == ss_ver) {
+ const enum Filter2d top_filter_2d =
+ dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
+ for (int pl = 0; pl < 2; pl++) {
+ res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
+ f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,
+ 1 + pl, r[-1][t->bx].mv.mv[0],
+ &f->refp[r[-1][t->bx].ref.ref[0] - 1],
+ r[-1][t->bx].ref.ref[0] - 1,
+ t->frame_thread.pass != 2 ? top_filter_2d :
+ f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
+ if (res) return res;
+ }
+ v_off = 2 * PXSTRIDE(f->cur.stride[1]);
+ }
+ for (int pl = 0; pl < 2; pl++) {
+ res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1],
+ bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],
+ refp, b->ref[0], filter_2d);
+ if (res) return res;
+ }
+ } else {
+ if (imin(cbw4, cbh4) > 1 &&
+ ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
+ (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
+ {
+ for (int pl = 0; pl < 2; pl++) {
+ res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,
+ f->cur.stride[1], b_dim, 1 + pl, refp,
+ b->motion_mode == MM_WARP ? &t->warpmv :
+ &f->frame_hdr->gmv[b->ref[0]]);
+ if (res) return res;
+ }
+ } else {
+ for (int pl = 0; pl < 2; pl++) {
+ res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
+ NULL, f->cur.stride[1],
+ bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
+ t->bx & ~ss_hor, t->by & ~ss_ver,
+ 1 + pl, b->mv[0], refp, b->ref[0], filter_2d);
+ if (res) return res;
+ if (b->motion_mode == MM_OBMC) {
+ res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
+ f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
+ if (res) return res;
+ }
+ }
+ }
+ if (b->interintra_type) {
+ // FIXME for 8x32 with 4:2:2 subsampling, this probably does
+ // the wrong thing since it will select 4x16, not 4x32, as a
+ // transform size...
+ const uint8_t *const ii_mask =
+ b->interintra_type == INTER_INTRA_BLEND ?
+ dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] :
+ dav1d_wedge_masks[bs][chr_layout_idx][0][b->wedge_idx];
+
+ for (int pl = 0; pl < 2; pl++) {
+ pixel *const tmp = bitfn(t->scratch.interintra);
+ pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
+ enum IntraPredMode m =
+ b->interintra_mode == II_SMOOTH_PRED ?
+ SMOOTH_PRED : b->interintra_mode;
+ int angle = 0;
+ pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
+ const pixel *top_sb_edge = NULL;
+ if (!(t->by & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[pl + 1];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
+ (t->bx >> ss_hor) >
+ (ts->tiling.col_start >> ss_hor),
+ t->by >> ss_ver,
+ (t->by >> ss_ver) >
+ (ts->tiling.row_start >> ss_ver),
+ ts->tiling.col_end >> ss_hor,
+ ts->tiling.row_end >> ss_ver,
+ 0, uvdst, f->cur.stride[1],
+ top_sb_edge, m,
+ &angle, cbw4, cbh4, 0, tl_edge
+ HIGHBD_CALL_SUFFIX);
+ dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
+ tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
+ HIGHBD_CALL_SUFFIX);
+ dsp->mc.blend(uvdst, f->cur.stride[1], tmp,
+ cbw4 * 4, cbh4 * 4, ii_mask);
+ }
+ }
+ }
+
+ skip_inter_chroma_pred: {}
+ t->tl_4x4_filter = filter_2d;
+ } else {
+ const enum Filter2d filter_2d = b->filter2d;
+ // Maximum super block size is 128x128
+ int16_t (*tmp)[128 * 128] = t->scratch.compinter;
+ int jnt_weight;
+ uint8_t *const seg_mask = t->scratch.seg_mask;
+ const uint8_t *mask;
+
+ for (int i = 0; i < 2; i++) {
+ const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
+
+ if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
+ res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
+ &f->frame_hdr->gmv[b->ref[i]]);
+ if (res) return res;
+ } else {
+ res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
+ b->mv[i], refp, b->ref[i], filter_2d);
+ if (res) return res;
+ }
+ }
+ switch (b->comp_type) {
+ case COMP_INTER_AVG:
+ dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1],
+ bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX);
+ break;
+ case COMP_INTER_WEIGHTED_AVG:
+ jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
+ dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1],
+ bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX);
+ break;
+ case COMP_INTER_SEG:
+ dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0],
+ tmp[b->mask_sign], tmp[!b->mask_sign],
+ bw4 * 4, bh4 * 4, seg_mask,
+ b->mask_sign HIGHBD_CALL_SUFFIX);
+ mask = seg_mask;
+ break;
+ case COMP_INTER_WEDGE:
+ mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx];
+ dsp->mc.mask(dst, f->cur.stride[0],
+ tmp[b->mask_sign], tmp[!b->mask_sign],
+ bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
+ if (has_chroma)
+ mask = dav1d_wedge_masks[bs][chr_layout_idx][b->mask_sign][b->wedge_idx];
+ break;
+ }
+
+ // chroma
+ if (has_chroma) for (int pl = 0; pl < 2; pl++) {
+ for (int i = 0; i < 2; i++) {
+ const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
+ if (b->inter_mode == GLOBALMV_GLOBALMV &&
+ imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
+ {
+ res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor,
+ b_dim, 1 + pl,
+ refp, &f->frame_hdr->gmv[b->ref[i]]);
+ if (res) return res;
+ } else {
+ res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
+ 1 + pl, b->mv[i], refp, b->ref[i], filter_2d);
+ if (res) return res;
+ }
+ }
+ pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
+ switch (b->comp_type) {
+ case COMP_INTER_AVG:
+ dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
+ bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver
+ HIGHBD_CALL_SUFFIX);
+ break;
+ case COMP_INTER_WEIGHTED_AVG:
+ dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
+ bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight
+ HIGHBD_CALL_SUFFIX);
+ break;
+ case COMP_INTER_WEDGE:
+ case COMP_INTER_SEG:
+ dsp->mc.mask(uvdst, f->cur.stride[1],
+ tmp[b->mask_sign], tmp[!b->mask_sign],
+ bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask
+ HIGHBD_CALL_SUFFIX);
+ break;
+ }
+ }
+ }
+
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
+ if (has_chroma) {
+ hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1],
+ cbw4 * 4, cbh4 * 4, "u-pred");
+ hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1],
+ cbw4 * 4, cbh4 * 4, "v-pred");
+ }
+ }
+
+ const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+
+ if (b->skip) {
+ // reset coef contexts
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir lcoef, off, mul * 0x40)
+ case_set(bh4, l., 1, by4);
+ case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+ if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
+ rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
+ case_set(cbh4, l., 1, cby4);
+ case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+ }
+ return 0;
+ }
+
+ const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
+ const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
+ const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
+
+ for (int init_y = 0; init_y < bh4; init_y += 16) {
+ for (int init_x = 0; init_x < bw4; init_x += 16) {
+ // coefficient coding & inverse transforms
+ int y_off = !!init_y, y;
+ dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y;
+ for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
+ y += ytx->h, y_off++)
+ {
+ int x, x_off = !!init_x;
+ for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
+ x += ytx->w, x_off++)
+ {
+ read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
+ x_off, y_off, &dst[x * 4]);
+ t->bx += ytx->w;
+ }
+ dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h;
+ t->bx -= x;
+ t->by += ytx->h;
+ }
+ dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y;
+ t->by -= y;
+
+ // chroma coefs and inverse transform
+ if (has_chroma) for (int pl = 0; pl < 2; pl++) {
+ pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +
+ (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver);
+ for (y = init_y >> ss_ver, t->by += init_y;
+ y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
+ {
+ int x;
+ for (x = init_x >> ss_hor, t->bx += init_x;
+ x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
+ {
+ coef *cf;
+ int eob;
+ enum TxfmType txtp;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ cf = ts->frame_thread[p].cf;
+ ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
+ const struct CodedBlockInfo *const cbi =
+ &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+ eob = cbi->eob[1 + pl];
+ txtp = cbi->txtp[1 + pl];
+ } else {
+ uint8_t cf_ctx;
+ cf = bitfn(t->cf);
+ txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +
+ bx4 + (x << ss_hor)];
+ eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+ &t->l.ccoef[pl][cby4 + y],
+ b->uvtx, bs, b, 0, 1 + pl,
+ cf, &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+ "txtp=%d,eob=%d]: r=%d\n",
+ pl, b->uvtx, txtp, eob, ts->msac.rng);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+ rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+ memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+ case_set_upto16_with_default( \
+ imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver),
+ l., 1, cby4 + y);
+ case_set_upto16_with_default( \
+ imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+ a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
+ }
+ if (eob >= 0) {
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
+ dsp->itx.itxfm_add[b->uvtx]
+ [txtp](&uvdst[4 * x],
+ f->cur.stride[1],
+ cf, eob HIGHBD_CALL_SUFFIX);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(&uvdst[4 * x], f->cur.stride[1],
+ uvtx->w * 4, uvtx->h * 4, "recon");
+ }
+ t->bx += uvtx->w << ss_hor;
+ }
+ uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h;
+ t->bx -= x << ss_hor;
+ t->by += uvtx->h << ss_ver;
+ }
+ t->by -= y << ss_ver;
+ }
+ }
+ }
+ return 0;
+}
+
+void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
+ if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
+ (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))
+ {
+ return;
+ }
+ const int y = sby * f->sb_step * 4;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ pixel *const p[3] = {
+ f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
+ f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
+ };
+ Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
+ bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
+ f->lf.start_of_tile_row[sby]);
+}
+
+void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
+ const int y = sby * f->sb_step * 4;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ pixel *const p[3] = {
+ f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
+ f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
+ };
+ Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
+ if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
+ (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))
+ {
+ bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
+ }
+ if (f->seq_hdr->cdef || f->lf.restore_planes) {
+ // Store loop filtered pixels required by CDEF / LR
+ bytefn(dav1d_copy_lpf)(f, p, sby);
+ }
+}
+
+void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
+ const Dav1dFrameContext *const f = tc->f;
+ if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;
+ const int sbsz = f->sb_step;
+ const int y = sby * sbsz * 4;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ pixel *const p[3] = {
+ f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
+ f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
+ };
+ Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
+ Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
+ const int start = sby * sbsz;
+ if (sby) {
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ pixel *p_up[3] = {
+ p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
+ p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ };
+ bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby);
+ }
+ const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
+ const int end = imin(start + n_blks, f->bh);
+ bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby);
+}
+
+void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
+ const int sbsz = f->sb_step;
+ const int y = sby * sbsz * 4;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const pixel *const p[3] = {
+ f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
+ f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
+ };
+ pixel *const sr_p[3] = {
+ f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
+ f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
+ f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
+ };
+ const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
+ for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
+ const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int h_start = 8 * !!sby >> ss_ver;
+ const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
+ pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
+ const ptrdiff_t src_stride = f->cur.stride[!!pl];
+ const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
+ const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
+ const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+ const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
+ const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
+
+ f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
+ imin(img_h, h_end) + h_start, src_w,
+ f->resize_step[!!pl], f->resize_start[!!pl]
+ HIGHBD_CALL_SUFFIX);
+ }
+}
+
+void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
+ if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;
+ const int y = sby * f->sb_step * 4;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ pixel *const sr_p[3] = {
+ f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
+ f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
+ f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
+ };
+ bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
+}
+
+void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
+ bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby);
+ bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby);
+ if (f->seq_hdr->cdef)
+ bytefn(dav1d_filter_sbrow_cdef)(f->c->tc, sby);
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
+ bytefn(dav1d_filter_sbrow_resize)(f, sby);
+ if (f->lf.restore_planes)
+ bytefn(dav1d_filter_sbrow_lr)(f, sby);
+}
+
+void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
+ const Dav1dFrameContext *const f = t->f;
+ Dav1dTileState *const ts = t->ts;
+ const int sby = t->by >> f->sb_shift;
+ const int sby_off = f->sb128w * 128 * sby;
+ const int x_off = ts->tiling.col_start;
+
+ const pixel *const y =
+ ((const pixel *) f->cur.data[0]) + x_off * 4 +
+ ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]);
+ pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
+ 4 * (ts->tiling.col_end - x_off));
+
+ if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+
+ const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
+ (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]);
+ for (int pl = 1; pl <= 2; pl++)
+ pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
+ &((const pixel *) f->cur.data[pl])[uv_off],
+ 4 * (ts->tiling.col_end - x_off) >> ss_hor);
+ }
+}
diff --git a/third_party/dav1d/src/ref.c b/third_party/dav1d/src/ref.c
new file mode 100644
index 0000000000..46462b4c80
--- /dev/null
+++ b/third_party/dav1d/src/ref.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/ref.h"
+
+static void default_free_callback(const uint8_t *const data, void *const user_data) {
+ assert(data == user_data);
+ dav1d_free_aligned(user_data);
+}
+
+Dav1dRef *dav1d_ref_create(size_t size) {
+ size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+ uint8_t *const data = dav1d_alloc_aligned(size + sizeof(Dav1dRef), 64);
+ if (!data) return NULL;
+
+ Dav1dRef *const res = (Dav1dRef*)(data + size);
+ res->const_data = res->user_data = res->data = data;
+ atomic_init(&res->ref_cnt, 1);
+ res->free_ref = 0;
+ res->free_callback = default_free_callback;
+
+ return res;
+}
+
+static void pool_free_callback(const uint8_t *const data, void *const user_data) {
+ dav1d_mem_pool_push((Dav1dMemPool*)data, user_data);
+}
+
+Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *const pool, size_t size) {
+ size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+ Dav1dMemPoolBuffer *const buf =
+ dav1d_mem_pool_pop(pool, size + sizeof(Dav1dRef));
+ if (!buf) return NULL;
+
+ Dav1dRef *const res = &((Dav1dRef*)buf)[-1];
+ res->data = buf->data;
+ res->const_data = pool;
+ atomic_init(&res->ref_cnt, 1);
+ res->free_ref = 0;
+ res->free_callback = pool_free_callback;
+ res->user_data = buf;
+
+ return res;
+}
+
+Dav1dRef *dav1d_ref_wrap(const uint8_t *const ptr,
+ void (*free_callback)(const uint8_t *data, void *user_data),
+ void *const user_data)
+{
+ Dav1dRef *res = malloc(sizeof(Dav1dRef));
+ if (!res) return NULL;
+
+ res->data = NULL;
+ res->const_data = ptr;
+ atomic_init(&res->ref_cnt, 1);
+ res->free_ref = 1;
+ res->free_callback = free_callback;
+ res->user_data = user_data;
+
+ return res;
+}
+
+void dav1d_ref_dec(Dav1dRef **const pref) {
+ assert(pref != NULL);
+
+ Dav1dRef *const ref = *pref;
+ if (!ref) return;
+
+ *pref = NULL;
+ if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) {
+ const int free_ref = ref->free_ref;
+ ref->free_callback(ref->const_data, ref->user_data);
+ if (free_ref) free(ref);
+ }
+}
+
+int dav1d_ref_is_writable(Dav1dRef *const ref) {
+ return atomic_load(&ref->ref_cnt) == 1 && ref->data;
+}
diff --git a/third_party/dav1d/src/ref.h b/third_party/dav1d/src/ref.h
new file mode 100644
index 0000000000..ec070a0a9a
--- /dev/null
+++ b/third_party/dav1d/src/ref.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_REF_H
+#define DAV1D_SRC_REF_H
+
+#include "dav1d/dav1d.h"
+
+#include "src/mem.h"
+#include "src/thread.h"
+
+#include <stdatomic.h>
+#include <stddef.h>
+
+struct Dav1dRef {
+ void *data;
+ const void *const_data;
+ atomic_int ref_cnt;
+ int free_ref;
+ void (*free_callback)(const uint8_t *data, void *user_data);
+ void *user_data;
+};
+
+Dav1dRef *dav1d_ref_create(size_t size);
+Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *pool, size_t size);
+Dav1dRef *dav1d_ref_wrap(const uint8_t *ptr,
+ void (*free_callback)(const uint8_t *data, void *user_data),
+ void *user_data);
+void dav1d_ref_dec(Dav1dRef **ref);
+int dav1d_ref_is_writable(Dav1dRef *ref);
+
+static inline void dav1d_ref_inc(Dav1dRef *const ref) {
+ atomic_fetch_add_explicit(&ref->ref_cnt, 1, memory_order_relaxed);
+}
+
+#endif /* DAV1D_SRC_REF_H */
diff --git a/third_party/dav1d/src/refmvs.c b/third_party/dav1d/src/refmvs.c
new file mode 100644
index 0000000000..5398d396d1
--- /dev/null
+++ b/third_party/dav1d/src/refmvs.c
@@ -0,0 +1,940 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2020, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <limits.h>
+#include <stdlib.h>
+
+#include "dav1d/common.h"
+
+#include "common/intops.h"
+
+#include "src/env.h"
+#include "src/mem.h"
+#include "src/refmvs.h"
+
+static void add_spatial_candidate(refmvs_candidate *const mvstack, int *const cnt,
+ const int weight, const refmvs_block *const b,
+ const union refmvs_refpair ref, const mv gmv[2],
+ int *const have_newmv_match,
+ int *const have_refmv_match)
+{
+ if (b->mv.mv[0].n == INVALID_MV) return; // intra block, no intrabc
+
+ if (ref.ref[1] == -1) {
+ for (int n = 0; n < 2; n++) {
+ if (b->ref.ref[n] == ref.ref[0]) {
+ const mv cand_mv = ((b->mf & 1) && gmv[0].n != INVALID_MV) ?
+ gmv[0] : b->mv.mv[n];
+
+ *have_refmv_match = 1;
+ *have_newmv_match |= b->mf >> 1;
+
+ const int last = *cnt;
+ for (int m = 0; m < last; m++)
+ if (mvstack[m].mv.mv[0].n == cand_mv.n) {
+ mvstack[m].weight += weight;
+ return;
+ }
+
+ if (last < 8) {
+ mvstack[last].mv.mv[0] = cand_mv;
+ mvstack[last].weight = weight;
+ *cnt = last + 1;
+ }
+ return;
+ }
+ }
+ } else if (b->ref.pair == ref.pair) {
+ const refmvs_mvpair cand_mv = { .mv = {
+ [0] = ((b->mf & 1) && gmv[0].n != INVALID_MV) ? gmv[0] : b->mv.mv[0],
+ [1] = ((b->mf & 1) && gmv[1].n != INVALID_MV) ? gmv[1] : b->mv.mv[1],
+ }};
+
+ *have_refmv_match = 1;
+ *have_newmv_match |= b->mf >> 1;
+
+ const int last = *cnt;
+ for (int n = 0; n < last; n++)
+ if (mvstack[n].mv.n == cand_mv.n) {
+ mvstack[n].weight += weight;
+ return;
+ }
+
+ if (last < 8) {
+ mvstack[last].mv = cand_mv;
+ mvstack[last].weight = weight;
+ *cnt = last + 1;
+ }
+ }
+}
+
+static int scan_row(refmvs_candidate *const mvstack, int *const cnt,
+ const union refmvs_refpair ref, const mv gmv[2],
+ const refmvs_block *b, const int bw4, const int w4,
+ const int max_rows, const int step,
+ int *const have_newmv_match, int *const have_refmv_match)
+{
+ const refmvs_block *cand_b = b;
+ const enum BlockSize first_cand_bs = cand_b->bs;
+ const uint8_t *const first_cand_b_dim = dav1d_block_dimensions[first_cand_bs];
+ int cand_bw4 = first_cand_b_dim[0];
+ int len = imax(step, imin(bw4, cand_bw4));
+
+ if (bw4 <= cand_bw4) {
+ // FIXME weight can be higher for odd blocks (bx4 & 1), but then the
+ // position of the first block has to be odd already, i.e. not just
+ // for row_offset=-3/-5
+ // FIXME why can this not be cand_bw4?
+ const int weight = bw4 == 1 ? 2 :
+ imax(2, imin(2 * max_rows, first_cand_b_dim[1]));
+ add_spatial_candidate(mvstack, cnt, len * weight, cand_b, ref, gmv,
+ have_newmv_match, have_refmv_match);
+ return weight >> 1;
+ }
+
+ for (int x = 0;;) {
+ // FIXME if we overhang above, we could fill a bitmask so we don't have
+ // to repeat the add_spatial_candidate() for the next row, but just increase
+ // the weight here
+ add_spatial_candidate(mvstack, cnt, len * 2, cand_b, ref, gmv,
+ have_newmv_match, have_refmv_match);
+ x += len;
+ if (x >= w4) return 1;
+ cand_b = &b[x];
+ cand_bw4 = dav1d_block_dimensions[cand_b->bs][0];
+ assert(cand_bw4 < bw4);
+ len = imax(step, cand_bw4);
+ }
+}
+
+static int scan_col(refmvs_candidate *const mvstack, int *const cnt,
+ const union refmvs_refpair ref, const mv gmv[2],
+ /*const*/ refmvs_block *const *b, const int bh4, const int h4,
+ const int bx4, const int max_cols, const int step,
+ int *const have_newmv_match, int *const have_refmv_match)
+{
+ const refmvs_block *cand_b = &b[0][bx4];
+ const enum BlockSize first_cand_bs = cand_b->bs;
+ const uint8_t *const first_cand_b_dim = dav1d_block_dimensions[first_cand_bs];
+ int cand_bh4 = first_cand_b_dim[1];
+ int len = imax(step, imin(bh4, cand_bh4));
+
+ if (bh4 <= cand_bh4) {
+ // FIXME weight can be higher for odd blocks (by4 & 1), but then the
+ // position of the first block has to be odd already, i.e. not just
+ // for col_offset=-3/-5
+ // FIXME why can this not be cand_bh4?
+ const int weight = bh4 == 1 ? 2 :
+ imax(2, imin(2 * max_cols, first_cand_b_dim[0]));
+ add_spatial_candidate(mvstack, cnt, len * weight, cand_b, ref, gmv,
+ have_newmv_match, have_refmv_match);
+ return weight >> 1;
+ }
+
+ for (int y = 0;;) {
+ // FIXME if we overhang above, we could fill a bitmask so we don't have
+ // to repeat the add_spatial_candidate() for the next row, but just increase
+ // the weight here
+ add_spatial_candidate(mvstack, cnt, len * 2, cand_b, ref, gmv,
+ have_newmv_match, have_refmv_match);
+ y += len;
+ if (y >= h4) return 1;
+ cand_b = &b[y][bx4];
+ cand_bh4 = dav1d_block_dimensions[cand_b->bs][1];
+ assert(cand_bh4 < bh4);
+ len = imax(step, cand_bh4);
+ }
+}
+
+static inline union mv mv_projection(const union mv mv, const int num, const int den) {
+ static const uint16_t div_mult[32] = {
+ 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340,
+ 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092,
+ 1024, 963, 910, 862, 819, 780, 744, 712,
+ 682, 655, 630, 606, 585, 564, 546, 528
+ };
+ assert(den > 0 && den < 32);
+ assert(num > -32 && num < 32);
+ const int frac = num * div_mult[den];
+ const int y = mv.y * frac, x = mv.x * frac;
+ // Round and clip according to AV1 spec section 7.9.3
+ return (union mv) { // 0x3fff == (1 << 14) - 1
+ .y = iclip((y + 8192 + (y >> 31)) >> 14, -0x3fff, 0x3fff),
+ .x = iclip((x + 8192 + (x >> 31)) >> 14, -0x3fff, 0x3fff)
+ };
+}
+
+static void add_temporal_candidate(const refmvs_frame *const rf,
+ refmvs_candidate *const mvstack, int *const cnt,
+ const refmvs_temporal_block *const rb,
+ const union refmvs_refpair ref, int *const globalmv_ctx,
+ const union mv gmv[])
+{
+ if (rb->mv.n == INVALID_MV) return;
+
+ union mv mv = mv_projection(rb->mv, rf->pocdiff[ref.ref[0] - 1], rb->ref);
+ fix_mv_precision(rf->frm_hdr, &mv);
+
+ const int last = *cnt;
+ if (ref.ref[1] == -1) {
+ if (globalmv_ctx)
+ *globalmv_ctx = (abs(mv.x - gmv[0].x) | abs(mv.y - gmv[0].y)) >= 16;
+
+ for (int n = 0; n < last; n++)
+ if (mvstack[n].mv.mv[0].n == mv.n) {
+ mvstack[n].weight += 2;
+ return;
+ }
+ if (last < 8) {
+ mvstack[last].mv.mv[0] = mv;
+ mvstack[last].weight = 2;
+ *cnt = last + 1;
+ }
+ } else {
+ refmvs_mvpair mvp = { .mv = {
+ [0] = mv,
+ [1] = mv_projection(rb->mv, rf->pocdiff[ref.ref[1] - 1], rb->ref),
+ }};
+ fix_mv_precision(rf->frm_hdr, &mvp.mv[1]);
+
+ for (int n = 0; n < last; n++)
+ if (mvstack[n].mv.n == mvp.n) {
+ mvstack[n].weight += 2;
+ return;
+ }
+ if (last < 8) {
+ mvstack[last].mv = mvp;
+ mvstack[last].weight = 2;
+ *cnt = last + 1;
+ }
+ }
+}
+
+static void add_compound_extended_candidate(refmvs_candidate *const same,
+ int *const same_count,
+ const refmvs_block *const cand_b,
+ const int sign0, const int sign1,
+ const union refmvs_refpair ref,
+ const uint8_t *const sign_bias)
+{
+ refmvs_candidate *const diff = &same[2];
+ int *const diff_count = &same_count[2];
+
+ for (int n = 0; n < 2; n++) {
+ const int cand_ref = cand_b->ref.ref[n];
+
+ if (cand_ref <= 0) break;
+
+ mv cand_mv = cand_b->mv.mv[n];
+ if (cand_ref == ref.ref[0]) {
+ if (same_count[0] < 2)
+ same[same_count[0]++].mv.mv[0] = cand_mv;
+ if (diff_count[1] < 2) {
+ if (sign1 ^ sign_bias[cand_ref - 1]) {
+ cand_mv.y = -cand_mv.y;
+ cand_mv.x = -cand_mv.x;
+ }
+ diff[diff_count[1]++].mv.mv[1] = cand_mv;
+ }
+ } else if (cand_ref == ref.ref[1]) {
+ if (same_count[1] < 2)
+ same[same_count[1]++].mv.mv[1] = cand_mv;
+ if (diff_count[0] < 2) {
+ if (sign0 ^ sign_bias[cand_ref - 1]) {
+ cand_mv.y = -cand_mv.y;
+ cand_mv.x = -cand_mv.x;
+ }
+ diff[diff_count[0]++].mv.mv[0] = cand_mv;
+ }
+ } else {
+ mv i_cand_mv = (union mv) {
+ .x = -cand_mv.x,
+ .y = -cand_mv.y
+ };
+
+ if (diff_count[0] < 2) {
+ diff[diff_count[0]++].mv.mv[0] =
+ sign0 ^ sign_bias[cand_ref - 1] ?
+ i_cand_mv : cand_mv;
+ }
+
+ if (diff_count[1] < 2) {
+ diff[diff_count[1]++].mv.mv[1] =
+ sign1 ^ sign_bias[cand_ref - 1] ?
+ i_cand_mv : cand_mv;
+ }
+ }
+ }
+}
+
+static void add_single_extended_candidate(refmvs_candidate mvstack[8], int *const cnt,
+ const refmvs_block *const cand_b,
+ const int sign, const uint8_t *const sign_bias)
+{
+ for (int n = 0; n < 2; n++) {
+ const int cand_ref = cand_b->ref.ref[n];
+
+ if (cand_ref <= 0) break;
+ // we need to continue even if cand_ref == ref.ref[0], since
+ // the candidate could have been added as a globalmv variant,
+ // which changes the value
+ // FIXME if scan_{row,col}() returned a mask for the nearest
+ // edge, we could skip the appropriate ones here
+
+ mv cand_mv = cand_b->mv.mv[n];
+ if (sign ^ sign_bias[cand_ref - 1]) {
+ cand_mv.y = -cand_mv.y;
+ cand_mv.x = -cand_mv.x;
+ }
+
+ int m;
+ const int last = *cnt;
+ for (m = 0; m < last; m++)
+ if (cand_mv.n == mvstack[m].mv.mv[0].n)
+ break;
+ if (m == last) {
+ mvstack[m].mv.mv[0] = cand_mv;
+ mvstack[m].weight = 2; // "minimal"
+ *cnt = last + 1;
+ }
+ }
+}
+
+/*
+ * refmvs_frame allocates memory for one sbrow (32 blocks high, whole frame
+ * wide) of 4x4-resolution refmvs_block entries for spatial MV referencing.
+ * mvrefs_tile[] keeps a list of 35 (32 + 3 above) pointers into this memory,
+ * and each sbrow, the bottom entries (y=27/29/31) are exchanged with the top
+ * (-5/-3/-1) pointers by calling dav1d_refmvs_tile_sbrow_init() at the start
+ * of each tile/sbrow.
+ *
+ * For temporal MV referencing, we call dav1d_refmvs_save_tmvs() at the end of
+ * each tile/sbrow (when tile column threading is enabled), or at the start of
+ * each interleaved sbrow (i.e. once for all tile columns together, when tile
+ * column threading is disabled). This will copy the 4x4-resolution spatial MVs
+ * into 8x8-resolution refmvs_temporal_block structures. Then, for subsequent
+ * frames, at the start of each tile/sbrow (when tile column threading is
+ * enabled) or at the start of each interleaved sbrow (when tile column
+ * threading is disabled), we call load_tmvs(), which will project the MVs to
+ * their respective position in the current frame.
+ */
+
+void dav1d_refmvs_find(const refmvs_tile *const rt,
+ refmvs_candidate mvstack[8], int *const cnt,
+ int *const ctx,
+ const union refmvs_refpair ref, const enum BlockSize bs,
+ const enum EdgeFlags edge_flags,
+ const int by4, const int bx4)
+{
+ const refmvs_frame *const rf = rt->rf;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], w4 = imin(imin(bw4, 16), rt->tile_col.end - bx4);
+ const int bh4 = b_dim[1], h4 = imin(imin(bh4, 16), rt->tile_row.end - by4);
+ mv gmv[2], tgmv[2];
+
+ *cnt = 0;
+ assert(ref.ref[0] >= 0 && ref.ref[0] <= 8 &&
+ ref.ref[1] >= -1 && ref.ref[1] <= 8);
+ if (ref.ref[0] > 0) {
+ tgmv[0] = get_gmv_2d(&rf->frm_hdr->gmv[ref.ref[0] - 1],
+ bx4, by4, bw4, bh4, rf->frm_hdr);
+ gmv[0] = rf->frm_hdr->gmv[ref.ref[0] - 1].type > DAV1D_WM_TYPE_TRANSLATION ?
+ tgmv[0] : (mv) { .n = INVALID_MV };
+ } else {
+ tgmv[0] = (mv) { .n = 0 };
+ gmv[0] = (mv) { .n = INVALID_MV };
+ }
+ if (ref.ref[1] > 0) {
+ tgmv[1] = get_gmv_2d(&rf->frm_hdr->gmv[ref.ref[1] - 1],
+ bx4, by4, bw4, bh4, rf->frm_hdr);
+ gmv[1] = rf->frm_hdr->gmv[ref.ref[1] - 1].type > DAV1D_WM_TYPE_TRANSLATION ?
+ tgmv[1] : (mv) { .n = INVALID_MV };
+ }
+
+ // top
+ int have_newmv = 0, have_col_mvs = 0, have_row_mvs = 0;
+ unsigned max_rows = 0, n_rows = ~0;
+ const refmvs_block *b_top;
+ if (by4 > rt->tile_row.start) {
+ max_rows = imin((by4 - rt->tile_row.start + 1) >> 1, 2 + (bh4 > 1));
+ b_top = &rt->r[(by4 & 31) - 1 + 5][bx4];
+ n_rows = scan_row(mvstack, cnt, ref, gmv, b_top,
+ bw4, w4, max_rows, bw4 >= 16 ? 4 : 1,
+ &have_newmv, &have_row_mvs);
+ }
+
+ // left
+ unsigned max_cols = 0, n_cols = ~0U;
+ refmvs_block *const *b_left;
+ if (bx4 > rt->tile_col.start) {
+ max_cols = imin((bx4 - rt->tile_col.start + 1) >> 1, 2 + (bw4 > 1));
+ b_left = &rt->r[(by4 & 31) + 5];
+ n_cols = scan_col(mvstack, cnt, ref, gmv, b_left,
+ bh4, h4, bx4 - 1, max_cols, bh4 >= 16 ? 4 : 1,
+ &have_newmv, &have_col_mvs);
+ }
+
+ // top/right
+ if (n_rows != ~0U && edge_flags & EDGE_I444_TOP_HAS_RIGHT &&
+ imax(bw4, bh4) <= 16 && bw4 + bx4 < rt->tile_col.end)
+ {
+ add_spatial_candidate(mvstack, cnt, 4, &b_top[bw4], ref, gmv,
+ &have_newmv, &have_row_mvs);
+ }
+
+ const int nearest_match = have_col_mvs + have_row_mvs;
+ const int nearest_cnt = *cnt;
+ for (int n = 0; n < nearest_cnt; n++)
+ mvstack[n].weight += 640;
+
+ // temporal
+ int globalmv_ctx = rf->frm_hdr->use_ref_frame_mvs;
+ if (rf->use_ref_frame_mvs) {
+ const ptrdiff_t stride = rf->rp_stride;
+ const int by8 = by4 >> 1, bx8 = bx4 >> 1;
+ const refmvs_temporal_block *const rbi = &rt->rp_proj[(by8 & 15) * stride + bx8];
+ const refmvs_temporal_block *rb = rbi;
+ const int step_h = bw4 >= 16 ? 2 : 1, step_v = bh4 >= 16 ? 2 : 1;
+ const int w8 = imin((w4 + 1) >> 1, 8), h8 = imin((h4 + 1) >> 1, 8);
+ for (int y = 0; y < h8; y += step_v) {
+ for (int x = 0; x < w8; x+= step_h) {
+ add_temporal_candidate(rf, mvstack, cnt, &rb[x], ref,
+ !(x | y) ? &globalmv_ctx : NULL, tgmv);
+ }
+ rb += stride * step_v;
+ }
+ if (imin(bw4, bh4) >= 2 && imax(bw4, bh4) < 16) {
+ const int bh8 = bh4 >> 1, bw8 = bw4 >> 1;
+ rb = &rbi[bh8 * stride];
+ const int has_bottom = by8 + bh8 < imin(rt->tile_row.end >> 1,
+ (by8 & ~7) + 8);
+ if (has_bottom && bx8 - 1 >= imax(rt->tile_col.start >> 1, bx8 & ~7)) {
+ add_temporal_candidate(rf, mvstack, cnt, &rb[-1], ref,
+ NULL, NULL);
+ }
+ if (bx8 + bw8 < imin(rt->tile_col.end >> 1, (bx8 & ~7) + 8)) {
+ if (has_bottom) {
+ add_temporal_candidate(rf, mvstack, cnt, &rb[bw8], ref,
+ NULL, NULL);
+ }
+ if (by8 + bh8 - 1 < imin(rt->tile_row.end >> 1, (by8 & ~7) + 8)) {
+ add_temporal_candidate(rf, mvstack, cnt, &rb[bw8 - stride],
+ ref, NULL, NULL);
+ }
+ }
+ }
+ }
+ assert(*cnt <= 8);
+
+ // top/left (which, confusingly, is part of "secondary" references)
+ int have_dummy_newmv_match;
+ if ((n_rows | n_cols) != ~0U) {
+ add_spatial_candidate(mvstack, cnt, 4, &b_top[-1], ref, gmv,
+ &have_dummy_newmv_match, &have_row_mvs);
+ }
+
+ // "secondary" (non-direct neighbour) top & left edges
+ // what is different about secondary is that everything is now in 8x8 resolution
+ for (int n = 2; n <= 3; n++) {
+ if ((unsigned) n > n_rows && (unsigned) n <= max_rows) {
+ n_rows += scan_row(mvstack, cnt, ref, gmv,
+ &rt->r[(((by4 & 31) - 2 * n + 1) | 1) + 5][bx4 | 1],
+ bw4, w4, 1 + max_rows - n, bw4 >= 16 ? 4 : 2,
+ &have_dummy_newmv_match, &have_row_mvs);
+ }
+
+ if ((unsigned) n > n_cols && (unsigned) n <= max_cols) {
+ n_cols += scan_col(mvstack, cnt, ref, gmv, &rt->r[((by4 & 31) | 1) + 5],
+ bh4, h4, (bx4 - n * 2 + 1) | 1,
+ 1 + max_cols - n, bh4 >= 16 ? 4 : 2,
+ &have_dummy_newmv_match, &have_col_mvs);
+ }
+ }
+ assert(*cnt <= 8);
+
+ const int ref_match_count = have_col_mvs + have_row_mvs;
+
+ // context build-up
+ int refmv_ctx, newmv_ctx;
+ switch (nearest_match) {
+ case 0:
+ refmv_ctx = imin(2, ref_match_count);
+ newmv_ctx = ref_match_count > 0;
+ break;
+ case 1:
+ refmv_ctx = imin(ref_match_count * 3, 4);
+ newmv_ctx = 3 - have_newmv;
+ break;
+ case 2:
+ refmv_ctx = 5;
+ newmv_ctx = 5 - have_newmv;
+ break;
+ }
+
+ // sorting (nearest, then "secondary")
+ int len = nearest_cnt;
+ while (len) {
+ int last = 0;
+ for (int n = 1; n < len; n++) {
+ if (mvstack[n - 1].weight < mvstack[n].weight) {
+#define EXCHANGE(a, b) do { refmvs_candidate tmp = a; a = b; b = tmp; } while (0)
+ EXCHANGE(mvstack[n - 1], mvstack[n]);
+ last = n;
+ }
+ }
+ len = last;
+ }
+ len = *cnt;
+ while (len > nearest_cnt) {
+ int last = nearest_cnt;
+ for (int n = nearest_cnt + 1; n < len; n++) {
+ if (mvstack[n - 1].weight < mvstack[n].weight) {
+ EXCHANGE(mvstack[n - 1], mvstack[n]);
+#undef EXCHANGE
+ last = n;
+ }
+ }
+ len = last;
+ }
+
+ if (ref.ref[1] > 0) {
+ if (*cnt < 2) {
+ const int sign0 = rf->sign_bias[ref.ref[0] - 1];
+ const int sign1 = rf->sign_bias[ref.ref[1] - 1];
+ const int sz4 = imin(w4, h4);
+ refmvs_candidate *const same = &mvstack[*cnt];
+ int same_count[4] = { 0 };
+
+ // non-self references in top
+ if (n_rows != ~0U) for (int x = 0; x < sz4;) {
+ const refmvs_block *const cand_b = &b_top[x];
+ add_compound_extended_candidate(same, same_count, cand_b,
+ sign0, sign1, ref, rf->sign_bias);
+ x += dav1d_block_dimensions[cand_b->bs][0];
+ }
+
+ // non-self references in left
+ if (n_cols != ~0U) for (int y = 0; y < sz4;) {
+ const refmvs_block *const cand_b = &b_left[y][bx4 - 1];
+ add_compound_extended_candidate(same, same_count, cand_b,
+ sign0, sign1, ref, rf->sign_bias);
+ y += dav1d_block_dimensions[cand_b->bs][1];
+ }
+
+ refmvs_candidate *const diff = &same[2];
+ const int *const diff_count = &same_count[2];
+
+ // merge together
+ for (int n = 0; n < 2; n++) {
+ int m = same_count[n];
+
+ if (m >= 2) continue;
+
+ const int l = diff_count[n];
+ if (l) {
+ same[m].mv.mv[n] = diff[0].mv.mv[n];
+ if (++m == 2) continue;
+ if (l == 2) {
+ same[1].mv.mv[n] = diff[1].mv.mv[n];
+ continue;
+ }
+ }
+ do {
+ same[m].mv.mv[n] = tgmv[n];
+ } while (++m < 2);
+ }
+
+ // if the first extended was the same as the non-extended one,
+ // then replace it with the second extended one
+ int n = *cnt;
+ if (n == 1 && mvstack[0].mv.n == same[0].mv.n)
+ mvstack[1].mv = mvstack[2].mv;
+ do {
+ mvstack[n].weight = 2;
+ } while (++n < 2);
+ *cnt = 2;
+ }
+
+ // clamping
+ const int left = -(bx4 + bw4 + 4) * 4 * 8;
+ const int right = (rf->iw4 - bx4 + 4) * 4 * 8;
+ const int top = -(by4 + bh4 + 4) * 4 * 8;
+ const int bottom = (rf->ih4 - by4 + 4) * 4 * 8;
+
+ const int n_refmvs = *cnt;
+ int n = 0;
+ do {
+ mvstack[n].mv.mv[0].x = iclip(mvstack[n].mv.mv[0].x, left, right);
+ mvstack[n].mv.mv[0].y = iclip(mvstack[n].mv.mv[0].y, top, bottom);
+ mvstack[n].mv.mv[1].x = iclip(mvstack[n].mv.mv[1].x, left, right);
+ mvstack[n].mv.mv[1].y = iclip(mvstack[n].mv.mv[1].y, top, bottom);
+ } while (++n < n_refmvs);
+
+ switch (refmv_ctx >> 1) {
+ case 0:
+ *ctx = imin(newmv_ctx, 1);
+ break;
+ case 1:
+ *ctx = 1 + imin(newmv_ctx, 3);
+ break;
+ case 2:
+ *ctx = iclip(3 + newmv_ctx, 4, 7);
+ break;
+ }
+
+ return;
+ } else if (*cnt < 2 && ref.ref[0] > 0) {
+ const int sign = rf->sign_bias[ref.ref[0] - 1];
+ const int sz4 = imin(w4, h4);
+
+ // non-self references in top
+ if (n_rows != ~0U) for (int x = 0; x < sz4 && *cnt < 2;) {
+ const refmvs_block *const cand_b = &b_top[x];
+ add_single_extended_candidate(mvstack, cnt, cand_b, sign, rf->sign_bias);
+ x += dav1d_block_dimensions[cand_b->bs][0];
+ }
+
+ // non-self references in left
+ if (n_cols != ~0U) for (int y = 0; y < sz4 && *cnt < 2;) {
+ const refmvs_block *const cand_b = &b_left[y][bx4 - 1];
+ add_single_extended_candidate(mvstack, cnt, cand_b, sign, rf->sign_bias);
+ y += dav1d_block_dimensions[cand_b->bs][1];
+ }
+ }
+ assert(*cnt <= 8);
+
+ // clamping
+ int n_refmvs = *cnt;
+ if (n_refmvs) {
+ const int left = -(bx4 + bw4 + 4) * 4 * 8;
+ const int right = (rf->iw4 - bx4 + 4) * 4 * 8;
+ const int top = -(by4 + bh4 + 4) * 4 * 8;
+ const int bottom = (rf->ih4 - by4 + 4) * 4 * 8;
+
+ int n = 0;
+ do {
+ mvstack[n].mv.mv[0].x = iclip(mvstack[n].mv.mv[0].x, left, right);
+ mvstack[n].mv.mv[0].y = iclip(mvstack[n].mv.mv[0].y, top, bottom);
+ } while (++n < n_refmvs);
+ }
+
+ for (int n = *cnt; n < 2; n++)
+ mvstack[n].mv.mv[0] = tgmv[0];
+
+ *ctx = (refmv_ctx << 4) | (globalmv_ctx << 3) | newmv_ctx;
+}
+
+void dav1d_refmvs_tile_sbrow_init(refmvs_tile *const rt, const refmvs_frame *const rf,
+ const int tile_col_start4, const int tile_col_end4,
+ const int tile_row_start4, const int tile_row_end4,
+ const int sby, int tile_row_idx, const int pass)
+{
+ if (rf->n_tile_threads == 1) tile_row_idx = 0;
+ rt->rp_proj = &rf->rp_proj[16 * rf->rp_stride * tile_row_idx];
+ const int uses_2pass = rf->n_tile_threads > 1 && rf->n_frame_threads > 1;
+ const ptrdiff_t pass_off = (uses_2pass && pass == 2) ?
+ 35 * rf->r_stride * rf->n_tile_rows : 0;
+ refmvs_block *r = &rf->r[35 * rf->r_stride * tile_row_idx + pass_off];
+ const int sbsz = rf->sbsz;
+ const int off = (sbsz * sby) & 16;
+ for (int i = 0; i < sbsz; i++, r += rf->r_stride)
+ rt->r[off + 5 + i] = r;
+ rt->r[off + 0] = r;
+ r += rf->r_stride;
+ rt->r[off + 1] = NULL;
+ rt->r[off + 2] = r;
+ r += rf->r_stride;
+ rt->r[off + 3] = NULL;
+ rt->r[off + 4] = r;
+ if (sby & 1) {
+#define EXCHANGE(a, b) do { void *const tmp = a; a = b; b = tmp; } while (0)
+ EXCHANGE(rt->r[off + 0], rt->r[off + sbsz + 0]);
+ EXCHANGE(rt->r[off + 2], rt->r[off + sbsz + 2]);
+ EXCHANGE(rt->r[off + 4], rt->r[off + sbsz + 4]);
+#undef EXCHANGE
+ }
+
+ rt->rf = rf;
+ rt->tile_row.start = tile_row_start4;
+ rt->tile_row.end = imin(tile_row_end4, rf->ih4);
+ rt->tile_col.start = tile_col_start4;
+ rt->tile_col.end = imin(tile_col_end4, rf->iw4);
+}
+
+static void load_tmvs_c(const refmvs_frame *const rf, int tile_row_idx,
+ const int col_start8, const int col_end8,
+ const int row_start8, int row_end8)
+{
+ if (rf->n_tile_threads == 1) tile_row_idx = 0;
+ assert(row_start8 >= 0);
+ assert((unsigned) (row_end8 - row_start8) <= 16U);
+ row_end8 = imin(row_end8, rf->ih8);
+ const int col_start8i = imax(col_start8 - 8, 0);
+ const int col_end8i = imin(col_end8 + 8, rf->iw8);
+
+ const ptrdiff_t stride = rf->rp_stride;
+ refmvs_temporal_block *rp_proj =
+ &rf->rp_proj[16 * stride * tile_row_idx + (row_start8 & 15) * stride];
+ for (int y = row_start8; y < row_end8; y++) {
+ for (int x = col_start8; x < col_end8; x++)
+ rp_proj[x].mv.n = INVALID_MV;
+ rp_proj += stride;
+ }
+
+ rp_proj = &rf->rp_proj[16 * stride * tile_row_idx];
+ for (int n = 0; n < rf->n_mfmvs; n++) {
+ const int ref2cur = rf->mfmv_ref2cur[n];
+ if (ref2cur == INT_MIN) continue;
+
+ const int ref = rf->mfmv_ref[n];
+ const int ref_sign = ref - 4;
+ const refmvs_temporal_block *r = &rf->rp_ref[ref][row_start8 * stride];
+ for (int y = row_start8; y < row_end8; y++) {
+ const int y_sb_align = y & ~7;
+ const int y_proj_start = imax(y_sb_align, row_start8);
+ const int y_proj_end = imin(y_sb_align + 8, row_end8);
+ for (int x = col_start8i; x < col_end8i; x++) {
+ const refmvs_temporal_block *rb = &r[x];
+ const int b_ref = rb->ref;
+ if (!b_ref) continue;
+ const int ref2ref = rf->mfmv_ref2ref[n][b_ref - 1];
+ if (!ref2ref) continue;
+ const mv b_mv = rb->mv;
+ const mv offset = mv_projection(b_mv, ref2cur, ref2ref);
+ int pos_x = x + apply_sign(abs(offset.x) >> 6,
+ offset.x ^ ref_sign);
+ const int pos_y = y + apply_sign(abs(offset.y) >> 6,
+ offset.y ^ ref_sign);
+ if (pos_y >= y_proj_start && pos_y < y_proj_end) {
+ const ptrdiff_t pos = (pos_y & 15) * stride;
+ for (;;) {
+ const int x_sb_align = x & ~7;
+ if (pos_x >= imax(x_sb_align - 8, col_start8) &&
+ pos_x < imin(x_sb_align + 16, col_end8))
+ {
+ rp_proj[pos + pos_x].mv = rb->mv;
+ rp_proj[pos + pos_x].ref = ref2ref;
+ }
+ if (++x >= col_end8i) break;
+ rb++;
+ if (rb->ref != b_ref || rb->mv.n != b_mv.n) break;
+ pos_x++;
+ }
+ } else {
+ for (;;) {
+ if (++x >= col_end8i) break;
+ rb++;
+ if (rb->ref != b_ref || rb->mv.n != b_mv.n) break;
+ }
+ }
+ x--;
+ }
+ r += stride;
+ }
+ }
+}
+
+static void save_tmvs_c(refmvs_temporal_block *rp, const ptrdiff_t stride,
+ refmvs_block *const *const rr,
+ const uint8_t *const ref_sign,
+ const int col_end8, const int row_end8,
+ const int col_start8, const int row_start8)
+{
+ for (int y = row_start8; y < row_end8; y++) {
+ const refmvs_block *const b = rr[(y & 15) * 2];
+
+ for (int x = col_start8; x < col_end8;) {
+ const refmvs_block *const cand_b = &b[x * 2 + 1];
+ const int bw8 = (dav1d_block_dimensions[cand_b->bs][0] + 1) >> 1;
+
+ if (cand_b->ref.ref[1] > 0 && ref_sign[cand_b->ref.ref[1] - 1] &&
+ (abs(cand_b->mv.mv[1].y) | abs(cand_b->mv.mv[1].x)) < 4096)
+ {
+ for (int n = 0; n < bw8; n++, x++)
+ rp[x] = (refmvs_temporal_block) { .mv = cand_b->mv.mv[1],
+ .ref = cand_b->ref.ref[1] };
+ } else if (cand_b->ref.ref[0] > 0 && ref_sign[cand_b->ref.ref[0] - 1] &&
+ (abs(cand_b->mv.mv[0].y) | abs(cand_b->mv.mv[0].x)) < 4096)
+ {
+ for (int n = 0; n < bw8; n++, x++)
+ rp[x] = (refmvs_temporal_block) { .mv = cand_b->mv.mv[0],
+ .ref = cand_b->ref.ref[0] };
+ } else {
+ for (int n = 0; n < bw8; n++, x++) {
+ rp[x].mv.n = 0;
+ rp[x].ref = 0; // "invalid"
+ }
+ }
+ }
+ rp += stride;
+ }
+}
+
+int dav1d_refmvs_init_frame(refmvs_frame *const rf,
+ const Dav1dSequenceHeader *const seq_hdr,
+ const Dav1dFrameHeader *const frm_hdr,
+ const unsigned ref_poc[7],
+ refmvs_temporal_block *const rp,
+ const unsigned ref_ref_poc[7][7],
+ /*const*/ refmvs_temporal_block *const rp_ref[7],
+ const int n_tile_threads, const int n_frame_threads)
+{
+ rf->sbsz = 16 << seq_hdr->sb128;
+ rf->frm_hdr = frm_hdr;
+ rf->iw8 = (frm_hdr->width[0] + 7) >> 3;
+ rf->ih8 = (frm_hdr->height + 7) >> 3;
+ rf->iw4 = rf->iw8 << 1;
+ rf->ih4 = rf->ih8 << 1;
+
+ const ptrdiff_t r_stride = ((frm_hdr->width[0] + 127) & ~127) >> 2;
+ const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1;
+ if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) {
+ if (rf->r) dav1d_freep_aligned(&rf->r);
+ const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1;
+ rf->r = dav1d_alloc_aligned(sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64);
+ if (!rf->r) return DAV1D_ERR(ENOMEM);
+ rf->r_stride = r_stride;
+ }
+
+ const ptrdiff_t rp_stride = r_stride >> 1;
+ if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) {
+ if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
+ rf->rp_proj = dav1d_alloc_aligned(sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64);
+ if (!rf->rp_proj) return DAV1D_ERR(ENOMEM);
+ rf->rp_stride = rp_stride;
+ }
+ rf->n_tile_rows = n_tile_rows;
+ rf->n_tile_threads = n_tile_threads;
+ rf->n_frame_threads = n_frame_threads;
+ rf->rp = rp;
+ rf->rp_ref = rp_ref;
+ const unsigned poc = frm_hdr->frame_offset;
+ for (int i = 0; i < 7; i++) {
+ const int poc_diff = get_poc_diff(seq_hdr->order_hint_n_bits,
+ ref_poc[i], poc);
+ rf->sign_bias[i] = poc_diff > 0;
+ rf->mfmv_sign[i] = poc_diff < 0;
+ rf->pocdiff[i] = iclip(get_poc_diff(seq_hdr->order_hint_n_bits,
+ poc, ref_poc[i]), -31, 31);
+ }
+
+ // temporal MV setup
+ rf->n_mfmvs = 0;
+ if (frm_hdr->use_ref_frame_mvs && seq_hdr->order_hint_n_bits) {
+ int total = 2;
+ if (rp_ref[0] && ref_ref_poc[0][6] != ref_poc[3] /* alt-of-last != gold */) {
+ rf->mfmv_ref[rf->n_mfmvs++] = 0; // last
+ total = 3;
+ }
+ if (rp_ref[4] && get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[4],
+ frm_hdr->frame_offset) > 0)
+ {
+ rf->mfmv_ref[rf->n_mfmvs++] = 4; // bwd
+ }
+ if (rp_ref[5] && get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[5],
+ frm_hdr->frame_offset) > 0)
+ {
+ rf->mfmv_ref[rf->n_mfmvs++] = 5; // altref2
+ }
+ if (rf->n_mfmvs < total && rp_ref[6] &&
+ get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[6],
+ frm_hdr->frame_offset) > 0)
+ {
+ rf->mfmv_ref[rf->n_mfmvs++] = 6; // altref
+ }
+ if (rf->n_mfmvs < total && rp_ref[1])
+ rf->mfmv_ref[rf->n_mfmvs++] = 1; // last2
+
+ for (int n = 0; n < rf->n_mfmvs; n++) {
+ const unsigned rpoc = ref_poc[rf->mfmv_ref[n]];
+ const int diff1 = get_poc_diff(seq_hdr->order_hint_n_bits,
+ rpoc, frm_hdr->frame_offset);
+ if (abs(diff1) > 31) {
+ rf->mfmv_ref2cur[n] = INT_MIN;
+ } else {
+ rf->mfmv_ref2cur[n] = rf->mfmv_ref[n] < 4 ? -diff1 : diff1;
+ for (int m = 0; m < 7; m++) {
+ const unsigned rrpoc = ref_ref_poc[rf->mfmv_ref[n]][m];
+ const int diff2 = get_poc_diff(seq_hdr->order_hint_n_bits,
+ rpoc, rrpoc);
+ // unsigned comparison also catches the < 0 case
+ rf->mfmv_ref2ref[n][m] = (unsigned) diff2 > 31U ? 0 : diff2;
+ }
+ }
+ }
+ }
+ rf->use_ref_frame_mvs = rf->n_mfmvs > 0;
+
+ return 0;
+}
+
+void dav1d_refmvs_init(refmvs_frame *const rf) {
+ rf->r = NULL;
+ rf->r_stride = 0;
+ rf->rp_proj = NULL;
+ rf->rp_stride = 0;
+}
+
+void dav1d_refmvs_clear(refmvs_frame *const rf) {
+ if (rf->r) dav1d_freep_aligned(&rf->r);
+ if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
+}
+
+static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
+ const int bx4, const int bw4, int bh4)
+{
+ do {
+ refmvs_block *const r = *rr++ + bx4;
+ for (int x = 0; x < bw4; x++)
+ r[x] = *rmv;
+ } while (--bh4);
+}
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/refmvs.h"
+#elif ARCH_X86
+#include "src/x86/refmvs.h"
+#endif
+#endif
+
+COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c)
+{
+ c->load_tmvs = load_tmvs_c;
+ c->save_tmvs = save_tmvs_c;
+ c->splat_mv = splat_mv_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ refmvs_dsp_init_arm(c);
+#elif ARCH_X86
+ refmvs_dsp_init_x86(c);
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/refmvs.h b/third_party/dav1d/src/refmvs.h
new file mode 100644
index 0000000000..70dc9678dd
--- /dev/null
+++ b/third_party/dav1d/src/refmvs.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2020, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_REF_MVS_H
+#define DAV1D_SRC_REF_MVS_H
+
+#include <stdint.h>
+
+#include "dav1d/headers.h"
+
+#include "common/intops.h"
+
+#include "src/intra_edge.h"
+#include "src/tables.h"
+
+#define INVALID_MV 0x80008000
+
+PACKED(typedef struct refmvs_temporal_block {
+ mv mv;
+ int8_t ref;
+}) refmvs_temporal_block;
+
+typedef union refmvs_refpair {
+ int8_t ref[2]; // [0] = 0: intra=1, [1] = -1: comp=0
+ uint16_t pair;
+} refmvs_refpair;
+
+typedef union refmvs_mvpair {
+ mv mv[2];
+ uint64_t n;
+} refmvs_mvpair;
+
+PACKED(typedef struct refmvs_block {
+ refmvs_mvpair mv;
+ refmvs_refpair ref;
+ uint8_t bs, mf; // 1 = globalmv+affine, 2 = newmv
+}) ALIGN(refmvs_block, 4);
+
+typedef struct refmvs_frame {
+ const Dav1dFrameHeader *frm_hdr;
+ int iw4, ih4, iw8, ih8;
+ int sbsz;
+ int use_ref_frame_mvs;
+ uint8_t sign_bias[7], mfmv_sign[7];
+ int8_t pocdiff[7];
+ uint8_t mfmv_ref[3];
+ int mfmv_ref2cur[3];
+ int mfmv_ref2ref[3][7];
+ int n_mfmvs;
+
+ refmvs_temporal_block *rp;
+ /*const*/ refmvs_temporal_block *const *rp_ref;
+ refmvs_temporal_block *rp_proj;
+ ptrdiff_t rp_stride;
+
+ refmvs_block *r; // 35 x r_stride memory
+ ptrdiff_t r_stride;
+ int n_tile_rows, n_tile_threads, n_frame_threads;
+} refmvs_frame;
+
+typedef struct refmvs_tile {
+ const refmvs_frame *rf;
+ refmvs_block *r[32 + 5];
+ refmvs_temporal_block *rp_proj;
+ struct {
+ int start, end;
+ } tile_col, tile_row;
+} refmvs_tile;
+
+typedef struct refmvs_candidate {
+ refmvs_mvpair mv;
+ int weight;
+} refmvs_candidate;
+
+// initialize temporal MVs; this can be done in any configuration, e.g. one
+// tile/sbrow at a time, where col_{start,end}8 are the tile boundaries; or
+// it can just be for the whole frame's sbrow, where col_{start,end}8 are the
+// frame boundaries. row_{start,end}8 are the superblock row boundaries.
+#define decl_load_tmvs_fn(name) \
+void (name)(const refmvs_frame *rf, int tile_row_idx, \
+ int col_start8, int col_end8, int row_start8, int row_end8)
+typedef decl_load_tmvs_fn(*load_tmvs_fn);
+
+#define decl_save_tmvs_fn(name) \
+void (name)(refmvs_temporal_block *rp, const ptrdiff_t stride, \
+ refmvs_block *const *const rr, const uint8_t *const ref_sign, \
+ int col_end8, int row_end8, int col_start8, int row_start8)
+typedef decl_save_tmvs_fn(*save_tmvs_fn);
+
+#define decl_splat_mv_fn(name) \
+void (name)(refmvs_block **rr, const refmvs_block *rmv, int bx4, int bw4, int bh4)
+typedef decl_splat_mv_fn(*splat_mv_fn);
+
+typedef struct Dav1dRefmvsDSPContext {
+ load_tmvs_fn load_tmvs;
+ save_tmvs_fn save_tmvs;
+ splat_mv_fn splat_mv;
+} Dav1dRefmvsDSPContext;
+
+// call once per frame thread
+void dav1d_refmvs_init(refmvs_frame *rf);
+void dav1d_refmvs_clear(refmvs_frame *rf);
+
+// call once per frame
+int dav1d_refmvs_init_frame(refmvs_frame *rf,
+ const Dav1dSequenceHeader *seq_hdr,
+ const Dav1dFrameHeader *frm_hdr,
+ const unsigned ref_poc[7],
+ refmvs_temporal_block *rp,
+ const unsigned ref_ref_poc[7][7],
+ /*const*/ refmvs_temporal_block *const rp_ref[7],
+ int n_tile_threads, int n_frame_threads);
+
+// cache the current tile/sbrow (or frame/sbrow)'s projectable motion vectors
+// into buffers for use in future frame's temporal MV prediction
+static inline void dav1d_refmvs_save_tmvs(const Dav1dRefmvsDSPContext *const dsp,
+ const refmvs_tile *const rt,
+ const int col_start8, int col_end8,
+ const int row_start8, int row_end8)
+{
+ const refmvs_frame *const rf = rt->rf;
+
+ assert(row_start8 >= 0);
+ assert((unsigned) (row_end8 - row_start8) <= 16U);
+ row_end8 = imin(row_end8, rf->ih8);
+ col_end8 = imin(col_end8, rf->iw8);
+
+ const ptrdiff_t stride = rf->rp_stride;
+ const uint8_t *const ref_sign = rf->mfmv_sign;
+ refmvs_temporal_block *rp = &rf->rp[row_start8 * stride];
+
+ dsp->save_tmvs(rp, stride, rt->r + 6, ref_sign,
+ col_end8, row_end8, col_start8, row_start8);
+}
+
+// initialize tile boundaries and refmvs_block pointers for one tile/sbrow
+void dav1d_refmvs_tile_sbrow_init(refmvs_tile *rt, const refmvs_frame *rf,
+ int tile_col_start4, int tile_col_end4,
+ int tile_row_start4, int tile_row_end4,
+ int sby, int tile_row_idx, int pass);
+
+// call for each block
+void dav1d_refmvs_find(const refmvs_tile *rt,
+ refmvs_candidate mvstack[8], int *cnt,
+ int *ctx, const refmvs_refpair ref, enum BlockSize bs,
+ enum EdgeFlags edge_flags, int by4, int bx4);
+
+void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *dsp);
+void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *dsp);
+void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *dsp);
+
+#endif /* DAV1D_SRC_REF_MVS_H */
diff --git a/third_party/dav1d/src/scan.c b/third_party/dav1d/src/scan.c
new file mode 100644
index 0000000000..5261ccd3d1
--- /dev/null
+++ b/third_party/dav1d/src/scan.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/attributes.h"
+#include "src/scan.h"
+
+static const uint16_t ALIGN(scan_4x4[], 32) = {
+ 0, 4, 1, 2,
+ 5, 8, 12, 9,
+ 6, 3, 7, 10,
+ 13, 14, 11, 15,
+};
+
+static const uint16_t ALIGN(scan_4x8[], 32) = {
+ 0, 8, 1, 16,
+ 9, 2, 24, 17,
+ 10, 3, 25, 18,
+ 11, 4, 26, 19,
+ 12, 5, 27, 20,
+ 13, 6, 28, 21,
+ 14, 7, 29, 22,
+ 15, 30, 23, 31,
+};
+
+static const uint16_t ALIGN(scan_4x16[], 32) = {
+ 0, 16, 1, 32,
+ 17, 2, 48, 33,
+ 18, 3, 49, 34,
+ 19, 4, 50, 35,
+ 20, 5, 51, 36,
+ 21, 6, 52, 37,
+ 22, 7, 53, 38,
+ 23, 8, 54, 39,
+ 24, 9, 55, 40,
+ 25, 10, 56, 41,
+ 26, 11, 57, 42,
+ 27, 12, 58, 43,
+ 28, 13, 59, 44,
+ 29, 14, 60, 45,
+ 30, 15, 61, 46,
+ 31, 62, 47, 63,
+};
+
+static const uint16_t ALIGN(scan_8x4[], 32) = {
+ 0, 1, 4, 2, 5, 8, 3, 6,
+ 9, 12, 7, 10, 13, 16, 11, 14,
+ 17, 20, 15, 18, 21, 24, 19, 22,
+ 25, 28, 23, 26, 29, 27, 30, 31,
+};
+
+static const uint16_t ALIGN(scan_8x8[], 32) = {
+ 0, 8, 1, 2, 9, 16, 24, 17,
+ 10, 3, 4, 11, 18, 25, 32, 40,
+ 33, 26, 19, 12, 5, 6, 13, 20,
+ 27, 34, 41, 48, 56, 49, 42, 35,
+ 28, 21, 14, 7, 15, 22, 29, 36,
+ 43, 50, 57, 58, 51, 44, 37, 30,
+ 23, 31, 38, 45, 52, 59, 60, 53,
+ 46, 39, 47, 54, 61, 62, 55, 63,
+};
+
+static const uint16_t ALIGN(scan_8x16[], 32) = {
+ 0, 16, 1, 32, 17, 2, 48, 33,
+ 18, 3, 64, 49, 34, 19, 4, 80,
+ 65, 50, 35, 20, 5, 96, 81, 66,
+ 51, 36, 21, 6, 112, 97, 82, 67,
+ 52, 37, 22, 7, 113, 98, 83, 68,
+ 53, 38, 23, 8, 114, 99, 84, 69,
+ 54, 39, 24, 9, 115, 100, 85, 70,
+ 55, 40, 25, 10, 116, 101, 86, 71,
+ 56, 41, 26, 11, 117, 102, 87, 72,
+ 57, 42, 27, 12, 118, 103, 88, 73,
+ 58, 43, 28, 13, 119, 104, 89, 74,
+ 59, 44, 29, 14, 120, 105, 90, 75,
+ 60, 45, 30, 15, 121, 106, 91, 76,
+ 61, 46, 31, 122, 107, 92, 77, 62,
+ 47, 123, 108, 93, 78, 63, 124, 109,
+ 94, 79, 125, 110, 95, 126, 111, 127,
+};
+
+static const uint16_t ALIGN(scan_8x32[], 32) = {
+ 0, 32, 1, 64, 33, 2, 96, 65,
+ 34, 3, 128, 97, 66, 35, 4, 160,
+ 129, 98, 67, 36, 5, 192, 161, 130,
+ 99, 68, 37, 6, 224, 193, 162, 131,
+ 100, 69, 38, 7, 225, 194, 163, 132,
+ 101, 70, 39, 8, 226, 195, 164, 133,
+ 102, 71, 40, 9, 227, 196, 165, 134,
+ 103, 72, 41, 10, 228, 197, 166, 135,
+ 104, 73, 42, 11, 229, 198, 167, 136,
+ 105, 74, 43, 12, 230, 199, 168, 137,
+ 106, 75, 44, 13, 231, 200, 169, 138,
+ 107, 76, 45, 14, 232, 201, 170, 139,
+ 108, 77, 46, 15, 233, 202, 171, 140,
+ 109, 78, 47, 16, 234, 203, 172, 141,
+ 110, 79, 48, 17, 235, 204, 173, 142,
+ 111, 80, 49, 18, 236, 205, 174, 143,
+ 112, 81, 50, 19, 237, 206, 175, 144,
+ 113, 82, 51, 20, 238, 207, 176, 145,
+ 114, 83, 52, 21, 239, 208, 177, 146,
+ 115, 84, 53, 22, 240, 209, 178, 147,
+ 116, 85, 54, 23, 241, 210, 179, 148,
+ 117, 86, 55, 24, 242, 211, 180, 149,
+ 118, 87, 56, 25, 243, 212, 181, 150,
+ 119, 88, 57, 26, 244, 213, 182, 151,
+ 120, 89, 58, 27, 245, 214, 183, 152,
+ 121, 90, 59, 28, 246, 215, 184, 153,
+ 122, 91, 60, 29, 247, 216, 185, 154,
+ 123, 92, 61, 30, 248, 217, 186, 155,
+ 124, 93, 62, 31, 249, 218, 187, 156,
+ 125, 94, 63, 250, 219, 188, 157, 126,
+ 95, 251, 220, 189, 158, 127, 252, 221,
+ 190, 159, 253, 222, 191, 254, 223, 255,
+};
+
+static const uint16_t ALIGN(scan_16x4[], 32) = {
+ 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14,
+ 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+ 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+ 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
+};
+
+static const uint16_t ALIGN(scan_16x8[], 32) = {
+ 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5,
+ 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28,
+ 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44,
+ 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60,
+ 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76,
+ 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92,
+ 99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 87, 94, 101, 108, 115,
+ 122, 95, 102, 109, 116, 123, 103, 110, 117, 124, 111, 118, 125, 119, 126, 127,
+};
+
+static const uint16_t ALIGN(scan_16x16[], 32) = {
+ 0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 4, 19, 34, 49, 64, 80,
+ 65, 50, 35, 20, 5, 6, 21, 36, 51, 66, 81, 96, 112, 97, 82, 67,
+ 52, 37, 22, 7, 8, 23, 38, 53, 68, 83, 98, 113, 128, 144, 129, 114,
+ 99, 84, 69, 54, 39, 24, 9, 10, 25, 40, 55, 70, 85, 100, 115, 130,
+ 145, 160, 176, 161, 146, 131, 116, 101, 86, 71, 56, 41, 26, 11, 12, 27,
+ 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 208, 193, 178, 163, 148,
+ 133, 118, 103, 88, 73, 58, 43, 28, 13, 14, 29, 44, 59, 74, 89, 104,
+ 119, 134, 149, 164, 179, 194, 209, 224, 240, 225, 210, 195, 180, 165, 150, 135,
+ 120, 105, 90, 75, 60, 45, 30, 15, 31, 46, 61, 76, 91, 106, 121, 136,
+ 151, 166, 181, 196, 211, 226, 241, 242, 227, 212, 197, 182, 167, 152, 137, 122,
+ 107, 92, 77, 62, 47, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213,
+ 228, 243, 244, 229, 214, 199, 184, 169, 154, 139, 124, 109, 94, 79, 95, 110,
+ 125, 140, 155, 170, 185, 200, 215, 230, 245, 246, 231, 216, 201, 186, 171, 156,
+ 141, 126, 111, 127, 142, 157, 172, 187, 202, 217, 232, 247, 248, 233, 218, 203,
+ 188, 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190,
+ 175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, 255,
+};
+
+static const uint16_t ALIGN(scan_16x32[], 32) = {
+ 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, 160,
+ 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, 162, 131,
+ 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8, 288, 257, 226,
+ 195, 164, 133, 102, 71, 40, 9, 320, 289, 258, 227, 196, 165, 134, 103, 72,
+ 41, 10, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 384, 353,
+ 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, 12, 416, 385, 354, 323, 292,
+ 261, 230, 199, 168, 137, 106, 75, 44, 13, 448, 417, 386, 355, 324, 293, 262,
+ 231, 200, 169, 138, 107, 76, 45, 14, 480, 449, 418, 387, 356, 325, 294, 263,
+ 232, 201, 170, 139, 108, 77, 46, 15, 481, 450, 419, 388, 357, 326, 295, 264,
+ 233, 202, 171, 140, 109, 78, 47, 16, 482, 451, 420, 389, 358, 327, 296, 265,
+ 234, 203, 172, 141, 110, 79, 48, 17, 483, 452, 421, 390, 359, 328, 297, 266,
+ 235, 204, 173, 142, 111, 80, 49, 18, 484, 453, 422, 391, 360, 329, 298, 267,
+ 236, 205, 174, 143, 112, 81, 50, 19, 485, 454, 423, 392, 361, 330, 299, 268,
+ 237, 206, 175, 144, 113, 82, 51, 20, 486, 455, 424, 393, 362, 331, 300, 269,
+ 238, 207, 176, 145, 114, 83, 52, 21, 487, 456, 425, 394, 363, 332, 301, 270,
+ 239, 208, 177, 146, 115, 84, 53, 22, 488, 457, 426, 395, 364, 333, 302, 271,
+ 240, 209, 178, 147, 116, 85, 54, 23, 489, 458, 427, 396, 365, 334, 303, 272,
+ 241, 210, 179, 148, 117, 86, 55, 24, 490, 459, 428, 397, 366, 335, 304, 273,
+ 242, 211, 180, 149, 118, 87, 56, 25, 491, 460, 429, 398, 367, 336, 305, 274,
+ 243, 212, 181, 150, 119, 88, 57, 26, 492, 461, 430, 399, 368, 337, 306, 275,
+ 244, 213, 182, 151, 120, 89, 58, 27, 493, 462, 431, 400, 369, 338, 307, 276,
+ 245, 214, 183, 152, 121, 90, 59, 28, 494, 463, 432, 401, 370, 339, 308, 277,
+ 246, 215, 184, 153, 122, 91, 60, 29, 495, 464, 433, 402, 371, 340, 309, 278,
+ 247, 216, 185, 154, 123, 92, 61, 30, 496, 465, 434, 403, 372, 341, 310, 279,
+ 248, 217, 186, 155, 124, 93, 62, 31, 497, 466, 435, 404, 373, 342, 311, 280,
+ 249, 218, 187, 156, 125, 94, 63, 498, 467, 436, 405, 374, 343, 312, 281, 250,
+ 219, 188, 157, 126, 95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189,
+ 158, 127, 500, 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470,
+ 439, 408, 377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316,
+ 285, 254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411,
+ 380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, 382,
+ 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, 479, 511,
+};
+
+static const uint16_t ALIGN(scan_32x8[], 32) = {
+ 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28,
+ 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60,
+ 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92,
+ 99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 128, 87, 94, 101, 108, 115, 122, 129, 136, 95, 102, 109, 116, 123, 130, 137, 144, 103, 110, 117, 124,
+ 131, 138, 145, 152, 111, 118, 125, 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134, 141, 148, 155, 162, 169, 176, 135, 142, 149, 156,
+ 163, 170, 177, 184, 143, 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200, 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188,
+ 195, 202, 209, 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220,
+ 227, 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, 255,
+};
+
+static const uint16_t ALIGN(scan_32x16[], 32) = {
+ 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, 37, 52,
+ 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, 85, 100, 115, 130,
+ 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 13, 28, 43, 58, 73,
+ 88, 103, 118, 133, 148, 163, 178, 193, 208, 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, 15, 30, 45, 60, 75, 90, 105, 120,
+ 135, 150, 165, 180, 195, 210, 225, 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, 241, 256, 47, 62, 77, 92, 107, 122, 137, 152,
+ 167, 182, 197, 212, 227, 242, 257, 272, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, 243, 258, 273, 288, 79, 94, 109, 124, 139, 154, 169, 184,
+ 199, 214, 229, 244, 259, 274, 289, 304, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230, 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216,
+ 231, 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232, 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233, 248,
+ 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234, 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235, 250, 265, 280,
+ 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236, 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237, 252, 267, 282, 297, 312,
+ 327, 342, 357, 372, 387, 402, 417, 432, 223, 238, 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239, 254, 269, 284, 299, 314, 329, 344,
+ 359, 374, 389, 404, 419, 434, 449, 464, 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465, 480, 271, 286, 301, 316, 331, 346, 361, 376,
+ 391, 406, 421, 436, 451, 466, 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467, 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423,
+ 438, 453, 468, 483, 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335, 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366,
+ 381, 396, 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
+ 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, 510, 511,
+};
+
+static const uint16_t ALIGN(scan_32x32[], 32) = {
+ 0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 4, 35, 66, 97, 128, 160, 129, 98, 67, 36, 5, 6, 37, 68, 99, 130, 161, 192, 224, 193, 162, 131,
+ 100, 69, 38, 7, 8, 39, 70, 101, 132, 163, 194, 225, 256, 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 10, 41, 72, 103, 134, 165, 196, 227, 258,
+ 289, 320, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 12, 43, 74, 105, 136, 167, 198, 229, 260, 291, 322, 353, 384, 416, 385, 354, 323, 292,
+ 261, 230, 199, 168, 137, 106, 75, 44, 13, 14, 45, 76, 107, 138, 169, 200, 231, 262, 293, 324, 355, 386, 417, 448, 480, 449, 418, 387, 356, 325, 294, 263,
+ 232, 201, 170, 139, 108, 77, 46, 15, 16, 47, 78, 109, 140, 171, 202, 233, 264, 295, 326, 357, 388, 419, 450, 481, 512, 544, 513, 482, 451, 420, 389, 358,
+ 327, 296, 265, 234, 203, 172, 141, 110, 79, 48, 17, 18, 49, 80, 111, 142, 173, 204, 235, 266, 297, 328, 359, 390, 421, 452, 483, 514, 545, 576, 608, 577,
+ 546, 515, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, 143, 112, 81, 50, 19, 20, 51, 82, 113, 144, 175, 206, 237, 268, 299, 330, 361, 392, 423,
+ 454, 485, 516, 547, 578, 609, 640, 672, 641, 610, 579, 548, 517, 486, 455, 424, 393, 362, 331, 300, 269, 238, 207, 176, 145, 114, 83, 52, 21, 22, 53, 84,
+ 115, 146, 177, 208, 239, 270, 301, 332, 363, 394, 425, 456, 487, 518, 549, 580, 611, 642, 673, 704, 736, 705, 674, 643, 612, 581, 550, 519, 488, 457, 426, 395,
+ 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 24, 55, 86, 117, 148, 179, 210, 241, 272, 303, 334, 365, 396, 427, 458, 489, 520, 551, 582, 613,
+ 644, 675, 706, 737, 768, 800, 769, 738, 707, 676, 645, 614, 583, 552, 521, 490, 459, 428, 397, 366, 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 26,
+ 57, 88, 119, 150, 181, 212, 243, 274, 305, 336, 367, 398, 429, 460, 491, 522, 553, 584, 615, 646, 677, 708, 739, 770, 801, 832, 864, 833, 802, 771, 740, 709,
+ 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 28, 59, 90, 121, 152, 183, 214, 245, 276, 307,
+ 338, 369, 400, 431, 462, 493, 524, 555, 586, 617, 648, 679, 710, 741, 772, 803, 834, 865, 896, 928, 897, 866, 835, 804, 773, 742, 711, 680, 649, 618, 587, 556,
+ 525, 494, 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29, 30, 61, 92, 123, 154, 185, 216, 247, 278, 309, 340, 371, 402, 433, 464,
+ 495, 526, 557, 588, 619, 650, 681, 712, 743, 774, 805, 836, 867, 898, 929, 960, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, 651, 620, 589, 558, 527,
+ 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, 62, 31, 63, 94, 125, 156, 187, 218, 249, 280, 311, 342, 373, 404, 435, 466, 497, 528,
+ 559, 590, 621, 652, 683, 714, 745, 776, 807, 838, 869, 900, 931, 962, 993, 994, 963, 932, 901, 870, 839, 808, 777, 746, 715, 684, 653, 622, 591, 560, 529, 498,
+ 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126, 95, 127, 158, 189, 220, 251, 282, 313, 344, 375, 406, 437, 468, 499, 530, 561, 592, 623, 654, 685,
+ 716, 747, 778, 809, 840, 871, 902, 933, 964, 995, 996, 965, 934, 903, 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, 438, 407, 376, 345,
+ 314, 283, 252, 221, 190, 159, 191, 222, 253, 284, 315, 346, 377, 408, 439, 470, 501, 532, 563, 594, 625, 656, 687, 718, 749, 780, 811, 842, 873, 904, 935, 966,
+ 997, 998, 967, 936, 905, 874, 843, 812, 781, 750, 719, 688, 657, 626, 595, 564, 533, 502, 471, 440, 409, 378, 347, 316, 285, 254, 223, 255, 286, 317, 348, 379,
+ 410, 441, 472, 503, 534, 565, 596, 627, 658, 689, 720, 751, 782, 813, 844, 875, 906, 937, 968, 999, 1000, 969, 938, 907, 876, 845, 814, 783, 752, 721, 690, 659,
+ 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 319, 350, 381, 412, 443, 474, 505, 536, 567, 598, 629, 660, 691, 722, 753, 784, 815, 846, 877, 908,
+ 939, 970, 1001, 1002, 971, 940, 909, 878, 847, 816, 785, 754, 723, 692, 661, 630, 599, 568, 537, 506, 475, 444, 413, 382, 351, 383, 414, 445, 476, 507, 538, 569,
+ 600, 631, 662, 693, 724, 755, 786, 817, 848, 879, 910, 941, 972, 1003, 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, 508, 477,
+ 446, 415, 447, 478, 509, 540, 571, 602, 633, 664, 695, 726, 757, 788, 819, 850, 881, 912, 943, 974, 1005, 1006, 975, 944, 913, 882, 851, 820, 789, 758, 727, 696,
+ 665, 634, 603, 572, 541, 510, 479, 511, 542, 573, 604, 635, 666, 697, 728, 759, 790, 821, 852, 883, 914, 945, 976, 1007, 1008, 977, 946, 915, 884, 853, 822, 791,
+ 760, 729, 698, 667, 636, 605, 574, 543, 575, 606, 637, 668, 699, 730, 761, 792, 823, 854, 885, 916, 947, 978, 1009, 1010, 979, 948, 917, 886, 855, 824, 793, 762,
+ 731, 700, 669, 638, 607, 639, 670, 701, 732, 763, 794, 825, 856, 887, 918, 949, 980, 1011, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733, 702, 671, 703, 734,
+ 765, 796, 827, 858, 889, 920, 951, 982, 1013, 1014, 983, 952, 921, 890, 859, 828, 797, 766, 735, 767, 798, 829, 860, 891, 922, 953, 984, 1015, 1016, 985, 954, 923,
+ 892, 861, 830, 799, 831, 862, 893, 924, 955, 986, 1017, 1018, 987, 956, 925, 894, 863, 895, 926, 957, 988, 1019, 1020, 989, 958, 927, 959, 990, 1021, 1022, 991, 1023,
+};
+
+const uint16_t *const dav1d_scans[N_RECT_TX_SIZES] = {
+ [ TX_4X4 ] = scan_4x4,
+ [ TX_8X8 ] = scan_8x8,
+ [ TX_16X16] = scan_16x16,
+ [ TX_32X32] = scan_32x32,
+ [ TX_64X64] = scan_32x32,
+ [RTX_4X8 ] = scan_4x8,
+ [RTX_8X4 ] = scan_8x4,
+ [RTX_8X16 ] = scan_8x16,
+ [RTX_16X8 ] = scan_16x8,
+ [RTX_16X32] = scan_16x32,
+ [RTX_32X16] = scan_32x16,
+ [RTX_32X64] = scan_32x32,
+ [RTX_64X32] = scan_32x32,
+ [RTX_4X16 ] = scan_4x16,
+ [RTX_16X4 ] = scan_16x4,
+ [RTX_8X32 ] = scan_8x32,
+ [RTX_32X8 ] = scan_32x8,
+ [RTX_16X64] = scan_16x32,
+ [RTX_64X16] = scan_32x16,
+};
diff --git a/third_party/dav1d/src/scan.h b/third_party/dav1d/src/scan.h
new file mode 100644
index 0000000000..09df988779
--- /dev/null
+++ b/third_party/dav1d/src/scan.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_SCAN_H
+#define DAV1D_SRC_SCAN_H
+
+#include <stdint.h>
+
+#include "src/levels.h"
+
+EXTERN const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
+
+#endif /* DAV1D_SRC_SCAN_H */
diff --git a/third_party/dav1d/src/tables.c b/third_party/dav1d/src/tables.c
new file mode 100644
index 0000000000..9752f15c40
--- /dev/null
+++ b/third_party/dav1d/src/tables.c
@@ -0,0 +1,1013 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+
+#include "common/attributes.h"
+
+#include "src/levels.h"
+#include "src/tables.h"
+
+const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS] = {
+ {
+ // partitions:
+ // none, h, v, split, tts, tbs, tls, trs, h4, v4
+ { 0x00, 0x00, 0x10, -1, 0x00, 0x10, 0x10, 0x10, -1, -1 }, // bl128
+ { 0x10, 0x10, 0x18, -1, 0x10, 0x18, 0x18, 0x18, 0x10, 0x1c }, // bl64
+ { 0x18, 0x18, 0x1c, -1, 0x18, 0x1c, 0x1c, 0x1c, 0x18, 0x1e }, // bl32
+ { 0x1c, 0x1c, 0x1e, -1, 0x1c, 0x1e, 0x1e, 0x1e, 0x1c, 0x1f }, // bl16
+ { 0x1e, 0x1e, 0x1f, 0x1f, -1, -1, -1, -1, -1, -1 }, // bl8
+ }, {
+ { 0x00, 0x10, 0x00, -1, 0x10, 0x10, 0x00, 0x10, -1, -1 }, // bl128
+ { 0x10, 0x18, 0x10, -1, 0x18, 0x18, 0x10, 0x18, 0x1c, 0x10 }, // bl64
+ { 0x18, 0x1c, 0x18, -1, 0x1c, 0x1c, 0x18, 0x1c, 0x1e, 0x18 }, // bl32
+ { 0x1c, 0x1e, 0x1c, -1, 0x1e, 0x1e, 0x1c, 0x1e, 0x1f, 0x1c }, // bl16
+ { 0x1e, 0x1f, 0x1e, 0x1f, -1, -1, -1, -1, -1, -1 }, // bl8
+ }
+};
+
+const uint8_t /* enum BlockSize */
+ dav1d_block_sizes[N_BL_LEVELS][N_PARTITIONS][2] =
+{
+ [BL_128X128] = {
+ [PARTITION_NONE] = { BS_128x128 },
+ [PARTITION_H] = { BS_128x64 },
+ [PARTITION_V] = { BS_64x128 },
+ [PARTITION_T_TOP_SPLIT] = { BS_64x64, BS_128x64 },
+ [PARTITION_T_BOTTOM_SPLIT] = { BS_128x64, BS_64x64 },
+ [PARTITION_T_LEFT_SPLIT] = { BS_64x64, BS_64x128 },
+ [PARTITION_T_RIGHT_SPLIT] = { BS_64x128, BS_64x64 },
+ }, [BL_64X64] = {
+ [PARTITION_NONE] = { BS_64x64 },
+ [PARTITION_H] = { BS_64x32 },
+ [PARTITION_V] = { BS_32x64 },
+ [PARTITION_T_TOP_SPLIT] = { BS_32x32, BS_64x32 },
+ [PARTITION_T_BOTTOM_SPLIT] = { BS_64x32, BS_32x32 },
+ [PARTITION_T_LEFT_SPLIT] = { BS_32x32, BS_32x64 },
+ [PARTITION_T_RIGHT_SPLIT] = { BS_32x64, BS_32x32 },
+ [PARTITION_H4] = { BS_64x16 },
+ [PARTITION_V4] = { BS_16x64 },
+ }, [BL_32X32] = {
+ [PARTITION_NONE] = { BS_32x32 },
+ [PARTITION_H] = { BS_32x16 },
+ [PARTITION_V] = { BS_16x32 },
+ [PARTITION_T_TOP_SPLIT] = { BS_16x16, BS_32x16 },
+ [PARTITION_T_BOTTOM_SPLIT] = { BS_32x16, BS_16x16 },
+ [PARTITION_T_LEFT_SPLIT] = { BS_16x16, BS_16x32 },
+ [PARTITION_T_RIGHT_SPLIT] = { BS_16x32, BS_16x16 },
+ [PARTITION_H4] = { BS_32x8 },
+ [PARTITION_V4] = { BS_8x32 },
+ }, [BL_16X16] = {
+ [PARTITION_NONE] = { BS_16x16 },
+ [PARTITION_H] = { BS_16x8 },
+ [PARTITION_V] = { BS_8x16 },
+ [PARTITION_T_TOP_SPLIT] = { BS_8x8, BS_16x8 },
+ [PARTITION_T_BOTTOM_SPLIT] = { BS_16x8, BS_8x8 },
+ [PARTITION_T_LEFT_SPLIT] = { BS_8x8, BS_8x16 },
+ [PARTITION_T_RIGHT_SPLIT] = { BS_8x16, BS_8x8 },
+ [PARTITION_H4] = { BS_16x4 },
+ [PARTITION_V4] = { BS_4x16 },
+ }, [BL_8X8] = {
+ [PARTITION_NONE] = { BS_8x8 },
+ [PARTITION_H] = { BS_8x4 },
+ [PARTITION_V] = { BS_4x8 },
+ [PARTITION_SPLIT] = { BS_4x4 },
+ }
+};
+
+const uint8_t dav1d_block_dimensions[N_BS_SIZES][4] = {
+ [BS_128x128] = { 32, 32, 5, 5 },
+ [BS_128x64] = { 32, 16, 5, 4 },
+ [BS_64x128] = { 16, 32, 4, 5 },
+ [BS_64x64] = { 16, 16, 4, 4 },
+ [BS_64x32] = { 16, 8, 4, 3 },
+ [BS_64x16] = { 16, 4, 4, 2 },
+ [BS_32x64] = { 8, 16, 3, 4 },
+ [BS_32x32] = { 8, 8, 3, 3 },
+ [BS_32x16] = { 8, 4, 3, 2 },
+ [BS_32x8] = { 8, 2, 3, 1 },
+ [BS_16x64] = { 4, 16, 2, 4 },
+ [BS_16x32] = { 4, 8, 2, 3 },
+ [BS_16x16] = { 4, 4, 2, 2 },
+ [BS_16x8] = { 4, 2, 2, 1 },
+ [BS_16x4] = { 4, 1, 2, 0 },
+ [BS_8x32] = { 2, 8, 1, 3 },
+ [BS_8x16] = { 2, 4, 1, 2 },
+ [BS_8x8] = { 2, 2, 1, 1 },
+ [BS_8x4] = { 2, 1, 1, 0 },
+ [BS_4x16] = { 1, 4, 0, 2 },
+ [BS_4x8] = { 1, 2, 0, 1 },
+ [BS_4x4] = { 1, 1, 0, 0 },
+};
+
+const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES] = {
+ [ TX_4X4] = { .w = 1, .h = 1, .lw = 0, .lh = 0,
+ .min = 0, .max = 0, .ctx = 0 },
+ [ TX_8X8] = { .w = 2, .h = 2, .lw = 1, .lh = 1,
+ .min = 1, .max = 1, .sub = TX_4X4, .ctx = 1 },
+ [ TX_16X16] = { .w = 4, .h = 4, .lw = 2, .lh = 2,
+ .min = 2, .max = 2, .sub = TX_8X8, .ctx = 2 },
+ [ TX_32X32] = { .w = 8, .h = 8, .lw = 3, .lh = 3,
+ .min = 3, .max = 3, .sub = TX_16X16, .ctx = 3 },
+ [ TX_64X64] = { .w = 16, .h = 16, .lw = 4, .lh = 4,
+ .min = 4, .max = 4, .sub = TX_32X32, .ctx = 4 },
+ [RTX_4X8] = { .w = 1, .h = 2, .lw = 0, .lh = 1,
+ .min = 0, .max = 1, .sub = TX_4X4, .ctx = 1 },
+ [RTX_8X4] = { .w = 2, .h = 1, .lw = 1, .lh = 0,
+ .min = 0, .max = 1, .sub = TX_4X4, .ctx = 1 },
+ [RTX_8X16] = { .w = 2, .h = 4, .lw = 1, .lh = 2,
+ .min = 1, .max = 2, .sub = TX_8X8, .ctx = 2 },
+ [RTX_16X8] = { .w = 4, .h = 2, .lw = 2, .lh = 1,
+ .min = 1, .max = 2, .sub = TX_8X8, .ctx = 2 },
+ [RTX_16X32] = { .w = 4, .h = 8, .lw = 2, .lh = 3,
+ .min = 2, .max = 3, .sub = TX_16X16, .ctx = 3 },
+ [RTX_32X16] = { .w = 8, .h = 4, .lw = 3, .lh = 2,
+ .min = 2, .max = 3, .sub = TX_16X16, .ctx = 3 },
+ [RTX_32X64] = { .w = 8, .h = 16, .lw = 3, .lh = 4,
+ .min = 3, .max = 4, .sub = TX_32X32, .ctx = 4 },
+ [RTX_64X32] = { .w = 16, .h = 8, .lw = 4, .lh = 3,
+ .min = 3, .max = 4, .sub = TX_32X32, .ctx = 4 },
+ [RTX_4X16] = { .w = 1, .h = 4, .lw = 0, .lh = 2,
+ .min = 0, .max = 2, .sub = RTX_4X8, .ctx = 1 },
+ [RTX_16X4] = { .w = 4, .h = 1, .lw = 2, .lh = 0,
+ .min = 0, .max = 2, .sub = RTX_8X4, .ctx = 1 },
+ [RTX_8X32] = { .w = 2, .h = 8, .lw = 1, .lh = 3,
+ .min = 1, .max = 3, .sub = RTX_8X16, .ctx = 2 },
+ [RTX_32X8] = { .w = 8, .h = 2, .lw = 3, .lh = 1,
+ .min = 1, .max = 3, .sub = RTX_16X8, .ctx = 2 },
+ [RTX_16X64] = { .w = 4, .h = 16, .lw = 2, .lh = 4,
+ .min = 2, .max = 4, .sub = RTX_16X32, .ctx = 3 },
+ [RTX_64X16] = { .w = 16, .h = 4, .lw = 4, .lh = 2,
+ .min = 2, .max = 4, .sub = RTX_32X16, .ctx = 3 },
+};
+
+const uint8_t /* enum (Rect)TxfmSize */
+ dav1d_max_txfm_size_for_bs[N_BS_SIZES][4 /* y, 420, 422, 444 */] =
+{
+ [BS_128x128] = { TX_64X64, TX_32X32, TX_32X32, TX_32X32 },
+ [BS_128x64] = { TX_64X64, TX_32X32, TX_32X32, TX_32X32 },
+ [BS_64x128] = { TX_64X64, TX_32X32, 0, TX_32X32 },
+ [BS_64x64] = { TX_64X64, TX_32X32, TX_32X32, TX_32X32 },
+ [BS_64x32] = { RTX_64X32, RTX_32X16, TX_32X32, TX_32X32 },
+ [BS_64x16] = { RTX_64X16, RTX_32X8, RTX_32X16, RTX_32X16 },
+ [BS_32x64] = { RTX_32X64, RTX_16X32, 0, TX_32X32 },
+ [BS_32x32] = { TX_32X32, TX_16X16, RTX_16X32, TX_32X32 },
+ [BS_32x16] = { RTX_32X16, RTX_16X8, TX_16X16, RTX_32X16 },
+ [BS_32x8] = { RTX_32X8, RTX_16X4, RTX_16X8, RTX_32X8 },
+ [BS_16x64] = { RTX_16X64, RTX_8X32, 0, RTX_16X32 },
+ [BS_16x32] = { RTX_16X32, RTX_8X16, 0, RTX_16X32 },
+ [BS_16x16] = { TX_16X16, TX_8X8, RTX_8X16, TX_16X16 },
+ [BS_16x8] = { RTX_16X8, RTX_8X4, TX_8X8, RTX_16X8 },
+ [BS_16x4] = { RTX_16X4, RTX_8X4, RTX_8X4, RTX_16X4 },
+ [BS_8x32] = { RTX_8X32, RTX_4X16, 0, RTX_8X32 },
+ [BS_8x16] = { RTX_8X16, RTX_4X8, 0, RTX_8X16 },
+ [BS_8x8] = { TX_8X8, TX_4X4, RTX_4X8, TX_8X8 },
+ [BS_8x4] = { RTX_8X4, TX_4X4, TX_4X4, RTX_8X4 },
+ [BS_4x16] = { RTX_4X16, RTX_4X8, 0, RTX_4X16 },
+ [BS_4x8] = { RTX_4X8, TX_4X4, 0, RTX_4X8 },
+ [BS_4x4] = { TX_4X4, TX_4X4, TX_4X4, TX_4X4 },
+};
+
+const uint8_t /* enum TxfmType */
+ dav1d_txtp_from_uvmode[N_UV_INTRA_PRED_MODES] =
+{
+ [DC_PRED] = DCT_DCT,
+ [VERT_PRED] = ADST_DCT,
+ [HOR_PRED] = DCT_ADST,
+ [DIAG_DOWN_LEFT_PRED] = DCT_DCT,
+ [DIAG_DOWN_RIGHT_PRED] = ADST_ADST,
+ [VERT_RIGHT_PRED] = ADST_DCT,
+ [HOR_DOWN_PRED] = DCT_ADST,
+ [HOR_UP_PRED] = DCT_ADST,
+ [VERT_LEFT_PRED] = ADST_DCT,
+ [SMOOTH_PRED] = ADST_ADST,
+ [SMOOTH_V_PRED] = ADST_DCT,
+ [SMOOTH_H_PRED] = DCT_ADST,
+ [PAETH_PRED] = ADST_ADST,
+};
+
+const uint8_t /* enum InterPredMode */
+ dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2] =
+{
+ [NEARESTMV_NEARESTMV] = { NEARESTMV, NEARESTMV },
+ [NEARMV_NEARMV] = { NEARMV, NEARMV },
+ [NEWMV_NEWMV] = { NEWMV, NEWMV },
+ [GLOBALMV_GLOBALMV] = { GLOBALMV, GLOBALMV },
+ [NEWMV_NEARESTMV] = { NEWMV, NEARESTMV },
+ [NEWMV_NEARMV] = { NEWMV, NEARMV },
+ [NEARESTMV_NEWMV] = { NEARESTMV, NEWMV },
+ [NEARMV_NEWMV] = { NEARMV, NEWMV },
+};
+
+const uint8_t dav1d_partition_type_count[N_BL_LEVELS] = {
+ [BL_128X128] = N_PARTITIONS - 3,
+ [BL_64X64] = N_PARTITIONS - 1,
+ [BL_32X32] = N_PARTITIONS - 1,
+ [BL_16X16] = N_PARTITIONS - 1,
+ [BL_8X8] = N_SUB8X8_PARTITIONS - 1,
+};
+
+const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40] = {
+ /* Intra2 */
+ IDTX, DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST,
+ /* Intra1 */
+ IDTX, DCT_DCT, V_DCT, H_DCT, ADST_ADST, ADST_DCT, DCT_ADST,
+ /* Inter2 */
+ IDTX, V_DCT, H_DCT, DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT,
+ DCT_FLIPADST, ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST,
+ /* Inter1 */
+ IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST, H_FLIPADST,
+ DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT, DCT_FLIPADST,
+ ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST,
+};
+
+const uint8_t dav1d_ymode_size_context[N_BS_SIZES] = {
+ [BS_128x128] = 3,
+ [BS_128x64] = 3,
+ [BS_64x128] = 3,
+ [BS_64x64] = 3,
+ [BS_64x32] = 3,
+ [BS_64x16] = 2,
+ [BS_32x64] = 3,
+ [BS_32x32] = 3,
+ [BS_32x16] = 2,
+ [BS_32x8 ] = 1,
+ [BS_16x64] = 2,
+ [BS_16x32] = 2,
+ [BS_16x16] = 2,
+ [BS_16x8 ] = 1,
+ [BS_16x4 ] = 0,
+ [BS_8x32 ] = 1,
+ [BS_8x16 ] = 1,
+ [BS_8x8 ] = 1,
+ [BS_8x4 ] = 0,
+ [BS_4x16 ] = 0,
+ [BS_4x8 ] = 0,
+ [BS_4x4 ] = 0,
+};
+
+const uint8_t dav1d_lo_ctx_offsets[3][5][5] = {
+ { /* w == h */
+ { 0, 1, 6, 6, 21 },
+ { 1, 6, 6, 21, 21 },
+ { 6, 6, 21, 21, 21 },
+ { 6, 21, 21, 21, 21 },
+ { 21, 21, 21, 21, 21 },
+ }, { /* w > h */
+ { 0, 16, 6, 6, 21 },
+ { 16, 16, 6, 21, 21 },
+ { 16, 16, 21, 21, 21 },
+ { 16, 16, 21, 21, 21 },
+ { 16, 16, 21, 21, 21 },
+ }, { /* w < h */
+ { 0, 11, 11, 11, 11 },
+ { 11, 11, 11, 11, 11 },
+ { 6, 6, 21, 21, 21 },
+ { 6, 21, 21, 21, 21 },
+ { 21, 21, 21, 21, 21 },
+ },
+};
+
+const uint8_t dav1d_skip_ctx[5][5] = {
+ { 1, 2, 2, 2, 3 },
+ { 2, 4, 4, 4, 5 },
+ { 2, 4, 4, 4, 5 },
+ { 2, 4, 4, 4, 5 },
+ { 3, 5, 5, 5, 6 },
+};
+
+const uint8_t /* enum TxClass */ dav1d_tx_type_class[N_TX_TYPES_PLUS_LL] = {
+ [DCT_DCT] = TX_CLASS_2D,
+ [ADST_DCT] = TX_CLASS_2D,
+ [DCT_ADST] = TX_CLASS_2D,
+ [ADST_ADST] = TX_CLASS_2D,
+ [FLIPADST_DCT] = TX_CLASS_2D,
+ [DCT_FLIPADST] = TX_CLASS_2D,
+ [FLIPADST_FLIPADST] = TX_CLASS_2D,
+ [ADST_FLIPADST] = TX_CLASS_2D,
+ [FLIPADST_ADST] = TX_CLASS_2D,
+ [IDTX] = TX_CLASS_2D,
+ [V_DCT] = TX_CLASS_V,
+ [H_DCT] = TX_CLASS_H,
+ [V_ADST] = TX_CLASS_V,
+ [H_ADST] = TX_CLASS_H,
+ [V_FLIPADST] = TX_CLASS_V,
+ [H_FLIPADST] = TX_CLASS_H,
+ [WHT_WHT] = TX_CLASS_2D,
+};
+
+const uint8_t /* enum Filter2d */ dav1d_filter_2d[DAV1D_N_FILTERS][DAV1D_N_FILTERS] = {
+ [DAV1D_FILTER_8TAP_REGULAR] = {
+ [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_REGULAR,
+ [DAV1D_FILTER_8TAP_SHARP] = FILTER_2D_8TAP_REGULAR_SHARP,
+ [DAV1D_FILTER_8TAP_SMOOTH] = FILTER_2D_8TAP_REGULAR_SMOOTH,
+ }, [DAV1D_FILTER_8TAP_SHARP] = {
+ [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_SHARP_REGULAR,
+ [DAV1D_FILTER_8TAP_SHARP] = FILTER_2D_8TAP_SHARP,
+ [DAV1D_FILTER_8TAP_SMOOTH] = FILTER_2D_8TAP_SHARP_SMOOTH,
+ }, [DAV1D_FILTER_8TAP_SMOOTH] = {
+ [DAV1D_FILTER_8TAP_REGULAR] = FILTER_2D_8TAP_SMOOTH_REGULAR,
+ [DAV1D_FILTER_8TAP_SHARP] = FILTER_2D_8TAP_SMOOTH_SHARP,
+ [DAV1D_FILTER_8TAP_SMOOTH] = FILTER_2D_8TAP_SMOOTH,
+ }, [DAV1D_FILTER_BILINEAR] = {
+ [DAV1D_FILTER_BILINEAR] = FILTER_2D_BILINEAR,
+ }
+};
+
+const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2] = {
+ [FILTER_2D_8TAP_REGULAR] = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_REGULAR },
+ [FILTER_2D_8TAP_REGULAR_SMOOTH] = { DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_REGULAR },
+ [FILTER_2D_8TAP_REGULAR_SHARP] = { DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_REGULAR },
+ [FILTER_2D_8TAP_SHARP_REGULAR] = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SHARP },
+ [FILTER_2D_8TAP_SHARP_SMOOTH] = { DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SHARP },
+ [FILTER_2D_8TAP_SHARP] = { DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SHARP },
+ [FILTER_2D_8TAP_SMOOTH_REGULAR] = { DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SMOOTH },
+ [FILTER_2D_8TAP_SMOOTH] = { DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SMOOTH },
+ [FILTER_2D_8TAP_SMOOTH_SHARP] = { DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SMOOTH },
+ [FILTER_2D_BILINEAR] = { DAV1D_FILTER_BILINEAR, DAV1D_FILTER_BILINEAR },
+};
+
+const uint8_t dav1d_filter_mode_to_y_mode[5] = {
+ DC_PRED, VERT_PRED, HOR_PRED, HOR_DOWN_PRED, DC_PRED
+};
+
+const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES] = {
+ [DC_PRED] = 0,
+ [VERT_PRED] = 1,
+ [HOR_PRED] = 2,
+ [DIAG_DOWN_LEFT_PRED] = 3,
+ [DIAG_DOWN_RIGHT_PRED] = 4,
+ [VERT_RIGHT_PRED] = 4,
+ [HOR_DOWN_PRED] = 4,
+ [HOR_UP_PRED] = 4,
+ [VERT_LEFT_PRED] = 3,
+ [SMOOTH_PRED] = 0,
+ [SMOOTH_V_PRED] = 1,
+ [SMOOTH_H_PRED] = 2,
+ [PAETH_PRED] = 0,
+};
+
+const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES] = {
+ [BS_32x32] = 6,
+ [BS_32x16] = 5,
+ [BS_32x8] = 8,
+ [BS_16x32] = 4,
+ [BS_16x16] = 3,
+ [BS_16x8] = 2,
+ [BS_8x32] = 7,
+ [BS_8x16] = 1,
+ [BS_8x8] = 0,
+};
+
+const Dav1dWarpedMotionParams dav1d_default_wm_params = {
+ .type = DAV1D_WM_TYPE_IDENTITY,
+ .matrix = {
+ 0, 0, 1 << 16,
+ 0, 0, 1 << 16,
+ },
+ .u.p.alpha = 0,
+ .u.p.beta = 0,
+ .u.p.gamma = 0,
+ .u.p.delta = 0,
+};
+
+const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
+ { 1 * 12 + 0, 2 * 12 + 0 }, // 6
+ { 1 * 12 + 0, 2 * 12 - 1 }, // 7
+ { -1 * 12 + 1, -2 * 12 + 2 }, // 0
+ { 0 * 12 + 1, -1 * 12 + 2 }, // 1
+ { 0 * 12 + 1, 0 * 12 + 2 }, // 2
+ { 0 * 12 + 1, 1 * 12 + 2 }, // 3
+ { 1 * 12 + 1, 2 * 12 + 2 }, // 4
+ { 1 * 12 + 0, 2 * 12 + 1 }, // 5
+ { 1 * 12 + 0, 2 * 12 + 0 }, // 6
+ { 1 * 12 + 0, 2 * 12 - 1 }, // 7
+ { -1 * 12 + 1, -2 * 12 + 2 }, // 0
+ { 0 * 12 + 1, -1 * 12 + 2 }, // 1
+};
+
+const uint16_t ALIGN(dav1d_sgr_params[16][2], 4) = {
+ { 140, 3236 }, { 112, 2158 }, { 93, 1618 }, { 80, 1438 },
+ { 70, 1295 }, { 58, 1177 }, { 47, 1079 }, { 37, 996 },
+ { 30, 925 }, { 25, 863 }, { 0, 2589 }, { 0, 1618 },
+ { 0, 1177 }, { 0, 925 }, { 56, 0 }, { 22, 0 },
+};
+
+const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 64) = {
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17,
+ 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9,
+ 8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+ 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0
+};
+
+const int8_t ALIGN(dav1d_mc_subpel_filters[6][15][8], 8) = {
+ [DAV1D_FILTER_8TAP_REGULAR] = {
+ { 0, 1, -3, 63, 4, -1, 0, 0 },
+ { 0, 1, -5, 61, 9, -2, 0, 0 },
+ { 0, 1, -6, 58, 14, -4, 1, 0 },
+ { 0, 1, -7, 55, 19, -5, 1, 0 },
+ { 0, 1, -7, 51, 24, -6, 1, 0 },
+ { 0, 1, -8, 47, 29, -6, 1, 0 },
+ { 0, 1, -7, 42, 33, -6, 1, 0 },
+ { 0, 1, -7, 38, 38, -7, 1, 0 },
+ { 0, 1, -6, 33, 42, -7, 1, 0 },
+ { 0, 1, -6, 29, 47, -8, 1, 0 },
+ { 0, 1, -6, 24, 51, -7, 1, 0 },
+ { 0, 1, -5, 19, 55, -7, 1, 0 },
+ { 0, 1, -4, 14, 58, -6, 1, 0 },
+ { 0, 0, -2, 9, 61, -5, 1, 0 },
+ { 0, 0, -1, 4, 63, -3, 1, 0 }
+ }, [DAV1D_FILTER_8TAP_SMOOTH] = {
+ { 0, 1, 14, 31, 17, 1, 0, 0 },
+ { 0, 0, 13, 31, 18, 2, 0, 0 },
+ { 0, 0, 11, 31, 20, 2, 0, 0 },
+ { 0, 0, 10, 30, 21, 3, 0, 0 },
+ { 0, 0, 9, 29, 22, 4, 0, 0 },
+ { 0, 0, 8, 28, 23, 5, 0, 0 },
+ { 0, -1, 8, 27, 24, 6, 0, 0 },
+ { 0, -1, 7, 26, 26, 7, -1, 0 },
+ { 0, 0, 6, 24, 27, 8, -1, 0 },
+ { 0, 0, 5, 23, 28, 8, 0, 0 },
+ { 0, 0, 4, 22, 29, 9, 0, 0 },
+ { 0, 0, 3, 21, 30, 10, 0, 0 },
+ { 0, 0, 2, 20, 31, 11, 0, 0 },
+ { 0, 0, 2, 18, 31, 13, 0, 0 },
+ { 0, 0, 1, 17, 31, 14, 1, 0 }
+ }, [DAV1D_FILTER_8TAP_SHARP] = {
+ { -1, 1, -3, 63, 4, -1, 1, 0 },
+ { -1, 3, -6, 62, 8, -3, 2, -1 },
+ { -1, 4, -9, 60, 13, -5, 3, -1 },
+ { -2, 5, -11, 58, 19, -7, 3, -1 },
+ { -2, 5, -11, 54, 24, -9, 4, -1 },
+ { -2, 5, -12, 50, 30, -10, 4, -1 },
+ { -2, 5, -12, 45, 35, -11, 5, -1 },
+ { -2, 6, -12, 40, 40, -12, 6, -2 },
+ { -1, 5, -11, 35, 45, -12, 5, -2 },
+ { -1, 4, -10, 30, 50, -12, 5, -2 },
+ { -1, 4, -9, 24, 54, -11, 5, -2 },
+ { -1, 3, -7, 19, 58, -11, 5, -2 },
+ { -1, 3, -5, 13, 60, -9, 4, -1 },
+ { -1, 2, -3, 8, 62, -6, 3, -1 },
+ { 0, 1, -1, 4, 63, -3, 1, -1 }
+ /* width <= 4 */
+ }, [3 + DAV1D_FILTER_8TAP_REGULAR] = {
+ { 0, 0, -2, 63, 4, -1, 0, 0 },
+ { 0, 0, -4, 61, 9, -2, 0, 0 },
+ { 0, 0, -5, 58, 14, -3, 0, 0 },
+ { 0, 0, -6, 55, 19, -4, 0, 0 },
+ { 0, 0, -6, 51, 24, -5, 0, 0 },
+ { 0, 0, -7, 47, 29, -5, 0, 0 },
+ { 0, 0, -6, 42, 33, -5, 0, 0 },
+ { 0, 0, -6, 38, 38, -6, 0, 0 },
+ { 0, 0, -5, 33, 42, -6, 0, 0 },
+ { 0, 0, -5, 29, 47, -7, 0, 0 },
+ { 0, 0, -5, 24, 51, -6, 0, 0 },
+ { 0, 0, -4, 19, 55, -6, 0, 0 },
+ { 0, 0, -3, 14, 58, -5, 0, 0 },
+ { 0, 0, -2, 9, 61, -4, 0, 0 },
+ { 0, 0, -1, 4, 63, -2, 0, 0 }
+ }, [3 + DAV1D_FILTER_8TAP_SMOOTH] = {
+ { 0, 0, 15, 31, 17, 1, 0, 0 },
+ { 0, 0, 13, 31, 18, 2, 0, 0 },
+ { 0, 0, 11, 31, 20, 2, 0, 0 },
+ { 0, 0, 10, 30, 21, 3, 0, 0 },
+ { 0, 0, 9, 29, 22, 4, 0, 0 },
+ { 0, 0, 8, 28, 23, 5, 0, 0 },
+ { 0, 0, 7, 27, 24, 6, 0, 0 },
+ { 0, 0, 6, 26, 26, 6, 0, 0 },
+ { 0, 0, 6, 24, 27, 7, 0, 0 },
+ { 0, 0, 5, 23, 28, 8, 0, 0 },
+ { 0, 0, 4, 22, 29, 9, 0, 0 },
+ { 0, 0, 3, 21, 30, 10, 0, 0 },
+ { 0, 0, 2, 20, 31, 11, 0, 0 },
+ { 0, 0, 2, 18, 31, 13, 0, 0 },
+ { 0, 0, 1, 17, 31, 15, 0, 0 }
+ /* Bilin scaled being very rarely used, add a new table entry
+ * and use the put/prep_8tap_scaled code, thus acting as a
+ * scaled bilinear filter. */
+ }, [5] = {
+ { 0, 0, 0, 60, 4, 0, 0, 0 },
+ { 0, 0, 0, 56, 8, 0, 0, 0 },
+ { 0, 0, 0, 52, 12, 0, 0, 0 },
+ { 0, 0, 0, 48, 16, 0, 0, 0 },
+ { 0, 0, 0, 44, 20, 0, 0, 0 },
+ { 0, 0, 0, 40, 24, 0, 0, 0 },
+ { 0, 0, 0, 36, 28, 0, 0, 0 },
+ { 0, 0, 0, 32, 32, 0, 0, 0 },
+ { 0, 0, 0, 28, 36, 0, 0, 0 },
+ { 0, 0, 0, 24, 40, 0, 0, 0 },
+ { 0, 0, 0, 20, 44, 0, 0, 0 },
+ { 0, 0, 0, 16, 48, 0, 0, 0 },
+ { 0, 0, 0, 12, 52, 0, 0, 0 },
+ { 0, 0, 0, 8, 56, 0, 0, 0 },
+ { 0, 0, 0, 4, 60, 0, 0, 0 }
+ }
+};
+
+const int8_t ALIGN(dav1d_mc_warp_filter[193][8], 8) = {
+ // [-1, 0)
+ { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, -1, 127, 2, 0, 0, 0, 0 },
+ { 1, -3, 127, 4, - 1, 0, 0, 0 }, { 1, -4, 126, 6, -2, 1, 0, 0 },
+ { 1, -5, 126, 8, - 3, 1, 0, 0 }, { 1, -6, 125, 11, -4, 1, 0, 0 },
+ { 1, -7, 124, 13, - 4, 1, 0, 0 }, { 2, -8, 123, 15, -5, 1, 0, 0 },
+ { 2, -9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, -6, 1, 0, 0 },
+ { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, -8, 2, 0, 0 },
+ { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, -9, 2, 0, 0 },
+ { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 },
+ { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 },
+ { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 },
+ { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 },
+ { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 },
+ { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 },
+ { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 },
+ { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 },
+ { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 },
+ { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 },
+ { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 },
+ { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 },
+ { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 },
+ { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 },
+ { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 },
+ { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 },
+ { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 },
+ { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 },
+ { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 },
+ { 2, -8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 },
+ { 2, -7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 },
+ { 1, -6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 },
+ { 1, -4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 },
+ { 1, -3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 },
+ { 0, -1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 },
+ // [0, 1)
+ { 0, 0, 0, 127, 1, 0, 0, 0 }, { 0, 0, -1, 127, 2, 0, 0, 0 },
+ { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -5, 127, 6, -2, 1, 0 },
+ { 0, 2, -6, 126, 8, -3, 1, 0 }, { -1, 2, -7, 126, 11, -4, 2, -1 },
+ { -1, 3, -8, 125, 13, -5, 2, -1 }, { -1, 3, -10, 124, 16, -6, 3, -1 },
+ { -1, 4, -11, 123, 18, -7, 3, -1 }, { -1, 4, -12, 122, 20, -7, 3, -1 },
+ { -1, 4, -13, 121, 23, -8, 3, -1 }, { -2, 5, -14, 120, 25, -9, 4, -1 },
+ { -1, 5, -15, 119, 27, -10, 4, -1 }, { -1, 5, -16, 118, 30, -11, 4, -1 },
+ { -2, 6, -17, 116, 33, -12, 5, -1 }, { -2, 6, -17, 114, 35, -12, 5, -1 },
+ { -2, 6, -18, 113, 38, -13, 5, -1 }, { -2, 7, -19, 111, 41, -14, 6, -2 },
+ { -2, 7, -19, 110, 43, -15, 6, -2 }, { -2, 7, -20, 108, 46, -15, 6, -2 },
+ { -2, 7, -20, 106, 49, -16, 6, -2 }, { -2, 7, -21, 104, 51, -16, 7, -2 },
+ { -2, 7, -21, 102, 54, -17, 7, -2 }, { -2, 8, -21, 100, 56, -18, 7, -2 },
+ { -2, 8, -22, 98, 59, -18, 7, -2 }, { -2, 8, -22, 96, 62, -19, 7, -2 },
+ { -2, 8, -22, 94, 64, -19, 7, -2 }, { -2, 8, -22, 91, 67, -20, 8, -2 },
+ { -2, 8, -22, 89, 69, -20, 8, -2 }, { -2, 8, -22, 87, 72, -21, 8, -2 },
+ { -2, 8, -21, 84, 74, -21, 8, -2 }, { -2, 8, -22, 82, 77, -21, 8, -2 },
+ { -2, 8, -21, 79, 79, -21, 8, -2 }, { -2, 8, -21, 77, 82, -22, 8, -2 },
+ { -2, 8, -21, 74, 84, -21, 8, -2 }, { -2, 8, -21, 72, 87, -22, 8, -2 },
+ { -2, 8, -20, 69, 89, -22, 8, -2 }, { -2, 8, -20, 67, 91, -22, 8, -2 },
+ { -2, 7, -19, 64, 94, -22, 8, -2 }, { -2, 7, -19, 62, 96, -22, 8, -2 },
+ { -2, 7, -18, 59, 98, -22, 8, -2 }, { -2, 7, -18, 56, 100, -21, 8, -2 },
+ { -2, 7, -17, 54, 102, -21, 7, -2 }, { -2, 7, -16, 51, 104, -21, 7, -2 },
+ { -2, 6, -16, 49, 106, -20, 7, -2 }, { -2, 6, -15, 46, 108, -20, 7, -2 },
+ { -2, 6, -15, 43, 110, -19, 7, -2 }, { -2, 6, -14, 41, 111, -19, 7, -2 },
+ { -1, 5, -13, 38, 113, -18, 6, -2 }, { -1, 5, -12, 35, 114, -17, 6, -2 },
+ { -1, 5, -12, 33, 116, -17, 6, -2 }, { -1, 4, -11, 30, 118, -16, 5, -1 },
+ { -1, 4, -10, 27, 119, -15, 5, -1 }, { -1, 4, -9, 25, 120, -14, 5, -2 },
+ { -1, 3, -8, 23, 121, -13, 4, -1 }, { -1, 3, -7, 20, 122, -12, 4, -1 },
+ { -1, 3, -7, 18, 123, -11, 4, -1 }, { -1, 3, -6, 16, 124, -10, 3, -1 },
+ { -1, 2, -5, 13, 125, -8, 3, -1 }, { -1, 2, -4, 11, 126, -7, 2, -1 },
+ { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -2, 6, 127, -5, 1, 0 },
+ { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, 0, 2, 127, -1, 0, 0 },
+ // [1, 2)
+ { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, -1, 127, 2, 0, 0 },
+ { 0, 0, 1, -3, 127, 4, -1, 0 }, { 0, 0, 1, -4, 126, 6, -2, 1 },
+ { 0, 0, 1, -5, 126, 8, -3, 1 }, { 0, 0, 1, -6, 125, 11, -4, 1 },
+ { 0, 0, 1, -7, 124, 13, -4, 1 }, { 0, 0, 2, -8, 123, 15, -5, 1 },
+ { 0, 0, 2, -9, 122, 18, -6, 1 }, { 0, 0, 2, -10, 121, 20, -6, 1 },
+ { 0, 0, 2, -11, 120, 22, -7, 2 }, { 0, 0, 2, -12, 119, 25, -8, 2 },
+ { 0, 0, 3, -13, 117, 27, -8, 2 }, { 0, 0, 3, -13, 116, 29, -9, 2 },
+ { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 },
+ { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 },
+ { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 },
+ { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 },
+ { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 },
+ { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 },
+ { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 },
+ { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 },
+ { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 },
+ { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 },
+ { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 },
+ { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 },
+ { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 },
+ { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 },
+ { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 },
+ { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 },
+ { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 },
+ { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 },
+ { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, -9, 29, 116, -13, 3 },
+ { 0, 0, 2, -8, 27, 117, -13, 3 }, { 0, 0, 2, -8, 25, 119, -12, 2 },
+ { 0, 0, 2, -7, 22, 120, -11, 2 }, { 0, 0, 1, -6, 20, 121, -10, 2 },
+ { 0, 0, 1, -6, 18, 122, -9, 2 }, { 0, 0, 1, -5, 15, 123, -8, 2 },
+ { 0, 0, 1, -4, 13, 124, -7, 1 }, { 0, 0, 1, -4, 11, 125, -6, 1 },
+ { 0, 0, 1, -3, 8, 126, -5, 1 }, { 0, 0, 1, -2, 6, 126, -4, 1 },
+ { 0, 0, 0, -1, 4, 127, -3, 1 }, { 0, 0, 0, 0, 2, 127, -1, 0 },
+ // dummy (replicate row index 191)
+ { 0, 0, 0, 0, 2, 127, -1, 0 },
+};
+
+const int8_t ALIGN(dav1d_resize_filter[64][8], 8) = {
+ { 0, 0, 0, -128, 0, 0, 0, 0 }, { 0, 0, 1, -128, -2, 1, 0, 0 },
+ { 0, -1, 3, -127, -4, 2, -1, 0 }, { 0, -1, 4, -127, -6, 3, -1, 0 },
+ { 0, -2, 6, -126, -8, 3, -1, 0 }, { 0, -2, 7, -125, -11, 4, -1, 0 },
+ { 1, -2, 8, -125, -13, 5, -2, 0 }, { 1, -3, 9, -124, -15, 6, -2, 0 },
+ { 1, -3, 10, -123, -18, 6, -2, 1 }, { 1, -3, 11, -122, -20, 7, -3, 1 },
+ { 1, -4, 12, -121, -22, 8, -3, 1 }, { 1, -4, 13, -120, -25, 9, -3, 1 },
+ { 1, -4, 14, -118, -28, 9, -3, 1 }, { 1, -4, 15, -117, -30, 10, -4, 1 },
+ { 1, -5, 16, -116, -32, 11, -4, 1 }, { 1, -5, 16, -114, -35, 12, -4, 1 },
+ { 1, -5, 17, -112, -38, 12, -4, 1 }, { 1, -5, 18, -111, -40, 13, -5, 1 },
+ { 1, -5, 18, -109, -43, 14, -5, 1 }, { 1, -6, 19, -107, -45, 14, -5, 1 },
+ { 1, -6, 19, -105, -48, 15, -5, 1 }, { 1, -6, 19, -103, -51, 16, -5, 1 },
+ { 1, -6, 20, -101, -53, 16, -6, 1 }, { 1, -6, 20, -99, -56, 17, -6, 1 },
+ { 1, -6, 20, -97, -58, 17, -6, 1 }, { 1, -6, 20, -95, -61, 18, -6, 1 },
+ { 2, -7, 20, -93, -64, 18, -6, 2 }, { 2, -7, 20, -91, -66, 19, -6, 1 },
+ { 2, -7, 20, -88, -69, 19, -6, 1 }, { 2, -7, 20, -86, -71, 19, -6, 1 },
+ { 2, -7, 20, -84, -74, 20, -7, 2 }, { 2, -7, 20, -81, -76, 20, -7, 1 },
+ { 2, -7, 20, -79, -79, 20, -7, 2 }, { 1, -7, 20, -76, -81, 20, -7, 2 },
+ { 2, -7, 20, -74, -84, 20, -7, 2 }, { 1, -6, 19, -71, -86, 20, -7, 2 },
+ { 1, -6, 19, -69, -88, 20, -7, 2 }, { 1, -6, 19, -66, -91, 20, -7, 2 },
+ { 2, -6, 18, -64, -93, 20, -7, 2 }, { 1, -6, 18, -61, -95, 20, -6, 1 },
+ { 1, -6, 17, -58, -97, 20, -6, 1 }, { 1, -6, 17, -56, -99, 20, -6, 1 },
+ { 1, -6, 16, -53, -101, 20, -6, 1 }, { 1, -5, 16, -51, -103, 19, -6, 1 },
+ { 1, -5, 15, -48, -105, 19, -6, 1 }, { 1, -5, 14, -45, -107, 19, -6, 1 },
+ { 1, -5, 14, -43, -109, 18, -5, 1 }, { 1, -5, 13, -40, -111, 18, -5, 1 },
+ { 1, -4, 12, -38, -112, 17, -5, 1 }, { 1, -4, 12, -35, -114, 16, -5, 1 },
+ { 1, -4, 11, -32, -116, 16, -5, 1 }, { 1, -4, 10, -30, -117, 15, -4, 1 },
+ { 1, -3, 9, -28, -118, 14, -4, 1 }, { 1, -3, 9, -25, -120, 13, -4, 1 },
+ { 1, -3, 8, -22, -121, 12, -4, 1 }, { 1, -3, 7, -20, -122, 11, -3, 1 },
+ { 1, -2, 6, -18, -123, 10, -3, 1 }, { 0, -2, 6, -15, -124, 9, -3, 1 },
+ { 0, -2, 5, -13, -125, 8, -2, 1 }, { 0, -1, 4, -11, -125, 7, -2, 0 },
+ { 0, -1, 3, -8, -126, 6, -2, 0 }, { 0, -1, 3, -6, -127, 4, -1, 0 },
+ { 0, -1, 2, -4, -127, 3, -1, 0 }, { 0, 0, 1, -2, -128, 1, 0, 0 },
+};
+
+const uint8_t ALIGN(dav1d_sm_weights[128], 16) = {
+ // Unused, because we always offset by bs, which is at least 2.
+ 0, 0,
+ // bs = 2
+ 255, 128,
+ // bs = 4
+ 255, 149, 85, 64,
+ // bs = 8
+ 255, 197, 146, 105, 73, 50, 37, 32,
+ // bs = 16
+ 255, 225, 196, 170, 145, 123, 102, 84,
+ 68, 54, 43, 33, 26, 20, 17, 16,
+ // bs = 32
+ 255, 240, 225, 210, 196, 182, 169, 157,
+ 145, 133, 122, 111, 101, 92, 83, 74,
+ 66, 59, 52, 45, 39, 34, 29, 25,
+ 21, 17, 14, 12, 10, 9, 8, 8,
+ // bs = 64
+ 255, 248, 240, 233, 225, 218, 210, 203,
+ 196, 189, 182, 176, 169, 163, 156, 150,
+ 144, 138, 133, 127, 121, 116, 111, 106,
+ 101, 96, 91, 86, 82, 77, 73, 69,
+ 65, 61, 57, 54, 50, 47, 44, 41,
+ 38, 35, 32, 29, 27, 25, 22, 20,
+ 18, 16, 15, 13, 12, 10, 9, 8,
+ 7, 6, 6, 5, 5, 4, 4, 4
+};
+
+const uint16_t dav1d_dr_intra_derivative[44] = {
+ // Values that are 0 will never be used
+ 0, // Angles:
+ 1023, 0, // 3, 93, 183
+ 547, // 6, 96, 186
+ 372, 0, 0, // 9, 99, 189
+ 273, // 14, 104, 194
+ 215, 0, // 17, 107, 197
+ 178, // 20, 110, 200
+ 151, 0, // 23, 113, 203 (113 & 203 are base angles)
+ 132, // 26, 116, 206
+ 116, 0, // 29, 119, 209
+ 102, 0, // 32, 122, 212
+ 90, // 36, 126, 216
+ 80, 0, // 39, 129, 219
+ 71, // 42, 132, 222
+ 64, 0, // 45, 135, 225 (45 & 135 are base angles)
+ 57, // 48, 138, 228
+ 51, 0, // 51, 141, 231
+ 45, 0, // 54, 144, 234
+ 40, // 58, 148, 238
+ 35, 0, // 61, 151, 241
+ 31, // 64, 154, 244
+ 27, 0, // 67, 157, 247 (67 & 157 are base angles)
+ 23, // 70, 160, 250
+ 19, 0, // 73, 163, 253
+ 15, 0, // 76, 166, 256
+ 11, 0, // 81, 171, 261
+ 7, // 84, 174, 264
+ 3 // 87, 177, 267
+};
+
+#if ARCH_X86
+#define F(idx, f0, f1, f2, f3, f4, f5, f6) \
+ [2*idx+0] = f0, [2*idx+1] = f1, \
+ [2*idx+16] = f2, [2*idx+17] = f3, \
+ [2*idx+32] = f4, [2*idx+33] = f5, \
+ [2*idx+48] = f6
+#else
+#define F(idx, f0, f1, f2, f3, f4, f5, f6) \
+ [1*idx+0] = f0, [1*idx+8] = f1, \
+ [1*idx+16] = f2, [1*idx+24] = f3, \
+ [1*idx+32] = f4, [1*idx+40] = f5, \
+ [1*idx+48] = f6
+#endif
+const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 64) = {
+ {
+ F( 0, -6, 10, 0, 0, 0, 12, 0 ),
+ F( 1, -5, 2, 10, 0, 0, 9, 0 ),
+ F( 2, -3, 1, 1, 10, 0, 7, 0 ),
+ F( 3, -3, 1, 1, 2, 10, 5, 0 ),
+ F( 4, -4, 6, 0, 0, 0, 2, 12 ),
+ F( 5, -3, 2, 6, 0, 0, 2, 9 ),
+ F( 6, -3, 2, 2, 6, 0, 2, 7 ),
+ F( 7, -3, 1, 2, 2, 6, 3, 5 ),
+ }, {
+ F( 0, -10, 16, 0, 0, 0, 10, 0 ),
+ F( 1, -6, 0, 16, 0, 0, 6, 0 ),
+ F( 2, -4, 0, 0, 16, 0, 4, 0 ),
+ F( 3, -2, 0, 0, 0, 16, 2, 0 ),
+ F( 4, -10, 16, 0, 0, 0, 0, 10 ),
+ F( 5, -6, 0, 16, 0, 0, 0, 6 ),
+ F( 6, -4, 0, 0, 16, 0, 0, 4 ),
+ F( 7, -2, 0, 0, 0, 16, 0, 2 ),
+ }, {
+ F( 0, -8, 8, 0, 0, 0, 16, 0 ),
+ F( 1, -8, 0, 8, 0, 0, 16, 0 ),
+ F( 2, -8, 0, 0, 8, 0, 16, 0 ),
+ F( 3, -8, 0, 0, 0, 8, 16, 0 ),
+ F( 4, -4, 4, 0, 0, 0, 0, 16 ),
+ F( 5, -4, 0, 4, 0, 0, 0, 16 ),
+ F( 6, -4, 0, 0, 4, 0, 0, 16 ),
+ F( 7, -4, 0, 0, 0, 4, 0, 16 ),
+ }, {
+ F( 0, -2, 8, 0, 0, 0, 10, 0 ),
+ F( 1, -1, 3, 8, 0, 0, 6, 0 ),
+ F( 2, -1, 2, 3, 8, 0, 4, 0 ),
+ F( 3, 0, 1, 2, 3, 8, 2, 0 ),
+ F( 4, -1, 4, 0, 0, 0, 3, 10 ),
+ F( 5, -1, 3, 4, 0, 0, 4, 6 ),
+ F( 6, -1, 2, 3, 4, 0, 4, 4 ),
+ F( 7, -1, 2, 2, 3, 4, 3, 3 ),
+ }, {
+ F( 0, -12, 14, 0, 0, 0, 14, 0 ),
+ F( 1, -10, 0, 14, 0, 0, 12, 0 ),
+ F( 2, -9, 0, 0, 14, 0, 11, 0 ),
+ F( 3, -8, 0, 0, 0, 14, 10, 0 ),
+ F( 4, -10, 12, 0, 0, 0, 0, 14 ),
+ F( 5, -9, 1, 12, 0, 0, 0, 12 ),
+ F( 6, -8, 0, 0, 12, 0, 1, 11 ),
+ F( 7, -7, 0, 0, 1, 12, 1, 9 ),
+ }
+};
+
+const uint8_t ALIGN(dav1d_obmc_masks[64], 16) = {
+ /* Unused */
+ 0, 0,
+ /* 2 */
+ 19, 0,
+ /* 4 */
+ 25, 14, 5, 0,
+ /* 8 */
+ 28, 22, 16, 11, 7, 3, 0, 0,
+ /* 16 */
+ 30, 27, 24, 21, 18, 15, 12, 10, 8, 6, 4, 3, 0, 0, 0, 0,
+ /* 32 */
+ 31, 29, 28, 26, 24, 23, 21, 20, 19, 17, 16, 14, 13, 12, 11, 9,
+ 8, 7, 6, 5, 4, 4, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+// Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512
+const int16_t dav1d_gaussian_sequence[2048] = {
+ 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820,
+ 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800,
+ 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588,
+ -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368,
+ 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4,
+ 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396,
+ 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740,
+ 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292,
+ 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532,
+ 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704,
+ 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96,
+ -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244,
+ 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136,
+ 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676,
+ -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400,
+ -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844,
+ -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96,
+ -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356,
+ 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280,
+ 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808,
+ 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228,
+ -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136,
+ -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264,
+ -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388,
+ 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500,
+ 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384,
+ 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220,
+ -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148,
+ 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572,
+ -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516,
+ 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916,
+ -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492,
+ 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560,
+ -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108,
+ -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516,
+ -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88,
+ -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196,
+ -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864,
+ 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920,
+ 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564,
+ -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876,
+ -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244,
+ 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184,
+ 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364,
+ -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72,
+ 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24,
+ 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4,
+ -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120,
+ 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108,
+ -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296,
+ 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336,
+ -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164,
+ -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264,
+ 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536,
+ -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296,
+ -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696,
+ 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204,
+ 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212,
+ -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40,
+ 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384,
+ 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8,
+ 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704,
+ -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348,
+ -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592,
+ -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420,
+ 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220,
+ -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208,
+ -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544,
+ -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288,
+ -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240,
+ -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132,
+ 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16,
+ -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044,
+ -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732,
+ 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460,
+ -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52,
+ -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104,
+ -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460,
+ 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716,
+ -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960,
+ 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476,
+ 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692,
+ 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352,
+ -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144,
+ -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44,
+ 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356,
+ 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452,
+ -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552,
+ -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264,
+ -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448,
+ -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588,
+ 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464,
+ 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216,
+ 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132,
+ 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412,
+ 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48,
+ 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196,
+ 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48,
+ -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292,
+ 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32,
+ -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012,
+ -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120,
+ -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56,
+ 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416,
+ -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404,
+ -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92,
+ 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904,
+ 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728,
+ 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584,
+ 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48,
+ 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180,
+ 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528,
+ 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364,
+ -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260,
+ -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324,
+ -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64,
+ 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120,
+ -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168,
+ -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888,
+ 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588,
+ -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484,
+ 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580,
+ 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392,
+ 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80,
+ -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688,
+ 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4,
+ -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300,
+ 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444,
+ 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192,
+ 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160,
+ 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188,
+ -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404,
+ -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400,
+ 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92,
+ -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824,
+ 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620,
+ 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720,
+ 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620,
+ -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508,
+ -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736,
+ 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836,
+ 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180,
+ 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140,
+ -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32,
+ -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916,
+ 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368,
+ -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380,
+ -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572,
+ -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864,
+ 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908,
+ -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84,
+ 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396,
+ -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360,
+ 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928,
+ -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288,
+ 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196,
+ 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504,
+ 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272,
+ 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344,
+ -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208,
+ -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156,
+ -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240,
+ -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432,
+ 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244,
+ 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584,
+ 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24,
+ 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300,
+ -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416,
+ 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380,
+ -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384,
+ 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88,
+ 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876,
+ -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320,
+ -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88,
+ -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196,
+ -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120,
+ 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664,
+ -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0,
+ -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264,
+ -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288,
+ -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56,
+ 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148,
+ 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156,
+ -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144,
+ -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148,
+ 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944,
+ 428, -484
+};
diff --git a/third_party/dav1d/src/tables.h b/third_party/dav1d/src/tables.h
new file mode 100644
index 0000000000..f3c00cfb00
--- /dev/null
+++ b/third_party/dav1d/src/tables.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_TABLES_H
+#define DAV1D_SRC_TABLES_H
+
+#include <stdint.h>
+
+#include "common/intops.h"
+
+#include "src/levels.h"
+
+EXTERN const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS];
+EXTERN const uint8_t /* enum BlockSize */
+ dav1d_block_sizes[N_BL_LEVELS][N_PARTITIONS][2];
+// width, height (in 4px blocks), log2 versions of these two
+EXTERN const uint8_t dav1d_block_dimensions[N_BS_SIZES][4];
+typedef struct TxfmInfo {
+ // width, height (in 4px blocks), log2 of them, min/max of log2, sub, pad
+ uint8_t w, h, lw, lh, min, max, sub, ctx;
+} TxfmInfo;
+EXTERN const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES];
+EXTERN const uint8_t /* enum (Rect)TxfmSize */
+ dav1d_max_txfm_size_for_bs[N_BS_SIZES][4 /* y, 420, 422, 444 */];
+EXTERN const uint8_t /* enum TxfmType */
+ dav1d_txtp_from_uvmode[N_UV_INTRA_PRED_MODES];
+
+EXTERN const uint8_t /* enum InterPredMode */
+ dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2];
+
+EXTERN const uint8_t dav1d_partition_type_count[N_BL_LEVELS];
+EXTERN const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40];
+
+EXTERN const uint8_t dav1d_filter_mode_to_y_mode[5];
+EXTERN const uint8_t dav1d_ymode_size_context[N_BS_SIZES];
+EXTERN const uint8_t dav1d_lo_ctx_offsets[3][5][5];
+EXTERN const uint8_t dav1d_skip_ctx[5][5];
+EXTERN const uint8_t /* enum TxClass */
+ dav1d_tx_type_class[N_TX_TYPES_PLUS_LL];
+EXTERN const uint8_t /* enum Filter2d */
+ dav1d_filter_2d[DAV1D_N_FILTERS /* h */][DAV1D_N_FILTERS /* v */];
+EXTERN const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2];
+EXTERN const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES];
+EXTERN const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES];
+
+static const unsigned cfl_allowed_mask =
+ (1 << BS_32x32) |
+ (1 << BS_32x16) |
+ (1 << BS_32x8) |
+ (1 << BS_16x32) |
+ (1 << BS_16x16) |
+ (1 << BS_16x8) |
+ (1 << BS_16x4) |
+ (1 << BS_8x32) |
+ (1 << BS_8x16) |
+ (1 << BS_8x8) |
+ (1 << BS_8x4) |
+ (1 << BS_4x16) |
+ (1 << BS_4x8) |
+ (1 << BS_4x4);
+
+static const unsigned wedge_allowed_mask =
+ (1 << BS_32x32) |
+ (1 << BS_32x16) |
+ (1 << BS_32x8) |
+ (1 << BS_16x32) |
+ (1 << BS_16x16) |
+ (1 << BS_16x8) |
+ (1 << BS_8x32) |
+ (1 << BS_8x16) |
+ (1 << BS_8x8);
+
+static const unsigned interintra_allowed_mask =
+ (1 << BS_32x32) |
+ (1 << BS_32x16) |
+ (1 << BS_16x32) |
+ (1 << BS_16x16) |
+ (1 << BS_16x8) |
+ (1 << BS_8x16) |
+ (1 << BS_8x8);
+
+EXTERN const Dav1dWarpedMotionParams dav1d_default_wm_params;
+
+EXTERN const int8_t dav1d_cdef_directions[12][2];
+
+EXTERN const uint16_t dav1d_sgr_params[16][2];
+EXTERN const uint8_t dav1d_sgr_x_by_x[256];
+
+EXTERN const int8_t dav1d_mc_subpel_filters[6][15][8];
+EXTERN const int8_t dav1d_mc_warp_filter[193][8];
+EXTERN const int8_t dav1d_resize_filter[64][8];
+
+EXTERN const uint8_t dav1d_sm_weights[128];
+EXTERN const uint16_t dav1d_dr_intra_derivative[44];
+EXTERN const int8_t dav1d_filter_intra_taps[5][64];
+
+EXTERN const uint8_t dav1d_obmc_masks[64];
+
+EXTERN const int16_t dav1d_gaussian_sequence[2048]; // for fgs
+
+#endif /* DAV1D_SRC_TABLES_H */
diff --git a/third_party/dav1d/src/thread.h b/third_party/dav1d/src/thread.h
new file mode 100644
index 0000000000..b091e4f26d
--- /dev/null
+++ b/third_party/dav1d/src/thread.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_THREAD_H
+#define DAV1D_SRC_THREAD_H
+
+#if defined(_WIN32)
+
+#include <limits.h>
+#include <windows.h>
+
+#define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT
+
+typedef struct {
+ HANDLE h;
+ void *(*func)(void*);
+ void *arg;
+} pthread_t;
+
+typedef struct {
+ unsigned stack_size;
+} pthread_attr_t;
+
+typedef SRWLOCK pthread_mutex_t;
+typedef CONDITION_VARIABLE pthread_cond_t;
+typedef INIT_ONCE pthread_once_t;
+
+void dav1d_init_thread(void);
+void dav1d_set_thread_name(const wchar_t *name);
+#define dav1d_set_thread_name(name) dav1d_set_thread_name(L##name)
+
+int dav1d_pthread_create(pthread_t *thread, const pthread_attr_t *attr,
+ void *(*func)(void*), void *arg);
+int dav1d_pthread_join(pthread_t *thread, void **res);
+int dav1d_pthread_once(pthread_once_t *once_control,
+ void (*init_routine)(void));
+
+#define pthread_create dav1d_pthread_create
+#define pthread_join(thread, res) dav1d_pthread_join(&(thread), res)
+#define pthread_once dav1d_pthread_once
+
+static inline int pthread_attr_init(pthread_attr_t *const attr) {
+ attr->stack_size = 0;
+ return 0;
+}
+
+static inline int pthread_attr_destroy(pthread_attr_t *const attr) {
+ return 0;
+}
+
+static inline int pthread_attr_setstacksize(pthread_attr_t *const attr,
+ const size_t stack_size)
+{
+ if (stack_size > UINT_MAX) return 1;
+ attr->stack_size = (unsigned) stack_size;
+ return 0;
+}
+
+static inline int pthread_mutex_init(pthread_mutex_t *const mutex,
+ const void *const attr)
+{
+ InitializeSRWLock(mutex);
+ return 0;
+}
+
+static inline int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+ return 0;
+}
+
+static inline int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+ AcquireSRWLockExclusive(mutex);
+ return 0;
+}
+
+static inline int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+ ReleaseSRWLockExclusive(mutex);
+ return 0;
+}
+
+static inline int pthread_cond_init(pthread_cond_t *const cond,
+ const void *const attr)
+{
+ InitializeConditionVariable(cond);
+ return 0;
+}
+
+static inline int pthread_cond_destroy(pthread_cond_t *const cond) {
+ return 0;
+}
+
+static inline int pthread_cond_wait(pthread_cond_t *const cond,
+ pthread_mutex_t *const mutex)
+{
+ return !SleepConditionVariableSRW(cond, mutex, INFINITE, 0);
+}
+
+static inline int pthread_cond_signal(pthread_cond_t *const cond) {
+ WakeConditionVariable(cond);
+ return 0;
+}
+
+static inline int pthread_cond_broadcast(pthread_cond_t *const cond) {
+ WakeAllConditionVariable(cond);
+ return 0;
+}
+
+#else
+
+#include <pthread.h>
+
+#define dav1d_init_thread() do {} while (0)
+
+/* Thread naming support */
+
+#ifdef __linux__
+
+#include <sys/prctl.h>
+
+static inline void dav1d_set_thread_name(const char *const name) {
+ prctl(PR_SET_NAME, name);
+}
+
+#elif defined(__APPLE__)
+
+static inline void dav1d_set_thread_name(const char *const name) {
+ pthread_setname_np(name);
+}
+
+#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__)
+
+#if defined(__FreeBSD__)
+ /* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */
+#define _SYS_PARAM_H_
+#include <sys/types.h>
+#endif
+#include <pthread_np.h>
+
+static inline void dav1d_set_thread_name(const char *const name) {
+ pthread_set_name_np(pthread_self(), name);
+}
+
+#elif defined(__NetBSD__)
+
+static inline void dav1d_set_thread_name(const char *const name) {
+ pthread_setname_np(pthread_self(), "%s", (void*)name);
+}
+
+#elif defined(__HAIKU__)
+
+#include <os/kernel/OS.h>
+
+static inline void dav1d_set_thread_name(const char *const name) {
+ rename_thread(find_thread(NULL), name);
+}
+
+#else
+
+#define dav1d_set_thread_name(name) do {} while (0)
+
+#endif
+
+#endif
+
+#endif /* DAV1D_SRC_THREAD_H */
diff --git a/third_party/dav1d/src/thread_data.h b/third_party/dav1d/src/thread_data.h
new file mode 100644
index 0000000000..62814e6348
--- /dev/null
+++ b/third_party/dav1d/src/thread_data.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_THREAD_DATA_H
+#define DAV1D_SRC_THREAD_DATA_H
+
+#include "src/thread.h"
+
+struct thread_data {
+ pthread_t thread;
+ pthread_cond_t cond;
+ pthread_mutex_t lock;
+ int inited;
+};
+
+#endif /* DAV1D_SRC_THREAD_DATA_H */
diff --git a/third_party/dav1d/src/thread_task.c b/third_party/dav1d/src/thread_task.c
new file mode 100644
index 0000000000..bfedf6e5bb
--- /dev/null
+++ b/third_party/dav1d/src/thread_task.c
@@ -0,0 +1,936 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "common/frame.h"
+
+#include "src/thread_task.h"
+#include "src/fg_apply.h"
+
+// This function resets the cur pointer to the first frame theoretically
+// executable after a task completed (ie. each time we update some progress or
+// insert some tasks in the queue).
+// When frame_idx is set, it can be either from a completed task, or from tasks
+// inserted in the queue, in which case we have to make sure the cur pointer
+// isn't past this insert.
+// The special case where frame_idx is UINT_MAX is to handle the reset after
+// completing a task and locklessly signaling progress. In this case we don't
+// enter a critical section, which is needed for this function, so we set an
+// atomic for a delayed handling, happening here. Meaning we can call this
+// function without any actual update other than what's in the atomic, hence
+// this special case.
+static inline int reset_task_cur(const Dav1dContext *const c,
+ struct TaskThreadData *const ttd,
+ unsigned frame_idx)
+{
+ const unsigned first = atomic_load(&ttd->first);
+ unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX);
+ if (reset_frame_idx < first) {
+ if (frame_idx == UINT_MAX) return 0;
+ reset_frame_idx = UINT_MAX;
+ }
+ if (!ttd->cur && c->fc[first].task_thread.task_cur_prev == NULL)
+ return 0;
+ if (reset_frame_idx != UINT_MAX) {
+ if (frame_idx == UINT_MAX) {
+ if (reset_frame_idx > first + ttd->cur)
+ return 0;
+ ttd->cur = reset_frame_idx - first;
+ goto cur_found;
+ }
+ } else if (frame_idx == UINT_MAX)
+ return 0;
+ if (frame_idx < first) frame_idx += c->n_fc;
+ const unsigned min_frame_idx = umin(reset_frame_idx, frame_idx);
+ const unsigned cur_frame_idx = first + ttd->cur;
+ if (ttd->cur < c->n_fc && cur_frame_idx < min_frame_idx)
+ return 0;
+ for (ttd->cur = min_frame_idx - first; ttd->cur < c->n_fc; ttd->cur++)
+ if (c->fc[(first + ttd->cur) % c->n_fc].task_thread.task_head)
+ break;
+cur_found:
+ for (unsigned i = ttd->cur; i < c->n_fc; i++)
+ c->fc[(first + i) % c->n_fc].task_thread.task_cur_prev = NULL;
+ return 1;
+}
+
+static inline void reset_task_cur_async(struct TaskThreadData *const ttd,
+ unsigned frame_idx, unsigned n_frames)
+{
+ const unsigned first = atomic_load(&ttd->first);
+ if (frame_idx < first) frame_idx += n_frames;
+ unsigned last_idx = frame_idx;
+ do {
+ frame_idx = last_idx;
+ last_idx = atomic_exchange(&ttd->reset_task_cur, frame_idx);
+ } while (last_idx < frame_idx);
+ if (frame_idx == first && atomic_load(&ttd->first) != first) {
+ unsigned expected = frame_idx;
+ atomic_compare_exchange_strong(&ttd->reset_task_cur, &expected, UINT_MAX);
+ }
+}
+
+static void insert_tasks_between(Dav1dFrameContext *const f,
+ Dav1dTask *const first, Dav1dTask *const last,
+ Dav1dTask *const a, Dav1dTask *const b,
+ const int cond_signal)
+{
+ struct TaskThreadData *const ttd = f->task_thread.ttd;
+ if (atomic_load(f->c->flush)) return;
+ assert(!a || a->next == b);
+ if (!a) f->task_thread.task_head = first;
+ else a->next = first;
+ if (!b) f->task_thread.task_tail = last;
+ last->next = b;
+ reset_task_cur(f->c, ttd, first->frame_idx);
+ if (cond_signal && !atomic_fetch_or(&ttd->cond_signaled, 1))
+ pthread_cond_signal(&ttd->cond);
+}
+
+static void insert_tasks(Dav1dFrameContext *const f,
+ Dav1dTask *const first, Dav1dTask *const last,
+ const int cond_signal)
+{
+ // insert task back into task queue
+ Dav1dTask *t_ptr, *prev_t = NULL;
+ for (t_ptr = f->task_thread.task_head;
+ t_ptr; prev_t = t_ptr, t_ptr = t_ptr->next)
+ {
+ // entropy coding precedes other steps
+ if (t_ptr->type == DAV1D_TASK_TYPE_TILE_ENTROPY) {
+ if (first->type > DAV1D_TASK_TYPE_TILE_ENTROPY) continue;
+ // both are entropy
+ if (first->sby > t_ptr->sby) continue;
+ if (first->sby < t_ptr->sby) {
+ insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+ return;
+ }
+ // same sby
+ } else {
+ if (first->type == DAV1D_TASK_TYPE_TILE_ENTROPY) {
+ insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+ return;
+ }
+ if (first->sby > t_ptr->sby) continue;
+ if (first->sby < t_ptr->sby) {
+ insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+ return;
+ }
+ // same sby
+ if (first->type > t_ptr->type) continue;
+ if (first->type < t_ptr->type) {
+ insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+ return;
+ }
+ // same task type
+ }
+
+ // sort by tile-id
+ assert(first->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION ||
+ first->type == DAV1D_TASK_TYPE_TILE_ENTROPY);
+ assert(first->type == t_ptr->type);
+ assert(t_ptr->sby == first->sby);
+ const int p = first->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
+ const int t_tile_idx = (int) (first - f->task_thread.tile_tasks[p]);
+ const int p_tile_idx = (int) (t_ptr - f->task_thread.tile_tasks[p]);
+ assert(t_tile_idx != p_tile_idx);
+ if (t_tile_idx > p_tile_idx) continue;
+ insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal);
+ return;
+ }
+ // append at the end
+ insert_tasks_between(f, first, last, prev_t, NULL, cond_signal);
+}
+
+static inline void insert_task(Dav1dFrameContext *const f,
+ Dav1dTask *const t, const int cond_signal)
+{
+ insert_tasks(f, t, t, cond_signal);
+}
+
+static inline void add_pending(Dav1dFrameContext *const f, Dav1dTask *const t) {
+ pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
+ t->next = NULL;
+ if (!f->task_thread.pending_tasks.head)
+ f->task_thread.pending_tasks.head = t;
+ else
+ f->task_thread.pending_tasks.tail->next = t;
+ f->task_thread.pending_tasks.tail = t;
+ atomic_store(&f->task_thread.pending_tasks.merge, 1);
+ pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
+}
+
+static inline int merge_pending_frame(Dav1dFrameContext *const f) {
+ int const merge = atomic_load(&f->task_thread.pending_tasks.merge);
+ if (merge) {
+ pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
+ Dav1dTask *t = f->task_thread.pending_tasks.head;
+ f->task_thread.pending_tasks.head = NULL;
+ f->task_thread.pending_tasks.tail = NULL;
+ atomic_store(&f->task_thread.pending_tasks.merge, 0);
+ pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
+ while (t) {
+ Dav1dTask *const tmp = t->next;
+ insert_task(f, t, 0);
+ t = tmp;
+ }
+ }
+ return merge;
+}
+
+static inline int merge_pending(const Dav1dContext *const c) {
+ int res = 0;
+ for (unsigned i = 0; i < c->n_fc; i++)
+ res |= merge_pending_frame(&c->fc[i]);
+ return res;
+}
+
+static int create_filter_sbrow(Dav1dFrameContext *const f,
+ const int pass, Dav1dTask **res_t)
+{
+ const int has_deblock = f->frame_hdr->loopfilter.level_y[0] ||
+ f->frame_hdr->loopfilter.level_y[1];
+ const int has_cdef = f->seq_hdr->cdef;
+ const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
+ const int has_lr = f->lf.restore_planes;
+
+ Dav1dTask *tasks = f->task_thread.tasks;
+ const int uses_2pass = f->c->n_fc > 1;
+ int num_tasks = f->sbh * (1 + uses_2pass);
+ if (num_tasks > f->task_thread.num_tasks) {
+ const size_t size = sizeof(Dav1dTask) * num_tasks;
+ tasks = realloc(f->task_thread.tasks, size);
+ if (!tasks) return -1;
+ memset(tasks, 0, size);
+ f->task_thread.tasks = tasks;
+ f->task_thread.num_tasks = num_tasks;
+ }
+ tasks += f->sbh * (pass & 1);
+
+ if (pass & 1) {
+ f->frame_thread.entropy_progress = 0;
+ } else {
+ const int prog_sz = ((f->sbh + 31) & ~31) >> 5;
+ if (prog_sz > f->frame_thread.prog_sz) {
+ atomic_uint *const prog = realloc(f->frame_thread.frame_progress,
+ 2 * prog_sz * sizeof(*prog));
+ if (!prog) return -1;
+ f->frame_thread.frame_progress = prog;
+ f->frame_thread.copy_lpf_progress = prog + prog_sz;
+ }
+ f->frame_thread.prog_sz = prog_sz;
+ memset(f->frame_thread.frame_progress, 0, prog_sz * sizeof(atomic_uint));
+ memset(f->frame_thread.copy_lpf_progress, 0, prog_sz * sizeof(atomic_uint));
+ atomic_store(&f->frame_thread.deblock_progress, 0);
+ }
+ f->frame_thread.next_tile_row[pass & 1] = 0;
+
+ Dav1dTask *t = &tasks[0];
+ t->sby = 0;
+ t->recon_progress = 1;
+ t->deblock_progress = 0;
+ t->type = pass == 1 ? DAV1D_TASK_TYPE_ENTROPY_PROGRESS :
+ has_deblock ? DAV1D_TASK_TYPE_DEBLOCK_COLS :
+ has_cdef || has_lr /* i.e. LR backup */ ? DAV1D_TASK_TYPE_DEBLOCK_ROWS :
+ has_resize ? DAV1D_TASK_TYPE_SUPER_RESOLUTION :
+ DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS;
+ t->frame_idx = (int)(f - f->c->fc);
+
+ *res_t = t;
+ return 0;
+}
+
+int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
+ const int cond_signal)
+{
+ Dav1dTask *tasks = f->task_thread.tile_tasks[0];
+ const int uses_2pass = f->c->n_fc > 1;
+ const int num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
+ if (pass < 2) {
+ int alloc_num_tasks = num_tasks * (1 + uses_2pass);
+ if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
+ const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
+ tasks = realloc(f->task_thread.tile_tasks[0], size);
+ if (!tasks) return -1;
+ memset(tasks, 0, size);
+ f->task_thread.tile_tasks[0] = tasks;
+ f->task_thread.num_tile_tasks = alloc_num_tasks;
+ }
+ f->task_thread.tile_tasks[1] = tasks + num_tasks;
+ }
+ tasks += num_tasks * (pass & 1);
+
+ Dav1dTask *pf_t;
+ if (create_filter_sbrow(f, pass, &pf_t))
+ return -1;
+
+ Dav1dTask *prev_t = NULL;
+ for (int tile_idx = 0; tile_idx < num_tasks; tile_idx++) {
+ Dav1dTileState *const ts = &f->ts[tile_idx];
+ Dav1dTask *t = &tasks[tile_idx];
+ t->sby = ts->tiling.row_start >> f->sb_shift;
+ if (pf_t && t->sby) {
+ prev_t->next = pf_t;
+ prev_t = pf_t;
+ pf_t = NULL;
+ }
+ t->recon_progress = 0;
+ t->deblock_progress = 0;
+ t->deps_skip = 0;
+ t->type = pass != 1 ? DAV1D_TASK_TYPE_TILE_RECONSTRUCTION :
+ DAV1D_TASK_TYPE_TILE_ENTROPY;
+ t->frame_idx = (int)(f - f->c->fc);
+ if (prev_t) prev_t->next = t;
+ prev_t = t;
+ }
+ if (pf_t) {
+ prev_t->next = pf_t;
+ prev_t = pf_t;
+ }
+ prev_t->next = NULL;
+
+ atomic_store(&f->task_thread.done[pass & 1], 0);
+
+ // XXX in theory this could be done locklessly, at this point they are no
+ // tasks in the frameQ, so no other runner should be using this lock, but
+ // we must add both passes at once
+ pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
+ assert(f->task_thread.pending_tasks.head == NULL || pass == 2);
+ if (!f->task_thread.pending_tasks.head)
+ f->task_thread.pending_tasks.head = &tasks[0];
+ else
+ f->task_thread.pending_tasks.tail->next = &tasks[0];
+ f->task_thread.pending_tasks.tail = prev_t;
+ atomic_store(&f->task_thread.pending_tasks.merge, 1);
+ pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
+
+ return 0;
+}
+
+void dav1d_task_frame_init(Dav1dFrameContext *const f) {
+ const Dav1dContext *const c = f->c;
+
+ atomic_store(&f->task_thread.init_done, 0);
+ // schedule init task, which will schedule the remaining tasks
+ Dav1dTask *const t = &f->task_thread.init_task;
+ t->type = DAV1D_TASK_TYPE_INIT;
+ t->frame_idx = (int)(f - c->fc);
+ t->sby = 0;
+ t->recon_progress = t->deblock_progress = 0;
+ insert_task(f, t, 1);
+}
+
+void dav1d_task_delayed_fg(Dav1dContext *const c, Dav1dPicture *const out,
+ const Dav1dPicture *const in)
+{
+ struct TaskThreadData *const ttd = &c->task_thread;
+ ttd->delayed_fg.in = in;
+ ttd->delayed_fg.out = out;
+ ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_PREP;
+ atomic_init(&ttd->delayed_fg.progress[0], 0);
+ atomic_init(&ttd->delayed_fg.progress[1], 0);
+ pthread_mutex_lock(&ttd->lock);
+ ttd->delayed_fg.exec = 1;
+ pthread_cond_signal(&ttd->cond);
+ pthread_cond_wait(&ttd->delayed_fg.cond, &ttd->lock);
+ pthread_mutex_unlock(&ttd->lock);
+}
+
+static inline int ensure_progress(struct TaskThreadData *const ttd,
+ Dav1dFrameContext *const f,
+ Dav1dTask *const t, const enum TaskType type,
+ atomic_int *const state, int *const target)
+{
+ // deblock_rows (non-LR portion) depends on deblock of previous sbrow,
+ // so ensure that completed. if not, re-add to task-queue; else, fall-through
+ int p1 = atomic_load(state);
+ if (p1 < t->sby) {
+ t->type = type;
+ t->recon_progress = t->deblock_progress = 0;
+ *target = t->sby;
+ add_pending(f, t);
+ pthread_mutex_lock(&ttd->lock);
+ return 1;
+ }
+ return 0;
+}
+
+static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f,
+ const int frame_mt)
+{
+ const int tp = t->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
+ const int tile_idx = (int)(t - f->task_thread.tile_tasks[tp]);
+ Dav1dTileState *const ts = &f->ts[tile_idx];
+ const int p1 = atomic_load(&ts->progress[tp]);
+ if (p1 < t->sby) return 1;
+ int error = p1 == TILE_ERROR;
+ error |= atomic_fetch_or(&f->task_thread.error, error);
+ if (!error && frame_mt && !tp) {
+ const int p2 = atomic_load(&ts->progress[1]);
+ if (p2 <= t->sby) return 1;
+ error = p2 == TILE_ERROR;
+ error |= atomic_fetch_or(&f->task_thread.error, error);
+ }
+ if (!error && frame_mt && !IS_KEY_OR_INTRA(f->frame_hdr)) {
+ // check reference state
+ const Dav1dThreadPicture *p = &f->sr_cur;
+ const int ss_ver = p->p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const unsigned p_b = (t->sby + 1) << (f->sb_shift + 2);
+ const int tile_sby = t->sby - (ts->tiling.row_start >> f->sb_shift);
+ const int (*const lowest_px)[2] = ts->lowest_pixel[tile_sby];
+ for (int n = t->deps_skip; n < 7; n++, t->deps_skip++) {
+ unsigned lowest;
+ if (tp) {
+ // if temporal mv refs are disabled, we only need this
+ // for the primary ref; if segmentation is disabled, we
+ // don't even need that
+ lowest = p_b;
+ } else {
+ // +8 is postfilter-induced delay
+ const int y = lowest_px[n][0] == INT_MIN ? INT_MIN :
+ lowest_px[n][0] + 8;
+ const int uv = lowest_px[n][1] == INT_MIN ? INT_MIN :
+ lowest_px[n][1] * (1 << ss_ver) + 8;
+ const int max = imax(y, uv);
+ if (max == INT_MIN) continue;
+ lowest = iclip(max, 1, f->refp[n].p.p.h);
+ }
+ const unsigned p3 = atomic_load(&f->refp[n].progress[!tp]);
+ if (p3 < lowest) return 1;
+ atomic_fetch_or(&f->task_thread.error, p3 == FRAME_ERROR);
+ }
+ }
+ return 0;
+}
+
+static inline int get_frame_progress(const Dav1dContext *const c,
+ const Dav1dFrameContext *const f)
+{
+ unsigned frame_prog = c->n_fc > 1 ? atomic_load(&f->sr_cur.progress[1]) : 0;
+ if (frame_prog >= FRAME_ERROR)
+ return f->sbh - 1;
+ int idx = frame_prog >> (f->sb_shift + 7);
+ int prog;
+ do {
+ atomic_uint *state = &f->frame_thread.frame_progress[idx];
+ const unsigned val = ~atomic_load(state);
+ prog = val ? ctz(val) : 32;
+ if (prog != 32) break;
+ prog = 0;
+ } while (++idx < f->frame_thread.prog_sz);
+ return ((idx << 5) | prog) - 1;
+}
+
+static inline void abort_frame(Dav1dFrameContext *const f, const int error) {
+ atomic_store(&f->task_thread.error, error == DAV1D_ERR(EINVAL) ? 1 : -1);
+ atomic_store(&f->task_thread.task_counter, 0);
+ atomic_store(&f->task_thread.done[0], 1);
+ atomic_store(&f->task_thread.done[1], 1);
+ atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
+ atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
+ dav1d_decode_frame_exit(f, error);
+ f->n_tile_data = 0;
+ pthread_cond_signal(&f->task_thread.cond);
+}
+
+static inline void delayed_fg_task(const Dav1dContext *const c,
+ struct TaskThreadData *const ttd)
+{
+ const Dav1dPicture *const in = ttd->delayed_fg.in;
+ Dav1dPicture *const out = ttd->delayed_fg.out;
+#if CONFIG_16BPC
+ int off;
+ if (out->p.bpc != 8)
+ off = (out->p.bpc >> 1) - 4;
+#endif
+ switch (ttd->delayed_fg.type) {
+ case DAV1D_TASK_TYPE_FG_PREP:
+ ttd->delayed_fg.exec = 0;
+ if (atomic_load(&ttd->cond_signaled))
+ pthread_cond_signal(&ttd->cond);
+ pthread_mutex_unlock(&ttd->lock);
+ switch (out->p.bpc) {
+#if CONFIG_8BPC
+ case 8:
+ dav1d_prep_grain_8bpc(&c->dsp[0].fg, out, in,
+ ttd->delayed_fg.scaling_8bpc,
+ ttd->delayed_fg.grain_lut_8bpc);
+ break;
+#endif
+#if CONFIG_16BPC
+ case 10:
+ case 12:
+ dav1d_prep_grain_16bpc(&c->dsp[off].fg, out, in,
+ ttd->delayed_fg.scaling_16bpc,
+ ttd->delayed_fg.grain_lut_16bpc);
+ break;
+#endif
+ default: abort();
+ }
+ ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_APPLY;
+ pthread_mutex_lock(&ttd->lock);
+ ttd->delayed_fg.exec = 1;
+ // fall-through
+ case DAV1D_TASK_TYPE_FG_APPLY:;
+ int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
+ pthread_mutex_unlock(&ttd->lock);
+ int progmax = (out->p.h + 31) >> 5;
+ fg_apply_loop:
+ if (row + 1 < progmax)
+ pthread_cond_signal(&ttd->cond);
+ else if (row + 1 >= progmax) {
+ pthread_mutex_lock(&ttd->lock);
+ ttd->delayed_fg.exec = 0;
+ if (row >= progmax) goto end_add;
+ pthread_mutex_unlock(&ttd->lock);
+ }
+ switch (out->p.bpc) {
+#if CONFIG_8BPC
+ case 8:
+ dav1d_apply_grain_row_8bpc(&c->dsp[0].fg, out, in,
+ ttd->delayed_fg.scaling_8bpc,
+ ttd->delayed_fg.grain_lut_8bpc, row);
+ break;
+#endif
+#if CONFIG_16BPC
+ case 10:
+ case 12:
+ dav1d_apply_grain_row_16bpc(&c->dsp[off].fg, out, in,
+ ttd->delayed_fg.scaling_16bpc,
+ ttd->delayed_fg.grain_lut_16bpc, row);
+ break;
+#endif
+ default: abort();
+ }
+ row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
+ int done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1;
+ if (row < progmax) goto fg_apply_loop;
+ pthread_mutex_lock(&ttd->lock);
+ ttd->delayed_fg.exec = 0;
+ end_add:
+ done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1;
+ progmax = atomic_load(&ttd->delayed_fg.progress[0]);
+ // signal for completion only once the last runner reaches this
+ if (done < progmax)
+ break;
+ pthread_cond_signal(&ttd->delayed_fg.cond);
+ break;
+ default: abort();
+ }
+}
+
+void *dav1d_worker_task(void *data) {
+ Dav1dTaskContext *const tc = data;
+ const Dav1dContext *const c = tc->c;
+ struct TaskThreadData *const ttd = tc->task_thread.ttd;
+
+ dav1d_set_thread_name("dav1d-worker");
+
+ pthread_mutex_lock(&ttd->lock);
+ for (;;) {
+ if (tc->task_thread.die) break;
+ if (atomic_load(c->flush)) goto park;
+
+ merge_pending(c);
+ if (ttd->delayed_fg.exec) { // run delayed film grain first
+ delayed_fg_task(c, ttd);
+ continue;
+ }
+ Dav1dFrameContext *f;
+ Dav1dTask *t, *prev_t = NULL;
+ if (c->n_fc > 1) { // run init tasks second
+ for (unsigned i = 0; i < c->n_fc; i++) {
+ const unsigned first = atomic_load(&ttd->first);
+ f = &c->fc[(first + i) % c->n_fc];
+ if (atomic_load(&f->task_thread.init_done)) continue;
+ t = f->task_thread.task_head;
+ if (!t) continue;
+ if (t->type == DAV1D_TASK_TYPE_INIT) goto found;
+ if (t->type == DAV1D_TASK_TYPE_INIT_CDF) {
+ // XXX This can be a simple else, if adding tasks of both
+ // passes at once (in dav1d_task_create_tile_sbrow).
+ // Adding the tasks to the pending Q can result in a
+ // thread merging them before setting init_done.
+ // We will need to set init_done before adding to the
+ // pending Q, so maybe return the tasks, set init_done,
+ // and add to pending Q only then.
+ const int p1 = f->in_cdf.progress ?
+ atomic_load(f->in_cdf.progress) : 1;
+ if (p1) {
+ atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
+ goto found;
+ }
+ }
+ }
+ }
+ while (ttd->cur < c->n_fc) { // run decoding tasks last
+ const unsigned first = atomic_load(&ttd->first);
+ f = &c->fc[(first + ttd->cur) % c->n_fc];
+ merge_pending_frame(f);
+ prev_t = f->task_thread.task_cur_prev;
+ t = prev_t ? prev_t->next : f->task_thread.task_head;
+ while (t) {
+ if (t->type == DAV1D_TASK_TYPE_INIT_CDF) goto next;
+ else if (t->type == DAV1D_TASK_TYPE_TILE_ENTROPY ||
+ t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION)
+ {
+ // if not bottom sbrow of tile, this task will be re-added
+ // after it's finished
+ if (!check_tile(t, f, c->n_fc > 1))
+ goto found;
+ } else if (t->recon_progress) {
+ const int p = t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS;
+ int error = atomic_load(&f->task_thread.error);
+ assert(!atomic_load(&f->task_thread.done[p]) || error);
+ const int tile_row_base = f->frame_hdr->tiling.cols *
+ f->frame_thread.next_tile_row[p];
+ if (p) {
+ atomic_int *const prog = &f->frame_thread.entropy_progress;
+ const int p1 = atomic_load(prog);
+ if (p1 < t->sby) goto next;
+ atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
+ }
+ for (int tc = 0; tc < f->frame_hdr->tiling.cols; tc++) {
+ Dav1dTileState *const ts = &f->ts[tile_row_base + tc];
+ const int p2 = atomic_load(&ts->progress[p]);
+ if (p2 < t->recon_progress) goto next;
+ atomic_fetch_or(&f->task_thread.error, p2 == TILE_ERROR);
+ }
+ if (t->sby + 1 < f->sbh) {
+ // add sby+1 to list to replace this one
+ Dav1dTask *next_t = &t[1];
+ *next_t = *t;
+ next_t->sby++;
+ const int ntr = f->frame_thread.next_tile_row[p] + 1;
+ const int start = f->frame_hdr->tiling.row_start_sb[ntr];
+ if (next_t->sby == start)
+ f->frame_thread.next_tile_row[p] = ntr;
+ next_t->recon_progress = next_t->sby + 1;
+ insert_task(f, next_t, 0);
+ }
+ goto found;
+ } else if (t->type == DAV1D_TASK_TYPE_CDEF) {
+ atomic_uint *prog = f->frame_thread.copy_lpf_progress;
+ const int p1 = atomic_load(&prog[(t->sby - 1) >> 5]);
+ if (p1 & (1U << ((t->sby - 1) & 31)))
+ goto found;
+ } else {
+ assert(t->deblock_progress);
+ const int p1 = atomic_load(&f->frame_thread.deblock_progress);
+ if (p1 >= t->deblock_progress) {
+ atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
+ goto found;
+ }
+ }
+ next:
+ prev_t = t;
+ t = t->next;
+ f->task_thread.task_cur_prev = prev_t;
+ }
+ ttd->cur++;
+ }
+ if (reset_task_cur(c, ttd, UINT_MAX)) continue;
+ if (merge_pending(c)) continue;
+ park:
+ tc->task_thread.flushed = 1;
+ pthread_cond_signal(&tc->task_thread.td.cond);
+ // we want to be woken up next time progress is signaled
+ atomic_store(&ttd->cond_signaled, 0);
+ pthread_cond_wait(&ttd->cond, &ttd->lock);
+ tc->task_thread.flushed = 0;
+ reset_task_cur(c, ttd, UINT_MAX);
+ continue;
+
+ found:
+ // remove t from list
+ if (prev_t) prev_t->next = t->next;
+ else f->task_thread.task_head = t->next;
+ if (!t->next) f->task_thread.task_tail = prev_t;
+ if (t->type > DAV1D_TASK_TYPE_INIT_CDF && !f->task_thread.task_head)
+ ttd->cur++;
+ t->next = NULL;
+ // we don't need to check cond_signaled here, since we found a task
+ // after the last signal so we want to re-signal the next waiting thread
+ // and again won't need to signal after that
+ atomic_store(&ttd->cond_signaled, 1);
+ pthread_cond_signal(&ttd->cond);
+ pthread_mutex_unlock(&ttd->lock);
+ found_unlocked:;
+ const int flush = atomic_load(c->flush);
+ int error = atomic_fetch_or(&f->task_thread.error, flush) | flush;
+
+ // run it
+ tc->f = f;
+ int sby = t->sby;
+ switch (t->type) {
+ case DAV1D_TASK_TYPE_INIT: {
+ assert(c->n_fc > 1);
+ int res = dav1d_decode_frame_init(f);
+ int p1 = f->in_cdf.progress ? atomic_load(f->in_cdf.progress) : 1;
+ if (res || p1 == TILE_ERROR) {
+ pthread_mutex_lock(&ttd->lock);
+ abort_frame(f, res ? res : DAV1D_ERR(EINVAL));
+ reset_task_cur(c, ttd, t->frame_idx);
+ } else {
+ t->type = DAV1D_TASK_TYPE_INIT_CDF;
+ if (p1) goto found_unlocked;
+ add_pending(f, t);
+ pthread_mutex_lock(&ttd->lock);
+ }
+ continue;
+ }
+ case DAV1D_TASK_TYPE_INIT_CDF: {
+ assert(c->n_fc > 1);
+ int res = DAV1D_ERR(EINVAL);
+ if (!atomic_load(&f->task_thread.error))
+ res = dav1d_decode_frame_init_cdf(f);
+ if (f->frame_hdr->refresh_context && !f->task_thread.update_set) {
+ atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1);
+ }
+ if (!res) {
+ assert(c->n_fc > 1);
+ for (int p = 1; p <= 2; p++) {
+ const int res = dav1d_task_create_tile_sbrow(f, p, 0);
+ if (res) {
+ pthread_mutex_lock(&ttd->lock);
+ // memory allocation failed
+ atomic_store(&f->task_thread.done[2 - p], 1);
+ atomic_store(&f->task_thread.error, -1);
+ atomic_fetch_sub(&f->task_thread.task_counter,
+ f->frame_hdr->tiling.cols *
+ f->frame_hdr->tiling.rows + f->sbh);
+ atomic_store(&f->sr_cur.progress[p - 1], FRAME_ERROR);
+ if (p == 2 && atomic_load(&f->task_thread.done[1])) {
+ assert(!atomic_load(&f->task_thread.task_counter));
+ dav1d_decode_frame_exit(f, DAV1D_ERR(ENOMEM));
+ f->n_tile_data = 0;
+ pthread_cond_signal(&f->task_thread.cond);
+ atomic_store(&f->task_thread.init_done, 1);
+ continue;
+ } else {
+ pthread_mutex_unlock(&ttd->lock);
+ }
+ }
+ }
+ atomic_store(&f->task_thread.init_done, 1);
+ pthread_mutex_lock(&ttd->lock);
+ } else {
+ pthread_mutex_lock(&ttd->lock);
+ abort_frame(f, res);
+ reset_task_cur(c, ttd, t->frame_idx);
+ atomic_store(&f->task_thread.init_done, 1);
+ }
+ continue;
+ }
+ case DAV1D_TASK_TYPE_TILE_ENTROPY:
+ case DAV1D_TASK_TYPE_TILE_RECONSTRUCTION: {
+ const int p = t->type == DAV1D_TASK_TYPE_TILE_ENTROPY;
+ const int tile_idx = (int)(t - f->task_thread.tile_tasks[p]);
+ Dav1dTileState *const ts = &f->ts[tile_idx];
+
+ tc->ts = ts;
+ tc->by = sby << f->sb_shift;
+ const int uses_2pass = c->n_fc > 1;
+ tc->frame_thread.pass = !uses_2pass ? 0 :
+ 1 + (t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION);
+ if (!error) error = dav1d_decode_tile_sbrow(tc);
+ const int progress = error ? TILE_ERROR : 1 + sby;
+
+ // signal progress
+ atomic_fetch_or(&f->task_thread.error, error);
+ if (((sby + 1) << f->sb_shift) < ts->tiling.row_end) {
+ t->sby++;
+ t->deps_skip = 0;
+ if (!check_tile(t, f, uses_2pass)) {
+ atomic_store(&ts->progress[p], progress);
+ reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
+ if (!atomic_fetch_or(&ttd->cond_signaled, 1))
+ pthread_cond_signal(&ttd->cond);
+ goto found_unlocked;
+ }
+ atomic_store(&ts->progress[p], progress);
+ add_pending(f, t);
+ pthread_mutex_lock(&ttd->lock);
+ } else {
+ pthread_mutex_lock(&ttd->lock);
+ atomic_store(&ts->progress[p], progress);
+ reset_task_cur(c, ttd, t->frame_idx);
+ error = atomic_load(&f->task_thread.error);
+ if (f->frame_hdr->refresh_context &&
+ tc->frame_thread.pass <= 1 && f->task_thread.update_set &&
+ f->frame_hdr->tiling.update == tile_idx)
+ {
+ if (!error)
+ dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf,
+ &f->ts[f->frame_hdr->tiling.update].cdf);
+ if (c->n_fc > 1)
+ atomic_store(f->out_cdf.progress, error ? TILE_ERROR : 1);
+ }
+ if (atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1 == 0 &&
+ atomic_load(&f->task_thread.done[0]) &&
+ (!uses_2pass || atomic_load(&f->task_thread.done[1])))
+ {
+ error = atomic_load(&f->task_thread.error);
+ dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
+ error ? DAV1D_ERR(ENOMEM) : 0);
+ f->n_tile_data = 0;
+ pthread_cond_signal(&f->task_thread.cond);
+ }
+ assert(atomic_load(&f->task_thread.task_counter) >= 0);
+ if (!atomic_fetch_or(&ttd->cond_signaled, 1))
+ pthread_cond_signal(&ttd->cond);
+ }
+ continue;
+ }
+ case DAV1D_TASK_TYPE_DEBLOCK_COLS:
+ if (!atomic_load(&f->task_thread.error))
+ f->bd_fn.filter_sbrow_deblock_cols(f, sby);
+ if (ensure_progress(ttd, f, t, DAV1D_TASK_TYPE_DEBLOCK_ROWS,
+ &f->frame_thread.deblock_progress,
+ &t->deblock_progress)) continue;
+ // fall-through
+ case DAV1D_TASK_TYPE_DEBLOCK_ROWS:
+ if (!atomic_load(&f->task_thread.error))
+ f->bd_fn.filter_sbrow_deblock_rows(f, sby);
+ // signal deblock progress
+ if (f->frame_hdr->loopfilter.level_y[0] ||
+ f->frame_hdr->loopfilter.level_y[1])
+ {
+ error = atomic_load(&f->task_thread.error);
+ atomic_store(&f->frame_thread.deblock_progress,
+ error ? TILE_ERROR : sby + 1);
+ reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
+ if (!atomic_fetch_or(&ttd->cond_signaled, 1))
+ pthread_cond_signal(&ttd->cond);
+ } else if (f->seq_hdr->cdef || f->lf.restore_planes) {
+ atomic_fetch_or(&f->frame_thread.copy_lpf_progress[sby >> 5],
+ 1U << (sby & 31));
+ // CDEF needs the top buffer to be saved by lr_copy_lpf of the
+ // previous sbrow
+ if (sby) {
+ int prog = atomic_load(&f->frame_thread.copy_lpf_progress[(sby - 1) >> 5]);
+ if (~prog & (1U << ((sby - 1) & 31))) {
+ t->type = DAV1D_TASK_TYPE_CDEF;
+ t->recon_progress = t->deblock_progress = 0;
+ add_pending(f, t);
+ pthread_mutex_lock(&ttd->lock);
+ continue;
+ }
+ }
+ }
+ // fall-through
+ case DAV1D_TASK_TYPE_CDEF:
+ if (f->seq_hdr->cdef) {
+ if (!atomic_load(&f->task_thread.error))
+ f->bd_fn.filter_sbrow_cdef(tc, sby);
+ reset_task_cur_async(ttd, t->frame_idx, c->n_fc);
+ if (!atomic_fetch_or(&ttd->cond_signaled, 1))
+ pthread_cond_signal(&ttd->cond);
+ }
+ // fall-through
+ case DAV1D_TASK_TYPE_SUPER_RESOLUTION:
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
+ if (!atomic_load(&f->task_thread.error))
+ f->bd_fn.filter_sbrow_resize(f, sby);
+ // fall-through
+ case DAV1D_TASK_TYPE_LOOP_RESTORATION:
+ if (!atomic_load(&f->task_thread.error) && f->lf.restore_planes)
+ f->bd_fn.filter_sbrow_lr(f, sby);
+ // fall-through
+ case DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS:
+ // dummy to cover for no post-filters
+ case DAV1D_TASK_TYPE_ENTROPY_PROGRESS:
+ // dummy to convert tile progress to frame
+ break;
+ default: abort();
+ }
+ // if task completed [typically LR], signal picture progress as per below
+ const int uses_2pass = c->n_fc > 1;
+ const int sbh = f->sbh;
+ const int sbsz = f->sb_step * 4;
+ if (t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS) {
+ error = atomic_load(&f->task_thread.error);
+ const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz;
+ assert(c->n_fc > 1);
+ if (f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */)
+ atomic_store(&f->sr_cur.progress[0], error ? FRAME_ERROR : y);
+ atomic_store(&f->frame_thread.entropy_progress,
+ error ? TILE_ERROR : sby + 1);
+ if (sby + 1 == sbh)
+ atomic_store(&f->task_thread.done[1], 1);
+ pthread_mutex_lock(&ttd->lock);
+ const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1;
+ if (sby + 1 < sbh && num_tasks) {
+ reset_task_cur(c, ttd, t->frame_idx);
+ continue;
+ }
+ if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
+ atomic_load(&f->task_thread.done[1]))
+ {
+ error = atomic_load(&f->task_thread.error);
+ dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
+ error ? DAV1D_ERR(ENOMEM) : 0);
+ f->n_tile_data = 0;
+ pthread_cond_signal(&f->task_thread.cond);
+ }
+ reset_task_cur(c, ttd, t->frame_idx);
+ continue;
+ }
+ // t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS
+ atomic_fetch_or(&f->frame_thread.frame_progress[sby >> 5],
+ 1U << (sby & 31));
+ pthread_mutex_lock(&f->task_thread.lock);
+ sby = get_frame_progress(c, f);
+ error = atomic_load(&f->task_thread.error);
+ const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz;
+ if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */)
+ atomic_store(&f->sr_cur.progress[1], error ? FRAME_ERROR : y);
+ pthread_mutex_unlock(&f->task_thread.lock);
+ if (sby + 1 == sbh)
+ atomic_store(&f->task_thread.done[0], 1);
+ pthread_mutex_lock(&ttd->lock);
+ const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1;
+ if (sby + 1 < sbh && num_tasks) {
+ reset_task_cur(c, ttd, t->frame_idx);
+ continue;
+ }
+ if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
+ (!uses_2pass || atomic_load(&f->task_thread.done[1])))
+ {
+ error = atomic_load(&f->task_thread.error);
+ dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
+ error ? DAV1D_ERR(ENOMEM) : 0);
+ f->n_tile_data = 0;
+ pthread_cond_signal(&f->task_thread.cond);
+ }
+ reset_task_cur(c, ttd, t->frame_idx);
+ }
+ pthread_mutex_unlock(&ttd->lock);
+
+ return NULL;
+}
diff --git a/third_party/dav1d/src/thread_task.h b/third_party/dav1d/src/thread_task.h
new file mode 100644
index 0000000000..257da1a470
--- /dev/null
+++ b/third_party/dav1d/src/thread_task.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_THREAD_TASK_H
+#define DAV1D_SRC_THREAD_TASK_H
+
+#include <limits.h>
+
+#include "src/internal.h"
+
+#define FRAME_ERROR (UINT_MAX - 1)
+#define TILE_ERROR (INT_MAX - 1)
+
+// these functions assume the task scheduling lock is already taken
+int dav1d_task_create_tile_sbrow(Dav1dFrameContext *f, int pass, int cond_signal);
+void dav1d_task_frame_init(Dav1dFrameContext *f);
+
+void dav1d_task_delayed_fg(Dav1dContext *c, Dav1dPicture *out, const Dav1dPicture *in);
+
+void *dav1d_worker_task(void *data);
+
+int dav1d_decode_frame_init(Dav1dFrameContext *f);
+int dav1d_decode_frame_init_cdf(Dav1dFrameContext *f);
+int dav1d_decode_frame_main(Dav1dFrameContext *f);
+void dav1d_decode_frame_exit(Dav1dFrameContext *f, int retval);
+int dav1d_decode_frame(Dav1dFrameContext *f);
+int dav1d_decode_tile_sbrow(Dav1dTaskContext *t);
+
+#endif /* DAV1D_SRC_THREAD_TASK_H */
diff --git a/third_party/dav1d/src/warpmv.c b/third_party/dav1d/src/warpmv.c
new file mode 100644
index 0000000000..439c4304c7
--- /dev/null
+++ b/third_party/dav1d/src/warpmv.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+#include "src/warpmv.h"
+
+static const uint16_t div_lut[257] = {
+ 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+ 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+ 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+ 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+ 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+ 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+ 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+ 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+ 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+ 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+ 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+ 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+ 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+ 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+ 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+ 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
+ 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
+ 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
+ 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
+ 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
+ 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
+ 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
+ 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
+ 8240, 8224, 8208, 8192,
+};
+
+static inline int iclip_wmp(const int v) {
+ const int cv = iclip(v, INT16_MIN, INT16_MAX);
+
+ return apply_sign((abs(cv) + 32) >> 6, cv) * (1 << 6);
+}
+
+static inline int resolve_divisor_32(const unsigned d, int *const shift) {
+ *shift = ulog2(d);
+ const int e = d - (1 << *shift);
+ const int f = *shift > 8 ? (e + (1 << (*shift - 9))) >> (*shift - 8) :
+ e << (8 - *shift);
+ assert(f <= 256);
+ *shift += 14;
+ // Use f as lookup into the precomputed table of multipliers
+ return div_lut[f];
+}
+
+int dav1d_get_shear_params(Dav1dWarpedMotionParams *const wm) {
+ const int32_t *const mat = wm->matrix;
+
+ if (mat[2] <= 0) return 1;
+
+ wm->u.p.alpha = iclip_wmp(mat[2] - 0x10000);
+ wm->u.p.beta = iclip_wmp(mat[3]);
+
+ int shift;
+ const int y = apply_sign(resolve_divisor_32(abs(mat[2]), &shift), mat[2]);
+ const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y;
+ const int rnd = (1 << shift) >> 1;
+ wm->u.p.gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1));
+ const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y;
+ wm->u.p.delta = iclip_wmp(mat[5] -
+ apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) -
+ 0x10000);
+
+ return (4 * abs(wm->u.p.alpha) + 7 * abs(wm->u.p.beta) >= 0x10000) ||
+ (4 * abs(wm->u.p.gamma) + 4 * abs(wm->u.p.delta) >= 0x10000);
+}
+
+static int resolve_divisor_64(const uint64_t d, int *const shift) {
+ *shift = u64log2(d);
+ const int64_t e = d - (1LL << *shift);
+ const int64_t f = *shift > 8 ? (e + (1LL << (*shift - 9))) >> (*shift - 8) :
+ e << (8 - *shift);
+ assert(f <= 256);
+ *shift += 14;
+ // Use f as lookup into the precomputed table of multipliers
+ return div_lut[f];
+}
+
+static int get_mult_shift_ndiag(const int64_t px,
+ const int idet, const int shift)
+{
+ const int64_t v1 = px * idet;
+ const int v2 = apply_sign64((int) ((llabs(v1) +
+ ((1LL << shift) >> 1)) >> shift),
+ v1);
+ return iclip(v2, -0x1fff, 0x1fff);
+}
+
+static int get_mult_shift_diag(const int64_t px,
+ const int idet, const int shift)
+{
+ const int64_t v1 = px * idet;
+ const int v2 = apply_sign64((int) ((llabs(v1) +
+ ((1LL << shift) >> 1)) >> shift),
+ v1);
+ return iclip(v2, 0xe001, 0x11fff);
+}
+
+void dav1d_set_affine_mv2d(const int bw4, const int bh4,
+ const mv mv, Dav1dWarpedMotionParams *const wm,
+ const int bx4, const int by4)
+{
+ int32_t *const mat = wm->matrix;
+ const int rsuy = 2 * bh4 - 1;
+ const int rsux = 2 * bw4 - 1;
+ const int isuy = by4 * 4 + rsuy;
+ const int isux = bx4 * 4 + rsux;
+
+ mat[0] = iclip(mv.x * 0x2000 - (isux * (mat[2] - 0x10000) + isuy * mat[3]),
+ -0x800000, 0x7fffff);
+ mat[1] = iclip(mv.y * 0x2000 - (isux * mat[4] + isuy * (mat[5] - 0x10000)),
+ -0x800000, 0x7fffff);
+}
+
+int dav1d_find_affine_int(const int (*pts)[2][2], const int np,
+ const int bw4, const int bh4,
+ const mv mv, Dav1dWarpedMotionParams *const wm,
+ const int bx4, const int by4)
+{
+ int32_t *const mat = wm->matrix;
+ int a[2][2] = { { 0, 0 }, { 0, 0 } };
+ int bx[2] = { 0, 0 };
+ int by[2] = { 0, 0 };
+ const int rsuy = 2 * bh4 - 1;
+ const int rsux = 2 * bw4 - 1;
+ const int suy = rsuy * 8;
+ const int sux = rsux * 8;
+ const int duy = suy + mv.y;
+ const int dux = sux + mv.x;
+ const int isuy = by4 * 4 + rsuy;
+ const int isux = bx4 * 4 + rsux;
+
+ for (int i = 0; i < np; i++) {
+ const int dx = pts[i][1][0] - dux;
+ const int dy = pts[i][1][1] - duy;
+ const int sx = pts[i][0][0] - sux;
+ const int sy = pts[i][0][1] - suy;
+ if (abs(sx - dx) < 256 && abs(sy - dy) < 256) {
+ a[0][0] += ((sx * sx) >> 2) + sx * 2 + 8;
+ a[0][1] += ((sx * sy) >> 2) + sx + sy + 4;
+ a[1][1] += ((sy * sy) >> 2) + sy * 2 + 8;
+ bx[0] += ((sx * dx) >> 2) + sx + dx + 8;
+ bx[1] += ((sy * dx) >> 2) + sy + dx + 4;
+ by[0] += ((sx * dy) >> 2) + sx + dy + 4;
+ by[1] += ((sy * dy) >> 2) + sy + dy + 8;
+ }
+ }
+
+ // compute determinant of a
+ const int64_t det = (int64_t) a[0][0] * a[1][1] - (int64_t) a[0][1] * a[0][1];
+ if (det == 0) return 1;
+ int shift, idet = apply_sign64(resolve_divisor_64(llabs(det), &shift), det);
+ shift -= 16;
+ if (shift < 0) {
+ idet <<= -shift;
+ shift = 0;
+ }
+
+ // solve the least-squares
+ mat[2] = get_mult_shift_diag((int64_t) a[1][1] * bx[0] -
+ (int64_t) a[0][1] * bx[1], idet, shift);
+ mat[3] = get_mult_shift_ndiag((int64_t) a[0][0] * bx[1] -
+ (int64_t) a[0][1] * bx[0], idet, shift);
+ mat[4] = get_mult_shift_ndiag((int64_t) a[1][1] * by[0] -
+ (int64_t) a[0][1] * by[1], idet, shift);
+ mat[5] = get_mult_shift_diag((int64_t) a[0][0] * by[1] -
+ (int64_t) a[0][1] * by[0], idet, shift);
+
+ mat[0] = iclip(mv.x * 0x2000 - (isux * (mat[2] - 0x10000) + isuy * mat[3]),
+ -0x800000, 0x7fffff);
+ mat[1] = iclip(mv.y * 0x2000 - (isux * mat[4] + isuy * (mat[5] - 0x10000)),
+ -0x800000, 0x7fffff);
+
+ return 0;
+}
diff --git a/third_party/dav1d/src/warpmv.h b/third_party/dav1d/src/warpmv.h
new file mode 100644
index 0000000000..08e841d1ca
--- /dev/null
+++ b/third_party/dav1d/src/warpmv.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_WARPMV_H
+#define DAV1D_SRC_WARPMV_H
+
+#include "src/levels.h"
+
+int dav1d_get_shear_params(Dav1dWarpedMotionParams *wm);
+int dav1d_find_affine_int(const int (*pts)[2][2], int np, int bw4, int bh4,
+ mv mv, Dav1dWarpedMotionParams *wm, int bx, int by);
+void dav1d_set_affine_mv2d(int bw4, int bh4,
+ mv mv, Dav1dWarpedMotionParams *wm, int bx, int by);
+
+#endif /* DAV1D_SRC_WARPMV_H */
diff --git a/third_party/dav1d/src/wedge.c b/third_party/dav1d/src/wedge.c
new file mode 100644
index 0000000000..6466068f32
--- /dev/null
+++ b/third_party/dav1d/src/wedge.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/wedge.h"
+
+enum WedgeDirectionType {
+ WEDGE_HORIZONTAL = 0,
+ WEDGE_VERTICAL = 1,
+ WEDGE_OBLIQUE27 = 2,
+ WEDGE_OBLIQUE63 = 3,
+ WEDGE_OBLIQUE117 = 4,
+ WEDGE_OBLIQUE153 = 5,
+ N_WEDGE_DIRECTIONS
+};
+
+typedef struct {
+ uint8_t /* enum WedgeDirectionType */ direction;
+ uint8_t x_offset;
+ uint8_t y_offset;
+} wedge_code_type;
+
+static const wedge_code_type wedge_codebook_16_hgtw[16] = {
+ { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
+ { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+ { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
+ { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
+ { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
+ { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+ { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
+ { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_hltw[16] = {
+ { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
+ { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+ { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 },
+ { WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 },
+ { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
+ { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+ { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
+ { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_heqw[16] = {
+ { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
+ { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+ { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
+ { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 },
+ { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
+ { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+ { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
+ { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static uint8_t ALIGN(wedge_masks_444_32x32[2 * 16 * 32 * 32], 64);
+static uint8_t ALIGN(wedge_masks_444_32x16[2 * 16 * 32 * 16], 64);
+static uint8_t ALIGN(wedge_masks_444_32x8 [2 * 16 * 32 * 8], 64);
+static uint8_t ALIGN(wedge_masks_444_16x32[2 * 16 * 16 * 32], 64);
+static uint8_t ALIGN(wedge_masks_444_16x16[2 * 16 * 16 * 16], 64);
+static uint8_t ALIGN(wedge_masks_444_16x8 [2 * 16 * 16 * 8], 64);
+static uint8_t ALIGN(wedge_masks_444_8x32 [2 * 16 * 8 * 32], 64);
+static uint8_t ALIGN(wedge_masks_444_8x16 [2 * 16 * 8 * 16], 64);
+static uint8_t ALIGN(wedge_masks_444_8x8 [2 * 16 * 8 * 8], 64);
+
+static uint8_t ALIGN(wedge_masks_422_16x32[2 * 16 * 16 * 32], 64);
+static uint8_t ALIGN(wedge_masks_422_16x16[2 * 16 * 16 * 16], 64);
+static uint8_t ALIGN(wedge_masks_422_16x8 [2 * 16 * 16 * 8], 64);
+static uint8_t ALIGN(wedge_masks_422_8x32 [2 * 16 * 8 * 32], 64);
+static uint8_t ALIGN(wedge_masks_422_8x16 [2 * 16 * 8 * 16], 64);
+static uint8_t ALIGN(wedge_masks_422_8x8 [2 * 16 * 8 * 8], 64);
+static uint8_t ALIGN(wedge_masks_422_4x32 [2 * 16 * 4 * 32], 64);
+static uint8_t ALIGN(wedge_masks_422_4x16 [2 * 16 * 4 * 16], 64);
+static uint8_t ALIGN(wedge_masks_422_4x8 [2 * 16 * 4 * 8], 32);
+
+static uint8_t ALIGN(wedge_masks_420_16x16[2 * 16 * 16 * 16], 64);
+static uint8_t ALIGN(wedge_masks_420_16x8 [2 * 16 * 16 * 8], 64);
+static uint8_t ALIGN(wedge_masks_420_16x4 [2 * 16 * 16 * 4], 64);
+static uint8_t ALIGN(wedge_masks_420_8x16 [2 * 16 * 8 * 16], 64);
+static uint8_t ALIGN(wedge_masks_420_8x8 [2 * 16 * 8 * 8], 64);
+static uint8_t ALIGN(wedge_masks_420_8x4 [2 * 16 * 8 * 4], 64);
+static uint8_t ALIGN(wedge_masks_420_4x16 [2 * 16 * 4 * 16], 64);
+static uint8_t ALIGN(wedge_masks_420_4x8 [2 * 16 * 4 * 8], 32);
+static uint8_t ALIGN(wedge_masks_420_4x4 [2 * 16 * 4 * 4], 16);
+
+const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3][2][16];
+
+static void insert_border(uint8_t *const dst, const uint8_t *const src,
+ const int ctr)
+{
+ if (ctr > 4) memset(dst, 0, ctr - 4);
+ memcpy(dst + imax(ctr, 4) - 4, src + imax(4 - ctr, 0), imin(64 - ctr, 8));
+ if (ctr < 64 - 4)
+ memset(dst + ctr + 4, 64, 64 - 4 - ctr);
+}
+
+static void transpose(uint8_t *const dst, const uint8_t *const src) {
+ for (int y = 0, y_off = 0; y < 64; y++, y_off += 64)
+ for (int x = 0, x_off = 0; x < 64; x++, x_off += 64)
+ dst[x_off + y] = src[y_off + x];
+}
+
+static void hflip(uint8_t *const dst, const uint8_t *const src) {
+ for (int y = 0, y_off = 0; y < 64; y++, y_off += 64)
+ for (int x = 0; x < 64; x++)
+ dst[y_off + 64 - 1 - x] = src[y_off + x];
+}
+
+static void invert(uint8_t *const dst, const uint8_t *const src,
+ const int w, const int h)
+{
+ for (int y = 0, y_off = 0; y < h; y++, y_off += w)
+ for (int x = 0; x < w; x++)
+ dst[y_off + x] = 64 - src[y_off + x];
+}
+
+static void copy2d(uint8_t *dst, const uint8_t *src,
+ const int w, const int h, const int x_off, const int y_off)
+{
+ src += y_off * 64 + x_off;
+ for (int y = 0; y < h; y++) {
+ memcpy(dst, src, w);
+ src += 64;
+ dst += w;
+ }
+}
+
+static COLD void init_chroma(uint8_t *chroma, const uint8_t *luma,
+ const int sign, const int w, const int h,
+ const int ss_ver)
+{
+ for (int y = 0; y < h; y += 1 + ss_ver) {
+ for (int x = 0; x < w; x += 2) {
+ int sum = luma[x] + luma[x + 1] + 1;
+ if (ss_ver) sum += luma[w + x] + luma[w + x + 1] + 1;
+ chroma[x >> 1] = (sum - sign) >> (1 + ss_ver);
+ }
+ luma += w << ss_ver;
+ chroma += w >> 1;
+ }
+}
+
+static COLD void fill2d_16x2(uint8_t *dst, const int w, const int h,
+ const enum BlockSize bs,
+ const uint8_t (*const master)[64 * 64],
+ const wedge_code_type *const cb,
+ uint8_t *masks_444, uint8_t *masks_422,
+ uint8_t *masks_420, const unsigned signs)
+{
+ uint8_t *ptr = dst;
+ for (int n = 0; n < 16; n++) {
+ copy2d(ptr, master[cb[n].direction], w, h,
+ 32 - (w * cb[n].x_offset >> 3), 32 - (h * cb[n].y_offset >> 3));
+ ptr += w * h;
+ }
+ for (int n = 0, off = 0; n < 16; n++, off += w * h)
+ invert(ptr + off, dst + off, w, h);
+
+ const int n_stride_444 = (w * h);
+ const int n_stride_422 = n_stride_444 >> 1;
+ const int n_stride_420 = n_stride_444 >> 2;
+ const int sign_stride_444 = 16 * n_stride_444;
+ const int sign_stride_422 = 16 * n_stride_422;
+ const int sign_stride_420 = 16 * n_stride_420;
+ // assign pointers in externally visible array
+ for (int n = 0; n < 16; n++) {
+ const int sign = (signs >> n) & 1;
+ dav1d_wedge_masks[bs][0][0][n] = &masks_444[ sign * sign_stride_444];
+ // not using !sign is intentional here, since 444 does not require
+ // any rounding since no chroma subsampling is applied.
+ dav1d_wedge_masks[bs][0][1][n] = &masks_444[ sign * sign_stride_444];
+ dav1d_wedge_masks[bs][1][0][n] = &masks_422[ sign * sign_stride_422];
+ dav1d_wedge_masks[bs][1][1][n] = &masks_422[!sign * sign_stride_422];
+ dav1d_wedge_masks[bs][2][0][n] = &masks_420[ sign * sign_stride_420];
+ dav1d_wedge_masks[bs][2][1][n] = &masks_420[!sign * sign_stride_420];
+ masks_444 += n_stride_444;
+ masks_422 += n_stride_422;
+ masks_420 += n_stride_420;
+
+ // since the pointers come from inside, we know that
+ // violation of the const is OK here. Any other approach
+ // means we would have to duplicate the sign correction
+ // logic in two places, which isn't very nice, or mark
+ // the table faced externally as non-const, which also sucks
+ init_chroma((uint8_t *)dav1d_wedge_masks[bs][1][0][n],
+ dav1d_wedge_masks[bs][0][0][n], 0, w, h, 0);
+ init_chroma((uint8_t *)dav1d_wedge_masks[bs][1][1][n],
+ dav1d_wedge_masks[bs][0][0][n], 1, w, h, 0);
+ init_chroma((uint8_t *)dav1d_wedge_masks[bs][2][0][n],
+ dav1d_wedge_masks[bs][0][0][n], 0, w, h, 1);
+ init_chroma((uint8_t *)dav1d_wedge_masks[bs][2][1][n],
+ dav1d_wedge_masks[bs][0][0][n], 1, w, h, 1);
+ }
+}
+
+COLD void dav1d_init_wedge_masks(void) {
+ // This function is guaranteed to be called only once
+
+ enum WedgeMasterLineType {
+ WEDGE_MASTER_LINE_ODD,
+ WEDGE_MASTER_LINE_EVEN,
+ WEDGE_MASTER_LINE_VERT,
+ N_WEDGE_MASTER_LINES,
+ };
+ static const uint8_t wedge_master_border[N_WEDGE_MASTER_LINES][8] = {
+ [WEDGE_MASTER_LINE_ODD] = { 1, 2, 6, 18, 37, 53, 60, 63 },
+ [WEDGE_MASTER_LINE_EVEN] = { 1, 4, 11, 27, 46, 58, 62, 63 },
+ [WEDGE_MASTER_LINE_VERT] = { 0, 2, 7, 21, 43, 57, 62, 64 },
+ };
+ uint8_t master[6][64 * 64];
+
+ // create master templates
+ for (int y = 0, off = 0; y < 64; y++, off += 64)
+ insert_border(&master[WEDGE_VERTICAL][off],
+ wedge_master_border[WEDGE_MASTER_LINE_VERT], 32);
+ for (int y = 0, off = 0, ctr = 48; y < 64; y += 2, off += 128, ctr--)
+ {
+ insert_border(&master[WEDGE_OBLIQUE63][off],
+ wedge_master_border[WEDGE_MASTER_LINE_EVEN], ctr);
+ insert_border(&master[WEDGE_OBLIQUE63][off + 64],
+ wedge_master_border[WEDGE_MASTER_LINE_ODD], ctr - 1);
+ }
+
+ transpose(master[WEDGE_OBLIQUE27], master[WEDGE_OBLIQUE63]);
+ transpose(master[WEDGE_HORIZONTAL], master[WEDGE_VERTICAL]);
+ hflip(master[WEDGE_OBLIQUE117], master[WEDGE_OBLIQUE63]);
+ hflip(master[WEDGE_OBLIQUE153], master[WEDGE_OBLIQUE27]);
+
+#define fill(w, h, sz_422, sz_420, hvsw, signs) \
+ fill2d_16x2((uint8_t *) wedge_masks_444_##w##x##h, w, h, BS_##w##x##h, \
+ master, wedge_codebook_16_##hvsw, wedge_masks_444_##w##x##h, \
+ wedge_masks_422_##sz_422, wedge_masks_420_##sz_420, signs)
+
+ fill(32, 32, 16x32, 16x16, heqw, 0x7bfb);
+ fill(32, 16, 16x16, 16x8, hltw, 0x7beb);
+ fill(32, 8, 16x8, 16x4, hltw, 0x6beb);
+ fill(16, 32, 8x32, 8x16, hgtw, 0x7beb);
+ fill(16, 16, 8x16, 8x8, heqw, 0x7bfb);
+ fill(16, 8, 8x8, 8x4, hltw, 0x7beb);
+ fill( 8, 32, 4x32, 4x16, hgtw, 0x7aeb);
+ fill( 8, 16, 4x16, 4x8, hgtw, 0x7beb);
+ fill( 8, 8, 4x8, 4x4, heqw, 0x7bfb);
+#undef fill
+}
+
+#define N_II_PRED_MODES (N_INTER_INTRA_PRED_MODES - 1)
+static uint8_t ALIGN(ii_dc_mask[32 * 32], 64);
+static uint8_t ALIGN(ii_nondc_mask_32x32[N_II_PRED_MODES][32 * 32], 64);
+static uint8_t ALIGN(ii_nondc_mask_16x32[N_II_PRED_MODES][16 * 32], 64);
+static uint8_t ALIGN(ii_nondc_mask_16x16[N_II_PRED_MODES][16 * 16], 64);
+static uint8_t ALIGN(ii_nondc_mask_8x32 [N_II_PRED_MODES][ 8 * 32], 64);
+static uint8_t ALIGN(ii_nondc_mask_8x16 [N_II_PRED_MODES][ 8 * 16], 64);
+static uint8_t ALIGN(ii_nondc_mask_8x8 [N_II_PRED_MODES][ 8 * 8], 64);
+static uint8_t ALIGN(ii_nondc_mask_4x16 [N_II_PRED_MODES][ 4 * 16], 64);
+static uint8_t ALIGN(ii_nondc_mask_4x8 [N_II_PRED_MODES][ 4 * 8], 32);
+static uint8_t ALIGN(ii_nondc_mask_4x4 [N_II_PRED_MODES][ 4 * 4], 16);
+#undef N_II_PRED_MODES
+
+#define set1(sz) \
+ [II_DC_PRED] = ii_dc_mask, \
+ [II_VERT_PRED] = ii_nondc_mask_##sz[II_VERT_PRED - 1], \
+ [II_HOR_PRED] = ii_nondc_mask_##sz[II_HOR_PRED - 1], \
+ [II_SMOOTH_PRED] = ii_nondc_mask_##sz[II_SMOOTH_PRED - 1]
+#define set(sz_444, sz_422, sz_420) \
+ { { set1(sz_444) }, { set1(sz_422) }, { set1(sz_420) } }
+const uint8_t *dav1d_ii_masks[N_BS_SIZES][3][N_INTER_INTRA_PRED_MODES] = {
+ [BS_8x8] = set( 8x8, 4x8, 4x4),
+ [BS_8x16] = set( 8x16, 4x16, 4x8),
+ [BS_16x8] = set(16x16, 8x8, 8x8),
+ [BS_16x16] = set(16x16, 8x16, 8x8),
+ [BS_16x32] = set(16x32, 8x32, 8x16),
+ [BS_32x16] = set(32x32, 16x16, 16x16),
+ [BS_32x32] = set(32x32, 16x32, 16x16),
+};
+#undef set
+#undef set1
+
+static COLD void build_nondc_ii_masks(uint8_t *const mask_v,
+ uint8_t *const mask_h,
+ uint8_t *const mask_sm,
+ const int w, const int h, const int step)
+{
+ static const uint8_t ii_weights_1d[] = {
+ 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
+ 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1,
+ };
+
+ for (int y = 0, off = 0; y < h; y++, off += w) {
+ memset(&mask_v[off], ii_weights_1d[y * step], w);
+ for (int x = 0; x < w; x++) {
+ mask_sm[off + x] = ii_weights_1d[imin(x, y) * step];
+ mask_h[off + x] = ii_weights_1d[x * step];
+ }
+ }
+}
+
+COLD void dav1d_init_interintra_masks(void) {
+ // This function is guaranteed to be called only once
+
+ memset(ii_dc_mask, 32, 32 * 32);
+#define set(a) a[II_VERT_PRED - 1], a[II_HOR_PRED - 1], a[II_SMOOTH_PRED - 1]
+ build_nondc_ii_masks(set(ii_nondc_mask_32x32), 32, 32, 1);
+ build_nondc_ii_masks(set(ii_nondc_mask_16x32), 16, 32, 1);
+ build_nondc_ii_masks(set(ii_nondc_mask_16x16), 16, 16, 2);
+ build_nondc_ii_masks(set(ii_nondc_mask_8x32), 8, 32, 1);
+ build_nondc_ii_masks(set(ii_nondc_mask_8x16), 8, 16, 2);
+ build_nondc_ii_masks(set(ii_nondc_mask_8x8), 8, 8, 4);
+ build_nondc_ii_masks(set(ii_nondc_mask_4x16), 4, 16, 2);
+ build_nondc_ii_masks(set(ii_nondc_mask_4x8), 4, 8, 4);
+ build_nondc_ii_masks(set(ii_nondc_mask_4x4), 4, 4, 8);
+#undef set
+}
diff --git a/third_party/dav1d/src/wedge.h b/third_party/dav1d/src/wedge.h
new file mode 100644
index 0000000000..586be98c42
--- /dev/null
+++ b/third_party/dav1d/src/wedge.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_WEDGE_H
+#define DAV1D_SRC_WEDGE_H
+
+#include "src/levels.h"
+
+void dav1d_init_wedge_masks(void);
+EXTERN const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
+ [2 /* sign */][16 /* wedge_idx */];
+
+void dav1d_init_interintra_masks(void);
+EXTERN const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
+ [N_INTER_INTRA_PRED_MODES];
+
+#endif /* DAV1D_SRC_WEDGE_H */
diff --git a/third_party/dav1d/src/win32/thread.c b/third_party/dav1d/src/win32/thread.c
new file mode 100644
index 0000000000..b89bd6b165
--- /dev/null
+++ b/third_party/dav1d/src/win32/thread.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#if defined(_WIN32)
+
+#include <process.h>
+#include <stdlib.h>
+#include <windows.h>
+
+#include "common/attributes.h"
+
+#include "src/thread.h"
+
+static HRESULT (WINAPI *set_thread_description)(HANDLE, PCWSTR);
+
+COLD void dav1d_init_thread(void) {
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+ HANDLE kernel32 = GetModuleHandleW(L"kernel32.dll");
+ if (kernel32)
+ set_thread_description =
+ (void*)GetProcAddress(kernel32, "SetThreadDescription");
+#endif
+}
+
+#undef dav1d_set_thread_name
+COLD void dav1d_set_thread_name(const wchar_t *const name) {
+ if (set_thread_description) /* Only available since Windows 10 1607 */
+ set_thread_description(GetCurrentThread(), name);
+}
+
+static COLD unsigned __stdcall thread_entrypoint(void *const data) {
+ pthread_t *const t = data;
+ t->arg = t->func(t->arg);
+ return 0;
+}
+
+COLD int dav1d_pthread_create(pthread_t *const thread,
+ const pthread_attr_t *const attr,
+ void *(*const func)(void*), void *const arg)
+{
+ const unsigned stack_size = attr ? attr->stack_size : 0;
+ thread->func = func;
+ thread->arg = arg;
+ thread->h = (HANDLE)_beginthreadex(NULL, stack_size, thread_entrypoint, thread,
+ STACK_SIZE_PARAM_IS_A_RESERVATION, NULL);
+ return !thread->h;
+}
+
+COLD int dav1d_pthread_join(pthread_t *const thread, void **const res) {
+ if (WaitForSingleObject(thread->h, INFINITE))
+ return 1;
+
+ if (res)
+ *res = thread->arg;
+
+ return !CloseHandle(thread->h);
+}
+
+COLD int dav1d_pthread_once(pthread_once_t *const once_control,
+ void (*const init_routine)(void))
+{
+ BOOL pending = FALSE;
+
+ if (InitOnceBeginInitialize(once_control, 0, &pending, NULL) != TRUE)
+ return 1;
+
+ if (pending == TRUE)
+ init_routine();
+
+ return !InitOnceComplete(once_control, 0, NULL);
+}
+
+#endif
diff --git a/third_party/dav1d/src/x86/cdef.h b/third_party/dav1d/src/x86/cdef.h
new file mode 100644
index 0000000000..553d650741
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/cdef.h"
+
+#define decl_cdef_fns(ext) \
+ decl_cdef_fn(BF(dav1d_cdef_filter_4x4, ext)); \
+ decl_cdef_fn(BF(dav1d_cdef_filter_4x8, ext)); \
+ decl_cdef_fn(BF(dav1d_cdef_filter_8x8, ext))
+
+decl_cdef_fns(avx512icl);
+decl_cdef_fns(avx2);
+decl_cdef_fns(sse4);
+decl_cdef_fns(ssse3);
+decl_cdef_fns(sse2);
+
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2));
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4));
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3));
+
+static ALWAYS_INLINE void cdef_dsp_init_x86(Dav1dCdefDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+#if BITDEPTH == 8
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->dir = BF(dav1d_cdef_dir, ssse3);
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
+
+ c->dir = BF(dav1d_cdef_dir, sse4);
+#if BITDEPTH == 8
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, sse4);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, sse4);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, sse4);
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->dir = BF(dav1d_cdef_dir, avx2);
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl);
+#endif
+}
diff --git a/third_party/dav1d/src/x86/cdef16_avx2.asm b/third_party/dav1d/src/x86/cdef16_avx2.asm
new file mode 100644
index 0000000000..4c8d3bca43
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef16_avx2.asm
@@ -0,0 +1,877 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA
+
+%macro DIR_TABLE 1 ; stride
+ db 1 * %1 + 0, 2 * %1 + 0
+ db 1 * %1 + 0, 2 * %1 - 2
+ db -1 * %1 + 2, -2 * %1 + 4
+ db 0 * %1 + 2, -1 * %1 + 4
+ db 0 * %1 + 2, 0 * %1 + 4
+ db 0 * %1 + 2, 1 * %1 + 4
+ db 1 * %1 + 2, 2 * %1 + 4
+ db 1 * %1 + 0, 2 * %1 + 2
+ db 1 * %1 + 0, 2 * %1 + 0
+ db 1 * %1 + 0, 2 * %1 - 2
+ db -1 * %1 + 2, -2 * %1 + 4
+ db 0 * %1 + 2, -1 * %1 + 4
+%endmacro
+
+dir_table4: DIR_TABLE 16
+dir_table8: DIR_TABLE 32
+pri_taps: dw 4, 4, 3, 3, 2, 2, 3, 3
+
+dir_shift: times 2 dw 0x4000
+ times 2 dw 0x1000
+
+pw_2048: times 2 dw 2048
+pw_m16384: times 2 dw -16384
+
+cextern cdef_dir_8bpc_avx2.main
+
+SECTION .text
+
+%macro CDEF_FILTER 2 ; w, h
+ DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp
+ movifnidn prid, r5m
+ movifnidn secd, r6m
+ mov dird, r7m
+ vpbroadcastd m8, [base+pw_2048]
+ lea dirq, [base+dir_table%1+dirq*2]
+ test prid, prid
+ jz .sec_only
+%if WIN64
+ vpbroadcastw m6, prim
+ movaps [rsp+16*0], xmm9
+ movaps [rsp+16*1], xmm10
+%else
+ movd xm6, prid
+ vpbroadcastw m6, xm6
+%endif
+ lzcnt pridmpd, prid
+ rorx tmpd, prid, 2
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, tmpd ; pri >>= 2
+ mov tmpd, r8m ; damping
+ and prid, 4
+ sub tmpd, 31
+ vpbroadcastd m9, [base+pri_taps+priq+8*0]
+ vpbroadcastd m10, [base+pri_taps+priq+8*1]
+ test secd, secd
+ jz .pri_only
+%if WIN64
+ movaps r8m, xmm13
+ vpbroadcastw m13, secm
+ movaps r4m, xmm11
+ movaps r6m, xmm12
+%else
+ movd xm0, secd
+ vpbroadcastw m13, xm0
+%endif
+ lzcnt secd, secd
+ xor prid, prid
+ add pridmpd, tmpd
+ cmovs pridmpd, prid
+ add secd, tmpd
+ lea tmpq, [px]
+ mov [pri_shift], pridmpq
+ mov [sec_shift], secq
+%rep %1*%2/16
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
+%endrep
+%if WIN64
+ movaps xmm11, r4m
+ movaps xmm12, r6m
+ movaps xmm13, r8m
+%endif
+ jmp .pri_end
+.pri_only:
+ add pridmpd, tmpd
+ cmovs pridmpd, secd
+ lea tmpq, [px]
+ mov [pri_shift], pridmpq
+%rep %1*%2/16
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
+%endrep
+.pri_end:
+%if WIN64
+ movaps xmm9, [rsp+16*0]
+ movaps xmm10, [rsp+16*1]
+%endif
+.end:
+ RET
+.sec_only:
+ mov tmpd, r8m ; damping
+%if WIN64
+ vpbroadcastw m6, secm
+%else
+ movd xm6, secd
+ vpbroadcastw m6, xm6
+%endif
+ tzcnt secd, secd
+ sub tmpd, secd
+ mov [sec_shift], tmpq
+ lea tmpq, [px]
+%rep %1*%2/16
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
+%endrep
+ jmp .end
+%if %1 == %2
+ALIGN function_align
+.pri:
+ movsx offq, byte [dirq+4] ; off_k0
+%if %1 == 4
+ mova m1, [tmpq+32*0]
+ punpcklqdq m1, [tmpq+32*1] ; 0 2 1 3
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0p0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0p1
+%else
+ mova xm1, [tmpq+32*0]
+ vinserti128 m1, [tmpq+32*1], 1
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+5] ; off_k1
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m0, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0p0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1p0
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1p1
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ psignw m2, m3 ; constrain(diff_k0p1)
+ pabsw m3, m4 ; adiff_k1p0
+ paddw m0, m2 ; constrain(diff_k0)
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m0, m9 ; pri_tap_k0
+ pmullw m7, m10 ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ add tmpq, 32*2
+ paddw m0, m1
+%if %1 == 4
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+%endif
+ ret
+ALIGN function_align
+.sec:
+ movsx offq, byte [dirq+8] ; off1_k0
+%if %1 == 4
+ mova m1, [tmpq+32*0]
+ punpcklqdq m1, [tmpq+32*1]
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0s1
+%else
+ mova xm1, [tmpq+32*0]
+ vinserti128 m1, [tmpq+32*1], 1
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+0] ; off2_k0
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k0s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k0s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+9] ; off1_k1
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ psignw m2, m3 ; constrain(diff_k0s1)
+ pabsw m3, m4 ; adiff_k0s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+%if %1 == 4
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k1s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k1s1
+%else
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+1] ; off2_k1
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k0s3)
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ paddw m0, m7
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k1s3)
+ paddw m0, m4 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ add tmpq, 32*2
+ paddw m0, m1
+%if %1 == 4
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+%endif
+ ret
+ALIGN function_align
+.pri_sec:
+ movsx offq, byte [dirq+8] ; off2_k0
+%if %1 == 4
+ mova m1, [tmpq+32*0]
+ punpcklqdq m1, [tmpq+32*1]
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0s1
+%else
+ mova xm1, [dstq+strideq*0]
+ vinserti128 m1, [dstq+strideq*1], 1
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+0] ; off3_k0
+ pmaxsw m11, m2, m3
+ pminuw m12, m2, m3
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m13, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m13, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k0s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k0s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+9] ; off2_k1
+ psignw m2, m3 ; constrain(diff_k0s1)
+ pmaxsw m11, m4
+ pminuw m12, m4
+ pmaxsw m11, m5
+ pminuw m12, m5
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ paddw m0, m2
+ pabsw m3, m4 ; adiff_k0s2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m13, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m13, m3
+ pminsw m4, m2
+%if %1 == 4
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k1s0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k1s1
+%else
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+1] ; off3_k1
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k0s3)
+ pmaxsw m11, m2
+ pminuw m12, m2
+ pmaxsw m11, m3
+ pminuw m12, m3
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m13, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m13, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1s2
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1s3
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+4] ; off1_k0
+ paddw m0, m7
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pmaxsw m11, m4
+ pminuw m12, m4
+ pmaxsw m11, m5
+ pminuw m12, m5
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m13, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m13, m3
+ pminsw m4, m2
+ paddw m0, m7
+%if %1 == 4
+ movu m2, [tmpq+offq+32*0]
+ punpcklqdq m2, [tmpq+offq+32*1] ; k0p0
+ neg offq
+ movu m3, [tmpq+offq+32*0]
+ punpcklqdq m3, [tmpq+offq+32*1] ; k0p1
+%else
+ movu xm2, [tmpq+offq+32*0]
+ vinserti128 m2, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm3, [tmpq+offq+32*0]
+ vinserti128 m3, [tmpq+offq+32*1], 1
+%endif
+ movsx offq, byte [dirq+5] ; off1_k1
+ psignw m4, m5 ; constrain(diff_k1s3)
+ pmaxsw m11, m2
+ pminuw m12, m2
+ pmaxsw m11, m3
+ pminuw m12, m3
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ paddw m0, m4
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m7, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k0p0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movu m4, [tmpq+offq+32*0]
+ punpcklqdq m4, [tmpq+offq+32*1] ; k1p0
+ neg offq
+ movu m5, [tmpq+offq+32*0]
+ punpcklqdq m5, [tmpq+offq+32*1] ; k1p1
+%else
+ movu xm4, [tmpq+offq+32*0]
+ vinserti128 m4, [tmpq+offq+32*1], 1
+ neg offq
+ movu xm5, [tmpq+offq+32*0]
+ vinserti128 m5, [tmpq+offq+32*1], 1
+%endif
+ psignw m2, m3 ; constrain(diff_k0p1)
+ paddw m7, m2 ; constrain(diff_k0)
+ pmaxsw m11, m4
+ pminuw m12, m4
+ pmaxsw m11, m5
+ pminuw m12, m5
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ pabsw m3, m4 ; adiff_k1p0
+ pmullw m7, m9 ; pri_tap_k0
+ paddw m0, m7
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m7, m10 ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ add tmpq, 32*2
+ pmaxsw m11, m1
+ pminuw m12, m1
+ paddw m0, m1
+ pminsw m0, m11
+ pmaxsw m0, m12
+%if %1 == 4
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+%endif
+ ret
+%endif
+%endmacro
+
+INIT_YMM avx2
+cglobal cdef_filter_4x4_16bpc, 5, 10, 9, 16*10, dst, stride, left, top, bot, \
+ pri, sec, edge
+%if WIN64
+ %define px rsp+16*6
+ %define offq r8
+ %define pri_shift rsp+16*2
+ %define sec_shift rsp+16*3
+%else
+ %define px rsp+16*4
+ %define offq r4
+ %define pri_shift rsp+16*0
+ %define sec_shift rsp+16*1
+%endif
+ %define base r8-dir_table4
+ mov edged, r9m
+ lea r8, [dir_table4]
+ movu xm0, [dstq+strideq*0]
+ movu xm1, [dstq+strideq*1]
+ lea r9, [strideq*3]
+ movu xm2, [dstq+strideq*2]
+ movu xm3, [dstq+r9 ]
+ vpbroadcastd m7, [base+pw_m16384]
+ mova [px+16*0+0], xm0
+ mova [px+16*1+0], xm1
+ mova [px+16*2+0], xm2
+ mova [px+16*3+0], xm3
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movu xm0, [topq+strideq*0]
+ movu xm1, [topq+strideq*1]
+ mova [px-16*2+0], xm0
+ mova [px-16*1+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd xm0, [topq+strideq*0-4]
+ movd xm1, [topq+strideq*1-4]
+ movd [px-16*2-4], xm0
+ movd [px-16*1-4], xm1
+ jmp .top_done
+.no_top:
+ mova [px-16*2+0], m7
+.top_no_left:
+ movd [px-16*2-4], xm7
+ movd [px-16*1-4], xm7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movu xm0, [botq+strideq*0]
+ movu xm1, [botq+strideq*1]
+ mova [px+16*4+0], xm0
+ mova [px+16*5+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd xm0, [botq+strideq*0-4]
+ movd xm1, [botq+strideq*1-4]
+ movd [px+16*4-4], xm0
+ movd [px+16*5-4], xm1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+16*4+0], m7
+.bottom_no_left:
+ movd [px+16*4-4], xm7
+ movd [px+16*5-4], xm7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movd xm0, [leftq+4*0]
+ movd xm1, [leftq+4*1]
+ movd xm2, [leftq+4*2]
+ movd xm3, [leftq+4*3]
+ movd [px+16*0-4], xm0
+ movd [px+16*1-4], xm1
+ movd [px+16*2-4], xm2
+ movd [px+16*3-4], xm3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5
+.padding_done:
+ CDEF_FILTER 4, 4
+
+cglobal cdef_filter_4x8_16bpc, 5, 10, 9, 16*14, dst, stride, left, top, bot, \
+ pri, sec, edge
+ mov edged, r9m
+ movu xm0, [dstq+strideq*0]
+ movu xm1, [dstq+strideq*1]
+ lea r9, [strideq*3]
+ movu xm2, [dstq+strideq*2]
+ movu xm3, [dstq+r9 ]
+ lea r6, [dstq+strideq*4]
+ movu xm4, [r6 +strideq*0]
+ movu xm5, [r6 +strideq*1]
+ movu xm6, [r6 +strideq*2]
+ movu xm7, [r6 +r9 ]
+ lea r8, [dir_table4]
+ mova [px+16*0+0], xm0
+ mova [px+16*1+0], xm1
+ mova [px+16*2+0], xm2
+ mova [px+16*3+0], xm3
+ mova [px+16*4+0], xm4
+ mova [px+16*5+0], xm5
+ mova [px+16*6+0], xm6
+ mova [px+16*7+0], xm7
+ vpbroadcastd m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movu xm0, [topq+strideq*0]
+ movu xm1, [topq+strideq*1]
+ mova [px-16*2+0], xm0
+ mova [px-16*1+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd xm0, [topq+strideq*0-4]
+ movd xm1, [topq+strideq*1-4]
+ movd [px-16*2-4], xm0
+ movd [px-16*1-4], xm1
+ jmp .top_done
+.no_top:
+ mova [px-16*2+0], m7
+.top_no_left:
+ movd [px-16*2-4], xm7
+ movd [px-16*1-4], xm7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movu xm0, [botq+strideq*0]
+ movu xm1, [botq+strideq*1]
+ mova [px+16*8+0], xm0
+ mova [px+16*9+0], xm1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd xm0, [botq+strideq*0-4]
+ movd xm1, [botq+strideq*1-4]
+ movd [px+16*8-4], xm0
+ movd [px+16*9-4], xm1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+16*8+0], m7
+.bottom_no_left:
+ movd [px+16*8-4], xm7
+ movd [px+16*9-4], xm7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movd xm0, [leftq+4*0]
+ movd xm1, [leftq+4*1]
+ movd xm2, [leftq+4*2]
+ movd xm3, [leftq+4*3]
+ movd [px+16*0-4], xm0
+ movd [px+16*1-4], xm1
+ movd [px+16*2-4], xm2
+ movd [px+16*3-4], xm3
+ movd xm0, [leftq+4*4]
+ movd xm1, [leftq+4*5]
+ movd xm2, [leftq+4*6]
+ movd xm3, [leftq+4*7]
+ movd [px+16*4-4], xm0
+ movd [px+16*5-4], xm1
+ movd [px+16*6-4], xm2
+ movd [px+16*7-4], xm3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 4, 8
+
+cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*13, dst, stride, left, top, bot, \
+ pri, sec, edge
+%if WIN64
+ %define px rsp+32*4
+%else
+ %define px rsp+32*3
+%endif
+ %define base r8-dir_table8
+ mov edged, r9m
+ movu m0, [dstq+strideq*0]
+ movu m1, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ movu m2, [r6 +strideq*0]
+ movu m3, [r6 +strideq*1]
+ lea r6, [r6 +strideq*2]
+ movu m4, [r6 +strideq*0]
+ movu m5, [r6 +strideq*1]
+ lea r6, [r6 +strideq*2]
+ movu m6, [r6 +strideq*0]
+ movu m7, [r6 +strideq*1]
+ lea r8, [dir_table8]
+ mova [px+32*0+0], m0
+ mova [px+32*1+0], m1
+ mova [px+32*2+0], m2
+ mova [px+32*3+0], m3
+ mova [px+32*4+0], m4
+ mova [px+32*5+0], m5
+ mova [px+32*6+0], m6
+ mova [px+32*7+0], m7
+ vpbroadcastd m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd xm0, [topq+strideq*0-4]
+ movd xm1, [topq+strideq*1-4]
+ movd [px-32*2-4], xm0
+ movd [px-32*1-4], xm1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+0], m7
+ mova [px-32*1+0], m7
+.top_no_left:
+ movd [px-32*2-4], xm7
+ movd [px-32*1-4], xm7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movu m0, [botq+strideq*0]
+ movu m1, [botq+strideq*1]
+ mova [px+32*8+0], m0
+ mova [px+32*9+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd xm0, [botq+strideq*0-4]
+ movd xm1, [botq+strideq*1-4]
+ movd [px+32*8-4], xm0
+ movd [px+32*9-4], xm1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*8+0], m7
+ mova [px+32*9+0], m7
+.bottom_no_left:
+ movd [px+32*8-4], xm7
+ movd [px+32*9-4], xm7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movd xm0, [leftq+4*0]
+ movd xm1, [leftq+4*1]
+ movd xm2, [leftq+4*2]
+ movd xm3, [leftq+4*3]
+ movd [px+32*0-4], xm0
+ movd [px+32*1-4], xm1
+ movd [px+32*2-4], xm2
+ movd [px+32*3-4], xm3
+ movd xm0, [leftq+4*4]
+ movd xm1, [leftq+4*5]
+ movd xm2, [leftq+4*6]
+ movd xm3, [leftq+4*7]
+ movd [px+32*4-4], xm0
+ movd [px+32*5-4], xm1
+ movd [px+32*6-4], xm2
+ movd [px+32*7-4], xm3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+16], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 8, 8
+
+cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax
+ lea r6, [dir_shift]
+ shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
+ vpbroadcastd m4, [r6+bdmaxq*4]
+ lea r6, [strideq*3]
+ mova xm0, [srcq+strideq*0]
+ mova xm1, [srcq+strideq*1]
+ mova xm2, [srcq+strideq*2]
+ mova xm3, [srcq+r6 ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m0, [srcq+r6 ], 1
+ vinserti128 m1, [srcq+strideq*2], 1
+ vinserti128 m2, [srcq+strideq*1], 1
+ vinserti128 m3, [srcq+strideq*0], 1
+ REPX {pmulhuw x, m4}, m0, m1, m2, m3
+ jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/cdef16_avx512.asm b/third_party/dav1d/src/x86/cdef16_avx512.asm
new file mode 100644
index 0000000000..6d625a02a0
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef16_avx512.asm
@@ -0,0 +1,622 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+cdef_perm: db 2, 18, 16, 18, 24, 19, 0, 19, 25, 20, 1, 20, 26, 21, 2, 21
+ db 3, 26, 3, 26, 28, 27, 4, 27, 29, 28, -1, 28, 30, 29, -1, 29
+ db 0, 34, 17, 34, 16, 35, 8, 35, 17, 36, 9, 36, 18, 37, 10, 37
+ db 1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45
+end_perm4: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+ db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
+edge_mask4: dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111
+ dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011
+ dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111
+pri_taps4: dw 64, 32, 48, 48 ; left-shifted by 4
+cdef_dirs4: dw 8, 16, 8, 15, -7,-14, 1, -6
+ dw 1, 2, 1, 10, 9, 18, 8, 17
+ dw 8, 16, 8, 15, -7,-14, 1, -6
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+cdef_dirs8: db 32, 64, 32, 62,-30,-60, 2,-28
+ db 2, 4, 2, 36, 34, 68, 32, 66
+ db 32, 64, 32, 62,-30,-60, 2,-28
+pri_taps8: dw 4, 4, 2, 2, 3, 3, 3, 3
+sec_taps4: dw 32, 16
+pw_m16384: times 2 dw -16384
+pw_2048: times 2 dw 2048
+pd_268435568: dd 268435568 ; (1 << 28) + (7 << 4)
+edge_mask8: dw 0x2121, 0x2020, 0x0101
+
+SECTION .text
+
+%macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp
+ psubw %1, %2, %3
+ pabsw %1, %1
+ vpcmpgtw k1, %3, %2
+ vpsrlvw %7, %1, %6
+ psubusw %7, %5, %7
+ pminsw %1, %7
+ vpsubw %1{k1}, %4, %1
+%endmacro
+
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35
+; L0 L1 00 01 02 03 04 05 b0 b1 b2 b3 b4 b5 b6 b7
+; L2 L3 10 11 12 13 14 15 B0 B1 B2 B3 B4 B5 B6 B7
+
+INIT_ZMM avx512icl
+cglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r6-cdef_dirs4
+ lea r6, [cdef_dirs4]
+ movu xm3, [dstq+strideq*0]
+ vinserti32x4 ym3, [dstq+strideq*1], 1
+ mova xm2, [leftq]
+ lea r2, [dstq+strideq*2]
+ vinserti32x4 m3, [r2+strideq*0], 2
+ mova m5, [base+cdef_perm]
+ vinserti32x4 m3, [r2+strideq*1], 3
+ vpermt2d m2, m5, m3
+ vinserti32x4 m1, m2, [topq+strideq*0-4], 0
+ vinserti32x4 m1, [topq+strideq*1-4], 1
+ mov r3d, edgem
+ movifnidn prid, prim
+ punpcklwd m3, m3 ; px
+ psrlw m5, 8
+ vpbroadcastd m0, [base+pd_268435568]
+ pxor m12, m12
+ cmp r3d, 0x0f
+ jne .mask_edges
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+.main:
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ vpbroadcastd m15, [base+pri_taps4+priq]
+ xor prid, prid
+ add r4d, r3d
+ cmovns prid, r4d ; pri_shift
+ mov r4d, dirm
+ vpbroadcastw m14, prid
+ mov r5d, secm
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4]
+ call .constrain
+ test r5d, r5d
+ jz .end_no_clip
+ lzcnt r5d, r5d
+ vpbroadcastw m13, secm
+ add r3d, r5d
+ pminuw m6, m3, m8
+ pmaxsw m7, m3, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ call .constrain_sec
+ pminuw m6, m8
+ pmaxsw m7, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+ pminuw m6, m8
+ pmaxsw m7, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ psrldq m8, m6, 2
+ vpshldd m3, m0, 8
+ psrldq m9, m7, 2
+ paddd m0, m3
+ pminuw m6, m8
+ psrldq m0, 1
+ pmaxsw m7, m9
+ pmaxsw m0, m6
+ pminsw m0, m7
+ vpmovdw ym0, m0
+ jmp .end
+.sec_only:
+ tzcnt r5d, secm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ mov r4d, dirm
+ sub r3d, r5d ; sec_shift
+ call .constrain_sec
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+.end_no_clip:
+ mova ym1, [base+end_perm4]
+ vpshldd m3, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ paddd m0, m3 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ vpermb m0, m1, m0
+.end:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm0, ym0, 1
+ movq [r2+strideq*0], xm0
+ movhps [r2+strideq*1], xm0
+ RET
+.mask_edges:
+ vpbroadcastd m6, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ jmp .mask_edges_main
+.mask_edges_no_bottom:
+ kmovw k1, [base+edge_mask4+8+r3*2]
+.mask_edges_main:
+ or r3d, 0x04
+ vmovdqa32 m1{k1}, m6 ; edge pixels = -16384
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ vmovdqa32 m2{k1}, m6
+ jmp .main
+.constrain_sec:
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4]
+ vpbroadcastw m14, r3d
+ vpbroadcastd m15, [base+sec_taps4]
+.constrain:
+ paddw m8, m5, m9
+ vpermi2w m8, m1, m2 ; k0p0 k1p0
+ psubw m9, m5, m9
+ vpermi2w m9, m1, m2 ; k0p1 k1p1
+ CONSTRAIN m10, m8, m3, m12, m13, m14, m11
+ vpdpwssd m0, m10, m15
+ CONSTRAIN m10, m9, m3, m12, m13, m14, m11
+ vpdpwssd m0, m10, m15
+ ret
+
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75
+; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7
+; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7
+
+cglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+ lea r6, [cdef_dirs4]
+ movu xm18, [dstq+strideq*0]
+ vinserti128 ym18, [dstq+strideq*1], 1
+ mova xm1, [leftq+16*0]
+ mova xm2, [leftq+16*1]
+ lea r2, [strideq*3]
+ vinserti32x4 m18, [dstq+strideq*2], 2
+ mova m5, [base+cdef_perm]
+ vinserti32x4 m18, [dstq+r2 ], 3
+ vpermt2d m1, m5, m18
+ vinserti32x4 m0, m1, [topq+strideq*0-4], 0
+ vinserti32x4 m0, [topq+strideq*1-4], 1
+ lea r3, [dstq+strideq*4]
+ movu xm19, [r3+strideq*0]
+ vinserti128 ym19, [r3+strideq*1], 1
+ vinserti32x4 m19, [r3+strideq*2], 2
+ vinserti32x4 m19, [r3+r2 ], 3
+ mov r3d, edgem
+ movifnidn prid, prim
+ vpermt2d m2, m5, m19
+ vpbroadcastd m16, [base+pd_268435568]
+ pxor m12, m12
+ punpcklwd m18, m18 ; px (top)
+ psrlw m5, 8
+ punpcklwd m19, m19 ; px (bottom)
+ mova m17, m16
+ vshufi32x4 m1, m2, q3210
+ cmp r3d, 0x0f
+ jne .mask_edges
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+.main:
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ vpbroadcastd m15, [base+pri_taps4+priq]
+ xor prid, prid
+ add r4d, r3d
+ cmovns prid, r4d ; pri_shift
+ mov r4d, dirm
+ vpbroadcastw m14, prid
+ mov r5d, secm
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4]
+ call .constrain
+ test r5d, r5d
+ jz .end_no_clip
+ lzcnt r5d, r5d
+ vpbroadcastw m13, secm
+ add r3d, r5d
+ pminuw m3, m18, m6
+ pmaxsw m4, m18, m6
+ pminuw m20, m19, m7
+ pmaxsw m21, m19, m7
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ call .constrain_sec
+ pminuw m3, m6
+ pmaxsw m4, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+ pminuw m3, m6
+ pmaxsw m4, m6
+ mov r3, 0xcccccccccccccccc
+ pminuw m20, m7
+ pmaxsw m21, m7
+ kmovq k1, r3
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ vbroadcasti32x4 m0, [base+deint_shuf]
+ vpshldd m6, m20, m3, 16
+ vmovdqu8 m3{k1}, m20
+ vpshldd m18, m16, 8
+ vpshldd m7, m21, m4, 16
+ vmovdqu8 m4{k1}, m21
+ vpshldd m19, m17, 8
+ pminuw m3, m6
+ paddd m16, m18
+ pmaxsw m4, m7
+ paddd m17, m19
+ psrldq m16, 1
+ palignr m16{k1}, m17, m17, 15
+ lea r6, [dstq+strideq*4]
+ pmaxsw m16, m3
+ pminsw m16, m4
+ pshufb m16, m0
+ movq [dstq+strideq*0], xm16
+ movhps [r6 +strideq*0], xm16
+ vextracti128 xm17, ym16, 1
+ movq [dstq+strideq*1], xm17
+ movhps [r6 +strideq*1], xm17
+ vextracti32x4 xm17, m16, 2
+ movq [dstq+strideq*2], xm17
+ movhps [r6 +strideq*2], xm17
+ vextracti32x4 xm16, m16, 3
+ movq [dstq+r2 ], xm16
+ movhps [r6 +r2 ], xm16
+ RET
+.sec_only:
+ mov r4d, dirm
+ tzcnt r5d, secm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ sub r3d, r5d ; sec_shift
+ call .constrain_sec
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+.end_no_clip:
+ mova ym20, [base+end_perm4]
+ vpshldd m18, m16, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m19, m17, 8
+ paddd m16, m18 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddd m17, m19
+ vpermb m16, m20, m16
+ vpermb m17, m20, m17
+ movq [dstq+strideq*0], xm16
+ movhps [dstq+strideq*1], xm16
+ vextracti128 xm16, ym16, 1
+ movq [dstq+strideq*2], xm16
+ movhps [dstq+r2 ], xm16
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm17
+ movhps [dstq+strideq*1], xm17
+ vextracti128 xm17, ym17, 1
+ movq [dstq+strideq*2], xm17
+ movhps [dstq+r2 ], xm17
+ RET
+.mask_edges:
+ vpbroadcastd m6, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ jmp .mask_edges_main
+.mask_edges_no_bottom:
+ kmovw k1, [base+edge_mask4+8+r3*2]
+.mask_edges_main:
+ mov r4d, r3d
+ or r3d, 0x0c
+ vmovdqa32 m0{k1}, m6 ; edge pixels = -16384
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ or r4d, 0x04
+ vmovdqa32 m1{k1}, m6
+ kmovw k1, [base+edge_mask4-8+r4*2]
+ vmovdqa32 m2{k1}, m6
+ jmp .main
+.constrain_sec:
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4]
+ vpbroadcastw m14, r3d
+ vpbroadcastd m15, [base+sec_taps4]
+.constrain:
+ paddw m7, m5, m9
+ mova m6, m0
+ vpermt2w m6, m7, m1 ; k0p0 k1p0 (top)
+ psubw m9, m5, m9
+ mova m8, m0
+ vpermi2w m7, m1, m2 ; k0p0 k1p0 (bottom)
+ CONSTRAIN m10, m6, m18, m12, m13, m14, m11
+ vpermt2w m8, m9, m1 ; k0p1 k1p1 (top)
+ vpdpwssd m16, m10, m15
+ CONSTRAIN m10, m7, m19, m12, m13, m14, m11
+ vpermi2w m9, m1, m2 ; k0p1 k1p1 (bottom)
+ vpdpwssd m17, m10, m15
+ CONSTRAIN m10, m8, m18, m12, m13, m14, m11
+ vpdpwssd m16, m10, m15
+ CONSTRAIN m10, m9, m19, m12, m13, m14, m11
+ vpdpwssd m17, m10, m15
+ ret
+
+cglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r6-cdef_dirs8
+ lea r6, [cdef_dirs8]
+ movu ym17, [dstq+strideq*0]
+ vinserti32x8 m17, [dstq+strideq*1], 1
+ movq xm4, [leftq+8*0]
+ movq xm5, [leftq+8*1]
+ psrld m2, [base+cdef_perm], 16
+ movq xm6, [leftq+8*2]
+ movq xm7, [leftq+8*3]
+ lea r2, [strideq*3]
+ movu ym16, [topq+strideq*0-4]
+ vinserti32x8 m16, [topq+strideq*1-4], 1
+ lea r3, [dstq+strideq*4]
+ movu ym18, [dstq+strideq*2]
+ vinserti32x8 m18, [dstq+r2 ], 1
+ movu ym19, [r3+strideq*0]
+ vinserti32x8 m19, [r3+strideq*1], 1
+ movu ym20, [r3+strideq*2]
+ vinserti32x8 m20, [r3+r2 ], 1
+ vshufi32x4 m0, m17, m18, q2020 ; px (top)
+ mov r3d, edgem
+ vshufi32x4 m1, m19, m20, q2020 ; px (bottom)
+ movifnidn prid, prim
+ vpermt2d m17, m2, m4
+ vpermt2d m18, m2, m5
+ pxor m12, m12
+ vpermt2d m19, m2, m6
+ vpermt2d m20, m2, m7
+ cmp r3d, 0x0f
+ jne .mask_edges
+ movu ym21, [botq+strideq*0-4]
+ vinserti32x8 m21, [botq+strideq*1-4], 1
+.main:
+ mova [rsp+64*0], m16 ; top
+ mova [rsp+64*1], m17 ; 0 1
+ mova [rsp+64*2], m18 ; 2 3
+ mova [rsp+64*3], m19 ; 4 5
+ mova [rsp+64*4], m20 ; 6 7
+ mova [rsp+64*5], m21 ; bottom
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ add r4d, r3d ; pri_shift
+ vpbroadcastw m14, r4d
+ mov r4d, dirm
+ vpbroadcastd m2, [base+pri_taps8+priq*2+0]
+ vpbroadcastd m3, [base+pri_taps8+priq*2+4]
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1
+ pmaxsw m14, m12
+ call .constrain
+ mov r5d, secm
+ pmullw m16, m8, m2
+ pmullw m17, m9, m2
+ test r5d, r5d
+ jnz .pri_sec
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
+ call .constrain
+ pmullw m8, m3
+ pmullw m9, m3
+ jmp .end_no_clip
+.pri_sec:
+ lzcnt r5d, r5d
+ add r3d, r5d ; sec_shift
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
+ pminuw m18, m0, m4
+ pmaxsw m19, m0, m4
+ pminuw m20, m1, m5
+ pmaxsw m21, m1, m5
+ call .min_max_constrain2
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2
+ pmullw m8, m3
+ pmullw m9, m3
+ vpbroadcastw m13, secm
+ vpbroadcastw m14, r3d
+ paddw m16, m8
+ paddw m17, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3
+ mova m2, m8
+ mova m3, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2
+ paddw m2, m8
+ paddw m3, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3
+ paddw m2, m2
+ paddw m3, m3
+ paddw m16, m8
+ paddw m17, m9
+ call .min_max_constrain
+ vpbroadcastd m10, [base+pw_2048]
+ paddw m16, m2
+ paddw m17, m3
+ paddw m16, m8
+ paddw m17, m9
+ psraw m8, m16, 15
+ psraw m9, m17, 15
+ paddw m16, m8
+ paddw m17, m9
+ pmulhrsw m16, m10
+ pmulhrsw m17, m10
+ pminuw m18, m4
+ pmaxsw m19, m4
+ pminuw m20, m5
+ pmaxsw m21, m5
+ pminuw m18, m6
+ pmaxsw m19, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+ paddw m16, m0
+ paddw m17, m1
+ pmaxsw m16, m18
+ pmaxsw m17, m20
+ pminsw m16, m19
+ pminsw m17, m21
+ jmp .end
+.sec_only:
+ tzcnt r5d, secm
+ mov r4d, dirm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ sub r3d, r5d
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0]
+ vpbroadcastw m14, r3d
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0]
+ mova m16, m8
+ mova m17, m9
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1]
+ paddw m16, m8
+ paddw m17, m9
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1]
+ paddw m16, m16
+ paddw m17, m17
+ paddw m16, m8
+ paddw m17, m9
+ call .constrain
+.end_no_clip:
+ vpbroadcastd m10, [base+pw_2048]
+ paddw m16, m8
+ paddw m17, m9
+ psraw m8, m16, 15
+ psraw m9, m17, 15
+ paddw m16, m8
+ paddw m17, m9
+ pmulhrsw m16, m10
+ pmulhrsw m17, m10
+ paddw m16, m0
+ paddw m17, m1
+.end:
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm17
+ vextracti128 [dstq+strideq*1], ym17, 1
+ vextracti32x4 [dstq+strideq*2], m17, 2
+ vextracti32x4 [dstq+r2 ], m17, 3
+ RET
+.mask_edges:
+ vpbroadcastd m2, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ movu ym21, [botq+strideq*0-4]
+ vinserti32x8 m21, [botq+strideq*1-4], 1
+ jmp .mask_edges_top
+.mask_edges_no_bottom:
+ mova m21, m2
+.mask_edges_top:
+ test r3b, 0x04
+ jnz .mask_edges_main
+ mova m16, m2
+.mask_edges_main:
+ and r3d, 0x03
+ cmp r3d, 0x03
+ je .main
+ kmovw k1, [base+edge_mask8+r3*2]
+ vmovdqa32 m16{k1}, m2 ; edge pixels = -16384
+ vmovdqa32 m17{k1}, m2
+ vmovdqa32 m18{k1}, m2
+ vmovdqa32 m19{k1}, m2
+ vmovdqa32 m20{k1}, m2
+ vmovdqa32 m21{k1}, m2
+ jmp .main
+ALIGN function_align
+.min_max_constrain:
+ pminuw m18, m4
+ pmaxsw m19, m4
+ pminuw m20, m5
+ pmaxsw m21, m5
+.min_max_constrain2:
+ pminuw m18, m6
+ pmaxsw m19, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+.constrain:
+ %define tmp rsp+gprsize+68
+ movu m4, [tmp+r5+64*0]
+ vshufi32x4 m4, [tmp+r5+64*1], q2020 ; k0p0 (top)
+ movu m5, [tmp+r5+64*2]
+ vshufi32x4 m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom)
+ neg r5
+ movu m6, [tmp+r5+64*0]
+ vshufi32x4 m6, [tmp+r5+64*1], q2020 ; k0p1 (top)
+ movu m7, [tmp+r5+64*2]
+ vshufi32x4 m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom)
+ CONSTRAIN m8, m4, m0, m12, m13, m14, m15
+ CONSTRAIN m9, m5, m1, m12, m13, m14, m15
+ CONSTRAIN m10, m6, m0, m12, m13, m14, m15
+ CONSTRAIN m11, m7, m1, m12, m13, m14, m15
+ paddw m8, m10
+ paddw m9, m11
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/cdef16_sse.asm b/third_party/dav1d/src/x86/cdef16_sse.asm
new file mode 100644
index 0000000000..1bd67ace64
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef16_sse.asm
@@ -0,0 +1,1033 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; Copyright (c) 2017-2021, The rav1e contributors
+; Copyright (c) 2021, Nathan Egge
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+%macro DUP8 1-*
+ %rep %0
+ times 8 dw %1
+ %rotate 1
+ %endrep
+%endmacro
+
+pri_taps: DUP8 4, 2, 3, 3
+dir_table: db 1 * 32 + 0, 2 * 32 + 0
+ db 1 * 32 + 0, 2 * 32 - 2
+ db -1 * 32 + 2, -2 * 32 + 4
+ db 0 * 32 + 2, -1 * 32 + 4
+ db 0 * 32 + 2, 0 * 32 + 4
+ db 0 * 32 + 2, 1 * 32 + 4
+ db 1 * 32 + 2, 2 * 32 + 4
+ db 1 * 32 + 0, 2 * 32 + 2
+ db 1 * 32 + 0, 2 * 32 + 0
+ db 1 * 32 + 0, 2 * 32 - 2
+ db -1 * 32 + 2, -2 * 32 + 4
+ db 0 * 32 + 2, -1 * 32 + 4
+
+dir_shift: times 4 dw 0x4000
+ times 4 dw 0x1000
+
+pw_128: times 4 dw 128
+pw_2048: times 8 dw 2048
+pw_m16384: times 8 dw -16384
+
+cextern cdef_dir_8bpc_ssse3.main
+cextern cdef_dir_8bpc_sse4.main
+cextern shufw_6543210x
+
+SECTION .text
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 5, 3
+%elif WIN64
+DECLARE_REG_TMP 8, 4
+%else
+DECLARE_REG_TMP 8, 6
+%endif
+
+%macro CDEF_FILTER 2 ; w, h
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, _, tmp, pridmp, pri, sec, dir
+ mova m8, [base+pw_2048]
+%else
+ DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir
+ %define m8 [base+pw_2048]
+ %define m9 [rsp+16*1+gprsize]
+ %define m10 [rsp+16*2+gprsize]
+%endif
+ movifnidn prid, r5m
+ movifnidn secd, r6m
+ test prid, prid
+ jz .sec_only
+ movd m6, r5m
+%if ARCH_X86_32
+ mov [rsp+24], pridmpd
+%endif
+ bsr pridmpd, prid
+ lea tmpd, [priq*4]
+ cmp dword r10m, 0x3ff ; if (bpc == 10)
+ cmove prid, tmpd ; pri <<= 2
+ mov tmpd, r8m ; damping
+ mov dird, r7m
+ and prid, 16
+ pshufb m6, m7 ; splat
+ lea dirq, [base+dir_table+dirq*2]
+ lea priq, [base+pri_taps+priq*2]
+ test secd, secd
+ jz .pri_only
+ mova [rsp], m6
+ movd m6, secd
+ tzcnt secd, secd
+ sub pridmpd, tmpd
+ sub tmpd, secd
+ pshufb m6, m7
+ xor secd, secd
+ neg pridmpd
+ cmovs pridmpd, secd
+%if ARCH_X86_32
+ mov [pri_shift+4], secd
+ mov [sec_shift+4], secd
+%endif
+ mov [pri_shift+0], pridmpq
+ mov [sec_shift+0], tmpq
+ lea tmpq, [px]
+%if WIN64
+ movaps r4m, m9
+ movaps r6m, m10
+%elif ARCH_X86_32
+ mov pridmpd, [rsp+24]
+%endif
+%rep %1*%2/8
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
+%endrep
+%if WIN64
+ movaps m9, r4m
+ movaps m10, r6m
+%endif
+ jmp .end
+.pri_only:
+ sub tmpd, pridmpd
+ cmovs tmpd, secd
+%if ARCH_X86_32
+ mov pridmpd, [rsp+24]
+ mov [pri_shift+4], secd
+%endif
+ mov [pri_shift+0], tmpq
+ lea tmpq, [px]
+%rep %1*%2/8
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
+%endrep
+.end:
+ RET
+.sec_only:
+ mov tmpd, r8m ; damping
+ movd m6, r6m
+ tzcnt secd, secd
+ mov dird, r7m
+ pshufb m6, m7
+ sub tmpd, secd
+ lea dirq, [base+dir_table+dirq*2]
+%if ARCH_X86_32
+ mov [sec_shift+4], prid
+%endif
+ mov [sec_shift+0], tmpq
+ lea tmpq, [px]
+%rep %1*%2/8
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
+%endrep
+ jmp .end
+%if %1 == %2
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, _, tmp, off, pri, _, dir
+ %else
+ DEFINE_ARGS dst, stride, tmp, off, pri, _, dir
+ %endif
+ALIGN function_align
+.pri:
+ movsx offq, byte [dirq+4] ; off_k0
+%if %1 == 4
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ movq m2, [tmpq+offq+32*0] ; k0p0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0p1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ mova m1, [dstq]
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+5] ; off_k1
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m0, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0p0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1p0
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1p1
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ psignw m2, m3 ; constrain(diff_k0p1)
+ pabsw m3, m4 ; adiff_k1p0
+ paddw m0, m2 ; constrain(diff_k0)
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m0, [priq+16*0] ; pri_tap_k0
+ pmullw m7, [priq+16*1] ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ paddw m0, m1
+%if %1 == 4
+ add tmpq, 32*2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+%else
+ add tmpq, 32
+ mova [dstq], m0
+ add dstq, strideq
+%endif
+ ret
+ALIGN function_align
+.sec:
+ movsx offq, byte [dirq+8] ; off1_k0
+%if %1 == 4
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ movq m2, [tmpq+offq+32*0] ; k0s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ mova m1, [dstq]
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+0] ; off2_k0
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k0s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k0s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+9] ; off1_k1
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ psignw m2, m3 ; constrain(diff_k0s1)
+ pabsw m3, m4 ; adiff_k0s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+%if %1 == 4
+ movq m2, [tmpq+offq+32*0] ; k1s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k1s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+1] ; off2_k1
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k0s3)
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ paddw m0, m7
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k1s3)
+ paddw m0, m4 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ paddw m0, m1
+%if %1 == 4
+ add tmpq, 32*2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+%else
+ add tmpq, 32
+ mova [dstq], m0
+ add dstq, strideq
+%endif
+ ret
+ALIGN function_align
+.pri_sec:
+ movsx offq, byte [dirq+8] ; off2_k0
+%if %1 == 4
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ movq m2, [tmpq+offq+32*0] ; k0s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ mova m1, [dstq]
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+0] ; off3_k0
+ pabsw m4, m2
+%if ARCH_X86_64
+ pabsw m10, m3
+ pmaxsw m9, m2, m3
+ pminsw m10, m4
+%else
+ pabsw m7, m3
+ pmaxsw m5, m2, m3
+ pminsw m4, m7
+ mova m9, m5
+ mova m10, m4
+%endif
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k0s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k0s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+9] ; off2_k1
+ pabsw m7, m4
+ psignw m2, m3
+ pabsw m3, m5 ; constrain(diff_k0s1)
+%if ARCH_X86_64
+ pmaxsw m9, m4
+ pminsw m10, m7
+ pmaxsw m9, m5
+ pminsw m10, m3
+%else
+ pminsw m7, m10
+ pminsw m7, m3
+ pmaxsw m3, m9, m4
+ pmaxsw m3, m5
+ mova m10, m7
+ mova m9, m3
+%endif
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ paddw m0, m2
+ pabsw m3, m4 ; adiff_k0s2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+%if %1 == 4
+ movq m2, [tmpq+offq+32*0] ; k1s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k1s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+1] ; off3_k1
+ paddw m0, m7
+ pabsw m7, m2
+ psignw m4, m5 ; constrain(diff_k0s3)
+ pabsw m5, m3
+%if ARCH_X86_64
+ pmaxsw m9, m2
+ pminsw m10, m7
+ pmaxsw m9, m3
+ pminsw m10, m5
+%else
+ pminsw m7, m10
+ pminsw m7, m5
+ pmaxsw m5, m9, m2
+ pmaxsw m5, m3
+ mova m10, m7
+ mova m9, m5
+%endif
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+4] ; off1_k0
+ paddw m0, m7
+ pabsw m7, m4
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pabsw m3, m5
+%if ARCH_X86_64
+ pmaxsw m9, m4
+ pminsw m10, m7
+ pmaxsw m9, m5
+ pminsw m10, m3
+%else
+ pminsw m7, m10
+ pminsw m7, m3
+ pmaxsw m3, m9, m4
+ pmaxsw m3, m5
+ mova m10, m7
+ mova m9, m3
+%endif
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ paddw m0, m7
+%if %1 == 4
+ movq m2, [tmpq+offq+32*0] ; k0p0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0p1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+5] ; off1_k1
+ pabsw m7, m2
+ psignw m4, m5 ; constrain(diff_k1s3)
+ pabsw m5, m3
+%if ARCH_X86_64
+ pmaxsw m9, m2
+ pminsw m10, m7
+ pmaxsw m9, m3
+ pminsw m10, m5
+%else
+ pminsw m7, m10
+ pminsw m7, m5
+ pmaxsw m5, m9, m2
+ pmaxsw m5, m3
+ mova m10, m7
+ mova m9, m5
+%endif
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ paddw m0, m4
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m7, [rsp+gprsize], m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m7, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k0p0)
+ psubusw m2, [rsp+gprsize], m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1p0
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1p1
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ psignw m2, m3 ; constrain(diff_k0p1)
+ pabsw m3, m4
+ paddw m7, m2 ; constrain(diff_k0)
+ pabsw m2, m5
+%if ARCH_X86_64
+ pmaxsw m9, m4
+ pminsw m10, m3
+ pmaxsw m9, m5
+ pminsw m10, m2
+%else
+ pminsw m3, m10
+ pminsw m3, m2
+ pmaxsw m2, m9, m4
+ pmaxsw m2, m5
+ mova m10, m3
+ mova m9, m2
+%endif
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ pabsw m3, m4 ; adiff_k1p0
+ pmullw m7, [priq+16*0] ; pri_tap_k0
+ paddw m0, m7
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, [rsp+16*0+gprsize], m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, [rsp+16*0+gprsize], m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m7, [priq+16*1] ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ paddw m0, m1
+%if ARCH_X86_64
+ pmaxsw m9, m1
+ pminsw m0, m9
+%else
+ pmaxsw m2, m9, m1
+ pminsw m0, m2
+%endif
+ pminsw m1, m10
+ pmaxsw m0, m1
+%if %1 == 4
+ add tmpq, 32*2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+%else
+ add tmpq, 32
+ mova [dstq], m0
+ add dstq, strideq
+%endif
+ ret
+%endif
+%endmacro
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal cdef_filter_4x4_16bpc, 5, 9, 9, 32*10, dst, stride, left, top, bot, \
+ pri, sec, edge
+ %define px rsp+32*4
+%else
+cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left
+ %define botq topq
+ %define px rsp+32*5
+%endif
+ %define base t0-dir_table
+ %define pri_shift px-16*6
+ %define sec_shift px-16*5
+ mov edged, r9m
+ LEA t0, dir_table
+ movu m0, [dstq+strideq*0]
+ movu m1, [dstq+strideq*1]
+ lea t1, [dstq+strideq*2]
+ movu m2, [t1 +strideq*0]
+ movu m3, [t1 +strideq*1]
+ movddup m7, [base+pw_m16384]
+ mova [px+32*0+0], m0
+ mova [px+32*1+0], m1
+ mova [px+32*2+0], m2
+ mova [px+32*3+0], m3
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movifnidn topq, topmp
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd m0, [topq+strideq*0-4]
+ movd m1, [topq+strideq*1-4]
+ movd [px-32*2-4], m0
+ movd [px-32*1-4], m1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+0], m7
+ mova [px-32*1+0], m7
+.top_no_left:
+ movd [px-32*2-4], m7
+ movd [px-32*1-4], m7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movifnidn botq, r4mp
+ movu m0, [botq+strideq*0]
+ movu m1, [botq+strideq*1]
+ mova [px+32*4+0], m0
+ mova [px+32*5+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd m0, [botq+strideq*0-4]
+ movd m1, [botq+strideq*1-4]
+ movd [px+32*4-4], m0
+ movd [px+32*5-4], m1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*4+0], m7
+ mova [px+32*5+0], m7
+.bottom_no_left:
+ movd [px+32*4-4], m7
+ movd [px+32*5-4], m7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movifnidn leftq, r2mp
+ movd m0, [leftq+4*0]
+ movd m1, [leftq+4*1]
+ movd m2, [leftq+4*2]
+ movd m3, [leftq+4*3]
+ movd [px+32*0-4], m0
+ movd [px+32*1-4], m1
+ movd [px+32*2-4], m2
+ movd [px+32*3-4], m3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5
+.padding_done:
+ CDEF_FILTER 4, 4
+
+%if ARCH_X86_64
+cglobal cdef_filter_4x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \
+ pri, sec, edge
+%else
+cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
+%endif
+ mov edged, r9m
+ LEA t0, dir_table
+ movu m0, [dstq+strideq*0]
+ movu m1, [dstq+strideq*1]
+ lea t1, [dstq+strideq*2]
+ movu m2, [t1 +strideq*0]
+ movu m3, [t1 +strideq*1]
+ lea t1, [t1 +strideq*2]
+ movu m4, [t1 +strideq*0]
+ movu m5, [t1 +strideq*1]
+ lea t1, [t1 +strideq*2]
+ movu m6, [t1 +strideq*0]
+ movu m7, [t1 +strideq*1]
+ mova [px+32*0+0], m0
+ mova [px+32*1+0], m1
+ mova [px+32*2+0], m2
+ mova [px+32*3+0], m3
+ mova [px+32*4+0], m4
+ mova [px+32*5+0], m5
+ mova [px+32*6+0], m6
+ mova [px+32*7+0], m7
+ movddup m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movifnidn topq, topmp
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd m0, [topq+strideq*0-4]
+ movd m1, [topq+strideq*1-4]
+ movd [px-32*2-4], m0
+ movd [px-32*1-4], m1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+0], m7
+ mova [px-32*1+0], m7
+.top_no_left:
+ movd [px-32*2-4], m7
+ movd [px-32*1-4], m7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movifnidn botq, r4mp
+ movu m0, [botq+strideq*0]
+ movu m1, [botq+strideq*1]
+ mova [px+32*8+0], m0
+ mova [px+32*9+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd m0, [botq+strideq*0-4]
+ movd m1, [botq+strideq*1-4]
+ movd [px+32*8-4], m0
+ movd [px+32*9-4], m1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*8+0], m7
+ mova [px+32*9+0], m7
+.bottom_no_left:
+ movd [px+32*8-4], m7
+ movd [px+32*9-4], m7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movifnidn leftq, r2mp
+ movd m0, [leftq+4*0]
+ movd m1, [leftq+4*1]
+ movd m2, [leftq+4*2]
+ movd m3, [leftq+4*3]
+ movd [px+32*0-4], m0
+ movd [px+32*1-4], m1
+ movd [px+32*2-4], m2
+ movd [px+32*3-4], m3
+ movd m0, [leftq+4*4]
+ movd m1, [leftq+4*5]
+ movd m2, [leftq+4*6]
+ movd m3, [leftq+4*7]
+ movd [px+32*4-4], m0
+ movd [px+32*5-4], m1
+ movd [px+32*6-4], m2
+ movd [px+32*7-4], m3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 4, 8
+
+%if ARCH_X86_64
+cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \
+ pri, sec, edge
+%else
+cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
+%endif
+ mov edged, r9m
+ LEA t0, dir_table
+ mova m0, [dstq+strideq*0+ 0]
+ movd m1, [dstq+strideq*0+16]
+ mova m2, [dstq+strideq*1+ 0]
+ movd m3, [dstq+strideq*1+16]
+ lea t1, [dstq+strideq*2]
+ mova m4, [t1 +strideq*0+ 0]
+ movd m5, [t1 +strideq*0+16]
+ mova m6, [t1 +strideq*1+ 0]
+ movd m7, [t1 +strideq*1+16]
+ lea t1, [t1 +strideq*2]
+ mova [px+32*0+ 0], m0
+ movd [px+32*0+16], m1
+ mova [px+32*1+ 0], m2
+ movd [px+32*1+16], m3
+ mova [px+32*2+ 0], m4
+ movd [px+32*2+16], m5
+ mova [px+32*3+ 0], m6
+ movd [px+32*3+16], m7
+ mova m0, [t1 +strideq*0+ 0]
+ movd m1, [t1 +strideq*0+16]
+ mova m2, [t1 +strideq*1+ 0]
+ movd m3, [t1 +strideq*1+16]
+ lea t1, [t1 +strideq*2]
+ mova m4, [t1 +strideq*0+ 0]
+ movd m5, [t1 +strideq*0+16]
+ mova m6, [t1 +strideq*1+ 0]
+ movd m7, [t1 +strideq*1+16]
+ mova [px+32*4+ 0], m0
+ movd [px+32*4+16], m1
+ mova [px+32*5+ 0], m2
+ movd [px+32*5+16], m3
+ mova [px+32*6+ 0], m4
+ movd [px+32*6+16], m5
+ mova [px+32*7+ 0], m6
+ movd [px+32*7+16], m7
+ movddup m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movifnidn topq, topmp
+ mova m0, [topq+strideq*0+ 0]
+ mova m1, [topq+strideq*0+16]
+ mova m2, [topq+strideq*1+ 0]
+ mova m3, [topq+strideq*1+16]
+ mova [px-32*2+ 0], m0
+ movd [px-32*2+16], m1
+ mova [px-32*1+ 0], m2
+ movd [px-32*1+16], m3
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd m0, [topq+strideq*0-4]
+ movd m1, [topq+strideq*1-4]
+ movd [px-32*2-4], m0
+ movd [px-32*1-4], m1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+ 0], m7
+ movd [px-32*2+16], m7
+ mova [px-32*1+ 0], m7
+ movd [px-32*1+16], m7
+.top_no_left:
+ movd [px-32*2- 4], m7
+ movd [px-32*1- 4], m7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ movifnidn botq, r4mp
+ mova m0, [botq+strideq*0+ 0]
+ movd m1, [botq+strideq*0+16]
+ mova m2, [botq+strideq*1+ 0]
+ movd m3, [botq+strideq*1+16]
+ mova [px+32*8+ 0], m0
+ movd [px+32*8+16], m1
+ mova [px+32*9+ 0], m2
+ movd [px+32*9+16], m3
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd m0, [botq+strideq*0-4]
+ movd m1, [botq+strideq*1-4]
+ movd [px+32*8- 4], m0
+ movd [px+32*9- 4], m1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*8+ 0], m7
+ movd [px+32*8+16], m7
+ mova [px+32*9+ 0], m7
+ movd [px+32*9+16], m7
+.bottom_no_left:
+ movd [px+32*8- 4], m7
+ movd [px+32*9- 4], m7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movifnidn leftq, r2mp
+ movd m0, [leftq+4*0]
+ movd m1, [leftq+4*1]
+ movd m2, [leftq+4*2]
+ movd m3, [leftq+4*3]
+ movd [px+32*0- 4], m0
+ movd [px+32*1- 4], m1
+ movd [px+32*2- 4], m2
+ movd [px+32*3- 4], m3
+ movd m0, [leftq+4*4]
+ movd m1, [leftq+4*5]
+ movd m2, [leftq+4*6]
+ movd m3, [leftq+4*7]
+ movd [px+32*4- 4], m0
+ movd [px+32*5- 4], m1
+ movd [px+32*6- 4], m2
+ movd [px+32*7- 4], m3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 8, 8
+
+%macro CDEF_DIR 0
+%if ARCH_X86_64
+cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax
+ lea r6, [dir_shift]
+ shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
+ movddup m7, [r6+bdmaxq*8]
+ lea r6, [strideq*3]
+ mova m0, [srcq+strideq*0]
+ mova m1, [srcq+strideq*1]
+ mova m2, [srcq+strideq*2]
+ mova m3, [srcq+r6 ]
+ lea srcq, [srcq+strideq*4]
+ mova m4, [srcq+strideq*0]
+ mova m5, [srcq+strideq*1]
+ mova m6, [srcq+strideq*2]
+ REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhuw m7, [srcq+r6 ]
+ pxor m8, m8
+ packuswb m9, m0, m1
+ packuswb m10, m2, m3
+ packuswb m11, m4, m5
+ packuswb m12, m6, m7
+ REPX {psadbw x, m8}, m9, m10, m11, m12
+ packssdw m9, m10
+ packssdw m11, m12
+ packssdw m9, m11
+ jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
+%else
+cglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax
+ mov bdmaxd, bdmaxm
+ LEA r2, dir_shift
+ shr bdmaxd, 11
+ movddup m7, [r2+bdmaxq*8]
+ lea r3, [strideq*3]
+ pmulhuw m3, m7, [srcq+strideq*0]
+ pmulhuw m4, m7, [srcq+strideq*1]
+ pmulhuw m5, m7, [srcq+strideq*2]
+ pmulhuw m6, m7, [srcq+r3 ]
+ movddup m1, [r2-dir_shift+pw_128]
+ lea srcq, [srcq+strideq*4]
+ pxor m0, m0
+ packuswb m2, m3, m4
+ psubw m3, m1
+ psubw m4, m1
+ mova [esp+0x00], m3
+ mova [esp+0x10], m4
+ packuswb m3, m5, m6
+ psadbw m2, m0
+ psadbw m3, m0
+ psubw m5, m1
+ psubw m6, m1
+ packssdw m2, m3
+ mova [esp+0x20], m5
+ mova [esp+0x50], m6
+ pmulhuw m4, m7, [srcq+strideq*0]
+ pmulhuw m5, m7, [srcq+strideq*1]
+ pmulhuw m6, m7, [srcq+strideq*2]
+ pmulhuw m7, [srcq+r3 ]
+ packuswb m3, m4, m5
+ packuswb m1, m6, m7
+ psadbw m3, m0
+ psadbw m1, m0
+ packssdw m3, m1
+ movddup m1, [r2-dir_shift+pw_128]
+ LEA r2, shufw_6543210x
+ jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
+%endif
+%endmacro
+
+INIT_XMM ssse3
+CDEF_DIR
+
+INIT_XMM sse4
+CDEF_DIR
diff --git a/third_party/dav1d/src/x86/cdef_avx2.asm b/third_party/dav1d/src/x86/cdef_avx2.asm
new file mode 100644
index 0000000000..1f30f8a3b7
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef_avx2.asm
@@ -0,0 +1,1772 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+%macro JMP_TABLE 2-*
+ %xdefine %1_jmptable %%table
+ %xdefine %%base mangle(private_prefix %+ _%1_avx2)
+ %%table:
+ %rep %0 - 1
+ dd %%base %+ .%2 - %%table
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro CDEF_FILTER_JMP_TABLE 1
+JMP_TABLE cdef_filter_%1_8bpc, \
+ d6k0, d6k1, d7k0, d7k1, \
+ d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
+ d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
+ d0k0, d0k1, d1k0, d1k1
+%endmacro
+
+SECTION_RODATA 32
+
+pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6
+blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
+ dd 0x80, 0x00, 0x00
+blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ dd 0x00, 0x00
+blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
+ dd 0x0000
+blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
+ dd 0x0000, 0x0000
+blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
+blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
+div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
+shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
+shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pw_128: times 2 dw 128
+pw_2048: times 2 dw 2048
+tap_table: ; masks for 8 bit shifts
+ db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
+ ; weights
+ db 4, 2, 3, 3, 2, 1
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+ db 1 * 16 + 0, 2 * 16 + 0
+ db 1 * 16 + 0, 2 * 16 - 1
+ ; the last 6 are repeats of the first 6 so we don't need to & 7
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+
+CDEF_FILTER_JMP_TABLE 4x4
+CDEF_FILTER_JMP_TABLE 4x8
+CDEF_FILTER_JMP_TABLE 8x8
+
+SECTION .text
+
+%macro PREP_REGS 2 ; w, h
+ ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+ mov dird, r7m
+ lea tableq, [cdef_filter_%1x%2_8bpc_jmptable]
+ lea dirq, [tableq+dirq*2*4]
+%if %1 == 4
+ %if %2 == 4
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
+ table, dir, dirjmp, stride3, k
+ %else
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
+ table, dir, dirjmp, dst4, stride3, k
+ lea dst4q, [dstq+strideq*4]
+ %endif
+%else
+ DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \
+ table, dir, dirjmp, top2, stride3, k
+ mov hq, -8
+ lea top1q, [top1q+strideq*0]
+ lea top2q, [top1q+strideq*1]
+%endif
+%if %1 == 4
+ lea stride3q, [strideq*3]
+%endif
+%endmacro
+
+%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max
+ mov kd, 1
+ pxor m15, m15 ; sum
+%if %2 == 8
+ pxor m12, m12
+ %if %1 == 4
+ movd xm4, [dstq +strideq*0]
+ movd xm6, [dstq +strideq*1]
+ movd xm5, [dstq +strideq*2]
+ movd xm7, [dstq +stride3q ]
+ vinserti128 m4, [dst4q+strideq*0], 1
+ vinserti128 m6, [dst4q+strideq*1], 1
+ vinserti128 m5, [dst4q+strideq*2], 1
+ vinserti128 m7, [dst4q+stride3q ], 1
+ punpckldq m4, m6
+ punpckldq m5, m7
+ %else
+ movq xm4, [dstq+strideq*0]
+ movq xm5, [dstq+strideq*1]
+ vinserti128 m4, [dstq+strideq*2], 1
+ vinserti128 m5, [dstq+stride3q ], 1
+ %endif
+ punpcklqdq m4, m5
+%else
+ movd xm4, [dstq+strideq*0]
+ movd xm5, [dstq+strideq*1]
+ vinserti128 m4, [dstq+strideq*2], 1
+ vinserti128 m5, [dstq+stride3q ], 1
+ punpckldq m4, m5
+%endif
+%if %3 == 1
+ mova m7, m4 ; min
+ mova m8, m4 ; max
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength
+ ; mul_tap, w, h, clip
+ ; load p0/p1
+ movsxd dirjmpq, [dirq+kq*4+%1*2*4]
+ add dirjmpq, tableq
+ call dirjmpq
+
+%if %8 == 1
+ pmaxub m7, m5
+ pminub m8, m5
+ pmaxub m7, m6
+ pminub m8, m6
+%endif
+
+ ; accumulate sum[m15] over p0/p1
+%if %7 == 4
+ punpcklbw m5, m6
+ punpcklbw m6, m4, m4
+ psubusb m9, m5, m6
+ psubusb m5, m6, m5
+ por m9, m5 ; abs_diff_p01(p01 - px)
+ pcmpeqb m5, m9
+ por m5, %5
+ psignb m6, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+ pminub m5, m9
+ pmaddubsw m5, m6
+ paddw m15, m5
+%else
+ psubusb m9, m5, m4
+ psubusb m5, m4, m5
+ psubusb m11, m6, m4
+ psubusb m6, m4, m6
+ por m9, m5 ; abs_diff_p0(p0 - px)
+ por m11, m6 ; abs_diff_p1(p1 - px)
+ pcmpeqb m5, m9
+ pcmpeqb m6, m11
+ punpckhbw m10, m9, m11
+ punpcklbw m9, m11
+ por m5, %5
+ por m11, m6, %5
+ punpckhbw m6, m5, m11
+ punpcklbw m5, m11
+ psignb m11, %5, m6
+ psrlw m6, m10, %2 ; emulate 8-bit shift
+ pand m6, %3
+ psubusb m6, %4, m6
+ pminub m6, m10
+ pmaddubsw m6, m11
+ paddw m12, m6
+ psignb m11, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+ pminub m5, m9
+ pmaddubsw m5, m11
+ paddw m15, m5
+%endif
+%endmacro
+
+%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip
+%if %2 == 4
+ %if %5 == 1
+ punpcklbw m4, %3
+ %endif
+ pcmpgtw %3, m15
+ paddw m15, %3
+ pmulhrsw m15, %4
+ %if %5 == 0
+ packsswb m15, m15
+ paddb m4, m15
+ %else
+ paddw m4, m15
+ packuswb m4, m4 ; clip px in [0x0,0xff]
+ pminub m4, m7
+ pmaxub m4, m8
+ %endif
+ vextracti128 xm5, m4, 1
+ movd [dstq+strideq*0], xm4
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+strideq*1], xm4, 1
+ pextrd [dstq+stride3q ], xm5, 1
+%else
+ pcmpgtw m6, %3, m12
+ pcmpgtw m5, %3, m15
+ paddw m12, m6
+ paddw m15, m5
+ %if %5 == 1
+ punpckhbw m5, m4, %3
+ punpcklbw m4, %3
+ %endif
+ pmulhrsw m12, %4
+ pmulhrsw m15, %4
+ %if %5 == 0
+ packsswb m15, m12
+ paddb m4, m15
+ %else
+ paddw m5, m12
+ paddw m4, m15
+ packuswb m4, m5 ; clip px in [0x0,0xff]
+ pminub m4, m7
+ pmaxub m4, m8
+ %endif
+ vextracti128 xm5, m4, 1
+ %if %1 == 4
+ movd [dstq +strideq*0], xm4
+ movd [dst4q+strideq*0], xm5
+ pextrd [dstq +strideq*1], xm4, 1
+ pextrd [dst4q+strideq*1], xm5, 1
+ pextrd [dstq +strideq*2], xm4, 2
+ pextrd [dst4q+strideq*2], xm5, 2
+ pextrd [dstq +stride3q ], xm4, 3
+ pextrd [dst4q+stride3q ], xm5, 3
+ %else
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm5
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+stride3q ], xm5
+ %endif
+%endif
+%endmacro
+
+%macro BORDER_PREP_REGS 2 ; w, h
+ ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+ mov dird, r7m
+ lea dirq, [tableq+dirq*2+14]
+%if %1*%2*2/mmsize > 1
+ %if %1 == 4
+ DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off
+ %else
+ DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off
+ %endif
+ mov hd, %1*%2*2/mmsize
+%else
+ DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off
+%endif
+ lea stkq, [px]
+ pxor m11, m11
+%endmacro
+
+%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max
+ mov kd, 1
+%if %1 == 4
+ movq xm4, [stkq+32*0]
+ movhps xm4, [stkq+32*1]
+ movq xm5, [stkq+32*2]
+ movhps xm5, [stkq+32*3]
+ vinserti128 m4, xm5, 1
+%else
+ mova xm4, [stkq+32*0] ; px
+ vinserti128 m4, [stkq+32*1], 1
+%endif
+ pxor m15, m15 ; sum
+%if %3 == 1
+ mova m7, m4 ; max
+ mova m8, m4 ; min
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength
+ ; mul_tap, w, clip
+ ; load p0/p1
+ movsx offq, byte [dirq+kq+%1] ; off1
+%if %6 == 4
+ movq xm5, [stkq+offq*2+32*0] ; p0
+ movq xm6, [stkq+offq*2+32*2]
+ movhps xm5, [stkq+offq*2+32*1]
+ movhps xm6, [stkq+offq*2+32*3]
+ vinserti128 m5, xm6, 1
+%else
+ movu xm5, [stkq+offq*2+32*0] ; p0
+ vinserti128 m5, [stkq+offq*2+32*1], 1
+%endif
+ neg offq ; -off1
+%if %6 == 4
+ movq xm6, [stkq+offq*2+32*0] ; p1
+ movq xm9, [stkq+offq*2+32*2]
+ movhps xm6, [stkq+offq*2+32*1]
+ movhps xm9, [stkq+offq*2+32*3]
+ vinserti128 m6, xm9, 1
+%else
+ movu xm6, [stkq+offq*2+32*0] ; p1
+ vinserti128 m6, [stkq+offq*2+32*1], 1
+%endif
+%if %7 == 1
+ ; out of bounds values are set to a value that is a both a large unsigned
+ ; value and a negative signed value.
+ ; use signed max and unsigned min to remove them
+ pmaxsw m7, m5 ; max after p0
+ pminuw m8, m5 ; min after p0
+ pmaxsw m7, m6 ; max after p1
+ pminuw m8, m6 ; min after p1
+%endif
+
+ ; accumulate sum[m15] over p0/p1
+ ; calculate difference before converting
+ psubw m5, m4 ; diff_p0(p0 - px)
+ psubw m6, m4 ; diff_p1(p1 - px)
+
+ ; convert to 8-bits with signed saturation
+ ; saturating to large diffs has no impact on the results
+ packsswb m5, m6
+
+ ; group into pairs so we can accumulate using maddubsw
+ pshufb m5, m12
+ pabsb m9, m5
+ psignb m10, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+
+ ; use unsigned min since abs diff can equal 0x80
+ pminub m5, m9
+ pmaddubsw m5, m10
+ paddw m15, m5
+%endmacro
+
+%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip
+ pcmpgtw m9, m11, m15
+ paddw m15, m9
+ pmulhrsw m15, %2
+ paddw m4, m15
+%if %3 == 1
+ pminsw m4, m7
+ pmaxsw m4, m8
+%endif
+ packuswb m4, m4
+ vextracti128 xm5, m4, 1
+%if %1 == 4
+ movd [dstq+strideq*0], xm4
+ pextrd [dstq+strideq*1], xm4, 1
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+stride3q ], xm5, 1
+%else
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*1], xm5
+%endif
+%endmacro
+
+%macro CDEF_FILTER 2 ; w, h
+INIT_YMM avx2
+cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%assign stack_offset_entry stack_offset
+ mov edged, edgem
+ cmp edged, 0xf
+ jne .border_block
+
+ PUSH r11
+ PUSH r12
+%if %2 == 4
+%assign regs_used 13
+ ALLOC_STACK 0x60, 16
+ pmovzxbw xm0, [leftq+1]
+ vpermq m0, m0, q0110
+ psrldq m1, m0, 4
+ vpalignr m2, m0, m0, 12
+ movu [rsp+0x10], m0
+ movu [rsp+0x28], m1
+ movu [rsp+0x40], m2
+%elif %1 == 4
+%assign regs_used 14
+ PUSH r13
+ ALLOC_STACK 8*2+%1*%2*1, 16
+ pmovzxwd m0, [leftq]
+ mova [rsp+0x10], m0
+%else
+%assign regs_used 15
+ PUSH r13
+ PUSH r14
+ ALLOC_STACK 8*4+%1*%2*2+32, 16
+ lea r11, [strideq*3]
+ movu xm4, [dstq+strideq*2]
+ pmovzxwq m0, [leftq+0]
+ pmovzxwq m1, [leftq+8]
+ vinserti128 m4, [dstq+r11], 1
+ pmovzxbd m2, [leftq+1]
+ pmovzxbd m3, [leftq+9]
+ mov [rsp+16], botq
+ mova [rsp+0x20], m0
+ mova [rsp+0x40], m1
+ mova [rsp+0x60], m2
+ mova [rsp+0x80], m3
+ mova [rsp+0xa0], m4
+ lea botq, [dstq+strideq*4]
+%endif
+
+ DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping
+ mov dampingd, r8m
+ xor zerod, zerod
+ movifnidn prid, prim
+ sub dampingd, 31
+ movifnidn secdmpd, secdmpm
+ test prid, prid
+ jz .sec_only
+ movd xm0, prid
+ lzcnt pridmpd, prid
+ add pridmpd, dampingd
+ cmovs pridmpd, zerod
+ mov [rsp+0], pridmpq ; pri_shift
+ test secdmpd, secdmpd
+ jz .pri_only
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+
+ DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir
+ vpbroadcastb m0, xm0 ; pri_strength
+ vpbroadcastb m1, xm1 ; sec_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ lea secq, [tableq+12] ; sec_taps
+
+ PREP_REGS %1, %2
+%if %1*%2 > mmsize
+.v_loop:
+%endif
+ LOAD_BLOCK %1, %2, 1
+.k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0
+ ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2
+ ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2
+ dec kq
+ jge .k_loop
+
+ vpbroadcastd m10, [pw_2048]
+ pxor m9, m9
+ ADJUST_PIXEL %1, %2, m9, m10, 1
+%if %1*%2 > mmsize
+ lea dstq, [dstq+strideq*4]
+ lea top1q, [rsp+0xa0]
+ lea top2q, [rsp+0xb0]
+ mov botq, [rsp+16]
+ add hq, 4
+ jl .v_loop
+%endif
+ RET
+
+.pri_only:
+ DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir
+ vpbroadcastb m0, xm0 ; pri_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ PREP_REGS %1, %2
+ vpbroadcastd m3, [pw_2048]
+ pxor m1, m1
+%if %1*%2 > mmsize
+.pri_v_loop:
+%endif
+ LOAD_BLOCK %1, %2
+.pri_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
+ dec kq
+ jge .pri_k_loop
+ ADJUST_PIXEL %1, %2, m1, m3
+%if %1*%2 > mmsize
+ lea dstq, [dstq+strideq*4]
+ lea top1q, [rsp+0xa0]
+ lea top2q, [rsp+0xb0]
+ mov botq, [rsp+16]
+ add hq, 4
+ jl .pri_v_loop
+%endif
+ RET
+
+.sec_only:
+ DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+ DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table
+ lea tableq, [tap_table]
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir
+ vpbroadcastb m1, xm1 ; sec_strength
+ lea secq, [tableq+12] ; sec_taps
+ PREP_REGS %1, %2
+ vpbroadcastd m2, [pw_2048]
+ pxor m0, m0
+%if %1*%2 > mmsize
+.sec_v_loop:
+%endif
+ LOAD_BLOCK %1, %2
+.sec_k_loop:
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
+ ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
+ dec kq
+ jge .sec_k_loop
+ ADJUST_PIXEL %1, %2, m0, m2
+%if %1*%2 > mmsize
+ lea dstq, [dstq+strideq*4]
+ lea top1q, [rsp+0xa0]
+ lea top2q, [rsp+0xb0]
+ mov botq, [rsp+16]
+ add hq, 4
+ jl .sec_v_loop
+%endif
+ RET
+
+.d0k0:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m6, [dstq+strideq*1-1]
+ vpbroadcastq m10, [dstq+strideq*2-1]
+ movd xm5, [topq+strideq*1+1]
+ movd xm9, [dstq+strideq*0+1]
+ psrldq m11, m6, 2
+ psrldq m12, m10, 2
+ vinserti128 m6, [dstq+stride3q -1], 1
+ vinserti128 m10, [botq -1], 1
+ vpblendd m5, m11, 0x10
+ vpblendd m9, m12, 0x10
+ movu m11, [blend_4x4+16]
+ punpckldq m6, m10
+ punpckldq m5, m9
+ vpblendvb m6, [rsp+gprsize+0x28], m11
+ %else
+ movd xm5, [topq +strideq*1+1]
+ movq xm6, [dstq +strideq*1-1]
+ movq xm10, [dstq +stride3q -1]
+ movq xm11, [dst4q+strideq*1-1]
+ pinsrd xm5, [dstq +strideq*0+1], 1
+ movhps xm6, [dstq +strideq*2-1]
+ movhps xm10, [dst4q+strideq*0-1]
+ movhps xm11, [dst4q+strideq*2-1]
+ psrldq xm9, xm6, 2
+ shufps xm5, xm9, q2010 ; -1 +0 +1 +2
+ shufps xm6, xm10, q2020 ; +1 +2 +3 +4
+ psrldq xm9, xm11, 2
+ psrldq xm10, 2
+ shufps xm10, xm9, q2020 ; +3 +4 +5 +6
+ movd xm9, [dst4q+stride3q -1]
+ pinsrd xm9, [botq -1], 1
+ shufps xm11, xm9, q1020 ; +5 +6 +7 +8
+ pmovzxbw m9, [leftq+3]
+ vinserti128 m6, xm11, 1
+ movu m11, [blend_4x8_0+4]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m9, m11
+ %endif
+%else
+ lea r13, [blend_8x8_0+16]
+ movq xm5, [top2q +1]
+ vbroadcasti128 m10, [dstq+strideq*1-1]
+ vbroadcasti128 m11, [dstq+strideq*2-1]
+ movhps xm5, [dstq+strideq*0+1]
+ vinserti128 m6, m10, [dstq+stride3q-1], 1
+ vinserti128 m9, m11, [botq -1], 1
+ psrldq m10, 2
+ psrldq m11, 2
+ punpcklqdq m6, m9
+ movu m9, [r13+hq*2*1+16*1]
+ punpcklqdq m10, m11
+ vpblendd m5, m10, 0xF0
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9
+%endif
+ ret
+.d1k0:
+.d2k0:
+.d3k0:
+%if %1 == 4
+ %if %2 == 4
+ movq xm6, [dstq+strideq*0-1]
+ movq xm9, [dstq+strideq*1-1]
+ vinserti128 m6, [dstq+strideq*2-1], 1
+ vinserti128 m9, [dstq+stride3q -1], 1
+ movu m11, [rsp+gprsize+0x10]
+ pcmpeqd m12, m12
+ psrldq m5, m6, 2
+ psrldq m10, m9, 2
+ psrld m12, 24
+ punpckldq m6, m9
+ punpckldq m5, m10
+ vpblendvb m6, m11, m12
+ %else
+ movq xm6, [dstq +strideq*0-1]
+ movq xm9, [dstq +strideq*2-1]
+ movhps xm6, [dstq +strideq*1-1]
+ movhps xm9, [dstq +stride3q -1]
+ movq xm10, [dst4q+strideq*0-1]
+ movhps xm10, [dst4q+strideq*1-1]
+ psrldq xm5, xm6, 2
+ psrldq xm11, xm9, 2
+ shufps xm5, xm11, q2020
+ movq xm11, [dst4q+strideq*2-1]
+ movhps xm11, [dst4q+stride3q -1]
+ shufps xm6, xm9, q2020
+ shufps xm9, xm10, xm11, q2020
+ vinserti128 m6, xm9, 1
+ pmovzxbw m9, [leftq+1]
+ psrldq xm10, 2
+ psrldq xm11, 2
+ shufps xm10, xm11, q2020
+ vpbroadcastd m11, [blend_4x8_0+4]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m9, m11
+ %endif
+%else
+ movu xm5, [dstq+strideq*0-1]
+ movu xm9, [dstq+strideq*1-1]
+ vinserti128 m5, [dstq+strideq*2-1], 1
+ vinserti128 m9, [dstq+stride3q -1], 1
+ movu m10, [blend_8x8_0+16]
+ punpcklqdq m6, m5, m9
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64], m10
+ psrldq m5, 2
+ psrldq m9, 2
+ punpcklqdq m5, m9
+%endif
+ ret
+.d4k0:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m10, [dstq+strideq*1-1]
+ vpbroadcastq m11, [dstq+strideq*2-1]
+ movd xm6, [topq+strideq*1-1]
+ movd xm9, [dstq+strideq*0-1]
+ psrldq m5, m10, 2
+ psrldq m12, m11, 2
+ vpblendd m6, m10, 0x10
+ vpblendd m9, m11, 0x10
+ movu m10, [blend_4x4]
+ vinserti128 m5, [dstq+stride3q +1], 1
+ vinserti128 m12, [botq +1], 1
+ punpckldq m6, m9
+ punpckldq m5, m12
+ vpblendvb m6, [rsp+gprsize+0x40], m10
+ %else
+ movd xm6, [topq +strideq*1-1]
+ movq xm9, [dstq +strideq*1-1]
+ movq xm10, [dstq +stride3q -1]
+ movq xm11, [dst4q+strideq*1-1]
+ pinsrd xm6, [dstq +strideq*0-1], 1
+ movhps xm9, [dstq +strideq*2-1]
+ movhps xm10, [dst4q+strideq*0-1]
+ movhps xm11, [dst4q+strideq*2-1]
+ psrldq xm5, xm9, 2
+ shufps xm6, xm9, q2010
+ psrldq xm9, xm10, 2
+ shufps xm5, xm9, q2020
+ shufps xm10, xm11, q2020
+ movd xm9, [dst4q+stride3q +1]
+ vinserti128 m6, xm10, 1
+ pinsrd xm9, [botq +1], 1
+ psrldq xm11, 2
+ pmovzxbw m10, [leftq-1]
+ shufps xm11, xm9, q1020
+ movu m9, [blend_4x8_0]
+ vinserti128 m5, xm11, 1
+ vpblendvb m6, m10, m9
+ %endif
+%else
+ lea r13, [blend_8x8_0+8]
+ movq xm6, [top2q -1]
+ vbroadcasti128 m5, [dstq+strideq*1-1]
+ vbroadcasti128 m9, [dstq+strideq*2-1]
+ movhps xm6, [dstq+strideq*0-1]
+ movu m11, [r13+hq*2*1+16*1]
+ punpcklqdq m10, m5, m9
+ vinserti128 m5, [dstq+stride3q -1], 1
+ vinserti128 m9, [botq -1], 1
+ vpblendd m6, m10, 0xF0
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11
+ psrldq m5, 2
+ psrldq m9, 2
+ punpcklqdq m5, m9
+%endif
+ ret
+.d5k0:
+.d6k0:
+.d7k0:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq+strideq*1 ]
+ vpbroadcastd m5, [dstq+strideq*1 ]
+ vpbroadcastd m9, [dstq+strideq*2 ]
+ vpblendd xm6, [dstq+strideq*0-4], 0x2
+ vpblendd m5, m9, 0x22
+ vpblendd m6, m5, 0x30
+ vinserti128 m5, [dstq+stride3q ], 1
+ vpblendd m5, [botq -20], 0x20
+ %else
+ movd xm6, [topq +strideq*1]
+ movd xm5, [dstq +strideq*1]
+ movd xm9, [dstq +stride3q ]
+ movd xm10, [dst4q+strideq*1]
+ movd xm11, [dst4q+stride3q ]
+ pinsrd xm6, [dstq +strideq*0], 1
+ pinsrd xm5, [dstq +strideq*2], 1
+ pinsrd xm9, [dst4q+strideq*0], 1
+ pinsrd xm10, [dst4q+strideq*2], 1
+ pinsrd xm11, [botq ], 1
+ punpcklqdq xm6, xm5
+ punpcklqdq xm5, xm9
+ punpcklqdq xm9, xm10
+ punpcklqdq xm10, xm11
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ %endif
+%else
+ movq xm6, [top2q ]
+ movq xm5, [dstq+strideq*1]
+ movq xm9, [dstq+stride3q ]
+ movhps xm6, [dstq+strideq*0]
+ movhps xm5, [dstq+strideq*2]
+ movhps xm9, [botq ]
+ vinserti128 m6, xm5, 1
+ vinserti128 m5, xm9, 1
+%endif
+ ret
+.d0k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [dstq+strideq*2-2]
+ movd xm9, [dstq+stride3q -2]
+ movd xm5, [topq+strideq*0+2]
+ movd xm10, [topq+strideq*1+2]
+ pinsrw xm6, [leftq+4], 0
+ pinsrw xm9, [leftq+6], 0
+ vinserti128 m5, [dstq+strideq*0+2], 1
+ vinserti128 m10, [dstq+strideq*1+2], 1
+ vinserti128 m6, [botq+strideq*0-2], 1
+ vinserti128 m9, [botq+strideq*1-2], 1
+ punpckldq m5, m10
+ punpckldq m6, m9
+ %else
+ movq xm6, [dstq +strideq*2-2]
+ movd xm10, [dst4q+strideq*2-2]
+ movd xm5, [topq +strideq*0+2]
+ movq xm9, [dst4q+strideq*0-2]
+ movhps xm6, [dstq +stride3q -2]
+ pinsrw xm10, [dst4q+stride3q ], 3
+ pinsrd xm5, [topq +strideq*1+2], 1
+ movhps xm9, [dst4q+strideq*1-2]
+ pinsrd xm10, [botq +strideq*0-2], 2
+ pinsrd xm5, [dstq +strideq*0+2], 2
+ pinsrd xm10, [botq +strideq*1-2], 3
+ pinsrd xm5, [dstq +strideq*1+2], 3
+ shufps xm11, xm6, xm9, q3131
+ shufps xm6, xm9, q2020
+ movu m9, [blend_4x8_3+8]
+ vinserti128 m6, xm10, 1
+ vinserti128 m5, xm11, 1
+ vpblendvb m6, [rsp+gprsize+0x10+8], m9
+ %endif
+%else
+ lea r13, [blend_8x8_1+16]
+ movq xm6, [dstq+strideq*2-2]
+ movq xm9, [dstq+stride3q -2]
+ movq xm5, [top1q +2]
+ movq xm10, [top2q +2]
+ movu m11, [r13+hq*2*2+16*2]
+ vinserti128 m6, [botq+strideq*0-2], 1
+ vinserti128 m9, [botq+strideq*1-2], 1
+ vinserti128 m5, [dstq+strideq*0+2], 1
+ vinserti128 m10, [dstq+strideq*1+2], 1
+ punpcklqdq m6, m9
+ punpcklqdq m5, m10
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11
+%endif
+ ret
+.d1k1:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m6, [dstq+strideq*1-2]
+ vpbroadcastq m9, [dstq+strideq*2-2]
+ movd xm5, [topq+strideq*1+2]
+ movd xm10, [dstq+strideq*0+2]
+ psrldq m11, m6, 4
+ psrldq m12, m9, 4
+ vpblendd m5, m11, 0x10
+ movq xm11, [leftq+2]
+ vinserti128 m6, [dstq+stride3q-2], 1
+ punpckldq xm11, xm11
+ vpblendd m10, m12, 0x10
+ pcmpeqd m12, m12
+ pmovzxwd m11, xm11
+ psrld m12, 16
+ punpckldq m6, m9
+ vpbroadcastd m9, [botq-2]
+ vpblendvb m6, m11, m12
+ punpckldq m5, m10
+ vpblendd m6, m9, 0x20
+ %else
+ movd xm5, [topq +strideq*1+2]
+ movq xm6, [dstq +strideq*1-2]
+ movq xm9, [dstq +stride3q -2]
+ movq xm10, [dst4q+strideq*1-2]
+ movd xm11, [dst4q+stride3q -2]
+ pinsrd xm5, [dstq +strideq*0+2], 1
+ movhps xm6, [dstq +strideq*2-2]
+ movhps xm9, [dst4q+strideq*0-2]
+ movhps xm10, [dst4q+strideq*2-2]
+ pinsrd xm11, [botq -2], 1
+ shufps xm5, xm6, q3110
+ shufps xm6, xm9, q2020
+ shufps xm9, xm10, q3131
+ shufps xm10, xm11, q1020
+ movu m11, [blend_4x8_2+4]
+ vinserti128 m6, xm10, 1
+ vinserti128 m5, xm9, 1
+ vpblendvb m6, [rsp+gprsize+0x10+4], m11
+ %endif
+%else
+ lea r13, [blend_8x8_1+16]
+ movq xm5, [top2q +2]
+ vbroadcasti128 m6, [dstq+strideq*1-2]
+ vbroadcasti128 m9, [dstq+strideq*2-2]
+ movhps xm5, [dstq+strideq*0+2]
+ shufps m10, m6, m9, q2121
+ vinserti128 m6, [dstq+stride3q -2], 1
+ vinserti128 m9, [botq -2], 1
+ movu m11, [r13+hq*2*1+16*1]
+ vpblendd m5, m10, 0xF0
+ punpcklqdq m6, m9
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11
+%endif
+ ret
+.d2k1:
+%if %1 == 4
+ %if %2 == 4
+ movq xm11, [leftq]
+ movq xm6, [dstq+strideq*0-2]
+ movq xm9, [dstq+strideq*1-2]
+ vinserti128 m6, [dstq+strideq*2-2], 1
+ vinserti128 m9, [dstq+stride3q -2], 1
+ punpckldq xm11, xm11
+ psrldq m5, m6, 4
+ psrldq m10, m9, 4
+ pmovzxwd m11, xm11
+ punpckldq m6, m9
+ punpckldq m5, m10
+ pblendw m6, m11, 0x05
+ %else
+ movq xm5, [dstq +strideq*0-2]
+ movq xm9, [dstq +strideq*2-2]
+ movq xm10, [dst4q+strideq*0-2]
+ movq xm11, [dst4q+strideq*2-2]
+ movhps xm5, [dstq +strideq*1-2]
+ movhps xm9, [dstq +stride3q -2]
+ movhps xm10, [dst4q+strideq*1-2]
+ movhps xm11, [dst4q+stride3q -2]
+ shufps xm6, xm5, xm9, q2020
+ shufps xm5, xm9, q3131
+ shufps xm9, xm10, xm11, q2020
+ shufps xm10, xm11, q3131
+ pmovzxwd m11, [leftq]
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ pblendw m6, m11, 0x55
+ %endif
+%else
+ mova m11, [rsp+gprsize+0x20+hq*8+64]
+ movu xm5, [dstq+strideq*0-2]
+ movu xm9, [dstq+strideq*1-2]
+ vinserti128 m5, [dstq+strideq*2-2], 1
+ vinserti128 m9, [dstq+stride3q -2], 1
+ shufps m6, m5, m9, q1010
+ shufps m5, m9, q2121
+ pblendw m6, m11, 0x11
+%endif
+ ret
+.d3k1:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m11, [dstq+strideq*1-2]
+ vpbroadcastq m12, [dstq+strideq*2-2]
+ movd xm6, [topq+strideq*1-2]
+ movd xm9, [dstq+strideq*0-2]
+ pblendw m11, [leftq-16+2], 0x01
+ pblendw m12, [leftq-16+4], 0x01
+ pinsrw xm9, [leftq- 0+0], 0
+ psrldq m5, m11, 4
+ psrldq m10, m12, 4
+ vinserti128 m5, [dstq+stride3q +2], 1
+ vinserti128 m10, [botq +2], 1
+ vpblendd m6, m11, 0x10
+ vpblendd m9, m12, 0x10
+ punpckldq m6, m9
+ punpckldq m5, m10
+ %else
+ movd xm6, [topq +strideq*1-2]
+ movq xm5, [dstq +strideq*1-2]
+ movq xm9, [dstq +stride3q -2]
+ movq xm10, [dst4q+strideq*1-2]
+ movd xm11, [dst4q+stride3q +2]
+ pinsrw xm6, [dstq +strideq*0 ], 3
+ movhps xm5, [dstq +strideq*2-2]
+ movhps xm9, [dst4q+strideq*0-2]
+ movhps xm10, [dst4q+strideq*2-2]
+ pinsrd xm11, [botq +2], 1
+ shufps xm6, xm5, q2010
+ shufps xm5, xm9, q3131
+ shufps xm9, xm10, q2020
+ shufps xm10, xm11, q1031
+ movu m11, [blend_4x8_2]
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, [rsp+gprsize+0x10-4], m11
+ %endif
+%else
+ lea r13, [blend_8x8_1+8]
+ movq xm6, [top2q -2]
+ vbroadcasti128 m5, [dstq+strideq*1-2]
+ vbroadcasti128 m10, [dstq+strideq*2-2]
+ movhps xm6, [dstq+strideq*0-2]
+ punpcklqdq m9, m5, m10
+ vinserti128 m5, [dstq+stride3q -2], 1
+ vinserti128 m10, [botq -2], 1
+ movu m11, [r13+hq*2*1+16*1]
+ vpblendd m6, m9, 0xF0
+ shufps m5, m10, q2121
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11
+%endif
+ ret
+.d4k1:
+%if %1 == 4
+ %if %2 == 4
+ vinserti128 m6, [dstq+strideq*0-2], 1
+ vinserti128 m9, [dstq+strideq*1-2], 1
+ movd xm5, [dstq+strideq*2+2]
+ movd xm10, [dstq+stride3q +2]
+ pblendw m6, [leftq-16+0], 0x01
+ pblendw m9, [leftq-16+2], 0x01
+ vinserti128 m5, [botq+strideq*0+2], 1
+ vinserti128 m10, [botq+strideq*1+2], 1
+ vpblendd m6, [topq+strideq*0-2], 0x01
+ vpblendd m9, [topq+strideq*1-2], 0x01
+ punpckldq m5, m10
+ punpckldq m6, m9
+ %else
+ movd xm6, [topq +strideq*0-2]
+ movq xm5, [dstq +strideq*2-2]
+ movq xm9, [dst4q+strideq*0-2]
+ movd xm10, [dst4q+strideq*2+2]
+ pinsrd xm6, [topq +strideq*1-2], 1
+ movhps xm5, [dstq +stride3q -2]
+ movhps xm9, [dst4q+strideq*1-2]
+ pinsrd xm10, [dst4q+stride3q +2], 1
+ pinsrd xm6, [dstq +strideq*0-2], 2
+ pinsrd xm10, [botq +strideq*0+2], 2
+ pinsrd xm6, [dstq +strideq*1-2], 3
+ pinsrd xm10, [botq +strideq*1+2], 3
+ shufps xm11, xm5, xm9, q2020
+ shufps xm5, xm9, q3131
+ movu m9, [blend_4x8_3]
+ vinserti128 m6, xm11, 1
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, [rsp+gprsize+0x10-8], m9
+ %endif
+%else
+ lea r13, [blend_8x8_1]
+ movu m11, [r13+hq*2*2+16*2]
+ movq xm6, [top1q -2]
+ movq xm9, [top2q -2]
+ movq xm5, [dstq+strideq*2+2]
+ movq xm10, [dstq+stride3q +2]
+ vinserti128 m6, [dstq+strideq*0-2], 1
+ vinserti128 m9, [dstq+strideq*1-2], 1
+ vinserti128 m5, [botq+strideq*0+2], 1
+ vinserti128 m10, [botq+strideq*1+2], 1
+ punpcklqdq m6, m9
+ vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11
+ punpcklqdq m5, m10
+%endif
+ ret
+.d5k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq+strideq*0-1]
+ movd xm9, [topq+strideq*1-1]
+ movd xm5, [dstq+strideq*2+1]
+ movd xm10, [dstq+stride3q +1]
+ pcmpeqd m12, m12
+ pmovzxbw m11, [leftq-8+1]
+ psrld m12, 24
+ vinserti128 m6, [dstq+strideq*0-1], 1
+ vinserti128 m9, [dstq+strideq*1-1], 1
+ vinserti128 m5, [botq+strideq*0+1], 1
+ vinserti128 m10, [botq+strideq*1+1], 1
+ punpckldq m6, m9
+ pxor m9, m9
+ vpblendd m12, m9, 0x0F
+ punpckldq m5, m10
+ vpblendvb m6, m11, m12
+ %else
+ movd xm6, [topq +strideq*0-1]
+ movq xm5, [dstq +strideq*2-1]
+ movq xm9, [dst4q+strideq*0-1]
+ movd xm10, [dst4q+strideq*2+1]
+ pinsrd xm6, [topq +strideq*1-1], 1
+ movhps xm5, [dstq +stride3q -1]
+ movhps xm9, [dst4q+strideq*1-1]
+ pinsrd xm10, [dst4q+stride3q +1], 1
+ pinsrd xm6, [dstq +strideq*0-1], 2
+ pinsrd xm10, [botq +strideq*0+1], 2
+ pinsrd xm6, [dstq +strideq*1-1], 3
+ pinsrd xm10, [botq +strideq*1+1], 3
+ shufps xm11, xm5, xm9, q2020
+ vinserti128 m6, xm11, 1
+ pmovzxbw m11, [leftq-3]
+ psrldq xm5, 2
+ psrldq xm9, 2
+ shufps xm5, xm9, q2020
+ movu m9, [blend_4x8_1]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m11, m9
+ %endif
+%else
+ lea r13, [blend_8x8_0]
+ movu m11, [r13+hq*2*2+16*2]
+ movq xm6, [top1q -1]
+ movq xm9, [top2q -1]
+ movq xm5, [dstq+strideq*2+1]
+ movq xm10, [dstq+stride3q +1]
+ vinserti128 m6, [dstq+strideq*0-1], 1
+ vinserti128 m9, [dstq+strideq*1-1], 1
+ vinserti128 m5, [botq+strideq*0+1], 1
+ vinserti128 m10, [botq+strideq*1+1], 1
+ punpcklqdq m6, m9
+ punpcklqdq m5, m10
+ vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11
+%endif
+ ret
+.d6k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq+strideq*0]
+ movd xm9, [topq+strideq*1]
+ movd xm5, [dstq+strideq*2]
+ movd xm10, [dstq+stride3q ]
+ vinserti128 m6, [dstq+strideq*0], 1
+ vinserti128 m9, [dstq+strideq*1], 1
+ vinserti128 m5, [botq+strideq*0], 1
+ vinserti128 m10, [botq+strideq*1], 1
+ punpckldq m6, m9
+ punpckldq m5, m10
+ %else
+ movd xm5, [dstq +strideq*2]
+ movd xm6, [topq +strideq*0]
+ movd xm9, [dst4q+strideq*2]
+ pinsrd xm5, [dstq +stride3q ], 1
+ pinsrd xm6, [topq +strideq*1], 1
+ pinsrd xm9, [dst4q+stride3q ], 1
+ pinsrd xm5, [dst4q+strideq*0], 2
+ pinsrd xm6, [dstq +strideq*0], 2
+ pinsrd xm9, [botq +strideq*0], 2
+ pinsrd xm5, [dst4q+strideq*1], 3
+ pinsrd xm6, [dstq +strideq*1], 3
+ pinsrd xm9, [botq +strideq*1], 3
+ vinserti128 m6, xm5, 1
+ vinserti128 m5, xm9, 1
+ %endif
+%else
+ movq xm5, [dstq+strideq*2]
+ movq xm9, [botq+strideq*0]
+ movq xm6, [top1q ]
+ movq xm10, [dstq+strideq*0]
+ movhps xm5, [dstq+stride3q ]
+ movhps xm9, [botq+strideq*1]
+ movhps xm6, [top2q ]
+ movhps xm10, [dstq+strideq*1]
+ vinserti128 m5, xm9, 1
+ vinserti128 m6, xm10, 1
+%endif
+ ret
+.d7k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm5, [dstq+strideq*2-1]
+ movd xm9, [dstq+stride3q -1]
+ movd xm6, [topq+strideq*0+1]
+ movd xm10, [topq+strideq*1+1]
+ pinsrb xm5, [leftq+ 5], 0
+ pinsrb xm9, [leftq+ 7], 0
+ vinserti128 m6, [dstq+strideq*0+1], 1
+ vinserti128 m10, [dstq+strideq*1+1], 1
+ vinserti128 m5, [botq+strideq*0-1], 1
+ vinserti128 m9, [botq+strideq*1-1], 1
+ punpckldq m6, m10
+ punpckldq m5, m9
+ %else
+ movd xm6, [topq +strideq*0+1]
+ movq xm9, [dstq +strideq*2-1]
+ movq xm10, [dst4q+strideq*0-1]
+ movd xm11, [dst4q+strideq*2-1]
+ pinsrd xm6, [topq +strideq*1+1], 1
+ movhps xm9, [dstq +stride3q -1]
+ movhps xm10, [dst4q+strideq*1-1]
+ pinsrd xm11, [dst4q+stride3q -1], 1
+ pinsrd xm6, [dstq +strideq*0+1], 2
+ pinsrd xm11, [botq +strideq*0-1], 2
+ pinsrd xm6, [dstq +strideq*1+1], 3
+ pinsrd xm11, [botq +strideq*1-1], 3
+ shufps xm5, xm9, xm10, q2020
+ vinserti128 m5, xm11, 1
+ pmovzxbw m11, [leftq+5]
+ psrldq xm9, 2
+ psrldq xm10, 2
+ shufps xm9, xm10, q2020
+ movu m10, [blend_4x8_1+8]
+ vinserti128 m6, xm9, 1
+ vpblendvb m5, m11, m10
+ %endif
+%else
+ lea r13, [blend_8x8_0+16]
+ movq xm5, [dstq+strideq*2-1]
+ movq xm9, [botq+strideq*0-1]
+ movq xm6, [top1q +1]
+ movq xm10, [dstq+strideq*0+1]
+ movhps xm5, [dstq+stride3q -1]
+ movhps xm9, [botq+strideq*1-1]
+ movhps xm6, [top2q +1]
+ movhps xm10, [dstq+strideq*1+1]
+ movu m11, [r13+hq*2*2+16*2]
+ vinserti128 m5, xm9, 1
+ vinserti128 m6, xm10, 1
+ vpblendvb m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11
+%endif
+ ret
+
+.border_block:
+ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge
+%define rstk rsp
+%assign stack_offset stack_offset_entry
+%assign regs_used 11
+ ALLOC_STACK 2*16+(%2+4)*32, 16
+%define px rsp+2*16+2*32
+
+ pcmpeqw m14, m14
+ psllw m14, 15 ; 0x8000
+
+ ; prepare pixel buffers - body/right
+%if %1 == 4
+ INIT_XMM avx2
+%endif
+%if %2 == 8
+ lea dst4q, [dstq+strideq*4]
+%endif
+ lea stride3q, [strideq*3]
+ test edgeb, 2 ; have_right
+ jz .no_right
+ pmovzxbw m1, [dstq+strideq*0]
+ pmovzxbw m2, [dstq+strideq*1]
+ pmovzxbw m3, [dstq+strideq*2]
+ pmovzxbw m4, [dstq+stride3q]
+ mova [px+0*32], m1
+ mova [px+1*32], m2
+ mova [px+2*32], m3
+ mova [px+3*32], m4
+%if %2 == 8
+ pmovzxbw m1, [dst4q+strideq*0]
+ pmovzxbw m2, [dst4q+strideq*1]
+ pmovzxbw m3, [dst4q+strideq*2]
+ pmovzxbw m4, [dst4q+stride3q]
+ mova [px+4*32], m1
+ mova [px+5*32], m2
+ mova [px+6*32], m3
+ mova [px+7*32], m4
+%endif
+ jmp .body_done
+.no_right:
+%if %1 == 4
+ movd xm1, [dstq+strideq*0]
+ movd xm2, [dstq+strideq*1]
+ movd xm3, [dstq+strideq*2]
+ movd xm4, [dstq+stride3q]
+ pmovzxbw xm1, xm1
+ pmovzxbw xm2, xm2
+ pmovzxbw xm3, xm3
+ pmovzxbw xm4, xm4
+ movq [px+0*32], xm1
+ movq [px+1*32], xm2
+ movq [px+2*32], xm3
+ movq [px+3*32], xm4
+%else
+ pmovzxbw xm1, [dstq+strideq*0]
+ pmovzxbw xm2, [dstq+strideq*1]
+ pmovzxbw xm3, [dstq+strideq*2]
+ pmovzxbw xm4, [dstq+stride3q]
+ mova [px+0*32], xm1
+ mova [px+1*32], xm2
+ mova [px+2*32], xm3
+ mova [px+3*32], xm4
+%endif
+ movd [px+0*32+%1*2], xm14
+ movd [px+1*32+%1*2], xm14
+ movd [px+2*32+%1*2], xm14
+ movd [px+3*32+%1*2], xm14
+%if %2 == 8
+ %if %1 == 4
+ movd xm1, [dst4q+strideq*0]
+ movd xm2, [dst4q+strideq*1]
+ movd xm3, [dst4q+strideq*2]
+ movd xm4, [dst4q+stride3q]
+ pmovzxbw xm1, xm1
+ pmovzxbw xm2, xm2
+ pmovzxbw xm3, xm3
+ pmovzxbw xm4, xm4
+ movq [px+4*32], xm1
+ movq [px+5*32], xm2
+ movq [px+6*32], xm3
+ movq [px+7*32], xm4
+ %else
+ pmovzxbw xm1, [dst4q+strideq*0]
+ pmovzxbw xm2, [dst4q+strideq*1]
+ pmovzxbw xm3, [dst4q+strideq*2]
+ pmovzxbw xm4, [dst4q+stride3q]
+ mova [px+4*32], xm1
+ mova [px+5*32], xm2
+ mova [px+6*32], xm3
+ mova [px+7*32], xm4
+ %endif
+ movd [px+4*32+%1*2], xm14
+ movd [px+5*32+%1*2], xm14
+ movd [px+6*32+%1*2], xm14
+ movd [px+7*32+%1*2], xm14
+%endif
+.body_done:
+
+ ; top
+ test edgeb, 4 ; have_top
+ jz .no_top
+ test edgeb, 1 ; have_left
+ jz .top_no_left
+ test edgeb, 2 ; have_right
+ jz .top_no_right
+ pmovzxbw m1, [topq+strideq*0-(%1/2)]
+ pmovzxbw m2, [topq+strideq*1-(%1/2)]
+ movu [px-2*32-%1], m1
+ movu [px-1*32-%1], m2
+ jmp .top_done
+.top_no_right:
+ pmovzxbw m1, [topq+strideq*0-%1]
+ pmovzxbw m2, [topq+strideq*1-%1]
+ movu [px-2*32-%1*2], m1
+ movu [px-1*32-%1*2], m2
+ movd [px-2*32+%1*2], xm14
+ movd [px-1*32+%1*2], xm14
+ jmp .top_done
+.top_no_left:
+ test edgeb, 2 ; have_right
+ jz .top_no_left_right
+ pmovzxbw m1, [topq+strideq*0]
+ pmovzxbw m2, [topq+strideq*1]
+ mova [px-2*32+0], m1
+ mova [px-1*32+0], m2
+ movd [px-2*32-4], xm14
+ movd [px-1*32-4], xm14
+ jmp .top_done
+.top_no_left_right:
+%if %1 == 4
+ movd xm1, [topq+strideq*0]
+ pinsrd xm1, [topq+strideq*1], 1
+ pmovzxbw xm1, xm1
+ movq [px-2*32+0], xm1
+ movhps [px-1*32+0], xm1
+%else
+ pmovzxbw xm1, [topq+strideq*0]
+ pmovzxbw xm2, [topq+strideq*1]
+ mova [px-2*32+0], xm1
+ mova [px-1*32+0], xm2
+%endif
+ movd [px-2*32-4], xm14
+ movd [px-1*32-4], xm14
+ movd [px-2*32+%1*2], xm14
+ movd [px-1*32+%1*2], xm14
+ jmp .top_done
+.no_top:
+ movu [px-2*32-%1], m14
+ movu [px-1*32-%1], m14
+.top_done:
+
+ ; left
+ test edgeb, 1 ; have_left
+ jz .no_left
+ pmovzxbw xm1, [leftq+ 0]
+%if %2 == 8
+ pmovzxbw xm2, [leftq+ 8]
+%endif
+ movd [px+0*32-4], xm1
+ pextrd [px+1*32-4], xm1, 1
+ pextrd [px+2*32-4], xm1, 2
+ pextrd [px+3*32-4], xm1, 3
+%if %2 == 8
+ movd [px+4*32-4], xm2
+ pextrd [px+5*32-4], xm2, 1
+ pextrd [px+6*32-4], xm2, 2
+ pextrd [px+7*32-4], xm2, 3
+%endif
+ jmp .left_done
+.no_left:
+ movd [px+0*32-4], xm14
+ movd [px+1*32-4], xm14
+ movd [px+2*32-4], xm14
+ movd [px+3*32-4], xm14
+%if %2 == 8
+ movd [px+4*32-4], xm14
+ movd [px+5*32-4], xm14
+ movd [px+6*32-4], xm14
+ movd [px+7*32-4], xm14
+%endif
+.left_done:
+
+ ; bottom
+ DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge
+ test edgeb, 8 ; have_bottom
+ jz .no_bottom
+ test edgeb, 1 ; have_left
+ jz .bottom_no_left
+ test edgeb, 2 ; have_right
+ jz .bottom_no_right
+ pmovzxbw m1, [botq+strideq*0-(%1/2)]
+ pmovzxbw m2, [botq+strideq*1-(%1/2)]
+ movu [px+(%2+0)*32-%1], m1
+ movu [px+(%2+1)*32-%1], m2
+ jmp .bottom_done
+.bottom_no_right:
+ pmovzxbw m1, [botq+strideq*0-%1]
+ pmovzxbw m2, [botq+strideq*1-%1]
+ movu [px+(%2+0)*32-%1*2], m1
+ movu [px+(%2+1)*32-%1*2], m2
+%if %1 == 8
+ movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu
+%endif
+ movd [px+(%2+0)*32+%1*2], xm14
+ movd [px+(%2+1)*32+%1*2], xm14
+ jmp .bottom_done
+.bottom_no_left:
+ test edgeb, 2 ; have_right
+ jz .bottom_no_left_right
+ pmovzxbw m1, [botq+strideq*0]
+ pmovzxbw m2, [botq+strideq*1]
+ mova [px+(%2+0)*32+0], m1
+ mova [px+(%2+1)*32+0], m2
+ movd [px+(%2+0)*32-4], xm14
+ movd [px+(%2+1)*32-4], xm14
+ jmp .bottom_done
+.bottom_no_left_right:
+%if %1 == 4
+ movd xm1, [botq+strideq*0]
+ pinsrd xm1, [botq+strideq*1], 1
+ pmovzxbw xm1, xm1
+ movq [px+(%2+0)*32+0], xm1
+ movhps [px+(%2+1)*32+0], xm1
+%else
+ pmovzxbw xm1, [botq+strideq*0]
+ pmovzxbw xm2, [botq+strideq*1]
+ mova [px+(%2+0)*32+0], xm1
+ mova [px+(%2+1)*32+0], xm2
+%endif
+ movd [px+(%2+0)*32-4], xm14
+ movd [px+(%2+1)*32-4], xm14
+ movd [px+(%2+0)*32+%1*2], xm14
+ movd [px+(%2+1)*32+%1*2], xm14
+ jmp .bottom_done
+.no_bottom:
+ movu [px+(%2+0)*32-%1], m14
+ movu [px+(%2+1)*32-%1], m14
+.bottom_done:
+
+ ; actual filter
+ INIT_YMM avx2
+ DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero
+%undef edged
+ ; register to shuffle values into after packing
+ vbroadcasti128 m12, [shufb_lohi]
+
+ mov dampingd, r8m
+ xor zerod, zerod
+ movifnidn prid, prim
+ sub dampingd, 31
+ movifnidn secdmpd, secdmpm
+ test prid, prid
+ jz .border_sec_only
+ movd xm0, prid
+ lzcnt pridmpd, prid
+ add pridmpd, dampingd
+ cmovs pridmpd, zerod
+ mov [rsp+0], pridmpq ; pri_shift
+ test secdmpd, secdmpd
+ jz .border_pri_only
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+
+ DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3
+ vpbroadcastb m0, xm0 ; pri_strength
+ vpbroadcastb m1, xm1 ; sec_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ lea secq, [tableq+12] ; sec_taps
+
+ BORDER_PREP_REGS %1, %2
+%if %1*%2*2/mmsize > 1
+.border_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2, 1
+.border_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1
+ ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1
+ ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1
+ dec kq
+ jge .border_k_loop
+
+ vpbroadcastd m10, [pw_2048]
+ BORDER_ADJUST_PIXEL %1, m10, 1
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_v_loop
+%endif
+ RET
+
+.border_pri_only:
+ DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3
+ vpbroadcastb m0, xm0 ; pri_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ BORDER_PREP_REGS %1, %2
+ vpbroadcastd m1, [pw_2048]
+%if %1*%2*2/mmsize > 1
+.border_pri_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2
+.border_pri_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
+ dec kq
+ jge .border_pri_k_loop
+ BORDER_ADJUST_PIXEL %1, m1
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_pri_v_loop
+%endif
+ RET
+
+.border_sec_only:
+ DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ mov [rsp+8], secdmpq ; sec_shift
+ DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+ DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3
+ vpbroadcastb m1, xm1 ; sec_strength
+ lea secq, [tableq+12] ; sec_taps
+ BORDER_PREP_REGS %1, %2
+ vpbroadcastd m0, [pw_2048]
+%if %1*%2*2/mmsize > 1
+.border_sec_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2
+.border_sec_k_loop:
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
+ ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
+ dec kq
+ jge .border_sec_k_loop
+ BORDER_ADJUST_PIXEL %1, m0
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_sec_v_loop
+%endif
+ RET
+%endmacro
+
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
+
+INIT_YMM avx2
+cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3
+ lea stride3q, [strideq*3]
+ movq xm0, [srcq+strideq*0]
+ movq xm1, [srcq+strideq*1]
+ movq xm2, [srcq+strideq*2]
+ movq xm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m4, [srcq+stride3q ]
+ vpbroadcastq m5, [srcq+strideq*2]
+ vpblendd m0, m4, 0xf0
+ vpblendd m1, m5, 0xf0
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpbroadcastq m5, [srcq+strideq*0]
+ vpblendd m2, m4, 0xf0
+ vpblendd m3, m5, 0xf0
+ pxor m4, m4
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+cglobal_label .main
+ vpbroadcastd m4, [pw_128]
+ PROLOGUE 3, 4, 15
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+
+ ; shuffle registers to generate partial_sum_diag[0-1] together
+ vperm2i128 m7, m0, m0, 0x01
+ vperm2i128 m6, m1, m1, 0x01
+ vperm2i128 m5, m2, m2, 0x01
+ vperm2i128 m4, m3, m3, 0x01
+
+ ; start with partial_sum_hv[0-1]
+ paddw m8, m0, m1
+ paddw m9, m2, m3
+ phaddw m10, m0, m1
+ phaddw m11, m2, m3
+ paddw m8, m9
+ phaddw m10, m11
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ paddw xm8, xm9 ; partial_sum_hv[1]
+ phaddw xm10, xm11 ; partial_sum_hv[0]
+ vinserti128 m8, xm10, 1
+ vpbroadcastd m9, [div_table+44]
+ pmaddwd m8, m8
+ pmulld m8, m9 ; cost6[2a-d] | cost2[a-d]
+
+ ; create aggregates [lower half]:
+ ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+
+ ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0
+ ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+
+ ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x
+ ; and [upper half]:
+ ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+
+ ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567
+ ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+
+ ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx
+ ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m9, m1, 2
+ psrldq m10, m1, 14
+ pslldq m11, m2, 4
+ psrldq m12, m2, 12
+ pslldq m13, m3, 6
+ psrldq m14, m3, 10
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14
+ pslldq m11, m4, 8
+ psrldq m12, m4, 8
+ pslldq m13, m5, 10
+ psrldq m14, m5, 6
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14
+ pslldq m11, m6, 12
+ psrldq m12, m6, 4
+ pslldq m13, m7, 14
+ psrldq m14, m7, 2
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero]
+ vbroadcasti128 m14, [shufw_6543210x]
+ vbroadcasti128 m13, [div_table+16]
+ vbroadcasti128 m12, [div_table+0]
+ paddw m9, m0 ; partial_sum_diag[0/1][0-7]
+ pshufb m10, m14
+ punpckhwd m11, m9, m10
+ punpcklwd m9, m10
+ pmaddwd m11, m11
+ pmaddwd m9, m9
+ pmulld m11, m13
+ pmulld m9, m12
+ paddd m9, m11 ; cost0[a-d] | cost4[a-d]
+
+ ; merge horizontally and vertically for partial_sum_alt[0-3]
+ paddw m10, m0, m1
+ paddw m11, m2, m3
+ paddw m12, m4, m5
+ paddw m13, m6, m7
+ phaddw m0, m4
+ phaddw m1, m5
+ phaddw m2, m6
+ phaddw m3, m7
+
+ ; create aggregates [lower half]:
+ ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234
+ ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx
+ ; and [upper half]:
+ ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
+ ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
+ ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m4, m11, 2
+ psrldq m11, 14
+ pslldq m5, m12, 4
+ psrldq m12, 12
+ pslldq m6, m13, 6
+ psrldq m13, 10
+ paddw m4, m10
+ paddw m11, m12
+ vpbroadcastd m12, [div_table+44]
+ paddw m5, m6
+ paddw m11, m13 ; partial_sum_alt[3/2] right
+ vbroadcasti128 m13, [div_table+32]
+ paddw m4, m5 ; partial_sum_alt[3/2] left
+ pshuflw m5, m11, q3012
+ punpckhwd m6, m11, m4
+ punpcklwd m4, m5
+ pmaddwd m6, m6
+ pmaddwd m4, m4
+ pmulld m6, m12
+ pmulld m4, m13
+ paddd m4, m6 ; cost7[a-d] | cost5[a-d]
+
+ ; create aggregates [lower half]:
+ ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234
+ ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx
+ ; and [upper half]:
+ ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
+ ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
+ ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m5, m1, 2
+ psrldq m1, 14
+ pslldq m6, m2, 4
+ psrldq m2, 12
+ pslldq m7, m3, 6
+ psrldq m3, 10
+ paddw m5, m0
+ paddw m1, m2
+ paddw m6, m7
+ paddw m1, m3 ; partial_sum_alt[0/1] right
+ paddw m5, m6 ; partial_sum_alt[0/1] left
+ pshuflw m0, m1, q3012
+ punpckhwd m1, m5
+ punpcklwd m5, m0
+ pmaddwd m1, m1
+ pmaddwd m5, m5
+ pmulld m1, m12
+ pmulld m5, m13
+ paddd m5, m1 ; cost1[a-d] | cost3[a-d]
+
+ mova xm0, [pd_47130256+ 16]
+ mova m1, [pd_47130256]
+ phaddd m9, m8
+ phaddd m5, m4
+ phaddd m9, m5
+ vpermd m0, m9 ; cost[0-3]
+ vpermd m1, m9 ; cost[4-7] | cost[0-3]
+
+ ; now find the best cost
+ pmaxsd xm2, xm0, xm1
+ pshufd xm3, xm2, q1032
+ pmaxsd xm2, xm3
+ pshufd xm3, xm2, q2301
+ pmaxsd xm2, xm3 ; best cost
+
+ ; find the idx using minpos
+ ; make everything other than the best cost negative via subtraction
+ ; find the min of unsigned 16-bit ints to sort out the negative values
+ psubd xm4, xm1, xm2
+ psubd xm3, xm0, xm2
+ packssdw xm3, xm4
+ phminposuw xm3, xm3
+
+ ; convert idx to 32-bits
+ psrld xm3, 16
+ movd eax, xm3
+
+ ; get idx^4 complement
+ vpermd m3, m1
+ psubd xm2, xm3
+ psrld xm2, 10
+ movd [varq], xm2
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/cdef_avx512.asm b/third_party/dav1d/src/x86/cdef_avx512.asm
new file mode 100644
index 0000000000..b4f9c008ca
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef_avx512.asm
@@ -0,0 +1,860 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+%macro DUP4 1-*
+ %rep %0
+ times 4 db %1
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro DIRS 16 ; cdef_directions[]
+ %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1
+ ; masking away unused bits allows us to use a single vpaddd {1to16}
+ ; instruction instead of having to do vpbroadcastd + paddb
+ db %13 & 0x3f, -%13 & 0x3f
+ %rotate 1
+ %endrep
+%endmacro
+
+SECTION_RODATA 64
+
+lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+ db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13
+ db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37
+ db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57
+lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+ db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13
+lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29
+ db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45
+ db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61
+ db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95
+pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7
+lut_perm_8x8a: db 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55
+ db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59
+lut_perm_8x8b: db 12, 13, 0, 1, 2, 3, 4, 5, 14, 15, 16, 17, 18, 19, 20, 21
+ db 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 20, 21, 22, 23, 24, 25
+ db 28, 29, 32, 33, 34, 35, 36, 37, 30, 31, 48, 49, 50, 51, 52, 53
+ db 34, 35, 36, 37, 38, 39, 40, 41, 50, 51, 52, 53, 54, 55, 56, 57
+end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
+end_perm_clip: db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
+ db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62
+ db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31
+ db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63
+edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001
+ dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011
+ dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101
+ dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111
+ dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001
+ dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011
+ dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101
+ dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111
+px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45
+cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15
+gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0
+ dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2
+ dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4
+ dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6
+pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4
+sec_tap: db 32, 32, 16, 16
+pd_268435568: dd 268435568
+
+SECTION .text
+
+%if WIN64
+DECLARE_REG_TMP 4
+%else
+DECLARE_REG_TMP 8
+%endif
+
+; lut:
+; t0 t1 t2 t3 t4 t5 t6 t7
+; T0 T1 T2 T3 T4 T5 T6 T7
+; L0 L1 00 01 02 03 04 05
+; L2 L3 10 11 12 13 14 15
+; L4 L5 20 21 22 23 24 25
+; L6 L7 30 31 32 33 34 35
+; b0 b1 b2 b3 b4 b5 b6 b7
+; B0 B1 B2 B3 B4 B5 B6 B7
+
+INIT_ZMM avx512icl
+cglobal cdef_filter_4x4_8bpc, 5, 8, 13, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r7-edge_mask
+ movq xmm0, [dstq+strideq*0]
+ movhps xmm0, [dstq+strideq*1]
+ lea r7, [edge_mask]
+ movq xmm1, [topq+strideq*0-2]
+ movhps xmm1, [topq+strideq*1-2]
+ mov r6d, edgem
+ vinserti32x4 ym0, ymm0, [leftq], 1
+ lea r2, [strideq*3]
+ vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1
+ mova m5, [base+lut_perm_4x4]
+ vinserti32x4 m0, [dstq+r2], 2
+ test r6b, 0x08 ; avoid buffer overread
+ jz .main
+ vinserti32x4 m1, [botq+strideq*0-4], 2
+ vinserti32x4 m0, [botq+strideq*1-4], 3
+.main:
+ movifnidn prid, prim
+ mov t0d, dirm
+ mova m3, [base+px_idx]
+ mov r3d, dampingm
+ vpermi2b m5, m0, m1 ; lut
+ vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+ pxor m7, m7
+ lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8
+ vpermb m6, m3, m5 ; px
+ cmp r6d, 0x0f
+ jne .mask_edges ; mask edges only if required
+ test prid, prid
+ jz .sec_only
+ vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+ vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1
+%macro CDEF_FILTER_4x4_PRI 0
+ vpcmpub k1, m6, m1, 6 ; px > pN
+ psubb m2, m1, m6
+ lzcnt r6d, prid
+ vpsubb m2{k1}, m6, m1 ; abs(diff)
+ vpbroadcastb m4, prid
+ and prid, 1
+ vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift
+ movifnidn secd, secm
+ vpbroadcastd m10, [base+pri_tap+priq*4]
+ vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap)
+ psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift)))
+ pminub m2, m4
+ vpdpbusd m0, m2, m10 ; sum
+%endmacro
+ CDEF_FILTER_4x4_PRI
+ test secd, secd
+ jz .end_no_clip
+ call .sec
+.end_clip:
+ pminub m4, m6, m1
+ pmaxub m1, m6
+ pminub m5, m2, m3
+ pmaxub m2, m3
+ pminub m4, m5
+ pmaxub m2, m1
+ psrldq m1, m4, 2
+ psrldq m3, m2, 2
+ pminub m1, m4
+ vpcmpw k1, m0, m7, 1
+ vpshldd m6, m0, 8
+ pmaxub m2, m3
+ pslldq m3, m1, 1
+ psubw m7, m0
+ paddusw m0, m6 ; clip >0xff
+ vpsubusw m0{k1}, m6, m7 ; clip <0x00
+ pslldq m4, m2, 1
+ pminub m1, m3
+ pmaxub m2, m4
+ pmaxub m0, m1
+ pminub m0, m2
+ jmp .end
+.sec_only:
+ movifnidn secd, secm
+ call .sec
+.end_no_clip:
+ vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+.end:
+ mova xm1, [base+end_perm]
+ vpermb m0, m1, m0 ; output in bits 8-15 of each dword
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ RET
+.mask_edges_sec_only:
+ movifnidn secd, secm
+ call .mask_edges_sec
+ jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+ vpbroadcastq m8, [base+edge_mask+r6*8]
+ test prid, prid
+ jz .mask_edges_sec_only
+ vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16}
+ vpshufbitqmb k1, m8, m2 ; index in-range
+ mova m1, m6
+ vpermb m1{k1}, m2, m5
+ CDEF_FILTER_4x4_PRI
+ test secd, secd
+ jz .end_no_clip
+ call .mask_edges_sec
+ jmp .end_clip
+.mask_edges_sec:
+ vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16}
+ vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16}
+ vpshufbitqmb k1, m8, m4
+ mova m2, m6
+ vpermb m2{k1}, m4, m5
+ vpshufbitqmb k1, m8, m9
+ mova m3, m6
+ vpermb m3{k1}, m9, m5
+ jmp .sec_main
+ALIGN function_align
+.sec:
+ vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+ vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+ vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1
+ vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3
+.sec_main:
+ vpbroadcastd m8, [base+sec_tap]
+ vpcmpub k1, m6, m2, 6
+ psubb m4, m2, m6
+ vpbroadcastb m12, secd
+ lzcnt secd, secd
+ vpsubb m4{k1}, m6, m2
+ vpcmpub k2, m6, m3, 6
+ vpbroadcastq m11, [r3+secq*8]
+ gf2p8affineqb m10, m4, m11, 0
+ psubb m5, m3, m6
+ mova m9, m8
+ vpsubb m8{k1}, m7, m8
+ psubusb m10, m12, m10
+ vpsubb m5{k2}, m6, m3
+ pminub m4, m10
+ vpdpbusd m0, m4, m8
+ gf2p8affineqb m11, m5, m11, 0
+ vpsubb m9{k2}, m7, m9
+ psubusb m12, m11
+ pminub m5, m12
+ vpdpbusd m0, m5, m9
+ ret
+
+DECLARE_REG_TMP 2, 7
+
+; lut top lut bottom
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35
+; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45
+; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55
+; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65
+; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75
+; L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7
+; La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7
+
+cglobal cdef_filter_4x8_8bpc, 5, 9, 22, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r8-edge_mask
+ vpbroadcastd ym21, strided
+ mov r6d, edgem
+ lea r8, [edge_mask]
+ movq xm1, [topq+strideq*0-2]
+ pmulld ym21, [base+pd_01234567]
+ kxnorb k1, k1, k1
+ movq xm2, [topq+strideq*1-2]
+ vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7
+ mova m14, [base+lut_perm_4x8a]
+ movu m15, [base+lut_perm_4x8b]
+ test r6b, 0x08 ; avoid buffer overread
+ jz .main
+ vinserti32x4 ym1, [botq+strideq*0-2], 1
+ vinserti32x4 ym2, [botq+strideq*1-2], 1
+.main:
+ punpcklqdq ym1, ym2
+ vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____
+ movifnidn prid, prim
+ mov t0d, dirm
+ mova m16, [base+px_idx]
+ mov r3d, dampingm
+ vpermi2b m14, m0, m1 ; lut top
+ vpermi2b m15, m0, m1 ; lut bottom
+ vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+ pxor m20, m20
+ lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
+ vpermb m2, m16, m14 ; pxt
+ vpermb m3, m16, m15 ; pxb
+ mova m1, m0
+ cmp r6b, 0x0f
+ jne .mask_edges ; mask edges only if required
+ test prid, prid
+ jz .sec_only
+ vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+ vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1
+ vpermb m5, m6, m15 ; pNb
+%macro CDEF_FILTER_4x8_PRI 0
+ vpcmpub k1, m2, m4, 6 ; pxt > pNt
+ vpcmpub k2, m3, m5, 6 ; pxb > pNb
+ psubb m6, m4, m2
+ psubb m7, m5, m3
+ lzcnt r6d, prid
+ vpsubb m6{k1}, m2, m4 ; abs(diff_top)
+ vpsubb m7{k2}, m3, m5 ; abs(diff_bottom)
+ vpbroadcastb m13, prid
+ vpbroadcastq m9, [r3+r6*8]
+ and prid, 1
+ vpbroadcastd m11, [base+pri_tap+priq*4]
+ vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift
+ vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift
+ mova m10, m11
+ movifnidn t1d, secm
+ vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top)
+ vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom)
+ psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift)))
+ psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift)))
+ pminub m6, m12
+ pminub m7, m13
+ vpdpbusd m0, m6, m10 ; sum top
+ vpdpbusd m1, m7, m11 ; sum bottom
+%endmacro
+ CDEF_FILTER_4x8_PRI
+ test t1d, t1d ; sec
+ jz .end_no_clip
+ call .sec
+.end_clip:
+ pminub m10, m4, m2
+ pminub m12, m6, m8
+ pminub m11, m5, m3
+ pminub m13, m7, m9
+ pmaxub m4, m2
+ pmaxub m6, m8
+ pmaxub m5, m3
+ pmaxub m7, m9
+ pminub m10, m12
+ pminub m11, m13
+ pmaxub m4, m6
+ pmaxub m5, m7
+ mov r2d, 0xAAAAAAAA
+ kmovd k1, r2d
+ kxnorb k2, k2, k2 ; hw lw
+ vpshrdd m12, m0, m1, 16 ; m1lw m0hw
+ vpshrdd m6, m10, m11, 16 ; m11lw m10hw
+ vpshrdd m8, m4, m5, 16 ; m5lw m4hw
+ vpblendmw m7{k1}, m10, m11 ; m11hw m10lw
+ vpblendmw m9{k1}, m4, m5 ; m5hw m4lw
+ vpblendmw m4{k1}, m0, m12 ; m1lw m0lw
+ vpblendmw m5{k1}, m12, m1 ; m1hw m0hw
+ vpshrdd m2, m3, 16
+ pminub m6, m7
+ pmaxub m8, m9
+ mova ym14, [base+end_perm]
+ vpcmpw k1, m4, m20, 1
+ vpshldw m2, m5, 8
+ pslldq m7, m6, 1
+ pslldq m9, m8, 1
+ psubw m5, m20, m4
+ paddusw m0, m4, m2 ; clip >0xff
+ pminub m6, m7
+ pmaxub m8, m9
+ psubusw m0{k1}, m2, m5 ; clip <0x00
+ pmaxub m0, m6
+ pminub m0, m8
+ vpermb m0, m14, m0
+ vpscatterdd [dstq+ym21]{k2}, ym0
+ RET
+.sec_only:
+ movifnidn t1d, secm
+ call .sec
+.end_no_clip:
+ mova ym4, [base+end_perm]
+ kxnorb k1, k1, k1
+ vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m3, m1, 8
+ paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddw m1, m3
+ pslld m0, 16
+ vpshrdd m0, m1, 16
+ vpermb m0, m4, m0 ; output in bits 8-15 of each word
+ vpscatterdd [dstq+ym21]{k1}, ym0
+ RET
+.mask_edges_sec_only:
+ movifnidn t1d, secm
+ call .mask_edges_sec
+ jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+ mov t1d, r6d
+ or r6d, 8 ; top 4x4 has bottom
+ or t1d, 4 ; bottom 4x4 has top
+ vpbroadcastq m17, [base+edge_mask+r6*8]
+ vpbroadcastq m18, [base+edge_mask+t1*8]
+ test prid, prid
+ jz .mask_edges_sec_only
+ vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16}
+ vpshufbitqmb k1, m17, m6 ; index in-range
+ vpshufbitqmb k2, m18, m6
+ mova m4, m2
+ mova m5, m3
+ vpermb m4{k1}, m6, m14
+ vpermb m5{k2}, m6, m15
+ CDEF_FILTER_4x8_PRI
+ test t1d, t1d
+ jz .end_no_clip
+ call .mask_edges_sec
+ jmp .end_clip
+.mask_edges_sec:
+ vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16}
+ vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16}
+ vpshufbitqmb k1, m17, m10
+ vpshufbitqmb k2, m18, m10
+ vpshufbitqmb k3, m17, m11
+ vpshufbitqmb k4, m18, m11
+ mova m6, m2
+ mova m7, m3
+ mova m8, m2
+ mova m9, m3
+ vpermb m6{k1}, m10, m14
+ vpermb m7{k2}, m10, m15
+ vpermb m8{k3}, m11, m14
+ vpermb m9{k4}, m11, m15
+ jmp .sec_main
+ALIGN function_align
+.sec:
+ vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+ vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+ vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1
+ vpermb m7, m8, m15 ; pNb
+ vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3
+ vpermb m9, m9, m15 ; pNb
+.sec_main:
+ vpbroadcastb m18, t1d
+ lzcnt t1d, t1d
+ vpcmpub k1, m2, m6, 6
+ vpcmpub k2, m3, m7, 6
+ vpcmpub k3, m2, m8, 6
+ vpcmpub k4, m3, m9, 6
+ vpbroadcastq m17, [r3+t1*8]
+ psubb m10, m6, m2
+ psubb m11, m7, m3
+ psubb m12, m8, m2
+ psubb m13, m9, m3
+ vpsubb m10{k1}, m2, m6 ; abs(dt0)
+ vpsubb m11{k2}, m3, m7 ; abs(db0)
+ vpsubb m12{k3}, m2, m8 ; abs(dt1)
+ vpsubb m13{k4}, m3, m9 ; abs(db1)
+ vpbroadcastd m19, [base+sec_tap]
+ gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift
+ gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift
+ gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift
+ gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift
+ psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift)))
+ psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift)))
+ psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift)))
+ psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift)))
+ pminub m10, m14
+ pminub m11, m15
+ pminub m12, m16
+ pminub m13, m17
+ mova m14, m19
+ mova m15, m19
+ mova m16, m19
+ vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0)
+ vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0)
+ vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1)
+ vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1)
+ vpdpbusd m0, m10, m14
+ vpdpbusd m1, m11, m15
+ vpdpbusd m0, m12, m16
+ vpdpbusd m1, m13, m19
+ ret
+
+; lut tl lut tr
+; t0 t1 t2 t3 t4 t5 t6 t7 t4 t5 t6 t7 t8 t9 ta tb
+; T0 T1 T2 T3 T4 T5 T6 T7 T4 T5 T6 T7 T8 T9 Ta Tb
+; L0 L1 00 01 02 03 04 05 02 03 04 05 06 07 08 09
+; L2 L3 10 11 12 13 14 15 12 13 14 15 16 17 18 19
+; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29
+; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39
+; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49
+; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59
+; lut bl lut br
+; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29
+; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39
+; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49
+; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59
+; Lc Ld 60 61 62 63 64 65 62 63 64 65 66 67 68 69
+; Le Lf 70 71 72 73 74 75 72 73 74 75 76 77 78 79
+; b0 b1 b2 b3 b4 b5 b6 b7 b4 b5 b6 b7 b8 b9 ba bb
+; B0 B1 B2 B3 B4 B5 B6 B7 B4 B5 B6 B7 B8 B9 Ba Bb
+
+cglobal cdef_filter_8x8_8bpc, 5, 11, 32, 4*64, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r8-edge_mask
+ movu xm16, [dstq+strideq*0]
+ pinsrd xm16, [leftq+4*0], 3
+ mov r6d, edgem
+ vinserti128 ym16, [dstq+strideq*1], 1
+ lea r10, [dstq+strideq*4]
+ movu xm17, [dstq+strideq*2]
+ vinserti32x4 m16, [topq+strideq*0-2], 2
+ lea r9, [strideq*3]
+ pinsrd xm17, [leftq+4*1], 3
+ vinserti32x4 m16, [topq+strideq*1-2], 3 ; 0 1 t T
+ lea r8, [edge_mask]
+ vinserti128 ym17, [dstq+r9 ], 1
+ vpbroadcastd ym18, [leftq+4*2]
+ vpblendd ym17, ym18, 0x80
+ movu xm18, [r10 +strideq*2]
+ vinserti32x4 m17, [r10 +strideq*0], 2
+ pinsrd xm18, [leftq+4*3], 3
+ vinserti32x4 m17, [r10 +strideq*1], 3 ; 2 3 4 5
+ vinserti128 ym18, [r10 +r9 ], 1
+ test r6b, 0x08 ; avoid buffer overread
+ jz .main
+ vinserti32x4 m18, [botq+strideq*0-2], 2
+ vinserti32x4 m18, [botq+strideq*1-2], 3 ; 6 7 b B
+.main:
+ mova m0, [base+lut_perm_8x8a]
+ movu m1, [base+lut_perm_8x8b]
+ mova m30, [base+px_idx]
+ vpermb m16, m0, m16
+ movifnidn prid, prim
+ vpermb m17, m1, m17
+ mov t0d, dirm
+ vpermb m18, m0, m18
+ mov r3d, dampingm
+ vshufi32x4 m12, m16, m17, q2020 ; lut tl
+ vshufi32x4 m13, m16, m17, q3131 ; lut tr
+ vshufi32x4 m14, m17, m18, q0220 ; lut bl
+ vshufi32x4 m15, m17, m18, q1331 ; lut br
+ vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+ pxor m31, m31
+ lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
+ vpermb m4, m30, m12 ; pxtl
+ mova m1, m0
+ vpermb m5, m30, m13 ; pxtr
+ mova m2, m0
+ vpermb m6, m30, m14 ; pxbl
+ mova m3, m0
+ vpermb m7, m30, m15 ; pxbr
+ cmp r6b, 0x0f
+ jne .mask_edges ; mask edges only if required
+ test prid, prid
+ jz .sec_only
+ vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+ vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1
+ vpermb m9, m11, m13 ; pNtr
+ vpermb m10, m11, m14 ; pNbl
+ vpermb m11, m11, m15 ; pNbr
+%macro CDEF_FILTER_8x8_PRI 0
+ vpcmpub k1, m4, m8, 6 ; pxtl > pNtl
+ vpcmpub k2, m5, m9, 6 ; pxtr > pNtr
+ vpcmpub k3, m6, m10, 6 ; pxbl > pNbl
+ vpcmpub k4, m7, m11, 6 ; pxbr > pNbr
+ psubb m16, m8, m4
+ psubb m17, m9, m5
+ psubb m18, m10, m6
+ psubb m19, m11, m7
+ lzcnt r6d, prid
+ vpsubb m16{k1}, m4, m8 ; abs(diff_tl)
+ vpsubb m17{k2}, m5, m9 ; abs(diff_tr)
+ vpsubb m18{k3}, m6, m10 ; abs(diff_bl)
+ vpsubb m19{k4}, m7, m11 ; abs(diff_br)
+ vpbroadcastq m28, [r3+r6*8]
+ vpbroadcastb m29, prid
+ and prid, 1
+ vpbroadcastd m27, [base+pri_tap+priq*4]
+ vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift
+ vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift
+ vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift
+ vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift
+ mova m24, m27
+ mova m25, m27
+ mova m26, m27
+ movifnidn t1d, secm
+ vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl)
+ vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr)
+ vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl)
+ vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr)
+ psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift)))
+ psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift)))
+ psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift)))
+ psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift)))
+ pminub m16, m20
+ pminub m17, m21
+ pminub m18, m22
+ pminub m19, m23
+ vpdpbusd m0, m16, m24 ; sum tl
+ vpdpbusd m1, m17, m25 ; sum tr
+ vpdpbusd m2, m18, m26 ; sum bl
+ vpdpbusd m3, m19, m27 ; sum br
+%endmacro
+ CDEF_FILTER_8x8_PRI
+ test t1d, t1d ; sec
+ jz .end_no_clip
+ call .sec
+.end_clip:
+ pminub m20, m8, m4
+ pminub m24, m12, m16
+ pminub m21, m9, m5
+ pminub m25, m13, m17
+ pminub m22, m10, m6
+ pminub m26, m14, m18
+ pminub m23, m11, m7
+ pminub m27, m15, m19
+ pmaxub m8, m4
+ pmaxub m12, m16
+ pmaxub m9, m5
+ pmaxub m13, m17
+ pmaxub m10, m6
+ pmaxub m14, m18
+ pmaxub m11, m7
+ pmaxub m15, m19
+ pminub m20, m24
+ pminub m21, m25
+ pminub m22, m26
+ pminub m23, m27
+ pmaxub m8, m12
+ pmaxub m9, m13
+ pmaxub m10, m14
+ pmaxub m11, m15
+ mov r2d, 0xAAAAAAAA
+ kmovd k1, r2d
+ vpshrdd m24, m0, m1, 16
+ vpshrdd m25, m2, m3, 16
+ vpshrdd m12, m20, m21, 16
+ vpshrdd m14, m22, m23, 16
+ vpshrdd m16, m8, m9, 16
+ vpshrdd m18, m10, m11, 16
+ vpblendmw m13{k1}, m20, m21
+ vpblendmw m15{k1}, m22, m23
+ vpblendmw m17{k1}, m8, m9
+ vpblendmw m19{k1}, m10, m11
+ vpblendmw m20{k1}, m0, m24
+ vpblendmw m21{k1}, m24, m1
+ vpblendmw m22{k1}, m2, m25
+ vpblendmw m23{k1}, m25, m3
+ vpshrdd m4, m5, 16
+ vpshrdd m6, m7, 16
+ pminub m12, m13
+ pminub m14, m15
+ pmaxub m16, m17
+ pmaxub m18, m19
+ mova m8, [base+end_perm_clip]
+ vpcmpw k2, m20, m31, 1
+ vpcmpw k3, m22, m31, 1
+ vpshldw m4, m21, 8
+ vpshldw m6, m23, 8
+ kunpckdq k1, k1, k1
+ kxnorb k4, k4, k4
+ vpshrdw m11, m12, m14, 8
+ vpshrdw m15, m16, m18, 8
+ vpblendmb m13{k1}, m12, m14
+ vpblendmb m17{k1}, m16, m18
+ psubw m21, m31, m20
+ psubw m23, m31, m22
+ paddusw m0, m20, m4 ; clip >0xff
+ paddusw m1, m22, m6
+ pminub m11, m13
+ pmaxub m15, m17
+ psubusw m0{k2}, m4, m21 ; clip <0x00
+ psubusw m1{k3}, m6, m23
+ psrlw m0, 8
+ vmovdqu8 m0{k1}, m1
+ pmaxub m0, m11
+ pminub m0, m15
+ vpermb m0, m8, m0
+ vextracti32x4 xm1, m0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*2], xm1
+ movq [r10 +strideq*0], xm2
+ movq [r10 +strideq*2], xm3
+ movhps [dstq+strideq*1], xm0
+ movhps [dstq+r9 ], xm1
+ movhps [r10 +strideq*1], xm2
+ movhps [r10 +r9 ], xm3
+ RET
+.sec_only:
+ movifnidn t1d, secm
+ call .sec
+.end_no_clip:
+ mova xm8, [base+end_perm]
+ kxnorb k1, k1, k1
+ vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m5, m1, 8
+ vpshldd m6, m2, 8
+ vpshldd m7, m3, 8
+ paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ vpermb m0, m8, m0
+ vpermb m1, m8, m1
+ vpermb m2, m8, m2
+ vpermb m3, m8, m3
+ punpckldq m4, m0, m1
+ punpckhdq m0, m1
+ punpckldq m5, m2, m3
+ punpckhdq m2, m3
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm0
+ movq [r10 +strideq*0], xm5
+ movq [r10 +strideq*2], xm2
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+r9 ], xm0
+ movhps [r10 +strideq*1], xm5
+ movhps [r10 +r9 ], xm2
+ RET
+.mask_edges_sec_only:
+ movifnidn t1d, secm
+ call .mask_edges_sec
+ jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+ mov t0d, r6d
+ mov t1d, r6d
+ or t0d, 0xA ; top-left 4x4 has bottom and right
+ or t1d, 0x9 ; top-right 4x4 has bottom and left
+ vpbroadcastq m26, [base+edge_mask+t0*8]
+ vpbroadcastq m27, [base+edge_mask+t1*8]
+ mov t1d, r6d
+ or r6d, 0x6 ; bottom-left 4x4 has top and right
+ or t1d, 0x5 ; bottom-right 4x4 has top and left
+ vpbroadcastq m28, [base+edge_mask+r6*8]
+ vpbroadcastq m29, [base+edge_mask+t1*8]
+ mov t0d, dirm
+ test prid, prid
+ jz .mask_edges_sec_only
+ vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16}
+ vpshufbitqmb k1, m26, m20 ; index in-range
+ vpshufbitqmb k2, m27, m20
+ vpshufbitqmb k3, m28, m20
+ vpshufbitqmb k4, m29, m20
+ mova m8, m4
+ mova m9, m5
+ mova m10, m6
+ mova m11, m7
+ vpermb m8{k1}, m20, m12
+ vpermb m9{k2}, m20, m13
+ vpermb m10{k3}, m20, m14
+ vpermb m11{k4}, m20, m15
+ mova [rsp+0x00], m26
+ mova [rsp+0x40], m27
+ mova [rsp+0x80], m28
+ mova [rsp+0xC0], m29
+ CDEF_FILTER_8x8_PRI
+ test t1d, t1d
+ jz .end_no_clip
+ mova m26, [rsp+0x00]
+ mova m27, [rsp+0x40]
+ mova m28, [rsp+0x80]
+ mova m29, [rsp+0xC0]
+ call .mask_edges_sec
+ jmp .end_clip
+.mask_edges_sec:
+ vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16}
+ vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16}
+ vpshufbitqmb k1, m26, m20
+ vpshufbitqmb k2, m27, m20
+ vpshufbitqmb k3, m28, m20
+ vpshufbitqmb k4, m29, m20
+ mova m16, m4
+ mova m17, m5
+ mova m18, m6
+ mova m19, m7
+ vpermb m16{k1}, m20, m12
+ vpermb m17{k2}, m20, m13
+ vpermb m18{k3}, m20, m14
+ vpermb m19{k4}, m20, m15
+ vpshufbitqmb k1, m26, m21
+ vpshufbitqmb k2, m27, m21
+ vpshufbitqmb k3, m28, m21
+ vpshufbitqmb k4, m29, m21
+ vpermb m12, m21, m12
+ vpermb m13, m21, m13
+ vpermb m14, m21, m14
+ vpermb m15, m21, m15
+ vpblendmb m12{k1}, m4, m12
+ vpblendmb m13{k2}, m5, m13
+ vpblendmb m14{k3}, m6, m14
+ vpblendmb m15{k4}, m7, m15
+ jmp .sec_main
+ALIGN function_align
+.sec:
+ vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+ vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+ vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1
+ vpermb m17, m20, m13 ; pNtr
+ vpermb m18, m20, m14 ; pNbl
+ vpermb m19, m20, m15 ; pNbr
+ vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3
+ vpermb m13, m21, m13 ; pNtr
+ vpermb m14, m21, m14 ; pNbl
+ vpermb m15, m21, m15 ; pNbr
+.sec_main:
+%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants
+ vpcmpub k1, m4, %1, 6
+ vpcmpub k2, m5, %2, 6
+ vpcmpub k3, m6, %3, 6
+ vpcmpub k4, m7, %4, 6
+ psubb m20, %1, m4
+ psubb m21, %2, m5
+ psubb m22, %3, m6
+ psubb m23, %4, m7
+%if %5
+ vpbroadcastb m28, t1d
+ lzcnt t1d, t1d
+ vpbroadcastq m29, [r3+t1*8]
+%endif
+ vpsubb m20{k1}, m4, %1
+ vpsubb m21{k2}, m5, %2
+ vpsubb m22{k3}, m6, %3
+ vpsubb m23{k4}, m7, %4
+ gf2p8affineqb m24, m20, m29, 0
+ gf2p8affineqb m25, m21, m29, 0
+ gf2p8affineqb m26, m22, m29, 0
+ gf2p8affineqb m27, m23, m29, 0
+%if %5
+ vpbroadcastd m30, [base+sec_tap]
+%endif
+ psubusb m24, m28, m24
+ psubusb m25, m28, m25
+ psubusb m26, m28, m26
+ psubusb m27, m28, m27
+ pminub m20, m24
+ pminub m21, m25
+ pminub m22, m26
+ pminub m23, m27
+ mova m24, m30
+ mova m25, m30
+ mova m26, m30
+ mova m27, m30
+ vpsubb m24{k1}, m31, m30
+ vpsubb m25{k2}, m31, m30
+ vpsubb m26{k3}, m31, m30
+ vpsubb m27{k4}, m31, m30
+ vpdpbusd m0, m20, m24
+ vpdpbusd m1, m21, m25
+ vpdpbusd m2, m22, m26
+ vpdpbusd m3, m23, m27
+%endmacro
+ CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1
+ CDEF_FILTER_8x8_SEC m12, m13, m14, m15
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/cdef_sse.asm b/third_party/dav1d/src/x86/cdef_sse.asm
new file mode 100644
index 0000000000..1b353121f4
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef_sse.asm
@@ -0,0 +1,1357 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2019, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%macro DUP8 1-*
+ %rep %0
+ times 8 db %1
+ %rotate 1
+ %endrep
+%endmacro
+
+div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105
+ dd 420, 210, 140, 105, 105, 105, 105, 105
+div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210
+ dw 168, 168, 140, 140, 120, 120, 105, 105
+ dw 420, 420, 210, 210, 140, 140, 105, 105
+ dw 105, 105, 105, 105, 105, 105, 105, 105
+const shufw_6543210x, \
+ db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
+shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pw_8: times 8 dw 8
+pw_128: times 8 dw 128
+pw_256: times 8 dw 256
+pw_2048: times 8 dw 2048
+pw_0x7FFF: times 8 dw 0x7FFF
+pw_0x8000: times 8 dw 0x8000
+tap_table: ; masks for 8-bit shift emulation
+ DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80
+ ; weights
+ DUP8 4, 2, 3, 3, 2, 1
+ ; taps indices
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+ db 1 * 16 + 0, 2 * 16 + 0
+ db 1 * 16 + 0, 2 * 16 - 1
+ ; the last 6 are repeats of the first 6 so we don't need to & 7
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+
+SECTION .text
+
+%macro movif32 2
+ %if ARCH_X86_32
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro PMOVZXBW 2-3 0 ; %3 = half
+ %if cpuflag(sse4) && %3 == 0
+ pmovzxbw %1, %2
+ %else
+ %if %3 == 1
+ movd %1, %2
+ %else
+ movq %1, %2
+ %endif
+ punpcklbw %1, m7
+ %endif
+%endmacro
+
+%macro PSHUFB_0 2
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ punpcklbw %1, %1
+ pshuflw %1, %1, q0000
+ punpcklqdq %1, %1
+ %endif
+%endmacro
+
+%macro MOVDDUP 2
+%if cpuflag(ssse3)
+ movddup %1, %2
+%else
+ movq %1, %2
+ punpcklqdq %1, %1
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax
+ ; load p0/p1
+ movsx offq, byte [dirq+kq+%1+14*8] ; off1
+ %if %6 == 4
+ movq m5, [stkq+offq*2+32*0] ; p0
+ movhps m5, [stkq+offq*2+32*1]
+ %else
+ movu m5, [stkq+offq*2+32*0] ; p0
+ %endif
+ neg offq ; -off1
+ %if %6 == 4
+ movq m6, [stkq+offq*2+32*0] ; p1
+ movhps m6, [stkq+offq*2+32*1]
+ %else
+ movu m6, [stkq+offq*2+32*0] ; p1
+ %endif
+ %if %7
+ %if cpuflag(sse4)
+ ; out of bounds values are set to a value that is a both a large unsigned
+ ; value and a negative signed value.
+ ; use signed max and unsigned min to remove them
+ pmaxsw m7, m5
+ pminuw m8, m5
+ pmaxsw m7, m6
+ pminuw m8, m6
+ %else
+ pcmpeqw m3, m14, m5
+ pminsw m8, m5 ; min after p0
+ pandn m3, m5
+ pmaxsw m7, m3 ; max after p0
+ pcmpeqw m3, m14, m6
+ pminsw m8, m6 ; min after p1
+ pandn m3, m6
+ pmaxsw m7, m3 ; max after p1
+ %endif
+ %endif
+
+ ; accumulate sum[m13] over p0/p1
+ psubw m5, m4 ; diff_p0(p0 - px)
+ psubw m6, m4 ; diff_p1(p1 - px)
+ packsswb m5, m6 ; convert pixel diff to 8-bit
+ %if cpuflag(ssse3)
+ pshufb m5, m13 ; group diffs p0 and p1 into pairs
+ pabsb m6, m5
+ psignb m3, %5, m5
+ %else
+ movlhps m6, m5
+ punpckhbw m6, m5
+ pxor m5, m5
+ pcmpgtb m5, m6
+ paddb m6, m5
+ pxor m6, m5
+ paddb m3, %5, m5
+ pxor m3, m5
+ %endif
+ pand m9, %3, m6 ; emulate 8-bit shift
+ psrlw m9, %2
+ psubusb m5, %4, m9
+ pminub m5, m6 ; constrain(diff_p)
+ %if cpuflag(ssse3)
+ pmaddubsw m5, m3 ; constrain(diff_p) * taps
+ %else
+ psrlw m9, m5, 8
+ psraw m6, m3, 8
+ psllw m5, 8
+ psllw m3, 8
+ pmullw m9, m6
+ pmulhw m5, m3
+ paddw m5, m9
+ %endif
+ paddw m0, m5
+%endmacro
+
+%macro LOAD_BODY 3 ; dst, src, block_width
+ %if %3 == 4
+ PMOVZXBW m0, [%2+strideq*0]
+ PMOVZXBW m1, [%2+strideq*1]
+ PMOVZXBW m2, [%2+strideq*2]
+ PMOVZXBW m3, [%2+stride3q]
+ mova [%1+32*0], m0
+ mova [%1+32*1], m1
+ mova [%1+32*2], m2
+ mova [%1+32*3], m3
+ %else
+ movu m0, [%2+strideq*0]
+ movu m1, [%2+strideq*1]
+ movu m2, [%2+strideq*2]
+ movu m3, [%2+stride3q]
+ punpcklbw m4, m0, m7
+ punpckhbw m0, m7
+ mova [%1+32*0+ 0], m4
+ mova [%1+32*0+16], m0
+ punpcklbw m4, m1, m7
+ punpckhbw m1, m7
+ mova [%1+32*1+ 0], m4
+ mova [%1+32*1+16], m1
+ punpcklbw m4, m2, m7
+ punpckhbw m2, m7
+ mova [%1+32*2+ 0], m4
+ mova [%1+32*2+16], m2
+ punpcklbw m4, m3, m7
+ punpckhbw m3, m7
+ mova [%1+32*3+ 0], m4
+ mova [%1+32*3+16], m3
+ %endif
+%endmacro
+
+%macro CDEF_FILTER_END 2 ; w, minmax
+ pxor m6, m6
+ pcmpgtw m6, m0
+ paddw m0, m6
+ %if cpuflag(ssse3)
+ pmulhrsw m0, m15
+ %else
+ paddw m0, m15
+ psraw m0, 4
+ %endif
+ paddw m4, m0
+ %if %2
+ pminsw m4, m7
+ pmaxsw m4, m8
+ %endif
+ packuswb m4, m4
+ %if %1 == 4
+ movd [dstq+strideq*0], m4
+ psrlq m4, 32
+ movd [dstq+strideq*1], m4
+ add stkq, 32*2
+ lea dstq, [dstq+strideq*2]
+ %else
+ movq [dstq], m4
+ add stkq, 32
+ add dstq, strideq
+ %endif
+%endmacro
+
+%macro CDEF_FILTER 2 ; w, h
+ %if ARCH_X86_64
+cglobal cdef_filter_%1x%2_8bpc, 5, 9, 16, 3 * 16 + (%2+4)*32, \
+ dst, stride, left, top, bot, pri, dst4, edge, \
+ stride3
+ %define px rsp+3*16+2*32
+ %define base 0
+ %else
+cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
+ dst, stride, left, edge, stride3
+ %define topq r2
+ %define botq r2
+ %define dst4q r2
+ LEA r5, tap_table
+ %define px esp+7*16+2*32
+ %define base r5-tap_table
+ %endif
+ mov edged, r9m
+ %if cpuflag(sse4)
+ %define OUT_OF_BOUNDS_MEM [base+pw_0x8000]
+ %else
+ %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF]
+ %endif
+ mova m6, OUT_OF_BOUNDS_MEM
+ pxor m7, m7
+
+ ; prepare pixel buffers - body/right
+ %if %2 == 8
+ lea dst4q, [dstq+strideq*4]
+ %endif
+ lea stride3q, [strideq*3]
+ test edgeb, 2 ; have_right
+ jz .no_right
+ LOAD_BODY px, dstq, %1
+ %if %2 == 8
+ LOAD_BODY px+4*32, dst4q, %1
+ %endif
+ jmp .body_done
+.no_right:
+ PMOVZXBW m0, [dstq+strideq*0], %1 == 4
+ PMOVZXBW m1, [dstq+strideq*1], %1 == 4
+ PMOVZXBW m2, [dstq+strideq*2], %1 == 4
+ PMOVZXBW m3, [dstq+stride3q ], %1 == 4
+ mova [px+32*0], m0
+ mova [px+32*1], m1
+ mova [px+32*2], m2
+ mova [px+32*3], m3
+ movd [px+32*0+%1*2], m6
+ movd [px+32*1+%1*2], m6
+ movd [px+32*2+%1*2], m6
+ movd [px+32*3+%1*2], m6
+ %if %2 == 8
+ PMOVZXBW m0, [dst4q+strideq*0], %1 == 4
+ PMOVZXBW m1, [dst4q+strideq*1], %1 == 4
+ PMOVZXBW m2, [dst4q+strideq*2], %1 == 4
+ PMOVZXBW m3, [dst4q+stride3q ], %1 == 4
+ mova [px+32*4], m0
+ mova [px+32*5], m1
+ mova [px+32*6], m2
+ mova [px+32*7], m3
+ movd [px+32*4+%1*2], m6
+ movd [px+32*5+%1*2], m6
+ movd [px+32*6+%1*2], m6
+ movd [px+32*7+%1*2], m6
+ %endif
+.body_done:
+
+ ; top
+ movifnidn topq, r3mp
+ test edgeb, 4 ; have_top
+ jz .no_top
+ test edgeb, 1 ; have_left
+ jz .top_no_left
+ test edgeb, 2 ; have_right
+ jz .top_no_right
+ %if %1 == 4
+ PMOVZXBW m0, [topq+strideq*0-2]
+ PMOVZXBW m1, [topq+strideq*1-2]
+ %else
+ movu m0, [topq+strideq*0-4]
+ movu m1, [topq+strideq*1-4]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movu [px-32*2+8], m2
+ movu [px-32*1+8], m3
+ %endif
+ movu [px-32*2-%1], m0
+ movu [px-32*1-%1], m1
+ jmp .top_done
+.top_no_right:
+ %if %1 == 4
+ PMOVZXBW m0, [topq+strideq*0-%1]
+ PMOVZXBW m1, [topq+strideq*1-%1]
+ movu [px-32*2-8], m0
+ movu [px-32*1-8], m1
+ %else
+ movu m0, [topq+strideq*0-%1]
+ movu m1, [topq+strideq*1-%2]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px-32*2-16], m0
+ mova [px-32*2+ 0], m2
+ mova [px-32*1-16], m1
+ mova [px-32*1+ 0], m3
+ %endif
+ movd [px-32*2+%1*2], m6
+ movd [px-32*1+%1*2], m6
+ jmp .top_done
+.top_no_left:
+ test edgeb, 2 ; have_right
+ jz .top_no_left_right
+ %if %1 == 4
+ PMOVZXBW m0, [topq+strideq*0]
+ PMOVZXBW m1, [topq+strideq*1]
+ %else
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movd [px-32*2+16], m2
+ movd [px-32*1+16], m3
+ %endif
+ movd [px-32*2- 4], m6
+ movd [px-32*1- 4], m6
+ mova [px-32*2+ 0], m0
+ mova [px-32*1+ 0], m1
+ jmp .top_done
+.top_no_left_right:
+ PMOVZXBW m0, [topq+strideq*0], %1 == 4
+ PMOVZXBW m1, [topq+strideq*1], %1 == 4
+ movd [px-32*2-4], m6
+ movd [px-32*1-4], m6
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ movd [px-32*2+%1*2], m6
+ movd [px-32*1+%1*2], m6
+ jmp .top_done
+.no_top:
+ movu [px-32*2- 4], m6
+ movu [px-32*1- 4], m6
+ %if %1 == 8
+ movq [px-32*2+12], m6
+ movq [px-32*1+12], m6
+ %endif
+.top_done:
+
+ ; left
+ test edgeb, 1 ; have_left
+ jz .no_left
+ movifnidn leftq, leftmp
+ %if %2 == 4
+ movq m0, [leftq]
+ %else
+ movu m0, [leftq]
+ %endif
+ %if %2 == 4
+ punpcklbw m0, m7
+ %else
+ punpckhbw m1, m0, m7
+ punpcklbw m0, m7
+ movhlps m3, m1
+ movd [px+32*4-4], m1
+ movd [px+32*6-4], m3
+ psrlq m1, 32
+ psrlq m3, 32
+ movd [px+32*5-4], m1
+ movd [px+32*7-4], m3
+ %endif
+ movhlps m2, m0
+ movd [px+32*0-4], m0
+ movd [px+32*2-4], m2
+ psrlq m0, 32
+ psrlq m2, 32
+ movd [px+32*1-4], m0
+ movd [px+32*3-4], m2
+ jmp .left_done
+.no_left:
+ movd [px+32*0-4], m6
+ movd [px+32*1-4], m6
+ movd [px+32*2-4], m6
+ movd [px+32*3-4], m6
+ %if %2 == 8
+ movd [px+32*4-4], m6
+ movd [px+32*5-4], m6
+ movd [px+32*6-4], m6
+ movd [px+32*7-4], m6
+ %endif
+.left_done:
+
+ ; bottom
+ movifnidn botq, r4mp
+ test edgeb, 8 ; have_bottom
+ jz .no_bottom
+ test edgeb, 1 ; have_left
+ jz .bottom_no_left
+ test edgeb, 2 ; have_right
+ jz .bottom_no_right
+ %if %1 == 4
+ PMOVZXBW m0, [botq+strideq*0-(%1/2)]
+ PMOVZXBW m1, [botq+strideq*1-(%1/2)]
+ %else
+ movu m0, [botq+strideq*0-4]
+ movu m1, [botq+strideq*1-4]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movu [px+32*(%2+0)+8], m2
+ movu [px+32*(%2+1)+8], m3
+ %endif
+ movu [px+32*(%2+0)-%1], m0
+ movu [px+32*(%2+1)-%1], m1
+ jmp .bottom_done
+.bottom_no_right:
+ %if %1 == 4
+ PMOVZXBW m0, [botq+strideq*0-4]
+ PMOVZXBW m1, [botq+strideq*1-4]
+ movu [px+32*(%2+0)-8], m0
+ movu [px+32*(%2+1)-8], m1
+ %else
+ movu m0, [botq+strideq*0-8]
+ movu m1, [botq+strideq*1-8]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px+32*(%2+0)-16], m0
+ mova [px+32*(%2+0)+ 0], m2
+ mova [px+32*(%2+1)-16], m1
+ mova [px+32*(%2+1)+ 0], m3
+ movd [px+32*(%2-1)+16], m6 ; overwritten by first mova
+ %endif
+ movd [px+32*(%2+0)+%1*2], m6
+ movd [px+32*(%2+1)+%1*2], m6
+ jmp .bottom_done
+.bottom_no_left:
+ test edgeb, 2 ; have_right
+ jz .bottom_no_left_right
+ %if %1 == 4
+ PMOVZXBW m0, [botq+strideq*0]
+ PMOVZXBW m1, [botq+strideq*1]
+ %else
+ movu m0, [botq+strideq*0]
+ movu m1, [botq+strideq*1]
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px+32*(%2+0)+16], m2
+ mova [px+32*(%2+1)+16], m3
+ %endif
+ mova [px+32*(%2+0)+ 0], m0
+ mova [px+32*(%2+1)+ 0], m1
+ movd [px+32*(%2+0)- 4], m6
+ movd [px+32*(%2+1)- 4], m6
+ jmp .bottom_done
+.bottom_no_left_right:
+ PMOVZXBW m0, [botq+strideq*0], %1 == 4
+ PMOVZXBW m1, [botq+strideq*1], %1 == 4
+ mova [px+32*(%2+0)+ 0], m0
+ mova [px+32*(%2+1)+ 0], m1
+ movd [px+32*(%2+0)+%1*2], m6
+ movd [px+32*(%2+1)+%1*2], m6
+ movd [px+32*(%2+0)- 4], m6
+ movd [px+32*(%2+1)- 4], m6
+ jmp .bottom_done
+.no_bottom:
+ movu [px+32*(%2+0)- 4], m6
+ movu [px+32*(%2+1)- 4], m6
+ %if %1 == 8
+ movq [px+32*(%2+0)+12], m6
+ movq [px+32*(%2+1)+12], m6
+ %endif
+.bottom_done:
+
+ ; actual filter
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, _, pridmp, damping, pri, sec
+ mova m13, [shufb_lohi]
+ %if cpuflag(ssse3)
+ mova m15, [pw_2048]
+ %else
+ mova m15, [pw_8]
+ %endif
+ mova m14, m6
+ %else
+ DEFINE_ARGS dst, pridmp, sec, damping, pri, tap
+ %xdefine m8 m1
+ %xdefine m9 m2
+ %xdefine m10 m0
+ %xdefine m13 [base+shufb_lohi]
+ %xdefine m14 OUT_OF_BOUNDS_MEM
+ %if cpuflag(ssse3)
+ %xdefine m15 [base+pw_2048]
+ %else
+ %xdefine m15 [base+pw_8]
+ %endif
+ %endif
+ movifnidn prid, r5m
+ movifnidn secd, r6m
+ mov dampingd, r8m
+ movif32 [esp+0x3C], r1d
+ test prid, prid
+ jz .sec_only
+ movd m1, r5m
+ bsr pridmpd, prid
+ test secd, secd
+ jz .pri_only
+ movd m10, r6m
+ tzcnt secd, secd
+ and prid, 1
+ sub pridmpd, dampingd
+ sub secd, dampingd
+ xor dampingd, dampingd
+ add prid, prid
+ neg pridmpd
+ cmovs pridmpd, dampingd
+ neg secd
+ PSHUFB_0 m1, m7
+ PSHUFB_0 m10, m7
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, _, pridmp, tap, pri, sec
+ lea tapq, [tap_table]
+ MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask
+ MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask
+ mov [rsp+0x00], pridmpq ; pri_shift
+ mov [rsp+0x10], secq ; sec_shift
+ DEFINE_ARGS dst, stride, h, dir, tap, pri, stk, k, off
+ %else
+ MOVDDUP m2, [tapq+pridmpq*8]
+ MOVDDUP m3, [tapq+secq*8]
+ mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw
+ mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP
+ mov [esp+0x00], pridmpd
+ mov [esp+0x30], secd
+ DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
+ %define offq dstq
+ %define kd strided
+ %define kq strideq
+ mova [esp+0x10], m2
+ mova [esp+0x40], m3
+ mova [esp+0x20], m1
+ mova [esp+0x50], m10
+ %endif
+ mov dird, r7m
+ lea stkq, [px]
+ lea priq, [tapq+8*8+priq*8] ; pri_taps
+ mov hd, %1*%2/8
+ lea dirq, [tapq+dirq*2]
+.v_loop:
+ movif32 [esp+0x38], dstd
+ mov kd, 1
+ %if %1 == 4
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
+ %else
+ mova m4, [stkq+32*0] ; px
+ %endif
+ pxor m0, m0 ; sum
+ mova m7, m4 ; max
+ mova m8, m4 ; min
+.k_loop:
+ MOVDDUP m2, [priq+kq*8]
+ %if ARCH_X86_64
+ ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1
+ ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1
+ %else
+ ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
+ %endif
+ dec kd
+ jge .k_loop
+ movif32 dstq, [esp+0x38]
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 1
+ dec hd
+ jg .v_loop
+ RET
+
+.pri_only:
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, zero, pridmp, damping, pri, tap
+ lea tapq, [tap_table]
+ %else
+ DEFINE_ARGS dst, pridmp, zero, damping, pri, tap
+ %endif
+ and prid, 1
+ xor zerod, zerod
+ sub dampingd, pridmpd
+ cmovs dampingd, zerod
+ add prid, prid
+ PSHUFB_0 m1, m7
+ MOVDDUP m7, [tapq+dampingq*8]
+ mov [rsp+0x00], dampingq
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, h, dir, stk, pri, tap, k, off
+ %else
+ mov [rsp+0x04], zerod
+ DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
+ %endif
+ mov dird, r7m
+ lea stkq, [px]
+ lea priq, [tapq+8*8+priq*8]
+ mov hd, %1*%2/8
+ lea dirq, [tapq+dirq*2]
+.pri_v_loop:
+ movif32 [esp+0x38], dstd
+ mov kd, 1
+ %if %1 == 4
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
+ %else
+ mova m4, [stkq+32*0]
+ %endif
+ pxor m0, m0
+.pri_k_loop:
+ MOVDDUP m2, [priq+kq*8]
+ ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0
+ dec kd
+ jge .pri_k_loop
+ movif32 dstq, [esp+0x38]
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 0
+ dec hd
+ jg .pri_v_loop
+ RET
+
+.sec_only:
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, zero, dir, damping, tap, sec
+%else
+ DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero
+%endif
+ movd m1, r6m
+ tzcnt secd, secd
+ mov dird, r7m
+ xor zerod, zerod
+ sub dampingd, secd
+ cmovs dampingd, zerod
+ PSHUFB_0 m1, m7
+ %if ARCH_X86_64
+ lea tapq, [tap_table]
+ %else
+ mov [rsp+0x04], zerod
+ %endif
+ mov [rsp+0x00], dampingq
+ MOVDDUP m7, [tapq+dampingq*8]
+ lea dirq, [tapq+dirq*2]
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, h, dir, stk, tap, off, k
+ %else
+ DEFINE_ARGS dst, stride, off, stk, dir, tap, h
+ %endif
+ lea stkq, [px]
+ mov hd, %1*%2/8
+.sec_v_loop:
+ mov kd, 1
+ %if %1 == 4
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
+ %else
+ mova m4, [stkq+32*0]
+ %endif
+ pxor m0, m0
+.sec_k_loop:
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0
+ %if ARCH_X86_32
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ %endif
+ ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0
+ dec kd
+ jge .sec_k_loop
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 0
+ dec hd
+ jg .sec_v_loop
+ RET
+%endmacro
+
+%macro MULLD 2
+ %if cpuflag(sse4)
+ pmulld %1, %2
+ %else
+ %if ARCH_X86_32
+ %define m15 m1
+ %endif
+ pmulhuw m15, %1, %2
+ pmullw %1, %2
+ pslld m15, 16
+ paddd %1, m15
+ %endif
+%endmacro
+
+%macro CDEF_DIR 0
+ %if ARCH_X86_64
+cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var
+ lea r6, [strideq*3]
+ movq m1, [srcq+strideq*0]
+ movhps m1, [srcq+strideq*1]
+ movq m3, [srcq+strideq*2]
+ movhps m3, [srcq+r6 ]
+ lea srcq, [srcq+strideq*4]
+ movq m5, [srcq+strideq*0]
+ movhps m5, [srcq+strideq*1]
+ movq m7, [srcq+strideq*2]
+ movhps m7, [srcq+r6 ]
+
+ pxor m8, m8
+ psadbw m9, m1, m8
+ psadbw m2, m3, m8
+ psadbw m4, m5, m8
+ psadbw m6, m7, m8
+ packssdw m9, m2
+ packssdw m4, m6
+ packssdw m9, m4
+
+ punpcklbw m0, m1, m8
+ punpckhbw m1, m8
+ punpcklbw m2, m3, m8
+ punpckhbw m3, m8
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ punpcklbw m6, m7, m8
+ punpckhbw m7, m8
+cglobal_label .main
+ mova m8, [pw_128]
+ psubw m0, m8
+ psubw m1, m8
+ psubw m2, m8
+ psubw m3, m8
+ psubw m4, m8
+ psubw m5, m8
+ psubw m6, m8
+ psubw m7, m8
+ psllw m8, 3
+ psubw m9, m8 ; partial_sum_hv[0]
+
+ paddw m8, m0, m1
+ paddw m10, m2, m3
+ paddw m8, m4
+ paddw m10, m5
+ paddw m8, m6
+ paddw m10, m7
+ paddw m8, m10 ; partial_sum_hv[1]
+
+ pmaddwd m8, m8
+ pmaddwd m9, m9
+ phaddd m9, m8
+ SWAP m8, m9
+ MULLD m8, [div_table%+SUFFIX+48]
+
+ pslldq m9, m1, 2
+ psrldq m10, m1, 14
+ pslldq m11, m2, 4
+ psrldq m12, m2, 12
+ pslldq m13, m3, 6
+ psrldq m14, m3, 10
+ paddw m9, m0
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14 ; partial_sum_diag[0] top/right half
+ paddw m9, m11 ; partial_sum_diag[0] top/left half
+ pslldq m11, m4, 8
+ psrldq m12, m4, 8
+ pslldq m13, m5, 10
+ psrldq m14, m5, 6
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14
+ pslldq m11, m6, 12
+ psrldq m12, m6, 4
+ pslldq m13, m7, 14
+ psrldq m14, m7, 2
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13 ; partial_sum_diag[0][0-7]
+ paddw m10, m14 ; partial_sum_diag[0][8-14,zero]
+ pshufb m10, [shufw_6543210x]
+ punpckhwd m11, m9, m10
+ punpcklwd m9, m10
+ pmaddwd m11, m11
+ pmaddwd m9, m9
+ MULLD m11, [div_table%+SUFFIX+16]
+ MULLD m9, [div_table%+SUFFIX+0]
+ paddd m9, m11 ; cost[0a-d]
+
+ pslldq m10, m0, 14
+ psrldq m11, m0, 2
+ pslldq m12, m1, 12
+ psrldq m13, m1, 4
+ pslldq m14, m2, 10
+ psrldq m15, m2, 6
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m15
+ pslldq m12, m3, 8
+ psrldq m13, m3, 8
+ pslldq m14, m4, 6
+ psrldq m15, m4, 10
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m15
+ pslldq m12, m5, 4
+ psrldq m13, m5, 12
+ pslldq m14, m6, 2
+ psrldq m15, m6, 14
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m15 ; partial_sum_diag[1][8-14,zero]
+ paddw m10, m7 ; partial_sum_diag[1][0-7]
+ pshufb m11, [shufw_6543210x]
+ punpckhwd m12, m10, m11
+ punpcklwd m10, m11
+ pmaddwd m12, m12
+ pmaddwd m10, m10
+ MULLD m12, [div_table%+SUFFIX+16]
+ MULLD m10, [div_table%+SUFFIX+0]
+ paddd m10, m12 ; cost[4a-d]
+ phaddd m9, m10 ; cost[0a/b,4a/b]
+
+ paddw m10, m0, m1
+ paddw m11, m2, m3
+ paddw m12, m4, m5
+ paddw m13, m6, m7
+ phaddw m0, m4
+ phaddw m1, m5
+ phaddw m2, m6
+ phaddw m3, m7
+
+ ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1)
+ pslldq m4, m11, 2
+ psrldq m5, m11, 14
+ pslldq m6, m12, 4
+ psrldq m7, m12, 12
+ pslldq m14, m13, 6
+ psrldq m15, m13, 10
+ paddw m4, m10
+ paddw m5, m7
+ paddw m4, m6
+ paddw m5, m15 ; partial_sum_alt[3] right
+ paddw m4, m14 ; partial_sum_alt[3] left
+ pshuflw m6, m5, q3012
+ punpckhwd m5, m4
+ punpcklwd m4, m6
+ pmaddwd m5, m5
+ pmaddwd m4, m4
+ MULLD m5, [div_table%+SUFFIX+48]
+ MULLD m4, [div_table%+SUFFIX+32]
+ paddd m4, m5 ; cost[7a-d]
+
+ pslldq m5, m10, 6
+ psrldq m6, m10, 10
+ pslldq m7, m11, 4
+ psrldq m10, m11, 12
+ pslldq m11, m12, 2
+ psrldq m12, 14
+ paddw m5, m7
+ paddw m6, m10
+ paddw m5, m11
+ paddw m6, m12
+ paddw m5, m13
+ pshuflw m7, m6, q3012
+ punpckhwd m6, m5
+ punpcklwd m5, m7
+ pmaddwd m6, m6
+ pmaddwd m5, m5
+ MULLD m6, [div_table%+SUFFIX+48]
+ MULLD m5, [div_table%+SUFFIX+32]
+ paddd m5, m6 ; cost[5a-d]
+
+ pslldq m6, m1, 2
+ psrldq m7, m1, 14
+ pslldq m10, m2, 4
+ psrldq m11, m2, 12
+ pslldq m12, m3, 6
+ psrldq m13, m3, 10
+ paddw m6, m0
+ paddw m7, m11
+ paddw m6, m10
+ paddw m7, m13 ; partial_sum_alt[3] right
+ paddw m6, m12 ; partial_sum_alt[3] left
+ pshuflw m10, m7, q3012
+ punpckhwd m7, m6
+ punpcklwd m6, m10
+ pmaddwd m7, m7
+ pmaddwd m6, m6
+ MULLD m7, [div_table%+SUFFIX+48]
+ MULLD m6, [div_table%+SUFFIX+32]
+ paddd m6, m7 ; cost[1a-d]
+
+ pshufd m0, m0, q1032
+ pshufd m1, m1, q1032
+ pshufd m2, m2, q1032
+ pshufd m3, m3, q1032
+
+ pslldq m10, m0, 6
+ psrldq m11, m0, 10
+ pslldq m12, m1, 4
+ psrldq m13, m1, 12
+ pslldq m14, m2, 2
+ psrldq m2, 14
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m2
+ paddw m10, m3
+ pshuflw m12, m11, q3012
+ punpckhwd m11, m10
+ punpcklwd m10, m12
+ pmaddwd m11, m11
+ pmaddwd m10, m10
+ MULLD m11, [div_table%+SUFFIX+48]
+ MULLD m10, [div_table%+SUFFIX+32]
+ paddd m10, m11 ; cost[3a-d]
+
+ phaddd m9, m8 ; cost[0,4,2,6]
+ phaddd m6, m10
+ phaddd m5, m4
+ phaddd m6, m5 ; cost[1,3,5,7]
+ pshufd m4, m9, q3120
+
+ ; now find the best cost
+ %if cpuflag(sse4)
+ pmaxsd m9, m6
+ pshufd m0, m9, q1032
+ pmaxsd m0, m9
+ pshufd m1, m0, q2301
+ pmaxsd m0, m1 ; best cost
+ %else
+ pcmpgtd m0, m9, m6
+ pand m9, m0
+ pandn m0, m6
+ por m9, m0
+ pshufd m1, m9, q1032
+ pcmpgtd m0, m9, m1
+ pand m9, m0
+ pandn m0, m1
+ por m9, m0
+ pshufd m1, m9, q2301
+ pcmpgtd m0, m9, m1
+ pand m9, m0
+ pandn m0, m1
+ por m0, m9
+ %endif
+
+ ; get direction and variance
+ punpckhdq m1, m4, m6
+ punpckldq m4, m6
+ psubd m2, m0, m1
+ psubd m3, m0, m4
+%if WIN64
+ WIN64_RESTORE_XMM
+ %define tmp rsp+stack_offset+8
+%else
+ %define tmp rsp-40
+%endif
+ mova [tmp+0x00], m2 ; emulate ymm in stack
+ mova [tmp+0x10], m3
+ pcmpeqd m1, m0 ; compute best cost mask
+ pcmpeqd m4, m0
+ packssdw m4, m1
+ pmovmskb eax, m4 ; get byte-idx from mask
+ tzcnt eax, eax
+ mov r1d, [tmp+rax*2] ; get idx^4 complement from emulated ymm
+ shr eax, 1 ; get direction by converting byte-idx to word-idx
+ shr r1d, 10
+ mov [varq], r1d
+ %else
+cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3
+%define base r2-shufw_6543210x
+ LEA r2, shufw_6543210x
+ pxor m0, m0
+ lea stride3q, [strideq*3]
+ movq m5, [srcq+strideq*0]
+ movhps m5, [srcq+strideq*1]
+ movq m7, [srcq+strideq*2]
+ movhps m7, [srcq+stride3q]
+ mova m1, [base+pw_128]
+ psadbw m2, m5, m0
+ psadbw m3, m7, m0
+ packssdw m2, m3
+ punpcklbw m4, m5, m0
+ punpckhbw m5, m0
+ punpcklbw m6, m7, m0
+ punpckhbw m7, m0
+ psubw m4, m1
+ psubw m5, m1
+ psubw m6, m1
+ psubw m7, m1
+
+ mova [esp+0x00], m4
+ mova [esp+0x10], m5
+ mova [esp+0x20], m6
+ mova [esp+0x50], m7
+
+ lea srcq, [srcq+strideq*4]
+ movq m5, [srcq+strideq*0]
+ movhps m5, [srcq+strideq*1]
+ movq m7, [srcq+strideq*2]
+ movhps m7, [srcq+stride3q]
+ psadbw m3, m5, m0
+ psadbw m0, m7
+ packssdw m3, m0
+ pxor m0, m0
+ punpcklbw m4, m5, m0
+ punpckhbw m5, m0
+ punpcklbw m6, m7, m0
+ punpckhbw m7, m0
+cglobal_label .main
+ psubw m4, m1
+ psubw m5, m1
+ psubw m6, m1
+ psubw m7, m1
+ packssdw m2, m3
+ psllw m1, 3
+ psubw m2, m1 ; partial_sum_hv[0]
+ pmaddwd m2, m2
+
+ mova m3, [esp+0x50]
+ mova m0, [esp+0x00]
+ paddw m0, [esp+0x10]
+ paddw m1, m3, [esp+0x20]
+ paddw m0, m4
+ paddw m1, m5
+ paddw m0, m6
+ paddw m1, m7
+ paddw m0, m1 ; partial_sum_hv[1]
+ pmaddwd m0, m0
+
+ phaddd m2, m0
+ MULLD m2, [base+div_table%+SUFFIX+48]
+ mova [esp+0x30], m2
+
+ mova m1, [esp+0x10]
+ pslldq m0, m1, 2
+ psrldq m1, 14
+ paddw m0, [esp+0x00]
+ pslldq m2, m3, 6
+ psrldq m3, 10
+ paddw m0, m2
+ paddw m1, m3
+ mova m3, [esp+0x20]
+ pslldq m2, m3, 4
+ psrldq m3, 12
+ paddw m0, m2 ; partial_sum_diag[0] top/left half
+ paddw m1, m3 ; partial_sum_diag[0] top/right half
+ pslldq m2, m4, 8
+ psrldq m3, m4, 8
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m5, 10
+ psrldq m3, m5, 6
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m6, 12
+ psrldq m3, m6, 4
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m7, 14
+ psrldq m3, m7, 2
+ paddw m0, m2 ; partial_sum_diag[0][0-7]
+ paddw m1, m3 ; partial_sum_diag[0][8-14,zero]
+ mova m3, [esp+0x50]
+ pshufb m1, [base+shufw_6543210x]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+ MULLD m2, [base+div_table%+SUFFIX+16]
+ MULLD m0, [base+div_table%+SUFFIX+ 0]
+ paddd m0, m2 ; cost[0a-d]
+ mova [esp+0x40], m0
+
+ mova m1, [esp+0x00]
+ pslldq m0, m1, 14
+ psrldq m1, 2
+ paddw m0, m7
+ pslldq m2, m3, 8
+ psrldq m3, 8
+ paddw m0, m2
+ paddw m1, m3
+ mova m3, [esp+0x20]
+ pslldq m2, m3, 10
+ psrldq m3, 6
+ paddw m0, m2
+ paddw m1, m3
+ mova m3, [esp+0x10]
+ pslldq m2, m3, 12
+ psrldq m3, 4
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m4, 6
+ psrldq m3, m4, 10
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m5, 4
+ psrldq m3, m5, 12
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m6, 2
+ psrldq m3, m6, 14
+ paddw m0, m2 ; partial_sum_diag[1][0-7]
+ paddw m1, m3 ; partial_sum_diag[1][8-14,zero]
+ mova m3, [esp+0x50]
+ pshufb m1, [base+shufw_6543210x]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+ MULLD m2, [base+div_table%+SUFFIX+16]
+ MULLD m0, [base+div_table%+SUFFIX+ 0]
+ paddd m0, m2 ; cost[4a-d]
+ phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b]
+ phaddd m1, [esp+0x30] ; cost[0,4,2,6]
+ mova [esp+0x30], m1
+
+ phaddw m0, [esp+0x00], m4
+ phaddw m1, [esp+0x10], m5
+ paddw m4, m5
+ mova m2, [esp+0x20]
+ paddw m5, m2, m3
+ phaddw m2, m6
+ paddw m6, m7
+ phaddw m3, m7
+ mova m7, [esp+0x00]
+ paddw m7, [esp+0x10]
+ mova [esp+0x00], m0
+ mova [esp+0x10], m1
+ mova [esp+0x20], m2
+
+ pslldq m1, m4, 4
+ pslldq m2, m6, 6
+ pslldq m0, m5, 2
+ paddw m1, m2
+ paddw m0, m7
+ psrldq m2, m5, 14
+ paddw m0, m1 ; partial_sum_alt[3] left
+ psrldq m1, m4, 12
+ paddw m1, m2
+ psrldq m2, m6, 10
+ paddw m1, m2 ; partial_sum_alt[3] right
+ pshuflw m1, m1, q3012
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+ MULLD m2, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
+ paddd m0, m2 ; cost[7a-d]
+ mova [esp+0x40], m0
+
+ pslldq m0, m7, 6
+ psrldq m7, 10
+ pslldq m1, m5, 4
+ psrldq m5, 12
+ pslldq m2, m4, 2
+ psrldq m4, 14
+ paddw m0, m6
+ paddw m7, m5
+ paddw m0, m1
+ paddw m7, m4
+ paddw m0, m2
+ pshuflw m2, m7, q3012
+ punpckhwd m7, m0
+ punpcklwd m0, m2
+ pmaddwd m7, m7
+ pmaddwd m0, m0
+ MULLD m7, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
+ paddd m0, m7 ; cost[5a-d]
+ mova [esp+0x50], m0
+
+ mova m7, [esp+0x10]
+ mova m2, [esp+0x20]
+ pslldq m0, m7, 2
+ psrldq m7, 14
+ pslldq m4, m2, 4
+ psrldq m2, 12
+ pslldq m5, m3, 6
+ psrldq m6, m3, 10
+ paddw m0, [esp+0x00]
+ paddw m7, m2
+ paddw m4, m5
+ paddw m7, m6 ; partial_sum_alt[3] right
+ paddw m0, m4 ; partial_sum_alt[3] left
+ pshuflw m2, m7, q3012
+ punpckhwd m7, m0
+ punpcklwd m0, m2
+ pmaddwd m7, m7
+ pmaddwd m0, m0
+ MULLD m7, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
+ paddd m0, m7 ; cost[1a-d]
+ SWAP m0, m4
+
+ pshufd m0, [esp+0x00], q1032
+ pshufd m1, [esp+0x10], q1032
+ pshufd m2, [esp+0x20], q1032
+ pshufd m3, m3, q1032
+ mova [esp+0x00], m4
+
+ pslldq m4, m0, 6
+ psrldq m0, 10
+ pslldq m5, m1, 4
+ psrldq m1, 12
+ pslldq m6, m2, 2
+ psrldq m2, 14
+ paddw m4, m3
+ paddw m0, m1
+ paddw m5, m6
+ paddw m0, m2
+ paddw m4, m5
+ pshuflw m2, m0, q3012
+ punpckhwd m0, m4
+ punpcklwd m4, m2
+ pmaddwd m0, m0
+ pmaddwd m4, m4
+ MULLD m0, [base+div_table%+SUFFIX+48]
+ MULLD m4, [base+div_table%+SUFFIX+32]
+ paddd m4, m0 ; cost[3a-d]
+
+ mova m1, [esp+0x00]
+ mova m2, [esp+0x50]
+ mova m0, [esp+0x30] ; cost[0,4,2,6]
+ phaddd m1, m4
+ phaddd m2, [esp+0x40] ; cost[1,3,5,7]
+ phaddd m1, m2
+ pshufd m2, m0, q3120
+
+ ; now find the best cost
+ %if cpuflag(sse4)
+ pmaxsd m0, m1
+ pshufd m3, m0, q1032
+ pmaxsd m3, m0
+ pshufd m0, m3, q2301
+ pmaxsd m0, m3
+ %else
+ pcmpgtd m3, m0, m1
+ pand m0, m3
+ pandn m3, m1
+ por m0, m3
+ pshufd m4, m0, q1032
+ pcmpgtd m3, m0, m4
+ pand m0, m3
+ pandn m3, m4
+ por m0, m3
+ pshufd m4, m0, q2301
+ pcmpgtd m3, m0, m4
+ pand m0, m3
+ pandn m3, m4
+ por m0, m3
+ %endif
+
+ ; get direction and variance
+ mov vard, varm
+ punpckhdq m3, m2, m1
+ punpckldq m2, m1
+ psubd m1, m0, m3
+ psubd m4, m0, m2
+ mova [esp+0x00], m1 ; emulate ymm in stack
+ mova [esp+0x10], m4
+ pcmpeqd m3, m0 ; compute best cost mask
+ pcmpeqd m2, m0
+ packssdw m2, m3
+ pmovmskb eax, m2 ; get byte-idx from mask
+ tzcnt eax, eax
+ mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm
+ shr eax, 1 ; get direction by converting byte-idx to word-idx
+ shr r1d, 10
+ mov [vard], r1d
+ %endif
+
+ RET
+%endmacro
+
+INIT_XMM sse4
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
+CDEF_DIR
+
+INIT_XMM ssse3
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
+CDEF_DIR
+
+INIT_XMM sse2
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
diff --git a/third_party/dav1d/src/x86/cpu.c b/third_party/dav1d/src/x86/cpu.c
new file mode 100644
index 0000000000..764d8be8ef
--- /dev/null
+++ b/third_party/dav1d/src/x86/cpu.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include "common/attributes.h"
+
+#include "src/x86/cpu.h"
+
+typedef struct {
+ uint32_t eax, ebx, edx, ecx;
+} CpuidRegisters;
+
+void dav1d_cpu_cpuid(CpuidRegisters *regs, unsigned leaf, unsigned subleaf);
+uint64_t dav1d_cpu_xgetbv(unsigned xcr);
+
+#define X(reg, mask) (((reg) & (mask)) == (mask))
+
+COLD unsigned dav1d_get_cpu_flags_x86(void) {
+ union {
+ CpuidRegisters r;
+ struct {
+ uint32_t max_leaf;
+ char vendor[12];
+ };
+ } cpu;
+ dav1d_cpu_cpuid(&cpu.r, 0, 0);
+ unsigned flags = 0;
+
+ if (cpu.max_leaf >= 1) {
+ CpuidRegisters r;
+ dav1d_cpu_cpuid(&r, 1, 0);
+ const unsigned model = ((r.eax >> 4) & 0x0f) + ((r.eax >> 12) & 0xf0);
+ const unsigned family = ((r.eax >> 8) & 0x0f) + ((r.eax >> 20) & 0xff);
+
+ if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ {
+ flags |= DAV1D_X86_CPU_FLAG_SSE2;
+ if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ {
+ flags |= DAV1D_X86_CPU_FLAG_SSSE3;
+ if (X(r.ecx, 0x00080000)) /* SSE4.1 */
+ flags |= DAV1D_X86_CPU_FLAG_SSE41;
+ }
+ }
+#if ARCH_X86_64
+ /* We only support >128-bit SIMD on x86-64. */
+ if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ {
+ const uint64_t xcr0 = dav1d_cpu_xgetbv(0);
+ if (X(xcr0, 0x00000006)) /* XMM/YMM */ {
+ if (cpu.max_leaf >= 7) {
+ dav1d_cpu_cpuid(&r, 7, 0);
+ if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ {
+ flags |= DAV1D_X86_CPU_FLAG_AVX2;
+ if (X(xcr0, 0x000000e0)) /* ZMM/OPMASK */ {
+ if (X(r.ebx, 0xd0230000) && X(r.ecx, 0x00005f42))
+ flags |= DAV1D_X86_CPU_FLAG_AVX512ICL;
+ }
+ }
+ }
+ }
+ }
+#endif
+ if (!memcmp(cpu.vendor, "AuthenticAMD", sizeof(cpu.vendor))) {
+ if ((flags & DAV1D_X86_CPU_FLAG_AVX2) && (family < 0x19 ||
+ (family == 0x19 && (model < 0x10 || (model >= 0x20 && model < 0x60)))))
+ {
+ /* Excavator, Zen, Zen+, Zen 2, Zen 3, Zen 3+ */
+ flags |= DAV1D_X86_CPU_FLAG_SLOW_GATHER;
+ }
+ }
+ }
+
+ return flags;
+}
diff --git a/third_party/dav1d/src/x86/cpu.h b/third_party/dav1d/src/x86/cpu.h
new file mode 100644
index 0000000000..8529c77c9b
--- /dev/null
+++ b/third_party/dav1d/src/x86/cpu.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_X86_CPU_H
+#define DAV1D_SRC_X86_CPU_H
+
+enum CpuFlags {
+ DAV1D_X86_CPU_FLAG_SSE2 = 1 << 0,
+ DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 1,
+ DAV1D_X86_CPU_FLAG_SSE41 = 1 << 2,
+ DAV1D_X86_CPU_FLAG_AVX2 = 1 << 3,
+ DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/
+ * VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */
+ DAV1D_X86_CPU_FLAG_SLOW_GATHER = 1 << 5, /* Flag CPUs where gather instructions are slow enough
+ * to cause performance regressions. */
+};
+
+unsigned dav1d_get_cpu_flags_x86(void);
+
+#endif /* DAV1D_SRC_X86_CPU_H */
diff --git a/third_party/dav1d/src/x86/cpuid.asm b/third_party/dav1d/src/x86/cpuid.asm
new file mode 100644
index 0000000000..e1d9228660
--- /dev/null
+++ b/third_party/dav1d/src/x86/cpuid.asm
@@ -0,0 +1,55 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION .text
+
+cglobal cpu_cpuid, 0, 5, 0, regs, leaf, subleaf
+ mov r4, regsmp
+ mov eax, leafm
+ mov ecx, subleafm
+%if ARCH_X86_64
+ mov r5, rbx
+%endif
+ cpuid
+ mov [r4+4*0], eax
+ mov [r4+4*1], ebx
+ mov [r4+4*2], edx
+ mov [r4+4*3], ecx
+%if ARCH_X86_64
+ mov rbx, r5
+%endif
+ RET
+
+cglobal cpu_xgetbv, 0, 0, 0, xcr
+ movifnidn ecx, xcrm
+ xgetbv
+%if ARCH_X86_64
+ shl rdx, 32
+ or rax, rdx
+%endif
+ RET
diff --git a/third_party/dav1d/src/x86/filmgrain.h b/third_party/dav1d/src/x86/filmgrain.h
new file mode 100644
index 0000000000..eeaa328d1e
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright © 2018-2022, VideoLAN and dav1d authors
+ * Copyright © 2018-2022, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/filmgrain.h"
+
+#define decl_fg_fns(ext) \
+decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ext)); \
+decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ext))
+
+decl_fg_fns(ssse3);
+decl_fg_fns(avx2);
+decl_fg_fns(avx512icl);
+
+static ALWAYS_INLINE void film_grain_dsp_init_x86(Dav1dFilmGrainDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3);
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
+ }
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl);
+#endif
+}
diff --git a/third_party/dav1d/src/x86/filmgrain16_avx2.asm b/third_party/dav1d/src/x86/filmgrain16_avx2.asm
new file mode 100644
index 0000000000..a1d4c41f27
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain16_avx2.asm
@@ -0,0 +1,2248 @@
+; Copyright © 2021-2022, VideoLAN and dav1d authors
+; Copyright © 2021-2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0
+gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+gen_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+pw_27_17_17_27: dw 27, 17, 17, 27
+pw_23_22: dw 23, 22, 0, 32
+pw_seed_xor: times 2 dw 0xb524
+ times 2 dw 0x49d8
+gen_ar0_shift: times 4 db 128
+ times 4 db 64
+ times 4 db 32
+ times 4 db 16
+pd_16: dd 16
+pd_m65536: dd -65536
+pb_1: times 4 db 1
+grain_max: times 2 dw 511
+ times 2 dw 2047
+grain_min: times 2 dw -512
+ times 2 dw -2048
+fg_max: times 2 dw 1023
+ times 2 dw 4095
+ times 2 dw 960
+ times 2 dw 3840
+ times 2 dw 940
+ times 2 dw 3760
+fg_min: times 2 dw 0
+ times 2 dw 64
+ times 2 dw 256
+uv_offset_mul: dd 256
+ dd 1024
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16, 8
+round_vals: dw 32, 64, 128, 256, 512, 1024
+pb_8_9_0_1: db 8, 9, 0, 1
+
+%macro JMP_TABLE 1-*
+ %xdefine %1_table %%table
+ %xdefine %%base %1_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %%table:
+ %rep %0 - 1
+ dd %%prefix %+ .ar%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3
+
+SECTION .text
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+INIT_YMM avx2
+cglobal generate_grain_y_16bpc, 3, 9, 14, buf, fg_data, bdmax
+%define base r4-generate_grain_y_16bpc_avx2_table
+ lea r4, [generate_grain_y_16bpc_avx2_table]
+ vpbroadcastw xm0, [fg_dataq+FGData.seed]
+ mov r6d, [fg_dataq+FGData.grain_scale_shift]
+ movq xm1, [base+next_upperbit_mask]
+ mov r3, -73*82*2
+ movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
+ lea r7d, [bdmaxq+1]
+ movq xm4, [base+mul_bits]
+ shr r7d, 11 ; 0 for 10bpc, 2 for 12bpc
+ movq xm5, [base+hmul_bits]
+ sub r6, r7
+ mova xm6, [base+pb_mask]
+ sub bufq, r3
+ vpbroadcastw xm7, [base+round+r6*2-2]
+ lea r6, [gaussian_sequence]
+ movsxd r5, [r4+r5*4]
+.loop:
+ pand xm2, xm0, xm1
+ psrlw xm3, xm2, 10
+ por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw xm2, xm4 ; bits 0x0f00 are set
+ pmulhuw xm0, xm5
+ pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm2, xm0 ; aggregate each bit into next seed's high bit
+ por xm3, xm2 ; 4 next output seeds
+ pshuflw xm0, xm3, q3333
+ psrlw xm3, 5
+ pand xm2, xm0, xm1
+ movq r7, xm3
+ psrlw xm3, xm2, 10
+ por xm2, xm3
+ pmullw xm2, xm4
+ pmulhuw xm0, xm5
+ movzx r8d, r7w
+ pshufb xm3, xm6, xm2
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm0, xm2
+ movd xm2, [r6+r8*2]
+ rorx r8, r7, 32
+ por xm3, xm0
+ shr r7d, 16
+ pinsrw xm2, [r6+r7*2], 1
+ pshuflw xm0, xm3, q3333
+ movzx r7d, r8w
+ psrlw xm3, 5
+ pinsrw xm2, [r6+r7*2], 2
+ shr r8d, 16
+ movq r7, xm3
+ pinsrw xm2, [r6+r8*2], 3
+ movzx r8d, r7w
+ pinsrw xm2, [r6+r8*2], 4
+ rorx r8, r7, 32
+ shr r7d, 16
+ pinsrw xm2, [r6+r7*2], 5
+ movzx r7d, r8w
+ pinsrw xm2, [r6+r7*2], 6
+ shr r8d, 16
+ pinsrw xm2, [r6+r8*2], 7
+ paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0
+ pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support
+ mova [bufq+r3], xm2
+ add r3, 8*2
+ jl .loop
+
+ ; auto-regression code
+ add r5, r4
+ jmp r5
+
+.ar1:
+ DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+ movd xm4, [fg_dataq+FGData.ar_coeffs_y]
+ DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
+ pinsrb xm4, [base+pb_1], 3
+ pmovsxbw xm4, xm4
+ pshufd xm5, xm4, q1111
+ pshufd xm4, xm4, q0000
+ vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd
+ sub bufq, 2*(82*73-(82*3+79))
+ mov hd, 70
+ sar maxd, 1
+ mov mind, maxd
+ xor mind, -1
+.y_loop_ar1:
+ mov xq, -76
+ movsx val3d, word [bufq+xq*2-2]
+.x_loop_ar1:
+ movu xm0, [bufq+xq*2-82*2-2] ; top/left
+ psrldq xm2, xm0, 2 ; top
+ psrldq xm1, xm0, 4 ; top/right
+ punpcklwd xm0, xm2
+ punpcklwd xm1, xm3
+ pmaddwd xm0, xm4
+ pmaddwd xm1, xm5
+ paddd xm0, xm1
+.x_loop_ar1_inner:
+ movd val0d, xm0
+ psrldq xm0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sarx val3d, val3d, shiftd
+ movsx val0d, word [bufq+xq*2]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov word [bufq+xq*2], val3w
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xb, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+.x_loop_ar1_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar1
+.ar0:
+ RET
+
+.ar2:
+ DEFINE_ARGS buf, fg_data, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movq xm0, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11
+ vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4
+ vpbroadcastw xm10, [base+round_vals-12+shiftq*2]
+ pxor m1, m1
+ punpcklwd xm10, xm1
+ pcmpgtb m1, m0
+ punpcklbw m0, m1 ; cf5-11,0-4
+ vpermq m1, m0, q3333 ; cf4
+ vbroadcasti128 m11, [base+gen_shufA]
+ pshufd m6, m0, q0000 ; cf[5,6], cf[0-1]
+ vbroadcasti128 m12, [base+gen_shufB]
+ pshufd m7, m0, q1111 ; cf[7,8], cf[2-3]
+ punpckhwd xm1, xm0
+ pshufhw xm9, xm0, q2121
+ pshufd xm8, xm1, q0000 ; cf[4,9]
+ sar bdmaxd, 1
+ punpckhqdq xm9, xm9 ; cf[10,11]
+ movd xm4, bdmaxd ; max_grain
+ pcmpeqd xm5, xm5
+ sub bufq, 2*(82*73-(82*3+79))
+ pxor xm5, xm4 ; min_grain
+ DEFINE_ARGS buf, fg_data, h, x
+ mov hd, 70
+.y_loop_ar2:
+ mov xq, -76
+.x_loop_ar2:
+ vbroadcasti128 m2, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5]
+ vinserti128 m1, m2, [bufq+xq*2-82*2-4], 0 ; y=-1,x=[-2,+5]
+ pshufb m0, m1, m11 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ pmaddwd m0, m6
+ punpckhwd xm2, xm1 ; y=-2/-1 interleaved, x=[+2,+5]
+ pshufb m1, m12 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ pmaddwd m1, m7
+ pmaddwd xm2, xm8
+ paddd m0, m1
+ vextracti128 xm1, m0, 1
+ paddd xm0, xm10
+ paddd xm2, xm0
+ movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5]
+ paddd xm2, xm1
+ pmovsxwd xm1, [bufq+xq*2] ; in dwords, y=0,x=[0,3]
+.x_loop_ar2_inner:
+ pmaddwd xm3, xm9, xm0
+ psrldq xm0, 2
+ paddd xm3, xm2
+ psrldq xm2, 4 ; shift top to next pixel
+ psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
+ ; skip packssdw because we only care about one value
+ paddd xm3, xm1
+ pminsd xm3, xm4
+ psrldq xm1, 4
+ pmaxsd xm3, xm5
+ pextrw [bufq+xq*2], xm3, 0
+ punpcklwd xm3, xm3
+ pblendw xm0, xm3, 0010b
+ inc xq
+ jz .x_loop_ar2_end
+ test xb, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+.x_loop_ar2_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+.ar3:
+ DEFINE_ARGS buf, fg_data, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ sar bdmaxd, 1
+ movq xm7, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6
+ movd xm0, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16
+ pinsrb xm7, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13
+ pinsrb xm0, [base+pb_1], 3 ; cf14-16,pb_1
+ movd xm1, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23
+ vinserti128 m7, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13
+ vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20
+ vpbroadcastw xm11, [base+round_vals+shiftq*2-12]
+ movd xm12, bdmaxd ; max_grain
+ punpcklbw m7, m7 ; sign-extension
+ punpcklbw m0, m0 ; sign-extension
+ punpcklbw xm1, xm1
+ REPX {psraw x, 8}, m7, m0, xm1
+ pshufd m4, m7, q0000 ; cf[0,1] | cf[7,8]
+ pshufd m5, m7, q1111 ; cf[2,3] | cf[9,10]
+ pshufd m6, m7, q2222 ; cf[4,5] | cf[11,12]
+ pshufd xm7, xm7, q3333 ; cf[6,13]
+ pshufd m8, m0, q0000 ; cf[14,15] | cf[17,18]
+ pshufd m9, m0, q1111 ; cf[16],pw_1 | cf[19,20]
+ paddw xm0, xm11, xm11
+ pcmpeqd xm13, xm13
+ pblendw xm10, xm1, xm0, 00001000b
+ pxor xm13, xm12 ; min_grain
+ DEFINE_ARGS buf, fg_data, h, x
+ sub bufq, 2*(82*73-(82*3+79))
+ mov hd, 70
+.y_loop_ar3:
+ mov xq, -76
+.x_loop_ar3:
+ movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
+ vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4]
+ movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8]
+ vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12]
+ palignr m3, m1, m0, 2 ; y=-3/-2,x=[-2,+5]
+ palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6]
+ punpckhwd m2, m0, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m0, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+ pmaddwd m0, m4
+ pmaddwd m2, m6
+ pmaddwd m3, m5
+ paddd m0, m2
+ movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
+ vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8]
+ paddd m0, m3
+ psrldq m3, m2, 2
+ punpcklwd m3, m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ pmaddwd m3, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ paddd m0, m3
+ psrldq m3, m2, 4
+ psrldq m2, 6
+ vpblendd m2, m11, 0x0f ; rounding constant
+ punpcklwd m3, m2 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd]
+ pmaddwd m3, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6]
+ vextracti128 xm2, m1, 1
+ punpcklwd xm1, xm2
+ pmaddwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6]
+ paddd m0, m3
+ vextracti128 xm2, m0, 1
+ paddd xm0, xm1
+ movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
+ paddd xm0, xm2
+.x_loop_ar3_inner:
+ pmaddwd xm2, xm1, xm10
+ pshuflw xm3, xm2, q1032
+ paddd xm2, xm0 ; add top
+ paddd xm2, xm3 ; left+cur
+ psrldq xm0, 4
+ psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
+ ; skip packssdw because we only care about one value
+ pminsd xm2, xm12
+ pmaxsd xm2, xm13
+ pextrw [bufq+xq*2], xm2, 0
+ pslldq xm2, 4
+ psrldq xm1, 2
+ pblendw xm1, xm2, 0100b
+ inc xq
+ jz .x_loop_ar3_end
+ test xb, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+.x_loop_ar3_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar3
+ RET
+
+%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y
+INIT_XMM avx2
+cglobal generate_grain_uv_%1_16bpc, 4, 11, 8, buf, bufy, fg_data, uv, bdmax
+%define base r8-generate_grain_uv_%1_16bpc_avx2_table
+ lea r8, [generate_grain_uv_%1_16bpc_avx2_table]
+ movifnidn bdmaxd, bdmaxm
+ vpbroadcastw xm0, [fg_dataq+FGData.seed]
+ mov r5d, [fg_dataq+FGData.grain_scale_shift]
+ movq xm1, [base+next_upperbit_mask]
+ lea r6d, [bdmaxq+1]
+ movq xm4, [base+mul_bits]
+ shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc
+ movq xm5, [base+hmul_bits]
+ sub r5, r6
+ mova xm6, [base+pb_mask]
+ vpbroadcastd xm2, [base+pw_seed_xor+uvq*4]
+ vpbroadcastw xm7, [base+round+r5*2-2]
+ pxor xm0, xm2
+ lea r6, [gaussian_sequence]
+%if %2
+ mov r7d, 73-35*%3
+ add bufq, 44*2
+.loop_y:
+ mov r5, -44*2
+%else
+ mov r5, -82*73*2
+ sub bufq, r5
+%endif
+.loop_x:
+ pand xm2, xm0, xm1
+ psrlw xm3, xm2, 10
+ por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw xm2, xm4 ; bits 0x0f00 are set
+ pmulhuw xm0, xm5
+ pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm2, xm0 ; aggregate each bit into next seed's high bit
+ por xm2, xm3 ; 4 next output seeds
+ pshuflw xm0, xm2, q3333
+ psrlw xm2, 5
+ movq r10, xm2
+ movzx r9d, r10w
+ movd xm2, [r6+r9*2]
+ rorx r9, r10, 32
+ shr r10d, 16
+ pinsrw xm2, [r6+r10*2], 1
+ movzx r10d, r9w
+ pinsrw xm2, [r6+r10*2], 2
+ shr r9d, 16
+ pinsrw xm2, [r6+r9*2], 3
+ paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0
+ pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support
+ movq [bufq+r5], xm2
+ add r5, 8
+ jl .loop_x
+%if %2
+ add bufq, 82*2
+ dec r7d
+ jg .loop_y
+%endif
+
+ ; auto-regression code
+ movsxd r6, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r6, [r8+r6*4]
+ add r6, r8
+ jmp r6
+
+INIT_YMM avx2
+.ar0:
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ vpbroadcastb m0, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ sar bdmaxd, 1
+ vpbroadcastd m4, [base+gen_ar0_shift-24+shiftq*4]
+ movd xm6, bdmaxd
+ pcmpeqw m7, m7
+ pmaddubsw m4, m0 ; ar_coeff << (14 - shift)
+ vpbroadcastw m6, xm6 ; max_gain
+ pxor m7, m6 ; min_grain
+ DEFINE_ARGS buf, bufy, h, x
+%if %2
+ vpbroadcastw m5, [base+hmul_bits+2+%3*2]
+ sub bufq, 2*(82*(73-35*%3)+82-(82*3+41))
+%else
+ sub bufq, 2*(82*70-3)
+%endif
+ add bufyq, 2*(3+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar0:
+%if %2
+ ; first 32 pixels
+ movu xm0, [bufyq+16*0]
+ vinserti128 m0, [bufyq+16*2], 1
+ movu xm1, [bufyq+16*1]
+ vinserti128 m1, [bufyq+16*3], 1
+%if %3
+ movu xm2, [bufyq+82*2+16*0]
+ vinserti128 m2, [bufyq+82*2+16*2], 1
+ movu xm3, [bufyq+82*2+16*1]
+ vinserti128 m3, [bufyq+82*2+16*3], 1
+ paddw m0, m2
+ paddw m1, m3
+%endif
+ phaddw m0, m1
+ movu xm1, [bufyq+16*4]
+ vinserti128 m1, [bufyq+16*6], 1
+ movu xm2, [bufyq+16*5]
+ vinserti128 m2, [bufyq+16*7], 1
+%if %3
+ movu xm3, [bufyq+82*2+16*4]
+ vinserti128 m3, [bufyq+82*2+16*6], 1
+ paddw m1, m3
+ movu xm3, [bufyq+82*2+16*5]
+ vinserti128 m3, [bufyq+82*2+16*7], 1
+ paddw m2, m3
+%endif
+ phaddw m1, m2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+%else
+ xor xd, xd
+.x_loop_ar0:
+ movu m0, [bufyq+xq*2]
+ movu m1, [bufyq+xq*2+32]
+%endif
+ paddw m0, m0
+ paddw m1, m1
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+%if %2
+ paddw m0, [bufq+ 0]
+ paddw m1, [bufq+32]
+%else
+ paddw m0, [bufq+xq*2+ 0]
+ paddw m1, [bufq+xq*2+32]
+%endif
+ pminsw m0, m6
+ pminsw m1, m6
+ pmaxsw m0, m7
+ pmaxsw m1, m7
+%if %2
+ movu [bufq+ 0], m0
+ movu [bufq+32], m1
+
+ ; last 6 pixels
+ movu xm0, [bufyq+32*4]
+ movu xm1, [bufyq+32*4+16]
+%if %3
+ paddw xm0, [bufyq+32*4+82*2]
+ paddw xm1, [bufyq+32*4+82*2+16]
+%endif
+ phaddw xm0, xm1
+ movu xm1, [bufq+32*2]
+ pmulhrsw xm0, xm5
+ paddw xm0, xm0
+ pmulhrsw xm0, xm4
+ paddw xm0, xm1
+ pminsw xm0, xm6
+ pmaxsw xm0, xm7
+ vpblendd xm0, xm1, 0x08
+ movu [bufq+32*2], xm0
+%else
+ movu [bufq+xq*2+ 0], m0
+ movu [bufq+xq*2+32], m1
+ add xd, 32
+ cmp xd, 64
+ jl .x_loop_ar0
+
+ ; last 12 pixels
+ movu m0, [bufyq+64*2]
+ movu m1, [bufq+64*2]
+ paddw m0, m0
+ pmulhrsw m0, m4
+ paddw m0, m1
+ pminsw m0, m6
+ pmaxsw m0, m7
+ vpblendd m0, m1, 0xc0
+ movu [bufq+64*2], m0
+%endif
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar0
+ RET
+
+INIT_XMM avx2
+.ar1:
+ DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+ movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
+ DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift
+ pmovsxbw xm4, xm4
+ pshufd xm5, xm4, q1111
+ pshufd xm4, xm4, q0000
+ pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd
+ vpbroadcastw xm6, [base+hmul_bits+2+%3*2]
+ vpbroadcastd xm3, xm3
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+ sar maxd, 1
+ mov mind, maxd
+ xor mind, -1
+.y_loop_ar1:
+ mov xq, -(76>>%2)
+ movsx val3d, word [bufq+xq*2-2]
+.x_loop_ar1:
+ movu xm0, [bufq+xq*2-82*2-2] ; top/left
+%if %2
+ movu xm2, [bufyq+xq*4]
+%else
+ movq xm2, [bufyq+xq*2]
+%endif
+%if %2
+%if %3
+ phaddw xm2, [bufyq+xq*4+82*2]
+ punpckhqdq xm1, xm2, xm2
+ paddw xm2, xm1
+%else
+ phaddw xm2, xm2
+%endif
+ pmulhrsw xm2, xm6
+%endif
+ psrldq xm1, xm0, 4 ; top/right
+ punpcklwd xm1, xm2
+ psrldq xm2, xm0, 2 ; top
+ punpcklwd xm0, xm2
+ pmaddwd xm1, xm5
+ pmaddwd xm0, xm4
+ paddd xm1, xm3
+ paddd xm0, xm1
+.x_loop_ar1_inner:
+ movd val0d, xm0
+ psrldq xm0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sarx val3d, val3d, shiftd
+ movsx val0d, word [bufq+xq*2]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov word [bufq+xq*2], val3w
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xb, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+.x_loop_ar1_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar1
+ RET
+
+INIT_YMM avx2
+.ar2:
+%if WIN64
+ ; xmm6 and xmm7 already saved
+ %assign xmm_regs_used 13 + %2
+ %assign stack_size_padded 136
+ SUB rsp, stack_size_padded
+ movaps [rsp+16*2], xmm8
+ movaps [rsp+16*3], xmm9
+ movaps [rsp+16*4], xmm10
+ movaps [rsp+16*5], xmm11
+ movaps [rsp+16*6], xmm12
+%if %2
+ movaps [rsp+16*7], xmm13
+%endif
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ vbroadcasti128 m10, [base+gen_shufA]
+ sar bdmaxd, 1
+ vbroadcasti128 m11, [base+gen_shufB]
+ movd xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 5]
+ pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4
+ pinsrb xm7, [base+pb_1], 5
+ pinsrw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3
+ movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
+ pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 9], 13
+ pmovsxbw m7, xm7
+ movd xm8, bdmaxd ; max_grain
+ pshufd m4, m7, q0000
+ vpbroadcastw xm12, [base+round_vals-12+shiftq*2]
+ pshufd m5, m7, q1111
+ pcmpeqd xm9, xm9
+ pshufd m6, m7, q2222
+ pxor xm9, xm8 ; min_grain
+ pshufd xm7, xm7, q3333
+ DEFINE_ARGS buf, bufy, fg_data, h, x
+%if %2
+ vpbroadcastw xm13, [base+hmul_bits+2+%3*2]
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar2:
+ mov xq, -(76>>%2)
+.x_loop_ar2:
+ vbroadcasti128 m3, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5]
+ vinserti128 m2, m3, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5]
+ pshufb m0, m2, m10 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ pmaddwd m0, m4
+ pshufb m1, m2, m11 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ pmaddwd m1, m5
+ punpckhwd m2, m3 ; y=-2/-1 interleaved, x=[+2,+5]
+%if %2
+ movu xm3, [bufyq+xq*4]
+%if %3
+ paddw xm3, [bufyq+xq*4+82*2]
+%endif
+ phaddw xm3, xm3
+ pmulhrsw xm3, xm13
+%else
+ movq xm3, [bufyq+xq*2]
+%endif
+ punpcklwd xm3, xm12 ; luma, round interleaved
+ vpblendd m2, m3, 0x0f
+ pmaddwd m2, m6
+ paddd m1, m0
+ movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5]
+ paddd m2, m1
+ vextracti128 xm1, m2, 1
+ paddd xm2, xm1
+ pshufd xm1, xm0, q3321
+ pmovsxwd xm1, xm1 ; y=0,x=[0,3] in dword
+.x_loop_ar2_inner:
+ pmaddwd xm3, xm7, xm0
+ paddd xm3, xm2
+ psrldq xm2, 4 ; shift top to next pixel
+ psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
+ ; we do not need to packssdw since we only care about one value
+ paddd xm3, xm1
+ psrldq xm1, 4
+ pminsd xm3, xm8
+ pmaxsd xm3, xm9
+ pextrw [bufq+xq*2], xm3, 0
+ psrldq xm0, 2
+ pslldq xm3, 2
+ pblendw xm0, xm3, 00000010b
+ inc xq
+ jz .x_loop_ar2_end
+ test xb, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+.x_loop_ar2_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+.ar3:
+%if WIN64
+ ; xmm6 and xmm7 already saved
+ %assign stack_offset 32
+ %assign xmm_regs_used 14 + %2
+ %assign stack_size_padded 152
+ SUB rsp, stack_size_padded
+ movaps [rsp+16*2], xmm8
+ movaps [rsp+16*3], xmm9
+ movaps [rsp+16*4], xmm10
+ movaps [rsp+16*5], xmm11
+ movaps [rsp+16*6], xmm12
+ movaps [rsp+16*7], xmm13
+%if %2
+ movaps [rsp+16*8], xmm14
+%endif
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ vpbroadcastw xm11, [base+round_vals-12+shiftq*2]
+ sar bdmaxd, 1
+ movq xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
+ pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma
+ movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7]
+ pmovsxbw m7, xm7
+%if %2
+ vpbroadcastw xm14, [base+hmul_bits+2+%3*2]
+%endif
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14]
+ pinsrb xm0, [base+pb_1], 3
+ pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1
+ pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2
+ pmovsxbw m0, xm0
+ movd xm12, bdmaxd ; max_grain
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pcmpeqd xm13, xm13
+ punpckhqdq xm10, xm0, xm0
+ pxor xm13, xm12 ; min_grain
+ pinsrw xm10, [base+round_vals-10+shiftq*2], 3
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar3:
+ mov xq, -(76>>%2)
+.x_loop_ar3:
+ movu xm2, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
+ vinserti128 m2, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4]
+ movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8]
+ vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12]
+ palignr m3, m1, m2, 2 ; y=-3/-2,x=[-2,+5]
+ palignr m1, m2, 12 ; y=-3/-2,x=[+3,+6]
+ punpcklwd m0, m2, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ punpckhwd m2, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+ pmaddwd m0, m4
+ pmaddwd m2, m6
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m0, m3
+ movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
+ vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8]
+%if %2
+ movu xm3, [bufyq+xq*4]
+%if %3
+ paddw xm3, [bufyq+xq*4+82*2]
+%endif
+ phaddw xm3, xm3
+ pmulhrsw xm3, xm14
+%else
+ movq xm3, [bufyq+xq*2]
+%endif
+ punpcklwd m1, m3
+ pmaddwd m1, m7
+ paddd m0, m1
+ psrldq m1, m2, 4
+ psrldq m3, m2, 6
+ vpblendd m3, m11, 0x0f ; rounding constant
+ punpcklwd m1, m3 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd]
+ pmaddwd m1, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6]
+ psrldq m3, m2, 2
+ punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ pmaddwd m2, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ paddd m0, m1
+ movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
+ paddd m0, m2
+ vextracti128 xm2, m0, 1
+ paddd xm0, xm2
+.x_loop_ar3_inner:
+ pmaddwd xm2, xm1, xm10
+ pshuflw xm3, xm2, q1032
+ paddd xm2, xm0 ; add top
+ paddd xm2, xm3 ; left+cur
+ psrldq xm0, 4
+ psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
+ psrldq xm1, 2
+ ; no need to packssdw since we only care about one value
+ pminsd xm2, xm12
+ pmaxsd xm2, xm13
+ pextrw [bufq+xq*2], xm2, 0
+ pslldq xm2, 4
+ pblendw xm1, xm2, 00000100b
+ inc xq
+ jz .x_loop_ar3_end
+ test xb, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+.x_loop_ar3_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar3
+ RET
+%endmacro
+
+cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, unused, sby, see
+%define base r11-grain_min
+ lea r11, [grain_min]
+ mov r6d, r9m ; bdmax
+ mov r9d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ mov sbyd, sbym
+ vpbroadcastd m8, r9m
+ shr r6d, 11 ; is_12bpc
+ vpbroadcastd m9, [base+grain_min+r6*4]
+ shlx r10d, r9d, r6d
+ vpbroadcastd m10, [base+grain_max+r6*4]
+ lea r9d, [r6+r9*4]
+ vpbroadcastw m11, [base+mul_bits+r7*2-12]
+ vpbroadcastd m12, [base+fg_min+r10*4]
+ vpbroadcastd m13, [base+fg_max+r9*4]
+ test sbyd, sbyd
+ setnz r7b
+ vpbroadcastd m14, [base+pd_16]
+ test r7b, [fg_dataq+FGData.overlap_flag]
+ jnz .vertical_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak
+
+ lea src_bakq, [srcq+wq*2]
+ neg wq
+ sub dstq, srcq
+
+.loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y:
+ ; scaling[src]
+ mova m0, [srcq+ 0]
+ mova m1, [srcq+32]
+ pand m4, m8, m0
+ psrld m3, m0, 16
+ mova m6, m9
+ vpgatherdd m2, [scalingq+m4-0], m9
+ pand m3, m8
+ mova m9, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pand m5, m8, m1
+ mova m6, m9
+ vpgatherdd m3, [scalingq+m5-0], m9
+ pblendw m4, m2, 0x55
+ psrld m2, m1, 16
+ mova m9, m6
+ pand m2, m8
+ vpgatherdd m5, [scalingq+m2-2], m6
+ pblendw m5, m3, 0x55
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m4, [grain_lutq+offxyq*2]
+ pmulhrsw m5, [grain_lutq+offxyq*2+32]
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+32], m1
+
+ add srcq, strideq
+ add grain_lutq, 82*2
+ dec hd
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+ cmp byte [fg_dataq+FGData.overlap_flag], 0
+ je .loop_x
+ movq xm7, [pw_27_17_17_27]
+ cmp dword r8m, 0 ; sby
+ jne .loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, left_offxy
+
+ lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y_h_overlap:
+ ; scaling[src]
+ mova m0, [srcq+ 0]
+ mova m1, [srcq+32]
+ pand m4, m8, m0
+ psrld m3, m0, 16
+ mova m6, m9
+ vpgatherdd m2, [scalingq+m4-0], m9
+ pand m3, m8
+ mova m9, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pand m5, m8, m1
+ mova m6, m9
+ vpgatherdd m3, [scalingq+m5-0], m9
+ pblendw m4, m2, 0x55
+ psrld m2, m1, 16
+ mova m9, m6
+ pand m2, m8
+ vpgatherdd m5, [scalingq+m2-2], m6
+ pblendw m5, m3, 0x55
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq*2]
+ movd xm6, [grain_lutq+left_offxyq*2]
+ punpcklwd xm6, xm3
+ pmaddwd xm6, xm7
+ paddd xm6, xm14
+ psrad xm6, 5
+ packssdw xm6, xm6
+ pmaxsw xm6, xm9
+ pminsw xm6, xm10
+ vpblendd m3, m6, 0x01
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m4, m3
+ pmulhrsw m5, [grain_lutq+offxyq*2+32]
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+32], m1
+
+ add srcq, strideq
+ add grain_lutq, 82*2
+ dec hd
+ jg .loop_y_h_overlap
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+ cmp dword r8m, 0 ; sby
+ jne .loop_x_hv_overlap
+ jmp .loop_x_h_overlap
+
+.vertical_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
+ sby, see, src_bak
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ lea src_bakq, [srcq+wq*2]
+ neg wq
+ sub dstq, srcq
+
+.loop_x_v_overlap:
+ vpbroadcastd m15, [pw_27_17_17_27]
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, unused, top_offxy
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, unused, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+.loop_y_v_overlap:
+ ; scaling[src]
+ mova m0, [srcq+ 0]
+ mova m1, [srcq+32]
+ pand m4, m8, m0
+ psrld m3, m0, 16
+ mova m6, m9
+ vpgatherdd m2, [scalingq+m4-0], m9
+ pand m3, m8
+ mova m9, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pand m5, m8, m1
+ mova m6, m9
+ vpgatherdd m3, [scalingq+m5-0], m9
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m9, m6
+ pand m4, m8
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m6, [grain_lutq+offxyq*2]
+ movu m5, [grain_lutq+top_offxyq*2]
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ pmaddwd m4, m15
+ pmaddwd m5, m15
+ movu m7, [grain_lutq+offxyq*2+32]
+ movu m6, [grain_lutq+top_offxyq*2+32]
+ paddd m4, m14
+ paddd m5, m14
+ psrad m4, 5
+ psrad m5, 5
+ packssdw m4, m5
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+ pmaddwd m5, m15
+ pmaddwd m6, m15
+ paddd m5, m14
+ paddd m6, m14
+ psrad m5, 5
+ psrad m6, 5
+ packssdw m5, m6
+ pmaxsw m4, m9
+ pmaxsw m5, m9
+ pminsw m4, m10
+ pminsw m5, m10
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m11
+ pmaddubsw m3, m11
+ paddw m2, m2
+ paddw m3, m3
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+32], m1
+
+ add srcq, strideq
+ add grain_lutq, 82*2
+ dec hb
+ jz .end_y_v_overlap
+ vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ add hd, 0x80000000
+ jnc .loop_y_v_overlap
+ jmp .loop_y
+.end_y_v_overlap:
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+
+.loop_x_hv_overlap:
+ vpbroadcastd m15, [pw_27_17_17_27]
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyd, [top_offxyq+32]
+ lea left_offxyd, [offyq+32]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+.loop_y_hv_overlap:
+ ; scaling[src]
+ mova m0, [srcq+ 0]
+ mova m1, [srcq+32]
+ pand m4, m8, m0
+ psrld m3, m0, 16
+ mova m6, m9
+ vpgatherdd m2, [scalingq+m4-0], m9
+ pand m3, m8
+ mova m9, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pand m5, m8, m1
+ mova m6, m9
+ vpgatherdd m3, [scalingq+m5-0], m9
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m9, m6
+ pand m4, m8
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m7, [grain_lutq+offxyq*2]
+ movd xm6, [grain_lutq+left_offxyq*2]
+ movu m5, [grain_lutq+top_offxyq*2]
+ movd xm4, [grain_lutq+topleft_offxyq*2]
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklwd xm6, xm7
+ punpcklwd xm4, xm5
+ punpcklqdq xm6, xm4
+ movddup xm4, [pw_27_17_17_27]
+ pmaddwd xm6, xm4
+ paddd xm6, xm14
+ psrad xm6, 5
+ packssdw xm6, xm6
+ pmaxsw xm6, xm9
+ pminsw xm6, xm10
+ pshuflw xm4, xm6, q1032
+ vpblendd m6, m7, 0xfe
+ vpblendd m4, m5, 0xfe
+ ; followed by v interpolation (top | cur -> cur)
+ punpckhwd m5, m7
+ pmaddwd m5, m15
+ punpcklwd m4, m6
+ pmaddwd m4, m15
+ movu m7, [grain_lutq+offxyq*2+32]
+ movu m6, [grain_lutq+top_offxyq*2+32]
+ paddd m5, m14
+ paddd m4, m14
+ psrad m5, 5
+ psrad m4, 5
+ packssdw m4, m5
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+ pmaddwd m5, m15
+ pmaddwd m6, m15
+ paddd m5, m14
+ paddd m6, m14
+ psrad m5, 5
+ psrad m6, 5
+ packssdw m5, m6
+ pmaxsw m4, m9
+ pmaxsw m5, m9
+ pminsw m4, m10
+ pminsw m5, m10
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m11
+ pmaddubsw m3, m11
+ paddw m2, m2
+ paddw m3, m3
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+32], m1
+
+ add srcq, strideq
+ add grain_lutq, 82*2
+ dec hb
+ jz .end_y_hv_overlap
+ vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ add hd, 0x80000000
+ jnc .loop_y_hv_overlap
+ movq xm7, [pw_27_17_17_27]
+ jmp .loop_y_h_overlap
+.end_y_hv_overlap:
+ add wq, 32
+ lea srcq, [src_bakq+wq*2]
+ jl .loop_x_hv_overlap
+.end:
+ RET
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, lstride, uv_pl, is_id
+%define base r12-grain_min
+ lea r12, [grain_min]
+ mov r9d, r13m ; bdmax
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ mov r11d, is_idm
+ mov sbyd, sbym
+ vpbroadcastw m11, [base+mul_bits+r7*2-12]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ shr r9d, 11 ; is_12bpc
+ vpbroadcastd m8, [base+grain_min+r9*4]
+ shlx r10d, r6d, r9d
+ vpbroadcastd m9, [base+grain_max+r9*4]
+ vpbroadcastw m10, r13m
+ shlx r6d, r6d, r11d
+ vpbroadcastd m12, [base+fg_min+r10*4]
+ lea r6d, [r9+r6*2]
+ vpbroadcastd m13, [base+fg_max+r6*4]
+ test sbyd, sbyd
+ setnz r7b
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused, sby, see, overlap
+
+%if %1
+ mov r6d, r11m
+ vpbroadcastd m0, [base+pb_8_9_0_1]
+ vpbroadcastd m1, [base+uv_offset_mul+r9*4]
+ vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4]
+ vpbroadcastd m15, [fg_dataq+FGData.uv_offset+r6*4]
+ pshufb m14, m0 ; { uv_luma_mult, uv_mult }
+ pmaddwd m15, m1
+%else
+%if %2
+ vpbroadcastq m15, [base+pw_23_22]
+%else
+ vpbroadcastq m15, [base+pw_27_17_17_27]
+%endif
+ vpbroadcastd m14, [base+pd_16]
+%endif
+ test r7b, [fg_dataq+FGData.overlap_flag]
+ jnz %%vertical_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused2, unused3, see, unused4, unused5, unused6, luma, lstride
+
+ mov lumaq, r9mp
+ mov lstrideq, r10mp
+ lea r10, [srcq+wq*2]
+ lea r11, [dstq+wq*2]
+ lea r12, [lumaq+wq*(2<<%2)]
+ mov r9mp, r10
+ mov r11mp, r11
+ mov r12mp, r12
+ neg wq
+
+%%loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, unused1, unused2, unused3, luma, lstride
+
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, unused1, unused2, unused3, luma, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y:
+ ; luma_src
+%if %2
+ mova xm2, [lumaq+lstrideq*0+ 0]
+ vinserti128 m2, [lumaq+lstrideq*0+32], 1
+ mova xm4, [lumaq+lstrideq*0+16]
+ vinserti128 m4, [lumaq+lstrideq*0+48], 1
+ mova xm3, [lumaq+lstrideq*(1<<%3)+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1
+ mova xm5, [lumaq+lstrideq*(1<<%3)+16]
+ vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1
+ phaddw m2, m4
+ phaddw m3, m5
+ pxor m4, m4
+ pavgw m2, m4
+ pavgw m3, m4
+%elif %1
+ mova m2, [lumaq+ 0]
+ mova m3, [lumaq+32]
+%endif
+%if %1
+ mova m0, [srcq]
+%if %2
+ mova m1, [srcq+strideq]
+%else
+ mova m1, [srcq+32]
+%endif
+ punpckhwd m4, m2, m0
+ punpcklwd m2, m0
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m4, m2, m5, m3
+ REPX {paddd x, m15}, m4, m2, m5, m3
+ REPX {psrad x, 6 }, m4, m2, m5, m3
+ packusdw m2, m4
+ packusdw m3, m5
+ pminuw m2, m10
+ pminuw m3, m10 ; clip_pixel()
+%elif %2
+ pand m2, m10
+ pand m3, m10
+%else
+ pand m2, m10, [lumaq+ 0]
+ pand m3, m10, [lumaq+32]
+%endif
+
+ ; scaling[luma_src]
+ vpbroadcastd m7, [pd_m65536]
+ pandn m4, m7, m2
+ mova m6, m7
+ vpgatherdd m5, [scalingq+m4-0], m7
+ psrld m2, 16
+ mova m7, m6
+ vpgatherdd m4, [scalingq+m2-2], m6
+ pblendw m4, m5, 0x55
+ pandn m5, m7, m3
+ mova m6, m7
+ vpgatherdd m2, [scalingq+m5-0], m7
+ psrld m3, 16
+ vpgatherdd m5, [scalingq+m3-2], m6
+ pblendw m5, m2, 0x55
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m4, [grain_lutq+offxyq*2]
+%if %2
+ pmulhrsw m5, [grain_lutq+offxyq*2+82*2]
+%else
+ pmulhrsw m5, [grain_lutq+offxyq*2+32]
+%endif
+
+ ; dst = clip_pixel(src, noise)
+%if %1
+ paddw m0, m4
+ paddw m1, m5
+%else
+ paddw m0, m4, [srcq]
+%if %2
+ paddw m1, m5, [srcq+strideq]
+%else
+ paddw m1, m5, [srcq+32]
+%endif
+%endif
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq], m0
+%if %2
+ mova [dstq+strideq], m1
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ mova [dstq+32], m1
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*(2<<%2)
+%if %2
+ sub hb, 2
+%else
+ dec hb
+%endif
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ cmp byte [fg_dataq+FGData.overlap_flag], 0
+ je %%loop_x
+ cmp dword r8m, 0 ; sby
+ jne %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, luma, lstride
+
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, luma, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y_h_overlap:
+ ; luma_src
+%if %2
+ mova xm2, [lumaq+lstrideq*0+ 0]
+ vinserti128 m2, [lumaq+lstrideq*0+32], 1
+ mova xm4, [lumaq+lstrideq*0+16]
+ vinserti128 m4, [lumaq+lstrideq*0+48], 1
+ mova xm3, [lumaq+lstrideq*(1<<%3)+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1
+ mova xm5, [lumaq+lstrideq*(1<<%3)+16]
+ vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1
+ phaddw m2, m4
+ phaddw m3, m5
+ pxor m4, m4
+ pavgw m2, m4
+ pavgw m3, m4
+%elif %1
+ mova m2, [lumaq]
+ mova m3, [lumaq+32]
+%endif
+%if %1
+ mova m0, [srcq]
+%if %2
+ mova m1, [srcq+strideq]
+%else
+ mova m1, [srcq+32]
+%endif
+ punpckhwd m4, m2, m0
+ punpcklwd m2, m0
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m4, m2, m5, m3
+ REPX {paddd x, m15}, m4, m2, m5, m3
+ REPX {psrad x, 6 }, m4, m2, m5, m3
+ packusdw m2, m4
+ packusdw m3, m5
+ pminuw m2, m10 ; clip_pixel()
+ pminuw m3, m10
+%elif %2
+ pand m2, m10
+ pand m3, m10
+%else
+ pand m2, m10, [lumaq+ 0]
+ pand m3, m10, [lumaq+32]
+%endif
+
+ ; scaling[luma_src]
+ vpbroadcastd m7, [pd_m65536]
+ pandn m4, m7, m2
+ mova m6, m7
+ vpgatherdd m5, [scalingq+m4-0], m7
+ psrld m2, 16
+ mova m7, m6
+ vpgatherdd m4, [scalingq+m2-2], m6
+ pblendw m4, m5, 0x55
+ pandn m5, m7, m3
+ mova m6, m7
+ vpgatherdd m2, [scalingq+m5-0], m7
+ psrld m3, 16
+ vpgatherdd m5, [scalingq+m3-2], m6
+ pblendw m5, m2, 0x55
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m2, [grain_lutq+offxyq*2]
+%if %2
+ movu m3, [grain_lutq+offxyq*2+82*2]
+%else
+ movu m3, [grain_lutq+offxyq*2+32]
+%endif
+ movd xm6, [grain_lutq+left_offxyq*2]
+%if %2
+ pinsrw xm6, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1}
+ punpckldq xm7, xm2, xm3 ; {cur0, cur1}
+ punpcklwd xm6, xm7 ; {left0, cur0, left1, cur1}
+%else
+ punpcklwd xm6, xm2
+%endif
+%if %1
+%if %2
+ vpbroadcastq xm7, [pw_23_22]
+%else
+ movq xm7, [pw_27_17_17_27]
+%endif
+ pmaddwd xm6, xm7
+ vpbroadcastd xm7, [pd_16]
+ paddd xm6, xm7
+%else
+ pmaddwd xm6, xm15
+ paddd xm6, xm14
+%endif
+ psrad xm6, 5
+ packssdw xm6, xm6
+ pmaxsw xm6, xm8
+ pminsw xm6, xm9
+ vpblendd m2, m6, 0x01
+%if %2
+ pshuflw xm6, xm6, q1032
+ vpblendd m3, m6, 0x01
+%endif
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+
+ ; dst = clip_pixel(src, noise)
+%if %1
+ paddw m0, m2
+ paddw m1, m3
+%else
+ paddw m0, m2, [srcq]
+%if %2
+ paddw m1, m3, [srcq+strideq]
+%else
+ paddw m1, m3, [srcq+32]
+%endif
+%endif
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq], m0
+%if %2
+ mova [dstq+strideq], m1
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ mova [dstq+32], m1
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, r10mp
+%endif
+ add grain_lutq, 82*(2<<%2)
+%if %2
+ sub hb, 2
+%else
+ dec hb
+%endif
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ cmp dword r8m, 0 ; sby
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%vertical_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
+ sby, see, unused1, unused2, unused3, lstride
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, unused1, top_offxy, unused2, luma, lstride
+
+ mov lumaq, r9mp
+ mov lstrideq, r10mp
+ lea r10, [srcq+wq*2]
+ lea r11, [dstq+wq*2]
+ lea r12, [lumaq+wq*(2<<%2)]
+ mov r9mp, r10
+ mov r11mp, r11
+ mov r12mp, r12
+ neg wq
+
+%%loop_x_v_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, unused1, top_offxy, unused2, luma, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if %2 == 0
+ lea r10, [pw_27_17_17_27]
+%endif
+%%loop_y_v_overlap:
+ ; luma_src
+%if %2
+ mova xm2, [lumaq+lstrideq*0+ 0]
+ vinserti128 m2, [lumaq+lstrideq*0+32], 1
+ mova xm4, [lumaq+lstrideq*0+16]
+ vinserti128 m4, [lumaq+lstrideq*0+48], 1
+ mova xm3, [lumaq+lstrideq*(1<<%3)+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1
+ mova xm5, [lumaq+lstrideq*(1<<%3)+16]
+ vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1
+ phaddw m2, m4
+ phaddw m3, m5
+ pxor m4, m4
+ pavgw m2, m4
+ pavgw m3, m4
+%elif %1
+ mova m2, [lumaq]
+ mova m3, [lumaq+32]
+%endif
+%if %1
+ mova m0, [srcq]
+%if %2
+ mova m1, [srcq+strideq]
+%else
+ mova m1, [srcq+32]
+%endif
+ punpckhwd m4, m2, m0
+ punpcklwd m2, m0
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m4, m2, m5, m3
+ REPX {paddd x, m15}, m4, m2, m5, m3
+ REPX {psrad x, 6 }, m4, m2, m5, m3
+ packusdw m2, m4
+ packusdw m3, m5
+ pminuw m2, m10 ; clip_pixel()
+ pminuw m3, m10
+%elif %2
+ pand m2, m10
+ pand m3, m10
+%else
+ pand m2, m10, [lumaq+ 0]
+ pand m3, m10, [lumaq+32]
+%endif
+
+ ; scaling[luma_src]
+ vpbroadcastd m7, [pd_m65536]
+ pandn m4, m7, m2
+ mova m6, m7
+ vpgatherdd m5, [scalingq+m4-0], m7
+ psrld m2, 16
+ mova m7, m6
+ vpgatherdd m4, [scalingq+m2-2], m6
+ pblendw m4, m5, 0x55
+ pandn m5, m7, m3
+ mova m6, m7
+ vpgatherdd m2, [scalingq+m5-0], m7
+ psrld m3, 16
+ vpgatherdd m5, [scalingq+m3-2], m6
+ pblendw m5, m2, 0x55
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m6, [grain_lutq+offxyq*2]
+ movu m3, [grain_lutq+top_offxyq*2]
+ punpcklwd m2, m3, m6
+ punpckhwd m3, m6 ; { top, cur }
+%if %3
+ vpbroadcastd m0, [pw_23_22]
+%elif %2
+ vpbroadcastd m0, [pw_27_17_17_27]
+%else
+ vpbroadcastd m0, [r10]
+%endif
+ REPX {pmaddwd x, m0}, m2, m3
+%if %1
+ vpbroadcastd m1, [pd_16]
+ REPX {paddd x, m1}, m2, m3
+%else
+ REPX {paddd x, m14}, m2, m3
+%endif
+ REPX {psrad x, 5}, m2, m3
+ packssdw m2, m3
+%if %2
+ movu m3, [grain_lutq+offxyq*2+82*2]
+%else
+ movu m3, [grain_lutq+offxyq*2+32]
+%endif
+%if %3
+ pmaxsw m2, m8
+ pminsw m2, m9
+%else
+%if %2
+ movu m7, [grain_lutq+top_offxyq*2+82*2]
+ punpckhwd m6, m3, m7 ; { cur, top }
+ punpcklwd m3, m7
+%else
+ movu m7, [grain_lutq+top_offxyq*2+32]
+ punpckhwd m6, m7, m3
+ punpcklwd m3, m7, m3 ; { top, cur }
+%endif
+ pmaddwd m6, m0
+ pmaddwd m3, m0
+%if %1
+ paddd m6, m1
+ paddd m3, m1
+%else
+ paddd m6, m14
+ paddd m3, m14
+%endif
+ psrad m6, 5
+ psrad m3, 5
+ packssdw m3, m6
+ pmaxsw m2, m8
+ pmaxsw m3, m8
+ pminsw m2, m9
+ pminsw m3, m9
+%endif
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2, [srcq]
+%if %2
+ paddw m1, m3, [srcq+strideq]
+%else
+ paddw m1, m3, [srcq+32]
+%endif
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq], m0
+%if %2
+ mova [dstq+strideq], m1
+ sub hb, 2
+%else
+ mova [dstq+32], m1
+ dec hb
+%endif
+ jle %%end_y_v_overlap
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*(2<<%2)
+%if %2
+ jmp %%loop_y
+%else
+ add hd, 0x80000000
+ jc %%loop_y
+ add r10, 4
+ jmp %%loop_y_v_overlap
+%endif
+%%end_y_v_overlap:
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+%%loop_x_hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
+
+%if %2 == 0
+ lea r14, [pw_27_17_17_27]
+%endif
+ lea topleft_offxyq, [top_offxyq+(32>>%2)]
+ lea left_offxyq, [offyq+(32>>%2)]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%%loop_y_hv_overlap:
+ ; luma_src
+%if %2
+ mova xm2, [lumaq+lstrideq*0+ 0]
+ vinserti128 m2, [lumaq+lstrideq*0+32], 1
+ mova xm4, [lumaq+lstrideq*0+16]
+ vinserti128 m4, [lumaq+lstrideq*0+48], 1
+ mova xm3, [lumaq+lstrideq*(1<<%3)+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1
+ mova xm5, [lumaq+lstrideq*(1<<%3)+16]
+ vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1
+ phaddw m2, m4
+ phaddw m3, m5
+ pxor m4, m4
+ pavgw m2, m4
+ pavgw m3, m4
+%elif %1
+ mova m2, [lumaq]
+ mova m3, [lumaq+32]
+%endif
+%if %1
+ mova m0, [srcq]
+%if %2
+ mova m1, [srcq+strideq]
+%else
+ mova m1, [srcq+32]
+%endif
+ punpckhwd m4, m2, m0
+ punpcklwd m2, m0
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m4, m2, m5, m3
+ REPX {paddd x, m15}, m4, m2, m5, m3
+ REPX {psrad x, 6 }, m4, m2, m5, m3
+ packusdw m2, m4
+ packusdw m3, m5
+ pminuw m2, m10 ; clip_pixel()
+ pminuw m3, m10
+%elif %2
+ pand m2, m10
+ pand m3, m10
+%else
+ pand m2, m10, [lumaq+ 0]
+ pand m3, m10, [lumaq+32]
+%endif
+
+ ; scaling[luma_src]
+ vpbroadcastd m7, [pd_m65536]
+ pandn m4, m7, m2
+ mova m6, m7
+ vpgatherdd m5, [scalingq+m4-0], m7
+ psrld m2, 16
+ mova m7, m6
+ vpgatherdd m4, [scalingq+m2-2], m6
+ pblendw m4, m5, 0x55
+ pandn m5, m7, m3
+ mova m6, m7
+ vpgatherdd m2, [scalingq+m5-0], m7
+ psrld m3, 16
+ vpgatherdd m5, [scalingq+m3-2], m6
+ pblendw m5, m2, 0x55
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m0, [grain_lutq+offxyq*2]
+ movd xm2, [grain_lutq+left_offxyq*2]
+ movu m6, [grain_lutq+top_offxyq*2]
+%if %2
+ pinsrw xm2, [grain_lutq+left_offxyq*2+82*2], 2
+ movu m3, [grain_lutq+offxyq*2+82*2]
+ punpckldq xm1, xm0, xm3 ; { cur0, cur1 }
+%if %3
+ vinserti128 m2, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left }
+ vinserti128 m1, [grain_lutq+top_offxyq*2], 1 ; { cur0, cur1, top0 }
+%else
+ vinserti128 m2, [grain_lutq+topleft_offxyq*2+82*2], 1
+ vpbroadcastd m7, [grain_lutq+topleft_offxyq*2]
+ vpblendd m2, m7, 0x20
+ movd xm7, [grain_lutq+top_offxyq*2+82*2]
+ punpckldq xm7, xm6
+ vinserti128 m1, xm7, 1
+ movu m7, [grain_lutq+top_offxyq*2+82*2]
+%endif
+ punpcklwd m2, m1 ; { cur, left }
+%if %1
+ vpbroadcastq m1, [pw_23_22]
+ pmaddwd m2, m1
+ vpbroadcastd m1, [pd_16]
+ paddd m2, m1
+ psrad m2, 5
+ packssdw m2, m2
+ vpermq m2, m2, q3120
+%else
+ pmaddwd m2, m15
+ paddd m2, m14
+ psrad m2, 5
+ vextracti128 xm1, m2, 1
+ packssdw xm2, xm1
+%endif
+%else
+ pinsrd xm2, [grain_lutq+topleft_offxyq*2], 1
+ movu m3, [grain_lutq+offxyq*2+32]
+ movu m7, [grain_lutq+top_offxyq*2+32]
+ punpckldq xm1, xm0, xm6
+ punpcklwd xm2, xm1 ; { cur, left }
+%if %1
+ movddup xm1, [pw_27_17_17_27]
+ pmaddwd xm2, xm1
+ vpbroadcastd m1, [pd_16]
+ paddd xm2, xm1
+%else
+ pmaddwd xm2, xm15
+ paddd xm2, xm14
+%endif
+ psrad xm2, 5
+ packssdw xm2, xm2
+%endif
+ pmaxsw xm2, xm8
+ pminsw xm2, xm9
+ vpblendd m0, m2, 0x01
+%if %2
+ pshufd xm2, xm2, q0321
+ vpblendd m3, m2, 0x01
+%if %3 == 0
+ pshufd xm2, xm2, q0321
+ vpblendd m7, m2, 0x01
+%endif
+%endif
+ pshuflw xm2, xm2, q1032
+ vpblendd m2, m6, 0xfe
+ punpckhwd m6, m0 ; { top, cur }
+ punpcklwd m2, m0
+%if %3
+ vpbroadcastd m0, [pw_23_22]
+%elif %2
+ vpbroadcastd m0, [pw_27_17_17_27]
+%else
+ vpbroadcastd m0, [r14]
+%endif
+ pmaddwd m6, m0
+ pmaddwd m2, m0
+%if %1
+ paddd m6, m1
+ paddd m2, m1
+%else
+ paddd m6, m14
+ paddd m2, m14
+%endif
+ psrad m6, 5
+ psrad m2, 5
+ packssdw m2, m6
+
+%if %3
+ pmaxsw m2, m8
+ pminsw m2, m9
+%else
+%if %2
+ punpckhwd m6, m3, m7
+ punpcklwd m3, m7 ; { cur, top }
+%else
+ punpckhwd m6, m7, m3
+ punpcklwd m3, m7, m3 ; { top, cur }
+%endif
+ REPX {pmaddwd x, m0}, m6, m3
+%if %1
+ REPX {paddd x, m1}, m6, m3
+%else
+ REPX {paddd x, m14}, m6, m3
+%endif
+ REPX {psrad x, 5}, m6, m3
+ packssdw m3, m6
+ pmaxsw m2, m8
+ pmaxsw m3, m8
+ pminsw m2, m9
+ pminsw m3, m9
+%endif
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m4, m11
+ pmaddubsw m5, m11
+ paddw m4, m4
+ paddw m5, m5
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2, [srcq]
+%if %2
+ paddw m1, m3, [srcq+strideq]
+%else
+ paddw m1, m3, [srcq+32]
+%endif
+ pmaxsw m0, m12
+ pmaxsw m1, m12
+ pminsw m0, m13
+ pminsw m1, m13
+ mova [dstq], m0
+%if %2
+ mova [dstq+strideq], m1
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ mova [dstq+32], m1
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, r10mp
+%endif
+ add grain_lutq, 82*(2<<%2)
+%if %2
+ sub hb, 2
+ jg %%loop_y_h_overlap
+%else
+ dec hb
+ jle %%end_y_hv_overlap
+ add hd, 0x80000000
+ jc %%loop_y_h_overlap
+ add r14, 4
+ jmp %%loop_y_hv_overlap
+%endif
+%%end_y_hv_overlap:
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ jmp %%loop_x_hv_overlap
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
+%endmacro
+
+GEN_GRAIN_UV_FN 420, 1, 1
+FGUV_FN 420, 1, 1
+GEN_GRAIN_UV_FN 422, 1, 0
+FGUV_FN 422, 1, 0
+GEN_GRAIN_UV_FN 444, 0, 0
+FGUV_FN 444, 0, 0
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/filmgrain16_avx512.asm b/third_party/dav1d/src/x86/filmgrain16_avx512.asm
new file mode 100644
index 0000000000..00dd6af599
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain16_avx512.asm
@@ -0,0 +1,932 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
+ db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
+scale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1
+scale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4
+pw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27
+pw_23_22: dw 92, 88, 0, 128, 23, 22, 0, 32
+fg_min: times 2 dw 0
+ times 2 dw 64
+ times 2 dw 256
+fg_max: times 2 dw 1023
+ times 2 dw 4095
+ times 2 dw 960
+ times 2 dw 3840
+ times 2 dw 940
+ times 2 dw 3760
+scale_rnd: dd 64
+ dd 16
+uv_offset_mul: dd 256
+ dd 1024
+pb_8_9_0_1: db 8, 9, 0, 1
+
+SECTION .text
+
+INIT_ZMM avx512icl
+cglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, offx, sby, see, offy, src_bak
+%define base r11-fg_min
+ lea r11, [fg_min]
+ mov r6d, r9m ; bdmax
+ mov r9d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ mov sbyd, sbym
+ vpbroadcastd m6, r9m
+ shr r6d, 11 ; is_12bpc
+ vbroadcasti32x4 m7, [base+scale_mask]
+ shlx r10d, r9d, r6d
+ vpbroadcastd m10, [base+scale_shift+r7*4-32]
+ lea r9d, [r6+r9*4]
+ vpbroadcastd m8, [base+fg_min+r10*4]
+ kxnorw k1, k1, k1 ; 0xffff
+ vpbroadcastd m9, [base+fg_max+r9*4]
+ mov r12, 0xeeeeeeeeeeeeeeee
+ vpbroadcastd m19, [base+scale_rnd+r6*4]
+ kshiftrb k2, k1, 4 ; 0xf
+ vpbroadcastq xm20, [base+pw_27_17_17_27+r6*8]
+ kmovq k3, r12
+ vpbroadcastd m11, [base+scale_shift+r6*8+4]
+ test sbyd, sbyd
+ setnz r7b
+ vpbroadcastd m12, [base+pw_27_17_17_27+r6*8+0]
+ vpbroadcastd m13, [base+pw_27_17_17_27+r6*8+4]
+ test r7b, [fg_dataq+FGData.overlap_flag]
+ jnz .v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+ lea src_bakq, [srcq+wq*2]
+ neg wq
+ sub dstq, srcq
+
+.loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y:
+ movu m4, [grain_lutq+offxyq*2+82*0]
+ movu m5, [grain_lutq+offxyq*2+82*2]
+ call .add_noise
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+ cmp byte [fg_dataq+FGData.overlap_flag], 0
+ je .loop_x
+ test sbyd, sbyd
+ jnz .hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
+ sby, see, offy, src_bak, left_offxy
+
+ lea left_offxyd, [offyq+73] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y_h_overlap:
+ movu m4, [grain_lutq+offxyq*2+82*0]
+ movu m5, [grain_lutq+offxyq*2+82*2]
+ movd xm17, [grain_lutq+left_offxyq*2-82*1]
+ pinsrd xm17, [grain_lutq+left_offxyq*2+82*1], 1
+ punpckldq xm16, xm4, xm5
+ punpcklwd xm17, xm16
+ mova xm16, xm19
+ vpdpwssd xm16, xm20, xm17
+ psrad xm16, 1
+ packssdw xm16, xm16
+ vpsravw xm16, xm11
+ vmovdqu8 m4{k2}, m16
+ vpalignr m5{k2}, m16, m16, 4
+ call .add_noise
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+ test sbyd, sbyd
+ jnz .hv_overlap
+ jmp .loop_x_h_overlap
+
+.v_overlap:
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+ lea src_bakq, [srcq+wq*2]
+ neg wq
+ sub dstq, srcq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
+ sby, see, offy, src_bak, _, top_offxy
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak, _, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+ movu m16, [grain_lutq+offxyq*2+82*0]
+ movu m0, [grain_lutq+top_offxyq*2+82*0]
+ movu m17, [grain_lutq+offxyq*2+82*2]
+ movu m1, [grain_lutq+top_offxyq*2+82*2]
+ punpckhwd m4, m0, m16
+ punpcklwd m0, m16
+ punpckhwd m5, m1, m17
+ punpcklwd m1, m17
+ call .add_noise_v
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump back
+ ; to .v_overlap, and instead always fall-through to .hv_overlap
+.hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
+ sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyd, [top_offxyq+73]
+ lea left_offxyd, [offyq+73]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+ movu m5, [grain_lutq+offxyq*2+82*0]
+ movu m0, [grain_lutq+top_offxyq*2+82*0]
+ movd xm17, [grain_lutq+left_offxyq*2-82*1]
+ pinsrd xm17, [grain_lutq+topleft_offxyq*2-82*1], 1
+ movu m2, [grain_lutq+offxyq*2+82*2]
+ movu m1, [grain_lutq+top_offxyq*2+82*2]
+ movd xm18, [grain_lutq+left_offxyq*2+82*1]
+ pinsrd xm18, [grain_lutq+topleft_offxyq*2+82*1], 1
+ punpckldq xm16, xm5, xm0
+ punpcklwd xm17, xm16
+ mova xm16, xm19
+ vpdpwssd xm16, xm20, xm17
+ punpckldq xm17, xm2, xm1
+ punpcklwd xm18, xm17
+ mova xm17, xm19
+ vpdpwssd xm17, xm20, xm18
+ punpckhwd m4, m0, m5
+ punpcklwd m0, m5
+ punpckhwd m5, m1, m2
+ punpcklwd m1, m2
+ psrad xm16, 1
+ psrad xm17, 1
+ packssdw xm16, xm17
+ vpsravw xm16, xm11
+ vpshuflw m0{k2}, m16, q1302
+ punpckhqdq xm16, xm16
+ vpshuflw m1{k2}, m16, q1302
+ call .add_noise_v
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ lea srcq, [src_bakq+wq*2]
+ jl .hv_overlap
+.end:
+ RET
+ALIGN function_align
+.add_noise_v:
+ mova m2, m19
+ vpdpwssd m2, m12, m4
+ mova m3, m19
+ vpdpwssd m3, m13, m5
+ mova m4, m19
+ vpdpwssd m4, m12, m0
+ mova m5, m19
+ vpdpwssd m5, m13, m1
+ REPX {psrad x, 1}, m2, m3, m4, m5
+ packssdw m4, m2
+ packssdw m5, m3
+ vpsravw m4, m11
+ vpsravw m5, m11
+.add_noise:
+ mova m0, [srcq+strideq*0]
+ mova m1, [srcq+strideq*1]
+ kmovw k4, k1
+ pand m16, m6, m0
+ psrld m3, m0, 16
+ vpgatherdd m2{k4}, [scalingq+m16]
+ vpcmpud k4, m3, m6, 2 ; px <= bdmax
+ vpgatherdd m16{k4}, [scalingq+m3]
+ kmovw k4, k1
+ pand m17, m6, m1
+ vpgatherdd m3{k4}, [scalingq+m17]
+ vpshufb m2{k3}, m16, m7
+ psrld m16, m1, 16
+ vpcmpud k4, m16, m6, 2
+ vpgatherdd m17{k4}, [scalingq+m16]
+ vpshufb m3{k3}, m17, m7
+ vpsllvw m2, m10
+ vpsllvw m3, m10
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+ add grain_lutq, 82*4
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m8
+ pmaxsw m1, m8
+ pminsw m0, m9
+ pminsw m1, m9
+ mova [dstq+srcq], m0
+ add srcq, strideq
+ mova [dstq+srcq], m1
+ add srcq, strideq
+ ret
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, lstride, uv_pl, is_id
+%define base r12-fg_min
+ lea r12, [fg_min]
+ mov r9d, r13m ; bdmax
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r11d, is_idm
+ kxnorw k1, k1, k1 ; 0xffff
+ vpbroadcastd m5, r13m
+ mov r13, 0xeeeeeeeeeeeeeeee
+ vbroadcasti32x4 m6, [base+scale_mask]
+ shr r9d, 11 ; is_12bpc
+ vpbroadcastd m7, [base+scale_shift+r7*4-32]
+ shlx r10d, r6d, r9d
+ mov sbyd, sbym
+ shlx r6d, r6d, r11d
+ vpbroadcastd m8, [base+fg_min+r10*4]
+ lea r6d, [r9+r6*2]
+ vpbroadcastd m9, [base+fg_max+r6*4]
+ kmovq k2, r13
+ vpbroadcastd m20, [base+scale_rnd+r9*4]
+ packssdw m4, m5, m5
+ vpbroadcastd m21, [base+scale_shift+r9*8+4]
+%if %2
+ mova m12, [base+pb_0to63] ; pw_even
+ mov r13d, 0x0101
+ vpbroadcastq m10, [base+pw_23_22+r9*8]
+ kmovw k3, r13d
+%if %3
+ pshufd m11, m10, q0000
+%else
+ vpbroadcastd ym16, [base+pw_27_17_17_27+r9*8+0]
+ vpbroadcastd m11, [base+pw_27_17_17_27+r9*8+4]
+ vmovdqu16 m11{k1}, m16
+%endif
+ psrlw m13, m12, 8 ; pw_odd
+%else
+ vpbroadcastq m10, [base+pw_27_17_17_27+r9*8]
+ kshiftrb k3, k1, 7 ; 0x01
+ kshiftrb k4, k1, 4 ; 0x0f
+ pshufd m11, m10, q0000
+%endif
+ mov lstrideq, r10mp
+ test sbyd, sbyd
+ setnz r7b
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ _, sby, see, lstride
+
+%if %1
+ mov r6d, r11m
+ vpbroadcastd m0, [base+uv_offset_mul+r9*4]
+ vpbroadcastd m1, [base+pb_8_9_0_1]
+ vpbroadcastd m14, [fg_dataq+FGData.uv_offset+r6*4]
+ vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4]
+ pmaddwd m14, m0
+ pshufb m15, m1 ; { uv_luma_mult, uv_mult }
+%endif
+ test r7b, [fg_dataq+FGData.overlap_flag]
+ jnz %%v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq*2]
+ lea r13, [dstq+wq*2]
+ lea r14, [lumaq+wq*(2<<%2)]
+ mov r9mp, r12
+ mov r10mp, r13
+ mov r11mp, r14
+ neg wq
+
+%%loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y:
+%if %2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+offxyq*2+82*2]
+%endif
+ call %%add_noise
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ cmp byte [fg_dataq+FGData.overlap_flag], 0
+ je %%loop_x
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma, left_offxy
+
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y_h_overlap:
+%if %2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ movd xm16, [grain_lutq+left_offxyq*2+82*0]
+ vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2
+ movd xm17, [grain_lutq+left_offxyq*2+82*4]
+ vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2
+ punpckldq m16, m17
+ punpckldq m17, m18, m19
+ punpcklwd m16, m17
+ mova m17, m20
+ vpdpwssd m17, m16, m10
+ psrad m17, 1
+ packssdw m17, m17
+ vpsravw m17, m21
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+offxyq*2+82*2]
+ movd xm16, [grain_lutq+left_offxyq*2+82*0]
+ pinsrd xm16, [grain_lutq+left_offxyq*2+82*2], 1
+ punpckldq xm17, xm18, xm19
+ punpcklwd xm16, xm17
+ mova xm17, xm20
+ vpdpwssd xm17, xm16, xm10
+ psrad xm17, 1
+ packssdw xm17, xm17
+ vpsravw xm17, xm21
+%endif
+ vmovdqa32 m18{k3}, m17
+ vpshufd m19{k3}, m17, q0321
+ call %%add_noise
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%v_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ _, sby, see, lstride
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma, _, top_offxy
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq*2]
+ lea r13, [dstq+wq*2]
+ lea r14, [lumaq+wq*(2<<%2)]
+ mov r9mp, r12
+ mov r10mp, r13
+ mov r11mp, r14
+ neg wq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma, _, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+%if %3
+ movu ym16, [grain_lutq+offxyq*2+82*0]
+ movu ym1, [grain_lutq+top_offxyq*2+82*0]
+ vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2]
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ punpcklwd ym17, ym1, ym16
+ punpckhwd ym1, ym16
+%elif %2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym17, [grain_lutq+top_offxyq*2+82*0]
+ vinserti32x8 m17, [grain_lutq+top_offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ punpcklwd m16, m17, m18
+ punpckhwd m17, m18
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+top_offxyq*2+82*0]
+ movu m2, [grain_lutq+offxyq*2+82*2]
+ movu m16, [grain_lutq+top_offxyq*2+82*2]
+ punpckhwd m1, m19, m18
+ punpcklwd m19, m18
+ punpckhwd m18, m2, m16
+ punpcklwd m2, m16
+%endif
+ call %%add_noise_v
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump back
+ ; to %%v_overlap, and instead always fall-through to %%hv_overlap
+%%hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyq, [top_offxyq+(32>>%2)]
+ lea left_offxyq, [offyq+(32>>%2)]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %2
+ movd xm16, [grain_lutq+left_offxyq*2+82*0]
+ vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2
+ movd xm17, [grain_lutq+left_offxyq*2+82*4]
+ vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ punpckldq m16, m17
+ punpckldq m17, m18, m19
+ punpcklwd m16, m17
+ movu ym1, [grain_lutq+top_offxyq*2+82*0]
+ movd xm17, [grain_lutq+topleft_offxyq*2+82*0]
+ mova m0, m20
+ vpdpwssd m0, m16, m10
+%if %3
+ punpcklwd xm17, xm1
+ mova xm16, xm20
+ vpdpwssd xm16, xm17, xm10
+ psrad xm16, 1
+%else
+ vinserti32x8 m1, [grain_lutq+top_offxyq*2+82*2], 1
+ vinserti32x4 m17, [grain_lutq+topleft_offxyq*2+82*2], 2
+ punpcklwd m17, m1
+ mova m16, m20
+ vpdpwssd m16, m17, m10
+ psrad m16, 1
+%endif
+ psrad m0, 1
+ packssdw m0, m16
+ vpsravw m0, m21
+ vmovdqa32 m18{k3}, m0
+ vpshufd m19{k3}, m0, q0321
+%if %3
+ vpunpckhdq ym1{k3}, ym0, ym0
+ punpcklwd ym17, ym1, ym18
+ punpckhwd ym1, ym18
+%else
+ vpunpckhdq m1{k3}, m0, m0
+ punpcklwd m16, m1, m18
+ punpckhwd m17, m1, m18
+%endif
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+top_offxyq*2+82*0]
+ movd xm17, [grain_lutq+left_offxyq*2+82*0]
+ pinsrd xm17, [grain_lutq+topleft_offxyq*2+82*0], 1
+ punpckldq xm16, xm18, xm19
+ punpcklwd xm17, xm16
+ movu m2, [grain_lutq+offxyq*2+82*2]
+ movu m0, [grain_lutq+top_offxyq*2+82*2]
+ movd xm16, [grain_lutq+left_offxyq*2+82*2]
+ pinsrd xm16, [grain_lutq+topleft_offxyq*2+82*2], 1
+ punpckldq xm1, xm2, xm0
+ punpcklwd xm1, xm16, xm1
+ mova xm16, xm20
+ vpdpwssd xm16, xm17, xm10
+ mova xm17, xm20
+ vpdpwssd xm17, xm1, xm10
+ punpckhwd m1, m19, m18
+ punpcklwd m19, m18
+ punpckhwd m18, m2, m0
+ punpcklwd m2, m0
+ psrad xm16, 1
+ psrad xm17, 1
+ packssdw xm16, xm17
+ vpsravw xm16, xm21
+ vpshuflw m19{k4}, m16, q1302
+ punpckhqdq xm16, xm16
+ vpshuflw m2{k4}, m16, q3120
+%endif
+ call %%add_noise_v
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ jmp %%hv_overlap
+
+ALIGN function_align
+%%add_noise_v:
+%if %3
+ mova ym16, ym20
+ vpdpwssd ym16, ym17, ym11
+ mova ym17, ym20
+ vpdpwssd ym17, ym1, ym11
+ psrad ym16, 1
+ psrad ym17, 1
+ packssdw ym16, ym17
+ vpsravw m18{k1}, m16, m21
+%elif %2
+ mova m18, m20
+ vpdpwssd m18, m16, m11
+ mova m16, m20
+ vpdpwssd m16, m17, m11
+ psrad m18, 1
+ psrad m16, 1
+ packssdw m18, m16
+ vpsravw m18, m21
+%else
+ mova m16, m20
+ vpdpwssd m16, m1, m11
+ mova m17, m20
+ vpdpwssd m17, m18, m11
+ mova m18, m20
+ vpdpwssd m18, m19, m11
+ mova m19, m20
+ vpdpwssd m19, m2, m11
+ REPX {psrad x, 1}, m16, m17, m18, m19
+ packssdw m18, m16
+ packssdw m19, m17
+ vpsravw m18, m21
+ vpsravw m19, m21
+%endif
+%%add_noise:
+%if %2
+ mova m2, [lumaq+lstrideq*(0<<%3)]
+ mova m0, [lumaq+lstrideq*(1<<%3)]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ mova m3, [lumaq+lstrideq*(0<<%3)]
+ mova m1, [lumaq+lstrideq*(1<<%3)]
+ mova m16, m12
+ vpermi2w m16, m2, m0
+ vpermt2w m2, m13, m0
+ mova m17, m12
+ vpermi2w m17, m3, m1
+ vpermt2w m3, m13, m1
+ pavgw m2, m16
+ pavgw m3, m17
+%elif %1
+ mova m2, [lumaq+lstrideq*0]
+ mova m3, [lumaq+lstrideq*1]
+%endif
+%if %2
+ mova ym16, [srcq+strideq*0]
+ vinserti32x8 m16, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+%else
+ mova m16, [srcq+strideq*0]
+%endif
+%if %1
+ punpckhwd m17, m2, m16
+ mova m0, m14
+ vpdpwssd m0, m17, m15
+ punpcklwd m17, m2, m16
+ mova m2, m14
+ vpdpwssd m2, m17, m15
+%endif
+%if %2
+ mova ym17, [srcq+strideq*0]
+ vinserti32x8 m17, [srcq+strideq*1], 1
+%else
+ mova m17, [srcq+strideq*1]
+%endif
+%if %1
+ psrad m0, 6
+ psrad m2, 6
+ packusdw m2, m0
+ punpckhwd m0, m3, m17
+ mova m1, m14
+ vpdpwssd m1, m15, m0
+ punpcklwd m0, m3, m17
+ mova m3, m14
+ vpdpwssd m3, m15, m0
+ psrad m1, 6
+ psrad m3, 6
+ packusdw m3, m1
+ pminuw m2, m4
+ pminuw m3, m4
+
+.add_noise_main:
+ ; scaling[luma_src]
+ kmovw k5, k1
+ pand m1, m5, m2
+ vpgatherdd m0{k5}, [scalingq+m1]
+ kmovw k5, k1
+ psrld m2, 16
+ vpgatherdd m1{k5}, [scalingq+m2]
+ vpshufb m0{k2}, m1, m6
+ kmovw k5, k1
+ psrld m1, m3, 16
+ vpgatherdd m2{k5}, [scalingq+m1]
+ kmovw k5, k1
+ pand m3, m5
+ vpgatherdd m1{k5}, [scalingq+m3]
+ vpshufb m1{k2}, m2, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ vpsllvw m0, m7
+ vpsllvw m1, m7
+ pmulhrsw m18, m0
+ pmulhrsw m19, m1
+ add grain_lutq, 82*(4<<%2)
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ lea srcq, [srcq+strideq*2]
+ paddw m16, m18
+ paddw m17, m19
+ pmaxsw m16, m8
+ pmaxsw m17, m8
+ pminsw m16, m9
+ pminsw m17, m9
+%if %2
+ mova [dstq+strideq*0], ym16
+ vextracti32x8 [dstq+strideq*1], m16, 1
+ lea dstq, [dstq+strideq*2]
+ mova [dstq+strideq*0], ym17
+ vextracti32x8 [dstq+strideq*1], m17, 1
+%else
+ mova [dstq+strideq*0], m16
+ mova [dstq+strideq*1], m17
+%endif
+ lea dstq, [dstq+strideq*2]
+ ret
+%else
+%if %2
+ pand m2, m4
+ pand m3, m4
+%else
+ pand m2, m4, [lumaq+lstrideq*0]
+ pand m3, m4, [lumaq+lstrideq*1]
+%endif
+ jmp .add_noise_main
+%endif
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
+
+%endif
diff --git a/third_party/dav1d/src/x86/filmgrain16_sse.asm b/third_party/dav1d/src/x86/filmgrain16_sse.asm
new file mode 100644
index 0000000000..6b0daaac0b
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain16_sse.asm
@@ -0,0 +1,3421 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+SECTION_RODATA 16
+pd_16: times 4 dd 16
+pw_1: times 8 dw 1
+pw_16384: times 8 dw 16384
+pw_8192: times 8 dw 8192
+pw_23_22: dw 23, 22
+ times 3 dw 0, 32
+pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
+pw_27_17_17_27: dw 27, 17, 17, 27
+ times 2 dw 0, 32
+rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+pw_seed_xor: times 2 dw 0xb524
+ times 2 dw 0x49d8
+pb_1: times 4 db 1
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16
+round_vals: dw 32, 64, 128, 256, 512, 1024
+max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16
+min: dw 0, 16*4, 16*16
+; these two should be next to each other
+pw_4: times 2 dw 4
+pw_16: times 2 dw 16
+
+%macro JMP_TABLE 1-*
+ %xdefine %1_table %%table
+ %xdefine %%base %1_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %%table:
+ %rep %0 - 1
+ dd %%prefix %+ .ar%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3
+
+SECTION .text
+
+%if ARCH_X86_32
+%undef base
+%define PIC_ptr(a) base+a
+%else
+%define PIC_ptr(a) a
+%endif
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg
+%assign %%idx 0
+%define %%tmp %2
+%if %0 == 8
+%define %%tmp %8
+%endif
+%rep (%6/2)
+%if %%idx == 0
+ movd %5 %+ d, %2
+ pshuflw %%tmp, %2, q3232
+%else
+ movd %5 %+ d, %%tmp
+%if %6 == 8
+%if %%idx == 2
+ punpckhqdq %%tmp, %%tmp
+%elif %%idx == 4
+ psrlq %%tmp, 32
+%endif
+%endif
+%endif
+ movzx %4 %+ d, %5 %+ w
+ shr %5 %+ d, 16
+
+%if %%idx == 0
+ movd %1, [%3+%4*%7]
+%else
+ pinsrw %1, [%3+%4*%7], %%idx + 0
+%endif
+ pinsrw %1, [%3+%5*%7], %%idx + 1
+%assign %%idx %%idx+2
+%endrep
+%endmacro
+
+%macro SPLATD 2 ; dst, src
+%ifnidn %1, %2
+ movd %1, %2
+%endif
+ pshufd %1, %1, q0000
+%endmacro
+
+%macro SPLATW 2 ; dst, src
+%ifnidn %1, %2
+ movd %1, %2
+%endif
+ pshuflw %1, %1, q0000
+ punpcklqdq %1, %1
+%endmacro
+
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax
+ lea r4, [pb_mask]
+%define base r4-pb_mask
+%else
+cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax
+ LEA r4, $$
+%define base r4-$$
+%endif
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r3d, [fg_dataq+FGData.grain_scale_shift]
+ lea r5d, [bdmaxq+1]
+ shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc
+ sub r3, r5
+ SPLATW m6, [base+round+r3*2-2]
+ mova m5, [base+pb_mask]
+ SPLATW m0, [fg_dataq+FGData.seed]
+ mov r3, -73*82*2
+ sub bufq, r3
+%if ARCH_X86_64
+ lea r6, [gaussian_sequence]
+%endif
+.loop:
+ pand m2, m0, m1
+ psrlw m3, m2, 10
+ por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m2, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
+ psllq m2, m3, 30
+ por m2, m3
+ psllq m3, m2, 15
+ por m2, m3 ; aggregate each bit into next seed's high bit
+ pmulhuw m3, m0, m7
+ por m2, m3 ; 4 next output seeds
+ pshuflw m0, m2, q3333
+ psrlw m2, 5
+%if ARCH_X86_64
+ vpgatherdw m3, m2, r6, r5, r7, 4, 2
+%else
+ vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2
+%endif
+ paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0
+ ; shifts by 0, which pmulhrsw does not support
+ pmulhrsw m3, m6
+ movq [bufq+r3], m3
+ add r3, 4*2
+ jl .loop
+
+ ; auto-regression code
+ movsxd r3, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4]
+ lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table]
+ jmp r3
+
+.ar1:
+%if WIN64
+ DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0
+ lea bufq, [r0-2*(82*73-(82*3+79))]
+ PUSH r8
+%else
+%if ARCH_X86_64
+ DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0
+%else ; x86-32
+ DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0
+ PUSH r6
+%define shiftd r1d
+%endif
+ sub bufq, 2*(82*73-(82*3+79))
+%endif
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+ movd m4, [fg_dataq+FGData.ar_coeffs_y]
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+%if WIN64
+ DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0
+%elif ARCH_X86_64
+ DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0
+%else ; x86-32
+%undef shiftd
+ DEFINE_ARGS buf, shift, min, val3, x, cf3, val0
+%define hd dword r0m
+%define maxd dword minm
+%endif
+%if cpuflag(sse4)
+ pmovsxbw m4, m4
+%else
+ pxor m3, m3
+ pcmpgtb m3, m4
+ punpcklbw m4, m3
+%endif
+ pinsrw m4, [base+pw_1], 3
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd
+ mov hd, 70
+ sar maxd, 1
+ mov mind, maxd
+ xor mind, -1
+.y_loop_ar1:
+ mov xq, -76
+ movsx val3d, word [bufq+xq*2-2]
+.x_loop_ar1:
+ movu m0, [bufq+xq*2-82*2-2] ; top/left
+ psrldq m2, m0, 2 ; top
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, word [bufq+xq*2]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov word [bufq+xq*2], val3w
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar1
+%if WIN64
+ POP r8
+%elif ARCH_X86_32
+ POP r6
+%undef maxd
+%undef hd
+%endif
+.ar0:
+ RET
+
+.ar2:
+%if ARCH_X86_32
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -16*8
+%endif
+ DEFINE_ARGS buf, fg_data, bdmax, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m0, [base+round_vals-12+shiftq*2]
+ pshuflw m0, m0, q0000
+ movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11
+ pxor m2, m2
+ punpcklwd m0, m2
+ pcmpgtb m2, m6
+ punpckhbw m3, m6, m2
+ punpcklbw m6, m2
+ pshufd m2, m6, q3333
+ pshufd m1, m6, q2222
+ pshufd m7, m6, q1111
+ pshufd m6, m6, q0000
+ pshufd m4, m3, q1111
+ pshufd m3, m3, q0000
+%if ARCH_X86_64
+ SWAP 0, 12
+ SWAP 1, 8
+ SWAP 2, 9
+ SWAP 3, 10
+ SWAP 4, 11
+%else
+%define m12 [rsp+0*16]
+%define m8 [rsp+1*16]
+%define m9 [rsp+2*16]
+%define m10 [rsp+3*16]
+%define m11 [rsp+4*16]
+ mova m12, m0
+ mova m8, m1
+ mova m9, m2
+ mova m10, m3
+ mova m11, m4
+ mov bdmaxd, bdmaxm
+%endif
+ sar bdmaxd, 1
+ SPLATW m0, bdmaxd ; max_grain
+ pcmpeqw m1, m1
+%if !cpuflag(sse4)
+ pcmpeqw m2, m2
+ psrldq m2, 14
+ pslldq m2, 2
+ pxor m2, m1
+%endif
+ pxor m1, m0 ; min_grain
+%if ARCH_X86_64
+ SWAP 0, 13
+ SWAP 1, 14
+ SWAP 2, 15
+%else
+%define m13 [rsp+5*16]
+%define m14 [rsp+6*16]
+ mova m13, m0
+ mova m14, m1
+%if !cpuflag(sse4)
+%define m15 [rsp+7*16]
+ mova m15, m2
+%endif
+%endif
+ sub bufq, 2*(82*73-(82*3+79))
+ DEFINE_ARGS buf, fg_data, h, x
+ mov hd, 70
+.y_loop_ar2:
+ mov xq, -76
+
+.x_loop_ar2:
+ movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5]
+ movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5]
+ psrldq m2, m0, 2
+ psrldq m3, m0, 4
+ psrldq m4, m0, 6
+ psrldq m5, m0, 8
+ punpcklwd m0, m2
+ punpcklwd m3, m4
+ punpcklwd m5, m1
+ psrldq m2, m1, 2
+ psrldq m4, m1, 4
+ punpcklwd m2, m4
+ psrldq m4, m1, 6
+ psrldq m1, 8
+ punpcklwd m4, m1
+ pmaddwd m0, m6
+ pmaddwd m3, m7
+ pmaddwd m5, m8
+ pmaddwd m2, m9
+ pmaddwd m4, m10
+ paddd m0, m3
+ paddd m5, m2
+ paddd m0, m4
+ paddd m0, m5 ; accumulated top 2 rows
+ paddd m0, m12
+
+ movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5]
+ pshufd m4, m1, q3321
+ pxor m2, m2
+ pcmpgtw m2, m4
+ punpcklwd m4, m2 ; in dwords, y=0,x=[0,3]
+.x_loop_ar2_inner:
+ pmaddwd m2, m1, m11
+ paddd m2, m0
+ psrldq m0, 4 ; shift top to next pixel
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ paddd m2, m4
+ packssdw m2, m2
+ pminsw m2, m13
+ pmaxsw m2, m14
+ psrldq m4, 4
+ pslldq m2, 2
+ psrldq m1, 2
+%if cpuflag(sse4)
+ pblendw m1, m2, 00000010b
+%else
+ pand m1, m15
+ pandn m3, m15, m2
+ por m1, m3
+%endif
+ ; overwrite previous pixel, this should be ok
+ movd [bufq+xq*2-2], m1
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar2
+%if ARCH_X86_32
+%undef m8
+%undef m9
+%undef m10
+%undef m11
+%undef m12
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+
+.ar3:
+ DEFINE_ARGS buf, fg_data, bdmax, shift
+%if WIN64
+ mov r6, rsp
+ and rsp, ~15
+ sub rsp, 64
+ %define tmp rsp
+%elif ARCH_X86_64
+ %define tmp rsp+stack_offset-72
+%else
+%assign stack_offset stack_offset_old
+ ALLOC_STACK -16*12
+ %define tmp rsp
+ mov bdmaxd, bdmaxm
+%endif
+ sar bdmaxd, 1
+ SPLATW m7, bdmaxd ; max_grain
+ pcmpeqw m6, m6
+%if !cpuflag(sse4)
+ pcmpeqw m4, m4
+ psrldq m4, 14
+ pslldq m4, 4
+ pxor m4, m6
+%endif
+ pxor m6, m7 ; min_grain
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+
+%if ARCH_X86_64
+ SWAP 6, 14
+ SWAP 7, 15
+%else
+%define m14 [rsp+10*16]
+%define m15 [esp+11*16]
+ mova m14, m6
+ mova m15, m7
+%endif
+
+ ; build cf0-1 until 18-19 in m5-12 and r0/1
+ pxor m1, m1
+ movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+
+%if cpuflag(sse4)
+ pshufd m4, m2, q3333
+%else
+ pshufd m5, m2, q3333
+ mova [tmp+48], m5
+%endif
+ pshufd m3, m2, q2222
+ pshufd m1, m2, q0000
+ pshufd m2, m2, q1111
+ pshufd m7, m0, q2222
+ pshufd m6, m0, q1111
+ pshufd m5, m0, q0000
+ pshufd m0, m0, q3333
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+ SWAP 4, 12
+%else
+%define m8 [rsp+4*16]
+%define m9 [esp+5*16]
+%define m10 [rsp+6*16]
+%define m11 [esp+7*16]
+%define m12 [rsp+8*16]
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+ mova m12, m4
+%endif
+
+ ; build cf20,round in r2
+ ; build cf21-23,round*2 in m13
+ pxor m1, m1
+ movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23
+ pcmpgtb m1, m0
+ punpcklbw m0, m1
+ pshufd m1, m0, q0000
+ pshufd m2, m0, q1111
+ mova [tmp+ 0], m1
+ mova [tmp+16], m2
+ psrldq m3, m0, 10
+ pinsrw m3, [base+round_vals+shiftq*2-10], 3
+
+%if ARCH_X86_64
+ SWAP 3, 13
+%else
+%define m13 [esp+9*16]
+ mova m13, m3
+%endif
+
+ pinsrw m0, [base+round_vals+shiftq*2-12], 5
+ pshufd m3, m0, q2222
+ mova [tmp+32], m3
+
+ DEFINE_ARGS buf, fg_data, h, x
+ sub bufq, 2*(82*73-(82*3+79))
+ mov hd, 70
+.y_loop_ar3:
+ mov xq, -76
+
+.x_loop_ar3:
+ movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
+ movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6]
+ palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5]
+ palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6]
+ punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+
+ pmaddwd m0, m5
+ pmaddwd m2, m6
+ pmaddwd m3, m7
+ paddd m0, m2
+ paddd m0, m3
+ ; m0 = top line first 6 multiplied by cf, m1 = top line last entry
+
+ movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4]
+ movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6]
+ punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
+ palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5]
+ palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6]
+ punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
+ punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+
+ pmaddwd m1, m8
+ pmaddwd m4, m9
+ pmaddwd m3, m10
+ pmaddwd m2, m11
+ paddd m1, m4
+ paddd m3, m2
+ paddd m0, m1
+ paddd m0, m3
+ ; m0 = top 2 lines multiplied by cf
+
+ movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
+ movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6]
+ palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5]
+ palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6]
+ punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+ punpcklwd m2, [base+pw_1]
+
+%if cpuflag(sse4)
+ pmaddwd m1, m12
+%else
+ pmaddwd m1, [tmp+48]
+%endif
+ pmaddwd m3, [tmp+ 0]
+ pmaddwd m4, [tmp+16]
+ pmaddwd m2, [tmp+32]
+ paddd m1, m3
+ paddd m4, m2
+ paddd m0, m1
+ paddd m0, m4
+ ; m0 = top 3 lines multiplied by cf plus rounding for downshift
+
+ movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+ pmaddwd m2, m1, m13
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ packssdw m2, m2
+ pminsw m2, m15
+ pmaxsw m2, m14
+ pslldq m2, 4
+ psrldq m1, 2
+%if cpuflag(sse4)
+ pblendw m1, m2, 00000100b
+%else
+ pand m1, m12
+ pandn m3, m12, m2
+ por m1, m3
+%endif
+ ; overwrite a couple of pixels, should be ok
+ movq [bufq+xq*2-4], m1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82*2
+ dec hd
+ jg .y_loop_ar3
+%if WIN64
+ mov rsp, r6
+%elif ARCH_X86_32
+%undef m8
+%undef m9
+%undef m10
+%undef m11
+%undef m12
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+
+%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg
+%define base r8-pb_mask
+ lea r8, [pb_mask]
+ movifnidn bdmaxd, bdmaxm
+ lea r6d, [bdmaxq+1]
+%else
+cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h
+%define base r2-$$
+ LEA r2, $$
+ mov fg_dataq, r2m
+ mov r6d, r4m
+ inc r6d
+%endif
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r5d, [fg_dataq+FGData.grain_scale_shift]
+ shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc
+ sub r5, r6
+ SPLATW m6, [base+round+r5*2-2]
+ mova m5, [base+pb_mask]
+ SPLATW m0, [fg_dataq+FGData.seed]
+%if ARCH_X86_64
+ SPLATW m2, [base+pw_seed_xor+uvq*4]
+%else
+ mov r5d, r3m
+ SPLATW m2, [base+pw_seed_xor+r5*4]
+%endif
+ pxor m0, m2
+%if ARCH_X86_64
+ lea r6, [gaussian_sequence]
+%endif
+%if %2
+ mov hd, 73-35*%3
+ add bufq, 44*2
+.loop_y:
+ mov xq, -44
+%else
+ mov xq, -82*73
+ add bufq, 82*73*2
+%endif
+.loop_x:
+ pand m2, m0, m1
+ psrlw m3, m2, 10
+ por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m2, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
+ psllq m2, m3, 30
+ por m2, m3
+ psllq m3, m2, 15
+ por m2, m3 ; aggregate each bit into next seed's high bit
+ pmulhuw m3, m0, m7
+ por m2, m3 ; 4 next output seeds
+ pshuflw m0, m2, q3333
+ psrlw m2, 5
+%if ARCH_X86_64
+ vpgatherdw m3, m2, r6, r9, r10, 4, 2
+%else
+ vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2
+%endif
+ paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0
+ ; shifts by 0, which pmulhrsw does not support
+ pmulhrsw m3, m6
+ movq [bufq+xq*2], m3
+ add xq, 4
+ jl .loop_x
+%if %2
+ add bufq, 82*2
+ dec hd
+ jg .loop_y
+%endif
+
+ ; auto-regression code
+ movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4]
+ lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table]
+ jmp r5
+
+.ar0:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -16*2
+ mov bufyq, r1m
+ mov uvd, r3m
+%endif
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ SPLATW m3, [base+hmul_bits+shiftq*2-10]
+%if ARCH_X86_64
+ sar bdmaxd, 1
+ SPLATW m1, bdmaxd ; max_gain
+%else
+ SPLATW m1, r4m
+ psraw m1, 1
+%endif
+ pcmpeqw m7, m7
+ pxor m7, m1 ; min_grain
+%if ARCH_X86_64
+ SWAP 1, 14
+ DEFINE_ARGS buf, bufy, h, x
+%else
+%define m14 [rsp+0*16]
+ mova m14, m1
+ DEFINE_ARGS buf, bufy, pic_reg, h, x
+%endif
+ pxor m5, m5
+ pcmpgtb m5, m4
+ punpcklbw m4, m5
+%if %2
+ SPLATW m6, [base+hmul_bits+2+%3*2]
+%endif
+ SPLATW m4, m4
+ pxor m5, m5
+%if %2
+%if !cpuflag(sse4)
+ pcmpeqw m2, m2
+ pslldq m2, 12
+%if ARCH_X86_64
+ SWAP 2, 12
+%else
+%define m12 [rsp+1*16]
+ mova m12, m2
+%endif
+%endif
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+82-(82*3+41))
+%else
+ sub bufq, 2*(82*70-3)
+%endif
+ add bufyq, 2*(3+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar0:
+ ; first 32 pixels
+ xor xd, xd
+.x_loop_ar0:
+ movu m0, [bufyq+xq*(2<<%2)]
+%if %2
+%if %3
+ movu m2, [bufyq+xq*4+82*2]
+ paddw m0, m2
+%endif
+ movu m1, [bufyq+xq*4 +16]
+%if %3
+ movu m2, [bufyq+xq*4+82*2+16]
+ paddw m1, m2
+%endif
+ phaddw m0, m1
+ pmulhrsw m0, m6
+%endif
+ punpckhwd m1, m0, m5
+ punpcklwd m0, m5
+ REPX {pmaddwd x, m4}, m0, m1
+ REPX {psrad x, 5}, m0, m1
+ packssdw m0, m1
+ pmulhrsw m0, m3
+ movu m1, [bufq+xq*2]
+ paddw m0, m1
+ pminsw m0, m14
+ pmaxsw m0, m7
+ cmp xd, 72-40*%2
+ je .end
+ movu [bufq+xq*2], m0
+ add xd, 8
+ jmp .x_loop_ar0
+
+ ; last 6/4 pixels
+.end:
+%if %2
+%if cpuflag(sse4)
+ pblendw m0, m1, 11000000b
+%else
+ pand m1, m12
+ pandn m2, m12, m0
+ por m0, m1, m2
+%endif
+ movu [bufq+xq*2], m0
+%else
+ movq [bufq+xq*2], m0
+%endif
+
+ add bufq, 82*2
+ add bufyq, 82*(2<<%3)
+ dec hd
+ jg .y_loop_ar0
+%if ARCH_X86_32
+%undef m12
+%undef m14
+%endif
+ RET
+
+.ar1:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x
+%else
+%assign stack_offset stack_offset_old
+%xdefine rstk rsp
+%assign stack_size_padded 0
+ DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3
+ mov bufyq, r1m
+ mov uvd, r3m
+%endif
+ imul uvd, 28
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+ movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+%if WIN64
+ DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0
+%if %2
+ lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))]
+%else
+ lea bufq, [r0-2*(82*69+3)]
+%endif
+%else
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0
+%else
+ DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3
+%define hd dword r1m
+%define mind dword r3m
+%define maxd dword r4m
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+%endif
+%if ARCH_X86_64
+ mov shiftd, [r2+FGData.ar_coeff_shift]
+%else
+ mov shiftd, [r3+FGData.ar_coeff_shift]
+%endif
+ pxor m5, m5
+ pcmpgtb m5, m4
+ punpcklbw m4, m5 ; cf0-4 in words
+ pshuflw m4, m4, q2100
+ psrldq m4, 2 ; cf0-3,4 in words
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ movd m3, [base+round_vals+shiftq*2-12] ; rnd
+ pxor m6, m6
+ punpcklwd m3, m6
+%if %2
+ SPLATW m6, [base+hmul_bits+2+%3*2]
+%endif
+ SPLATD m3, m3
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+ sar maxd, 1
+%if ARCH_X86_64
+ mov mind, maxd
+ xor mind, -1
+%else
+ DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3
+ mov r2, maxd
+ xor r2, -1
+ mov mind, r2
+%endif
+.y_loop_ar1:
+ mov xq, -(76>>%2)
+ movsx val3d, word [bufq+xq*2-2]
+.x_loop_ar1:
+ movu m0, [bufq+xq*2-82*2-2] ; top/left
+%if %2
+ movu m7, [bufyq+xq*4]
+%if %3
+ movu m1, [bufyq+xq*4+82*2]
+ phaddw m7, m1
+%else
+ phaddw m7, m7
+%endif
+%else
+ movq m7, [bufyq+xq*2]
+%endif
+ psrldq m2, m0, 2 ; top
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m0, m2
+%if %2
+%if %3
+ pshufd m2, m7, q3232
+ paddw m7, m2
+%endif
+ pmulhrsw m7, m6
+%endif
+ punpcklwd m1, m7
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+ paddd m0, m3
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, word [bufq+xq*2]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovg val3d, maxd
+ cmp val3d, mind
+ cmovl val3d, mind
+ mov word [bufq+xq*2], val3w
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar1
+%if ARCH_X86_32
+%undef maxd
+%undef mind
+%undef hd
+%endif
+ RET
+
+.ar2:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
+ ALLOC_STACK -16*8
+ mov bufyq, r1m
+ mov uvd, r3m
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+%if ARCH_X86_64
+ sar bdmaxd, 1
+ SPLATW m5, bdmaxd ; max_grain
+%else
+ SPLATW m5, r4m
+ psraw m5, 1
+%endif
+ pcmpeqw m6, m6
+%if !cpuflag(sse4)
+ pcmpeqw m7, m7
+ psrldq m7, 14
+ pslldq m7, 2
+ pxor m7, m6
+%endif
+ pxor m6, m5 ; min_grain
+%if %2 && cpuflag(sse4)
+ SPLATW m7, [base+hmul_bits+2+%3*2]
+%endif
+
+%if ARCH_X86_64
+ SWAP 5, 13
+ SWAP 6, 14
+ SWAP 7, 15
+%else
+%define m13 [rsp+5*16]
+%define m14 [rsp+6*16]
+%define m15 [rsp+7*16]
+ mova m13, m5
+ mova m14, m6
+ mova m15, m7
+%endif
+
+ ; coef values
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pinsrw m2, [base+round_vals-12+shiftq*2], 5
+
+ pshufd m6, m0, q0000
+ pshufd m7, m0, q1111
+ pshufd m1, m0, q3333
+ pshufd m0, m0, q2222
+ pshufd m3, m2, q1111
+ pshufd m4, m2, q2222
+ pshufd m2, m2, q0000
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+ SWAP 4, 12
+%else
+%define m8 [rsp+0*16]
+%define m9 [rsp+1*16]
+%define m10 [rsp+2*16]
+%define m11 [rsp+3*16]
+%define m12 [rsp+4*16]
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+ mova m12, m4
+%endif
+
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, h, x
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar2:
+ mov xq, -(76>>%2)
+
+.x_loop_ar2:
+ movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5]
+ movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5]
+ psrldq m4, m0, 2 ; y=-2,x=[-1,+5]
+ psrldq m1, m0, 4 ; y=-2,x=[-0,+5]
+ psrldq m3, m0, 6 ; y=-2,x=[+1,+5]
+ psrldq m2, m0, 8 ; y=-2,x=[+2,+5]
+ punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1]
+ pmaddwd m0, m6
+ pmaddwd m1, m7
+ pmaddwd m2, m8
+ paddd m0, m1
+ paddd m0, m2
+ psrldq m3, m5, 2 ; y=-1,x=[-1,+5]
+ psrldq m1, m5, 4 ; y=-1,x=[-0,+5]
+ psrldq m4, m5, 6 ; y=-1,x=[+1,+5]
+ psrldq m2, m5, 8 ; y=-1,x=[+2,+5]
+ punpcklwd m3, m1
+ punpcklwd m4, m2
+ pmaddwd m3, m9
+ pmaddwd m4, m10
+ paddd m3, m4
+ paddd m0, m3
+
+ ; luma component & rounding
+%if %2
+ movu m1, [bufyq+xq*4]
+%if %3
+ movu m2, [bufyq+xq*4+82*2]
+ phaddw m1, m2
+ pshufd m2, m1, q3232
+ paddw m1, m2
+%else
+ phaddw m1, m1
+%endif
+%if cpuflag(sse4)
+ pmulhrsw m1, m15
+%elif %3
+ pmulhrsw m1, [base+pw_8192]
+%else
+ pmulhrsw m1, [base+pw_16384]
+%endif
+%else
+ movq m1, [bufyq+xq*2]
+%endif
+ punpcklwd m1, [base+pw_1]
+ pmaddwd m1, m12
+ paddd m0, m1
+
+ movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5]
+ pshufd m2, m1, q3321
+ pxor m3, m3
+ pcmpgtw m3, m2
+ punpcklwd m2, m3 ; y=0,x=[0,3] in dword
+.x_loop_ar2_inner:
+ pmaddwd m3, m1, m11
+ paddd m3, m0
+ psrldq m0, 4 ; shift top to next pixel
+ psrad m3, [fg_dataq+FGData.ar_coeff_shift]
+ ; we do not need to packssdw since we only care about one value
+ paddd m3, m2
+ packssdw m3, m3
+ pminsw m3, m13
+ pmaxsw m3, m14
+ psrldq m1, 2
+ pslldq m3, 2
+ psrldq m2, 4
+%if cpuflag(sse4)
+ pblendw m1, m3, 00000010b
+%else
+ pand m1, m15
+ pandn m4, m15, m3
+ por m1, m4
+%endif
+ ; overwrite previous pixel, should be ok
+ movd [bufq+xq*2-2], m1
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar2
+%if ARCH_X86_32
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+
+.ar3:
+%if ARCH_X86_64
+ DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift
+%if WIN64
+ mov r6, rsp
+ and rsp, ~15
+ sub rsp, 96
+ %define tmp rsp
+%else
+ %define tmp rsp+stack_offset-120
+%endif
+%else
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift
+%assign stack_offset stack_offset_old
+ ALLOC_STACK -16*14
+ mov bufyq, r1m
+ mov uvd, r3m
+ %define tmp rsp
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ SPLATW m4, [base+round_vals-12+shiftq*2]
+ pxor m5, m5
+ pcmpgtw m5, m4
+ punpcklwd m4, m5
+%if ARCH_X86_64
+ sar bdmaxd, 1
+ SPLATW m6, bdmaxd ; max_grain
+%else
+ SPLATW m6, r4m
+ psraw m6, 1
+%endif
+ pcmpeqw m7, m7
+%if !cpuflag(sse4)
+ pcmpeqw m3, m3
+ psrldq m3, 14
+ pslldq m3, 4
+ pxor m3, m7
+%endif
+ pxor m7, m6 ; min_grain
+%if %2 && cpuflag(sse4)
+ SPLATW m3, [base+hmul_bits+2+%3*2]
+%endif
+
+%if ARCH_X86_64
+ SWAP 3, 11
+ SWAP 4, 12
+ SWAP 6, 14
+ SWAP 7, 15
+%else
+%define m11 [rsp+ 9*16]
+%define m12 [rsp+10*16]
+%define m14 [rsp+12*16]
+%define m15 [rsp+13*16]
+ mova m11, m3
+ mova m12, m4
+ mova m14, m6
+ mova m15, m7
+%endif
+
+ ; cf from y=-3,x=-3 until y=-3,x=-2
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pshufd m1, m0, q0000
+ pshufd m3, m0, q1111
+ pshufd m4, m0, q2222
+ pshufd m0, m0, q3333
+ pshufd m5, m2, q0000
+ pshufd m6, m2, q1111
+ mova [tmp+16*0], m1
+ mova [tmp+16*1], m3
+ mova [tmp+16*2], m4
+ mova [tmp+16*3], m0
+ mova [tmp+16*4], m5
+ mova [tmp+16*5], m6
+ pshufd m6, m2, q2222
+ pshufd m7, m2, q3333
+
+ ; cf from y=-1,x=-1 to y=0,x=-1 + luma component
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpckhbw m2, m0, m1 ; luma
+ punpcklbw m0, m1
+ pshufd m3, m0, q3232
+ psrldq m5, m0, 10
+ ; y=0,x=[-3 to -1] + "1.0" for current pixel
+ pinsrw m5, [base+round_vals-10+shiftq*2], 3
+ ; y=-1,x=[-1 to +2]
+ pshufd m1, m0, q0000
+ pshufd m0, m0, q1111
+ ; y=-1,x=+3 + luma
+ punpcklwd m3, m2
+ pshufd m3, m3, q0000
+
+%if ARCH_X86_64
+ SWAP 1, 8
+ SWAP 0, 9
+ SWAP 3, 10
+ SWAP 5, 13
+ DEFINE_ARGS buf, bufy, fg_data, h, x
+%else
+%define m8 [rsp+ 6*16]
+%define m9 [rsp+ 7*16]
+%define m10 [rsp+ 8*16]
+%define m13 [rsp+11*16]
+ mova m8, m1
+ mova m9, m0
+ mova m10, m3
+ mova m13, m5
+ DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x
+%endif
+%if %2
+ sub bufq, 2*(82*(73-35*%3)+44-(82*3+41))
+%else
+ sub bufq, 2*(82*69+3)
+%endif
+ add bufyq, 2*(79+82*3)
+ mov hd, 70-35*%3
+.y_loop_ar3:
+ mov xq, -(76>>%2)
+
+.x_loop_ar3:
+ ; first line
+ movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4]
+ movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6]
+ palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5]
+ palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6]
+ punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+
+ pmaddwd m0, [tmp+0*16]
+ pmaddwd m2, [tmp+1*16]
+ pmaddwd m3, [tmp+2*16]
+ paddd m0, m2
+ paddd m0, m3 ; first 6 x of top y
+
+ ; second line [m0/1 are busy]
+ movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4]
+ movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6]
+ punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0]
+ palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5]
+ palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5]
+ punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6]
+ punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2]
+ shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4]
+ pmaddwd m1, [tmp+3*16]
+ pmaddwd m4, [tmp+4*16]
+ pmaddwd m3, [tmp+5*16]
+ pmaddwd m5, m6
+ paddd m1, m4
+ paddd m3, m5
+ paddd m0, m1
+ paddd m0, m3 ; top 2 lines
+
+ ; third line [m0 is busy] & luma + round
+ movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4]
+ movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6]
+%if %2
+ movu m5, [bufyq+xq*4]
+%if %3
+ movu m4, [bufyq+xq*4+82*2]
+ phaddw m5, m4
+%else
+ phaddw m5, m5
+%endif
+%else
+ movq m5, [bufyq+xq*2]
+%endif
+ palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5]
+ palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6]
+%if %3
+ pshufd m4, m5, q3232
+ paddw m5, m4
+%endif
+%if %2
+%if cpuflag(sse4)
+ pmulhrsw m5, m11
+%elif %3
+ pmulhrsw m5, [base+pw_8192]
+%else
+ pmulhrsw m5, [base+pw_16384]
+%endif
+%endif
+ punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5]
+ punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1]
+ shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3]
+ punpcklwd m2, m5
+ pmaddwd m1, m7
+ pmaddwd m3, m8
+ pmaddwd m4, m9
+ pmaddwd m2, m10
+ paddd m1, m3
+ paddd m4, m2
+ paddd m0, m12 ; += round
+ paddd m1, m4
+ paddd m0, m1
+
+ movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+ pmaddwd m2, m1, m13
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ packssdw m2, m2
+ pminsw m2, m14
+ pmaxsw m2, m15
+ pslldq m2, 4
+ psrldq m1, 2
+%if cpuflag(sse4)
+ pblendw m1, m2, 00000100b
+%else
+ pand m1, m11
+ pandn m3, m11, m2
+ por m1, m3
+%endif
+ ; overwrite previous pixels, should be ok
+ movq [bufq+xq*2-4], m1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82*2
+ add bufyq, 82*2<<%3
+ dec hd
+ jg .y_loop_ar3
+%if WIN64
+ mov rsp, r6
+%elif ARCH_X86_32
+%undef m8
+%undef m9
+%undef m10
+%undef m11
+%undef m12
+%undef m13
+%undef m14
+%undef m15
+%endif
+ RET
+%endmacro
+
+generate_grain_uv_fn 420, 1, 1
+generate_grain_uv_fn 422, 1, 0
+generate_grain_uv_fn 444, 0, 0
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+ ; copy stack arguments to new position post-alignment, so that we
+ ; don't have to keep the old stack location in a separate register
+ mov r0, r0m
+ mov r1, r2m
+ mov r2, r4m
+ mov r3, r6m
+ mov r4, r7m
+ mov r5, r8m
+
+%define r0m [rsp+8*mmsize+ 3*gprsize]
+%define r2m [rsp+8*mmsize+ 5*gprsize]
+%define r4m [rsp+8*mmsize+ 7*gprsize]
+%define r6m [rsp+8*mmsize+ 9*gprsize]
+%define r7m [rsp+8*mmsize+10*gprsize]
+%define r8m [rsp+8*mmsize+11*gprsize]
+
+ mov r0m, r0
+ mov r2m, r1
+ mov r4m, r2
+ mov r6m, r3
+ mov r7m, r4
+ mov r8m, r5
+%else
+cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+%endif
+ mov srcq, srcm
+ mov scalingq, r5m
+ mov fg_dataq, r3m
+%if STACK_ALIGNMENT < mmsize
+ mov r6, r9m
+
+%define r9m [rsp+8*mmsize+ 4*gprsize]
+%define r3m [rsp+8*mmsize+ 6*gprsize]
+%define r5m [rsp+8*mmsize+ 8*gprsize]
+
+ mov r9m, r6
+%endif
+ LEA r5, $$
+%define base r5-$$
+ mov r5m, picptrq
+%else
+cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
+ lea r8, [pb_mask]
+%define base r8-pb_mask
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ SPLATW m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+%if ARCH_X86_32
+ DECLARE_REG_TMP 0, 3
+%else
+ DECLARE_REG_TMP 9, 10
+%endif
+ mov t0d, r9m ; bdmax
+ sar t0d, 11 ; is_12bpc
+ inc t0d
+ mov t1d, r6d
+ imul t1d, t0d
+ dec t0d
+ SPLATW m5, [base+min+t1*2]
+ lea t0d, [t0d*3]
+ lea t0d, [r6d*2+t0d]
+ SPLATW m4, [base+max+t0*2]
+ SPLATW m2, r9m
+
+ pcmpeqw m1, m1
+ psraw m7, m2, 1 ; max_grain
+ pxor m1, m7 ; min_grain
+ SPLATD m6, [base+pd_16]
+
+ SCRATCH 1, 9, 0
+ SCRATCH 2, 10, 1
+ SCRATCH 3, 11, 2
+ SCRATCH 4, 12, 3
+ SCRATCH 5, 13, 4
+ SCRATCH 6, 14, 5
+ SCRATCH 7, 15, 6
+
+ mova m6, [base+pw_27_17_17_27] ; for horizontal filter
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2
+ DECLARE_REG_TMP 0
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
+ sby, see
+ DECLARE_REG_TMP 7
+%endif
+
+ mov sbyd, r8m
+ movzx t0d, byte [fg_dataq+FGData.overlap_flag]
+ test t0d, t0d
+ jz .no_vertical_overlap
+ test sbyd, sbyd
+ jnz .vertical_overlap
+.no_vertical_overlap:
+ mov dword r8m, t0d
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused1, unused2, see, src_bak
+%endif
+
+ lea src_bakq, [srcq+wq*2]
+ mov r9mp, src_bakq
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r4m, wq
+%endif
+
+.loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak
+%endif
+
+.loop_x_odd:
+ movzx hd, word r7m
+ mov grain_lutq, grain_lutmp
+.loop_y:
+ ; src
+ pand m0, m10, [srcq+ 0]
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4
+ vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4
+%else
+ vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4
+ vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4
+%endif
+ REPX {psrlw x, 8}, m2, m3
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq*2]
+ movu m5, [grain_lutq+offxyq*2+16]
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m2, m3
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp ; src += stride
+ add grain_lutq, 82*2
+ dec hd
+ jg .loop_y
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ btc dword r8m, 2
+ jc .next_blk
+ add offxyd, 16
+ test dword r8m, 2
+ jz .loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r12d, 16 ; top_offxy += 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.next_blk:
+ test dword r8m, 1
+ jz .loop_x
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jnz .loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+%if ARCH_X86_32
+ add offxyd, 16
+ mov [rsp+8*mmsize+0*gprsize], offxyd
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+ mov seed, r3m
+%endif
+
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, left_offxy
+
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, left_offxy
+%endif
+
+ mov hd, dword r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_h_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m5, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+0*gprsize]
+ movd m4, [grain_lutq+r5*2]
+%else
+ movd m4, [grain_lutq+left_offxyq*2]
+%endif
+ punpcklwd m4, m5
+ pmaddwd m4, m6
+ paddd m4, m14
+ psrad m4, 5
+ packssdw m4, m4
+ pminsw m4, m15
+ pmaxsw m4, m9
+ shufps m4, m5, q3210
+
+ ; src
+ pand m0, m10, [srcq+ 0]
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5
+ vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5
+%else
+ vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5
+ vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5
+%endif
+ REPX {psrlw x, 8}, m2, m3
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ movu m5, [grain_lutq+offxyq*2+16]
+ REPX {pmullw x, m11}, m2, m3
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp
+ add grain_lutq, 82*2
+ dec hd
+ jg .loop_y_h_overlap
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ or dword r8m, 4
+ add offxyd, 16
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jz .loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r12d, 16 ; top_offxy += 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end:
+ RET
+
+.vertical_overlap:
+ or t0d, 2
+ mov r8m, t0d
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
+ sby, see
+%endif
+
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+ DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul t0d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add t0d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and t0d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, t0d
+%if ARCH_X86_32
+ xor sbyd, seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused1, unused2, see, src_bak
+%endif
+
+ lea src_bakq, [srcq+wq*2]
+ mov r9mp, src_bakq
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r4m, wq
+%endif
+
+.loop_x_v_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+ SPLATD m7, [base+pw_27_17_17_27]
+ mov seed, r3m
+%else
+ SPLATD m7, [pw_27_17_17_27]
+%endif
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, unused, top_offxy
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, unused, top_offxy
+%endif
+
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+.loop_x_odd_v_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)]
+ mov hd, dword r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_v_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+1*gprsize]
+ movu m2, [grain_lutq+r5*2]
+%else
+ movu m2, [grain_lutq+top_offxyq*2]
+%endif
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ REPX {pmaddwd x, m7}, m4, m2
+ REPX {paddd x, m14}, m4, m2
+ REPX {psrad x, 5}, m4, m2
+ packssdw m2, m4
+ pminsw m2, m15
+ pmaxsw m2, m9
+ movu m4, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m3, [grain_lutq+r5*2+16]
+%else
+ movu m3, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpckhwd m5, m3, m4
+ punpcklwd m3, m4
+ REPX {pmaddwd x, m7}, m5, m3
+ REPX {paddd x, m14}, m5, m3
+ REPX {psrad x, 5}, m5, m3
+ packssdw m3, m5
+ pminsw m3, m15
+ pmaxsw m3, m9
+
+ ; src
+ pand m0, m10, [srcq+ 0] ; m0-1: src as word
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5
+%else
+ vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5
+%endif
+ psrlw m4, 8
+ pmullw m4, m11
+ pmulhrsw m4, m2
+%if ARCH_X86_32
+ vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2
+%else
+ vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2
+%endif
+ psrlw m5, 8
+ pmullw m5, m11
+ pmulhrsw m5, m3
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp
+ add grain_lutq, 82*2
+ dec hw
+ jz .end_y_v_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4]
+ xor hd, 0x10000
+ test hd, 0x10000
+ jnz .loop_y_v_overlap
+ jmp .loop_y
+
+.end_y_v_overlap:
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ btc dword r8m, 2
+ jc .next_blk_v
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ add offxyd, 16
+ jmp .loop_x_odd_v_overlap
+
+.next_blk_v:
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+
+.loop_x_hv_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r0, [rsp+8*mmsize+1*gprsize]
+ add r3, 16
+ add r0, 16
+ mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
+ mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy
+
+ mov seed, r3m
+ xor r0, r0
+%else
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offyq+16]
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy
+%endif
+
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)]
+
+ movzx hd, word r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m2, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
+ mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
+ movu m4, [grain_lutq+r0*2]
+ movd m5, [grain_lutq+r5*2]
+ mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
+ movd m3, [grain_lutq+r5*2]
+%else
+ movu m4, [grain_lutq+top_offxyq*2]
+ movd m5, [grain_lutq+left_offxyq*2]
+ movd m3, [grain_lutq+topleft_offxyq*2]
+%endif
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklwd m5, m2
+ punpcklwd m3, m4
+ REPX {pmaddwd x, m6}, m5, m3
+ REPX {paddd x, m14}, m5, m3
+ REPX {psrad x, 5}, m5, m3
+ packssdw m5, m3
+ pminsw m5, m15
+ pmaxsw m5, m9
+ shufps m3, m5, m2, q3210
+ shufps m5, m4, q3232
+ ; followed by v interpolation (top | cur -> cur)
+ movu m0, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m1, [grain_lutq+r0*2+16]
+%else
+ movu m1, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpcklwd m2, m5, m3
+ punpckhwd m5, m3
+ punpcklwd m3, m1, m0
+ punpckhwd m1, m0
+ REPX {pmaddwd x, m7}, m2, m5, m3, m1
+ REPX {paddd x, m14}, m2, m5, m3, m1
+ REPX {psrad x, 5}, m2, m5, m3, m1
+ packssdw m2, m5
+ packssdw m3, m1
+ REPX {pminsw x, m15}, m2, m3
+ REPX {pmaxsw x, m9}, m2, m3
+
+ ; src
+ pand m0, m10, [srcq+ 0]
+ pand m1, m10, [srcq+16] ; m0-1: src as word
+
+ ; scaling[src]
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5
+%else
+ vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5
+%endif
+ psrlw m4, 8
+ pmullw m4, m11
+ pmulhrsw m2, m4
+%if ARCH_X86_32
+ vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4
+%else
+ vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4
+%endif
+ psrlw m5, 8
+ pmullw m5, m11
+ pmulhrsw m3, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+srcq+ 0], m0
+ mova [dstq+srcq+16], m1
+
+ add srcq, r2mp
+ add grain_lutq, 82*2
+ dec hw
+ jz .end_y_hv_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4]
+ xor hd, 0x10000
+ test hd, 0x10000
+ jnz .loop_y_hv_overlap
+ jmp .loop_y_h_overlap
+
+.end_y_hv_overlap:
+ or dword r8m, 4
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov r5, r5m
+ add offxyd, 16
+ add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16
+ mov srcq, r9mp
+ add srcq, r4mp
+ add srcq, r4mp
+%else
+ add offxyd, 16
+ add top_offxyd, 16
+ mov src_bakq, r9mp
+ lea srcq, [src_bakq+wq*2]
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end_hv:
+ RET
+%if ARCH_X86_32
+ DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+%endif
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+INIT_XMM ssse3
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \
+ tmp, src, scaling, h, fg_data, picptr, unused
+ mov r0, r0m
+ mov r1, r1m
+ mov r2, r2m
+ mov r4, r3m
+ mov r3, r4m
+ mov r5, r5m
+%define r0m [rsp+8*mmsize+ 3*gprsize]
+%define r1m [rsp+8*mmsize+ 4*gprsize]
+%define r2m [rsp+8*mmsize+ 5*gprsize]
+%define r3m [rsp+8*mmsize+ 6*gprsize]
+%define r4m [rsp+8*mmsize+ 7*gprsize]
+%define r5m [rsp+8*mmsize+ 8*gprsize]
+ mov r0m, r0
+ mov r2m, r2
+ mov r4m, r3
+ mov r5m, r5
+
+ mov r0, r6m
+ mov r2, r7m
+ mov r3, r8m
+ mov r5, r9m
+%define r6m [rsp+8*mmsize+ 9*gprsize]
+%define r7m [rsp+8*mmsize+10*gprsize]
+%define r8m [rsp+8*mmsize+11*gprsize]
+%define r9m [rsp+8*mmsize+12*gprsize]
+ mov r6m, r0
+ mov r7m, r2
+ mov r8m, r3
+ mov r9m, r5
+
+ mov r2, r10m
+ mov r3, r11m
+ mov r5, r12m
+ mov r0, r13m
+%define r10m [rsp+8*mmsize+13*gprsize]
+%define r11m [rsp+8*mmsize+14*gprsize]
+%define r12m [rsp+8*mmsize+15*gprsize]
+ mov r10m, r2
+ mov r11m, r3
+ mov r12m, r5
+
+ SPLATW m2, r13m
+%else
+cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
+ tmp, src, scaling, h, fg_data, picptr, unused
+ mov srcq, srcm
+ mov fg_dataq, r3m
+%endif
+ LEA r5, $$
+%define base r5-$$
+
+ DECLARE_REG_TMP 0, 2, 3
+%else
+cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, lstride, uv_pl, is_id
+%define base r8-pb_mask
+ lea r8, [pb_mask]
+
+ DECLARE_REG_TMP 9, 10, 11
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ SPLATW m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+%if STACK_ALIGNMENT >= mmsize
+ mov t0d, r13m ; bdmax
+%endif
+ sar t0d, 11 ; is_12bpc
+ inc t0d
+ mov t1d, r6d
+ imul t1d, t0d
+ dec t0d
+ SPLATW m5, [base+min+t1*2]
+ lea t1d, [t0d*3]
+ mov t2d, r12m
+ inc t2d
+ imul r6d, t2d
+ add t1d, r6d
+ SPLATW m4, [base+max+t1*2]
+%if STACK_ALIGNMENT >= mmsize
+ SPLATW m2, r13m
+%endif
+
+ SCRATCH 2, 10, 2
+ SCRATCH 3, 11, 3
+ SCRATCH 4, 12, 4
+ SCRATCH 5, 13, 5
+
+%define mzero m7
+
+%if %3
+ SPLATD m2, [base+pw_23_22]
+%endif
+
+%if ARCH_X86_32
+ mov scalingq, r5m
+ mov r5m, r5
+%else
+ mov r13mp, strideq
+%endif
+
+ pcmpeqw m0, m0
+ psraw m1, m10, 1
+ pxor m0, m1
+
+ SCRATCH 0, 8, 0
+ SCRATCH 1, 9, 1
+
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+
+ DECLARE_REG_TMP 0
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+
+ DECLARE_REG_TMP 9
+%endif
+
+%if %1
+ mov r6d, r11m
+ SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4]
+ SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
+ punpcklwd m6, m1, m0
+ SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4]
+ SPLATD m7, [base+pw_4+t0*4]
+ pmullw m5, m7
+%else
+ SPLATD m6, [base+pd_16]
+%if %2
+ mova m5, [base+pw_23_22]
+%else
+ mova m5, [base+pw_27_17_17_27]
+%endif
+%endif
+
+ SCRATCH 6, 14, 6
+ SCRATCH 5, 15, 7
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 0
+%else
+ DECLARE_REG_TMP 7
+%endif
+
+ mov sbyd, r8m
+ mov t0d, [fg_dataq+FGData.overlap_flag]
+ test t0d, t0d
+ jz %%no_vertical_overlap
+ test sbyd, sbyd
+ jnz %%vertical_overlap
+
+%%no_vertical_overlap:
+ mov r8m, t0d
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
+
+ mov dstq, r0mp
+ mov lumaq, r9mp
+ mov wq, r4m
+ lea r3, [srcq+wq*2]
+ mov r1mp, r3
+ lea r3, [dstq+wq*2]
+ mov r11mp, r3
+ lea r3, [lumaq+wq*(2<<%2)]
+ mov r12mp, r3
+%if %3
+ shl r10mp, 1
+%endif
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused2, unused3, see, unused4, unused5, unused6, luma, lstride
+
+ mov lstrideq, r10mp
+%if %3
+ add lstrideq, lstrideq
+%endif
+ mov lumaq, r9mp
+ lea r10, [srcq+wq*2]
+ lea r11, [dstq+wq*2]
+ lea r12, [lumaq+wq*(2<<%2)]
+ mov r10mp, r10
+ mov r11mp, r11
+ mov r12mp, r12
+%endif
+ neg wq
+%if ARCH_X86_32
+ mov r4mp, wq
+%endif
+
+%%loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, unused1, unused2, unused3, luma, lstride
+
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, unused1, unused2, unused3, luma, lstride
+%endif
+
+%if %2 == 0
+%%loop_x_odd:
+%endif
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y:
+ ; src
+ mova m0, [srcq]
+ mova m1, [srcq+16] ; m0-1: src as word
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+
+ mov lumaq, r9m
+%endif
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+(16<<%2)]
+%if %2
+ phaddw m4, [lumaq+16]
+ phaddw m6, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9m, lumaq
+%endif
+%if %2
+ pavgw m4, mzero
+ pavgw m6, mzero
+%endif
+
+%if %1
+ punpckhwd m3, m4, m0
+ punpcklwd m4, m0
+ punpckhwd m5, m6, m1
+ punpcklwd m6, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m3, m4, m5, m6
+ REPX {psrad x, 6}, m3, m4, m5, m6
+ packssdw m4, m3
+ packssdw m6, m5
+ REPX {paddw x, m15}, m4, m6
+ REPX {pmaxsw x, mzero}, m4, m6
+ REPX {pminsw x, m10}, m4, m6 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m4, m6
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1
+%else
+ vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1
+%endif
+ REPX {psrlw x, 8}, m3, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq*2]
+ movu m6, [grain_lutq+offxyq*2+16]
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m3, m5
+ pmulhrsw m4, m3
+ pmulhrsw m6, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m4
+ paddw m1, m6
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+ dec hd
+ jg %%loop_y
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma
+
+ mov wq, r4mp
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov r0m, dstq
+ mov r9m, lumaq
+ mov r4m, wq
+%endif
+%if %2 == 0
+ btc dword r8m, 2
+ jc %%next_blk
+ add offxyd, 16
+ test dword r8m, 2
+ jz %%loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%%next_blk:
+%endif
+ test dword r8m, 1
+ je %%loop_x
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jnz %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+%if ARCH_X86_32
+ add offxyd, 16
+ mov [rsp+8*mmsize+0*gprsize], offxyd
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
+
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, luma, lstride
+
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, luma, lstride
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_h_overlap:
+ mova m0, [srcq]
+ mova m1, [srcq+16]
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+ mov lumaq, r9m
+%endif
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+(16<<%2)]
+%if %2
+ phaddw m4, [lumaq+16]
+ phaddw m6, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9m, lumaq
+%endif
+%if %2
+ pavgw m4, mzero
+ pavgw m6, mzero
+%endif
+
+%if %1
+ punpckhwd m3, m4, m0
+ punpcklwd m4, m0
+ punpckhwd m5, m6, m1
+ punpcklwd m6, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m3, m4, m5, m6
+ REPX {psrad x, 6}, m3, m4, m5, m6
+ packssdw m4, m3
+ packssdw m6, m5
+ REPX {paddw x, m15}, m4, m6
+ REPX {pmaxsw x, mzero}, m4, m6
+ REPX {pminsw x, m10}, m4, m6 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m4, m6
+%endif
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m7, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+0*gprsize]
+ movd m5, [grain_lutq+r5*2]
+%else
+ movd m5, [grain_lutq+left_offxyq*2+ 0]
+%endif
+ punpcklwd m5, m7 ; {left0, cur0}
+%if %1
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+%if %2
+ pmaddwd m5, [PIC_ptr(pw_23_22)]
+%else
+ pmaddwd m5, [PIC_ptr(pw_27_17_17_27)]
+%endif
+ paddd m5, [PIC_ptr(pd_16)]
+%else
+ pmaddwd m5, m15
+ paddd m5, m14
+%endif
+ psrad m5, 5
+ packssdw m5, m5
+ pmaxsw m5, m8
+ pminsw m5, m9
+ shufps m5, m7, q3210
+ movu m3, [grain_lutq+offxyq*2+16]
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1
+%else
+ vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1
+ vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1
+%endif
+ REPX {psrlw x, 8}, m7, m4
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m7, m4
+ pmulhrsw m5, m7
+ pmulhrsw m3, m4
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m5
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+ dec hd
+ jg %%loop_y_h_overlap
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
+ mov wq, r4mp
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov r0mp, dstq
+ mov r9mp, lumaq
+ mov r4m, wq
+%endif
+
+%if %2
+ ; r8m = sbym
+ test dword r8m, 2
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+%else
+ or dword r8m, 4
+ add offxyd, 16
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jz %%loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxy += 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%end:
+ RET
+
+%%vertical_overlap:
+ or t0d, 2
+ mov r8m, t0d
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
+ sby, see, unused1, unused2, unused3, lstride
+%endif
+
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+
+ DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul t0d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add t0d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and t0d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, t0d
+%if ARCH_X86_32
+ xor sbyd, seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, luma
+
+ mov r3m, seed
+ mov dstq, r0mp
+ mov lumaq, r9mp
+ mov wq, r4m
+ lea r3, [srcq+wq*2]
+ mov r1mp, r3
+ lea r3, [dstq+wq*2]
+ mov r11mp, r3
+ lea r3, [lumaq+wq*(2<<%2)]
+ mov r12mp, r3
+%if %3
+ shl r10mp, 1
+%endif
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused1, unused2, see, unused3, unused4, unused5, luma, lstride
+
+ mov lstrideq, r10mp
+%if %3
+ add lstrideq, lstrideq
+%endif
+ mov lumaq, r9mp
+ lea r10, [srcq+wq*2]
+ lea r11, [dstq+wq*2]
+ lea r12, [lumaq+wq*(2<<%2)]
+ mov r10mp, r10
+ mov r11mp, r11
+ mov r12mp, r12
+%endif
+ neg wq
+%if ARCH_X86_32
+ mov r4m, wq
+%endif
+
+%%loop_x_v_overlap:
+%if ARCH_X86_32
+ mov seed, r3m
+ xor t0d, t0d
+%else
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, unused1, top_offxy, unused2, luma, lstride
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, unused1, top_offxy, unused2, luma, lstride
+%endif
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+%if %2 == 0
+%%loop_x_odd_v_overlap:
+%endif
+%if %3 == 0
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)]
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_v_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy
+ movu m5, [grain_lutq+r0*2]
+%else
+ movu m5, [grain_lutq+top_offxyq*2]
+%endif
+ punpckhwd m7, m5, m3
+ punpcklwd m5, m3 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m7, m5
+%if %1
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5
+%else
+ REPX {paddd x, m14}, m7, m5
+%endif
+ REPX {psrad x, 5}, m7, m5
+ packssdw m3, m5, m7
+ pmaxsw m3, m8
+ pminsw m3, m9
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m5, [grain_lutq+r0*2+16]
+%else
+ movu m5, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpckhwd m7, m5, m4
+ punpcklwd m5, m4 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m7, m5
+%if %1
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5
+%else
+ REPX {paddd x, m14}, m7, m5
+%endif
+ REPX {psrad x, 5}, m7, m5
+ packssdw m4, m5, m7
+ pmaxsw m4, m8
+ pminsw m4, m9
+
+ ; src
+ mova m0, [srcq]
+ mova m1, [srcq+16]
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+
+ mov lumaq, r9mp
+%endif
+ mova m5, [lumaq+ 0]
+ mova m6, [lumaq+(16<<%2)]
+%if %2
+ phaddw m5, [lumaq+16]
+ phaddw m6, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+%if %2
+ pavgw m5, mzero
+ pavgw m6, mzero
+%endif
+
+%if %1
+ punpckhwd m7, m5, m0
+ punpcklwd m5, m0
+ REPX {pmaddwd x, m14}, m7, m5
+ REPX {psrad x, 6}, m7, m5
+ packssdw m5, m7
+ punpckhwd m7, m6, m1
+ punpcklwd m6, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m7, m6
+ REPX {psrad x, 6}, m7, m6
+ packssdw m6, m7
+ pxor mzero, mzero
+ REPX {paddw x, m15}, m5, m6
+ REPX {pmaxsw x, mzero}, m5, m6
+ REPX {pminsw x, m10}, m5, m6 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m5, m6
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1
+%else
+ vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1
+ vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m7, m5
+ pmulhrsw m3, m7
+ pmulhrsw m4, m5
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m3
+ paddw m1, m4
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+ dec hw
+ jle %%end_y_v_overlap
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+%if %3
+ jmp %%loop_y
+%else
+ btc hd, 16
+ jc %%loop_y
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4]
+ jmp %%loop_y_v_overlap
+%endif
+
+%%end_y_v_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov r0mp, dstq
+ mov r9mp, lumaq
+ mov r4m, wq
+%endif
+
+%if %2
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+%else
+ btc dword r8m, 2
+ jc %%loop_x_hv_overlap
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%loop_x_hv_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut
+
+ mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy
+ add offxyd, 16
+ add t0d, 16
+ mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd
+ mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut
+
+ mov seed, r3m
+ xor t0d, t0d
+%else
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp t0b ; parity of top_seed
+ shr seed, 16
+ shl t0d, 16
+ test seeb, seeh
+ setp t0b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor t0d, r6d
+ mov seed, t0d
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
+
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offyq+16]
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride
+%endif
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+8*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+%if %3 == 0
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)]
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
+ mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
+ movd m5, [grain_lutq+r5*2]
+%else
+ movd m5, [grain_lutq+left_offxyq*2]
+%endif
+ movu m7, [grain_lutq+offxyq*2]
+%if ARCH_X86_32
+ mov r5, [rsp+8*mmsize+2*gprsize]
+ movu m4, [grain_lutq+r0*2]
+%if %2
+ pinsrw m5, [grain_lutq+r5*2], 2
+%else
+ movd m3, [grain_lutq+r5*2]
+%endif
+%else
+ movu m4, [grain_lutq+top_offxyq*2]
+%if %2
+ pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left }
+%else
+ movd m3, [grain_lutq+topleft_offxyq*2]
+%endif
+%endif
+%if %2 == 0
+ punpckldq m5, m3
+%endif
+ punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 }
+ punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 }
+%if %1
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+%if %2
+ movddup m0, [PIC_ptr(pw_23_22)]
+%else
+ movddup m0, [PIC_ptr(pw_27_17_17_27)]
+%endif
+%else
+ pshufd m0, m15, q1010
+%endif
+ pmaddwd m5, m0
+%if %1
+ paddd m5, [PIC_ptr(pd_16)]
+%else
+ paddd m5, m14
+%endif
+ psrad m5, 5
+ packssdw m5, m5
+ pmaxsw m5, m8
+ pminsw m5, m9
+ shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3
+ shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter
+ shufps m5, m4, q3231 ; top0-7 post-h_filter
+
+ punpckhwd m7, m5, m3
+ punpcklwd m5, m3 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m7, m5
+%if %1
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7
+%else
+ REPX {paddd x, m14}, m5, m7
+%endif
+ REPX {psrad x, 5}, m5, m7
+ packssdw m3, m5, m7
+ pmaxsw m3, m8
+ pminsw m3, m9
+
+ ; right half
+ movu m4, [grain_lutq+offxyq*2+16]
+%if ARCH_X86_32
+ movu m0, [grain_lutq+r0*2+16]
+%else
+ movu m0, [grain_lutq+top_offxyq*2+16]
+%endif
+ punpckhwd m1, m0, m4
+ punpcklwd m0, m4 ; {top/cur interleaved}
+ REPX {pmaddwd x, m2}, m1, m0
+%if %1
+ REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0
+%else
+ REPX {paddd x, m14}, m1, m0
+%endif
+ REPX {psrad x, 5}, m1, m0
+ packssdw m4, m0, m1
+ pmaxsw m4, m8
+ pminsw m4, m9
+
+ ; src
+ mova m0, [srcq]
+ mova m1, [srcq+16]
+
+ ; luma_src
+ pxor mzero, mzero
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
+
+ mov lumaq, r9mp
+%endif
+ mova m6, [lumaq+ 0]
+ mova m5, [lumaq+(16<<%2)]
+%if %2
+ phaddw m6, [lumaq+16]
+ phaddw m5, [lumaq+48]
+%endif
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+%if %2
+ pavgw m6, mzero
+ pavgw m5, mzero
+%endif
+
+%if %1
+ punpckhwd m7, m6, m0
+ punpcklwd m6, m0
+ REPX {pmaddwd x, m14}, m7, m6
+ REPX {psrad x, 6}, m7, m6
+ packssdw m6, m7
+ punpckhwd m7, m5, m1
+ punpcklwd m5, m1 ; { luma, chroma }
+ REPX {pmaddwd x, m14}, m7, m5
+ REPX {psrad x, 6}, m7, m5
+ packssdw m5, m7
+ pxor mzero, mzero
+ REPX {paddw x, m15}, m6, m5
+ REPX {pmaxsw x, mzero}, m6, m5
+ REPX {pminsw x, m10}, m6, m5 ; clip_pixel()
+%else
+ REPX {pand x, m10}, m6, m5
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1
+ vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1
+%else
+%if %3 == 0
+ ; register shortage :)
+ push r12
+%endif
+ vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1
+ vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1
+%if %3 == 0
+ pop r12
+%endif
+%endif
+ REPX {psrlw x, 8}, m7, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ REPX {pmullw x, m11}, m7, m6
+ pmulhrsw m3, m7
+ pmulhrsw m4, m6
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m3
+ paddw m1, m4
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ movifnidn dstq, dstmp
+ mova [dstq+ 0], m0
+ mova [dstq+16], m1
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ add dstq, r2mp
+ mov dstmp, dstq
+%else
+ add srcq, r13mp
+ add dstq, r13mp
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*2
+ dec hw
+%if %3
+ jg %%loop_y_h_overlap
+%else
+ jle %%end_y_hv_overlap
+ btc hd, 16
+ jc %%loop_y_h_overlap
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4]
+ jmp %%loop_y_hv_overlap
+%%end_y_hv_overlap:
+%endif
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+%else
+ mov srcq, r10mp
+%endif
+ mov dstq, r11mp
+ mov lumaq, r12mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+%if ARCH_X86_32
+ mov dstmp, dstq
+ mov r9mp, lumaq
+ mov r4m, wq
+%endif
+%if %2
+ jmp %%loop_x_hv_overlap
+%else
+ or dword r8m, 4
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxy += 16
+%endif
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%end_hv:
+ RET
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
diff --git a/third_party/dav1d/src/x86/filmgrain_avx2.asm b/third_party/dav1d/src/x86/filmgrain_avx2.asm
new file mode 100644
index 0000000000..55445cf593
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_avx2.asm
@@ -0,0 +1,2107 @@
+; Copyright © 2019-2022, VideoLAN and dav1d authors
+; Copyright © 2019-2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0
+gen_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+gen_shufB: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
+gen_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+gen_shufD: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
+; note: the order of (some of) the following constants matter
+pb_27_17: times 2 db 27, 17
+byte_blend: db 0, 0, 0, -1
+pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32
+pb_17_27: times 2 db 17, 27
+pb_1: times 4 db 1
+pb_23_22: db 23, 22, 0, 32, 0, 32, 0, 32
+next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+pw_seed_xor: times 2 dw 0xb524
+ times 2 dw 0x49d8
+fg_min: times 4 db 0
+ times 4 db 16
+fg_max: times 4 db 255
+ times 4 db 240
+ times 4 db 235
+pd_m65536: dd -65536
+pw_8: times 2 dw 8
+pw_1024: times 2 dw 1024
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16
+round_vals: dw 32, 64, 128, 256, 512
+pw_1: dw 1
+
+%macro JMP_TABLE 2-*
+ %1_8bpc_%2_table:
+ %xdefine %%base %1_8bpc_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ %rep %0 - 2
+ dd %%prefix %+ .ar%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data
+%define base r4-generate_grain_y_8bpc_avx2_table
+ lea r4, [generate_grain_y_8bpc_avx2_table]
+ vpbroadcastw xm0, [fg_dataq+FGData.seed]
+ mov r6d, [fg_dataq+FGData.grain_scale_shift]
+ movq xm1, [base+next_upperbit_mask]
+ movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
+ movq xm4, [base+mul_bits]
+ movq xm5, [base+hmul_bits]
+ mov r7, -73*82
+ mova xm6, [base+pb_mask]
+ sub bufq, r7
+ vpbroadcastw xm7, [base+round+r6*2]
+ lea r6, [gaussian_sequence]
+ movsxd r5, [r4+r5*4]
+.loop:
+ pand xm2, xm0, xm1
+ psrlw xm3, xm2, 10
+ por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw xm2, xm4 ; bits 0x0f00 are set
+ pmulhuw xm0, xm5
+ pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm2, xm0 ; aggregate each bit into next seed's high bit
+ por xm3, xm2 ; 4 next output seeds
+ pshuflw xm0, xm3, q3333
+ psrlw xm3, 5
+ pand xm2, xm0, xm1
+ movq r2, xm3
+ psrlw xm3, xm2, 10
+ por xm2, xm3
+ pmullw xm2, xm4
+ pmulhuw xm0, xm5
+ movzx r3d, r2w
+ pshufb xm3, xm6, xm2
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm0, xm2
+ movd xm2, [r6+r3*2]
+ rorx r3, r2, 32
+ por xm3, xm0
+ shr r2d, 16
+ pinsrw xm2, [r6+r2*2], 1
+ pshuflw xm0, xm3, q3333
+ movzx r2d, r3w
+ psrlw xm3, 5
+ pinsrw xm2, [r6+r2*2], 2
+ shr r3d, 16
+ movq r2, xm3
+ pinsrw xm2, [r6+r3*2], 3
+ movzx r3d, r2w
+ pinsrw xm2, [r6+r3*2], 4
+ rorx r3, r2, 32
+ shr r2d, 16
+ pinsrw xm2, [r6+r2*2], 5
+ movzx r2d, r3w
+ pinsrw xm2, [r6+r2*2], 6
+ shr r3d, 16
+ pinsrw xm2, [r6+r3*2], 7
+ pmulhrsw xm2, xm7
+ packsswb xm2, xm2
+ movq [bufq+r7], xm2
+ add r7, 8
+ jl .loop
+
+ ; auto-regression code
+ add r5, r4
+ jmp r5
+
+.ar1:
+ DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+ movd xm5, [fg_dataq+FGData.ar_coeffs_y]
+ mova xm2, [base+gen_shufC]
+ DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
+ pinsrb xm5, [base+pb_1], 3
+ vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd
+ pmovsxbw xm5, xm5
+ pshufd xm4, xm5, q0000
+ pshufd xm5, xm5, q1111
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+ mov mind, -128
+ mov maxd, 127
+.y_loop_ar1:
+ mov xq, -76
+ movsx val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+ pmovsxbw xm1, [bufq+xq-82-3]
+ pshufb xm0, xm1, xm2
+ punpckhwd xm1, xm3
+ pmaddwd xm0, xm4
+ pmaddwd xm1, xm5
+ paddd xm0, xm1
+.x_loop_ar1_inner:
+ movd val0d, xm0
+ psrldq xm0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ movsx val0d, byte [bufq+xq]
+ sarx val3d, val3d, shiftd
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovns val3d, maxd
+ cmp val3d, mind
+ cmovs val3d, mind
+ mov [bufq+xq], val3b
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xb, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+.x_loop_ar1_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar1
+.ar0:
+ RET
+
+.ar2:
+%if WIN64
+ ; xmm6 and xmm7 already saved
+ %assign xmm_regs_used 16
+ %assign stack_size_padded 168
+ SUB rsp, stack_size_padded
+ movaps [rsp+16*2], xmm8
+ movaps [rsp+16*3], xmm9
+ movaps [rsp+16*4], xmm10
+ movaps [rsp+16*5], xmm11
+ movaps [rsp+16*6], xmm12
+ movaps [rsp+16*7], xmm13
+ movaps [rsp+16*8], xmm14
+ movaps [rsp+16*9], xmm15
+%endif
+ DEFINE_ARGS buf, fg_data, h, x
+ mov r6d, [fg_dataq+FGData.ar_coeff_shift]
+ pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7
+ movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11
+ vpbroadcastd xm10, [base+round_vals-14+r6*2]
+ movd xm11, [base+byte_blend+1]
+ pmovsxbw xm9, xm9
+ pshufd xm4, xm7, q0000
+ mova xm12, [base+gen_shufA]
+ pshufd xm5, xm7, q3333
+ mova xm13, [base+gen_shufB]
+ pshufd xm6, xm7, q1111
+ mova xm14, [base+gen_shufC]
+ pshufd xm7, xm7, q2222
+ mova xm15, [base+gen_shufD]
+ pshufd xm8, xm9, q0000
+ psrld xm10, 16
+ pshufd xm9, xm9, q1111
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+.y_loop_ar2:
+ mov xq, -76
+.x_loop_ar2:
+ pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
+ pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
+ pshufb xm2, xm0, xm12
+ pmaddwd xm2, xm4
+ pshufb xm3, xm1, xm13
+ pmaddwd xm3, xm5
+ paddd xm2, xm3
+ pshufb xm3, xm0, xm14
+ pmaddwd xm3, xm6
+ punpckhqdq xm0, xm0
+ punpcklwd xm0, xm1
+ pmaddwd xm0, xm7
+ pshufb xm1, xm15
+ pmaddwd xm1, xm8
+ paddd xm2, xm10
+ paddd xm2, xm3
+ paddd xm0, xm1
+ paddd xm2, xm0
+ movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+ pmovsxbw xm1, xm0
+ pmaddwd xm3, xm9, xm1
+ psrldq xm1, 4 ; y=0,x=0
+ paddd xm3, xm2
+ psrldq xm2, 4 ; shift top to next pixel
+ psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
+ paddw xm3, xm1
+ packsswb xm3, xm3
+ pextrb [bufq+xq], xm3, 0
+ pslldq xm3, 2
+ vpblendvb xm0, xm3, xm11
+ psrldq xm0, 1
+ inc xq
+ jz .x_loop_ar2_end
+ test xb, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+.x_loop_ar2_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+INIT_YMM avx2
+.ar3:
+%if WIN64
+ ; xmm6 and xmm7 already saved
+ %assign stack_offset 16
+ ALLOC_STACK 16*14
+ %assign stack_size stack_size - 16*4
+ %assign xmm_regs_used 12
+ movaps [rsp+16*12], xmm8
+ movaps [rsp+16*13], xmm9
+ movaps [rsp+16*14], xmm10
+ movaps [rsp+16*15], xmm11
+%else
+ ALLOC_STACK 16*12
+%endif
+ mov r6d, [fg_dataq+FGData.ar_coeff_shift]
+ movd xm11, [base+byte_blend]
+ pmovsxbw m1, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15
+ pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23
+ pshufd m0, m1, q0000
+ mova [rsp+16* 0], m0
+ pshufd m0, m1, q1111
+ mova [rsp+16* 2], m0
+ pshufd m0, m1, q2222
+ mova [rsp+16* 4], m0
+ pshufd m1, m1, q3333
+ mova [rsp+16* 6], m1
+ pshufd xm0, xm2, q0000
+ mova [rsp+16* 8], xm0
+ pshufd xm0, xm2, q1111
+ mova [rsp+16* 9], xm0
+ psrldq xm7, xm2, 10
+ mova m8, [base+gen_shufA]
+ pinsrw xm2, [base+pw_1], 5
+ mova m9, [base+gen_shufC]
+ pshufd xm2, xm2, q2222
+ movu m10, [base+gen_shufE]
+ vpbroadcastw xm6, [base+round_vals-12+r6*2]
+ pinsrw xm7, [base+round_vals+r6*2-10], 3
+ mova [rsp+16*10], xm2
+ DEFINE_ARGS buf, fg_data, h, x
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+.y_loop_ar3:
+ mov xq, -76
+.x_loop_ar3:
+ movu xm5, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
+ vinserti128 m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12]
+ movu xm4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
+ punpcklbw m3, m5, m5
+ punpckhwd m5, m4
+ psraw m3, 8
+ punpcklbw m5, m5
+ psraw m5, 8
+ punpcklbw xm4, xm4
+ psraw xm4, 8
+ pshufb m0, m3, m8
+ pmaddwd m0, [rsp+16*0]
+ pshufb m1, m3, m9
+ pmaddwd m1, [rsp+16*2]
+ shufps m2, m3, m5, q1032
+ paddd m0, m1
+ pshufb m1, m2, m8
+ vperm2i128 m3, m4, 0x21
+ pmaddwd m1, [rsp+16*4]
+ shufps xm2, xm3, q1021
+ vpblendd m2, m3, 0xf0
+ pshufb m2, m10
+ paddd m0, m1
+ pmaddwd m2, [rsp+16*6]
+ pshufb xm1, xm4, xm9
+ pmaddwd xm1, [rsp+16*8]
+ shufps xm4, xm5, q1132
+ paddd m0, m2
+ pshufb xm2, xm4, xm8
+ pshufd xm4, xm4, q2121
+ pmaddwd xm2, [rsp+16*9]
+ punpcklwd xm4, xm6
+ pmaddwd xm4, [rsp+16*10]
+ vextracti128 xm3, m0, 1
+ paddd xm0, xm1
+ movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4]
+ paddd xm2, xm4
+ paddd xm0, xm2
+ paddd xm0, xm3
+.x_loop_ar3_inner:
+ pmovsxbw xm2, xm1
+ pmaddwd xm2, xm7
+ pshufd xm3, xm2, q1111
+ paddd xm2, xm0 ; add top
+ paddd xm2, xm3 ; left+cur
+ psrldq xm0, 4
+ psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
+ packsswb xm2, xm2
+ pextrb [bufq+xq], xm2, 0
+ pslldq xm2, 3
+ vpblendvb xm1, xm2, xm11
+ psrldq xm1, 1
+ inc xq
+ jz .x_loop_ar3_end
+ test xb, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+.x_loop_ar3_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar3
+ RET
+
+%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y
+INIT_XMM avx2
+cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv
+%define base r4-generate_grain_uv_%1_8bpc_avx2_table
+ lea r4, [generate_grain_uv_%1_8bpc_avx2_table]
+ vpbroadcastw xm0, [fg_dataq+FGData.seed]
+ mov r6d, [fg_dataq+FGData.grain_scale_shift]
+ movq xm1, [base+next_upperbit_mask]
+ movq xm4, [base+mul_bits]
+ movq xm5, [base+hmul_bits]
+ mova xm6, [base+pb_mask]
+ vpbroadcastw xm7, [base+round+r6*2]
+ vpbroadcastd xm2, [base+pw_seed_xor+uvq*4]
+ pxor xm0, xm2
+ lea r6, [gaussian_sequence]
+%if %2
+ mov r7d, 73-35*%3
+ add bufq, 44
+.loop_y:
+ mov r5, -44
+%else
+ mov r5, -73*82
+ sub bufq, r5
+%endif
+.loop:
+ pand xm2, xm0, xm1
+ psrlw xm3, xm2, 10
+ por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw xm2, xm4 ; bits 0x0f00 are set
+ pmulhuw xm0, xm5
+ pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds
+ psllq xm2, xm3, 30
+ por xm2, xm3
+ psllq xm3, xm2, 15
+ por xm2, xm0 ; aggregate each bit into next seed's high bit
+ por xm2, xm3 ; 4 next output seeds
+ pshuflw xm0, xm2, q3333
+ psrlw xm2, 5
+ movq r8, xm2
+ movzx r9d, r8w
+ movd xm2, [r6+r9*2]
+ rorx r9, r8, 32
+ shr r8d, 16
+ pinsrw xm2, [r6+r8*2], 1
+ movzx r8d, r9w
+ pinsrw xm2, [r6+r8*2], 2
+ shr r9d, 16
+ pinsrw xm2, [r6+r9*2], 3
+ pmulhrsw xm2, xm7
+ packsswb xm2, xm2
+ movd [bufq+r5], xm2
+ add r5, 4
+ jl .loop
+%if %2
+ add bufq, 82
+ dec r7d
+ jg .loop_y
+%endif
+
+ ; auto-regression code
+ movsxd r6, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4]
+ add r6, r4
+ jmp r6
+
+INIT_YMM avx2
+.ar0:
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ movd xm3, [base+hmul_bits+shiftq*2]
+ DEFINE_ARGS buf, bufy, h
+ pmovsxbw xm2, xm2
+%if %2
+ vpbroadcastd m7, [base+pb_1]
+ vpbroadcastw m6, [base+hmul_bits+2+%3*2]
+%endif
+ vpbroadcastw m2, xm2
+ vpbroadcastw m3, xm3
+ pxor m12, m12
+%if %2
+ sub bufq, 82*(73-35*%3)+82-(82*3+41)
+%else
+ sub bufq, 82*70-3
+%endif
+ add bufyq, 3+82*3
+ mov hd, 70-35*%3
+.y_loop_ar0:
+%if %2
+ ; first 32 pixels
+ movu xm4, [bufyq]
+ vinserti128 m4, [bufyq+32], 1
+%if %3
+ movu xm0, [bufyq+82]
+ vinserti128 m0, [bufyq+82+32], 1
+%endif
+ movu xm5, [bufyq+16]
+ vinserti128 m5, [bufyq+48], 1
+%if %3
+ movu xm1, [bufyq+82+16]
+ vinserti128 m1, [bufyq+82+48], 1
+%endif
+ pmaddubsw m4, m7, m4
+%if %3
+ pmaddubsw m0, m7, m0
+%endif
+ pmaddubsw m5, m7, m5
+%if %3
+ pmaddubsw m1, m7, m1
+ paddw m4, m0
+ paddw m5, m1
+%endif
+ pmulhrsw m4, m6
+ pmulhrsw m5, m6
+%else
+ xor r3d, r3d
+ ; first 32x2 pixels
+.x_loop_ar0:
+ movu m4, [bufyq+r3]
+ pcmpgtb m0, m12, m4
+ punpckhbw m5, m4, m0
+ punpcklbw m4, m0
+%endif
+ pmullw m4, m2
+ pmullw m5, m2
+ pmulhrsw m4, m3
+ pmulhrsw m5, m3
+%if %2
+ movu m1, [bufq]
+%else
+ movu m1, [bufq+r3]
+%endif
+ pcmpgtb m8, m12, m1
+ punpcklbw m0, m1, m8
+ punpckhbw m1, m8
+ paddw m0, m4
+ paddw m1, m5
+ packsswb m0, m1
+%if %2
+ movu [bufq], m0
+%else
+ movu [bufq+r3], m0
+ add r3d, 32
+ cmp r3d, 64
+ jl .x_loop_ar0
+%endif
+
+ ; last 6/12 pixels
+ movu xm4, [bufyq+32*2]
+%if %2
+%if %3
+ movu xm5, [bufyq+32*2+82]
+%endif
+ pmaddubsw xm4, xm7, xm4
+%if %3
+ pmaddubsw xm5, xm7, xm5
+ paddw xm4, xm5
+%endif
+ movq xm0, [bufq+32]
+ pmulhrsw xm4, xm6
+ pmullw xm4, xm2
+ pmulhrsw xm4, xm3
+ pcmpgtb xm5, xm12, xm0
+ punpcklbw xm5, xm0, xm5
+ paddw xm4, xm5
+ packsswb xm4, xm4
+ pblendw xm0, xm4, xm0, 1000b
+ movq [bufq+32], xm0
+%else
+ movu xm0, [bufq+64]
+ pcmpgtb xm1, xm12, xm4
+ punpckhbw xm5, xm4, xm1
+ punpcklbw xm4, xm1
+ pmullw xm5, xm2
+ pmullw xm4, xm2
+ vpblendd xm1, xm3, xm12, 0x0c
+ pmulhrsw xm5, xm1
+ pmulhrsw xm4, xm3
+ pcmpgtb xm1, xm12, xm0
+ punpckhbw xm8, xm0, xm1
+ punpcklbw xm0, xm1
+ paddw xm5, xm8
+ paddw xm0, xm4
+ packsswb xm0, xm5
+ movu [bufq+64], xm0
+%endif
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar0
+ RET
+
+INIT_XMM avx2
+.ar1:
+ DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+ movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
+ DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift
+ pmovsxbw xm4, xm4
+ pshufd xm5, xm4, q1111
+ pshufd xm4, xm4, q0000
+ pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd
+%if %2
+ vpbroadcastd xm7, [base+pb_1]
+ vpbroadcastw xm6, [base+hmul_bits+2+%3*2]
+%endif
+ vpbroadcastd xm3, xm3
+%if %2
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*70-(82-3)
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+ mov mind, -128
+ mov maxd, 127
+.y_loop_ar1:
+ mov xq, -(76>>%2)
+ movsx val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+ pmovsxbw xm0, [bufq+xq-82-1] ; top/left
+%if %2
+ movq xm8, [bufyq+xq*2]
+%if %3
+ movq xm9, [bufyq+xq*2+82]
+%endif
+%endif
+ psrldq xm2, xm0, 2 ; top
+ psrldq xm1, xm0, 4 ; top/right
+%if %2
+ pmaddubsw xm8, xm7, xm8
+%if %3
+ pmaddubsw xm9, xm7, xm9
+ paddw xm8, xm9
+%endif
+ pmulhrsw xm8, xm6
+%else
+ pmovsxbw xm8, [bufyq+xq]
+%endif
+ punpcklwd xm0, xm2
+ punpcklwd xm1, xm8
+ pmaddwd xm0, xm4
+ pmaddwd xm1, xm5
+ paddd xm0, xm1
+ paddd xm0, xm3
+.x_loop_ar1_inner:
+ movd val0d, xm0
+ psrldq xm0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sarx val3d, val3d, shiftd
+ movsx val0d, byte [bufq+xq]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovns val3d, maxd
+ cmp val3d, mind
+ cmovs val3d, mind
+ mov byte [bufq+xq], val3b
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar1
+ RET
+
+.ar2:
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ vpbroadcastw xm13, [base+round_vals-12+shiftq*2]
+ pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7
+ pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12
+ pinsrw xm0, [base+pw_1], 5
+%if %2
+ vpbroadcastw xm12, [base+hmul_bits+2+%3*2]
+ vpbroadcastd xm11, [base+pb_1]
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+ pshufd xm4, xm7, q0000
+ pshufd xm5, xm7, q3333
+ pshufd xm6, xm7, q1111
+ pshufd xm7, xm7, q2222
+ pshufd xm8, xm0, q0000
+ pshufd xm9, xm0, q1111
+ pshufd xm10, xm0, q2222
+%if %2
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*70-(82-3)
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+.y_loop_ar2:
+ mov xq, -(76>>%2)
+
+.x_loop_ar2:
+ pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
+ pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
+ pshufb xm2, xm0, [base+gen_shufA]
+ pmaddwd xm2, xm4
+ pshufb xm3, xm1, [base+gen_shufB]
+ pmaddwd xm3, xm5
+ paddd xm2, xm3
+ pshufb xm3, xm0, [base+gen_shufC]
+ pmaddwd xm3, xm6
+ punpckhqdq xm0, xm0 ; y=-2,x=[+2,+5]
+ punpcklwd xm0, xm1
+ pmaddwd xm0, xm7
+ pshufb xm1, [gen_shufD]
+ pmaddwd xm1, xm8
+ paddd xm2, xm3
+ paddd xm0, xm1
+ paddd xm2, xm0
+
+%if %2
+ movq xm0, [bufyq+xq*2]
+%if %3
+ movq xm3, [bufyq+xq*2+82]
+%endif
+ pmaddubsw xm0, xm11, xm0
+%if %3
+ pmaddubsw xm3, xm11, xm3
+ paddw xm0, xm3
+%endif
+ pmulhrsw xm0, xm12
+%else
+ pmovsxbw xm0, [bufyq+xq]
+%endif
+ punpcklwd xm0, xm13
+ pmaddwd xm0, xm10
+ paddd xm2, xm0
+
+ movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+ pmovsxbw xm0, xm0
+ pmaddwd xm3, xm0, xm9
+ psrldq xm0, 2
+ paddd xm3, xm2
+ psrldq xm2, 4 ; shift top to next pixel
+ psrad xm3, [fg_dataq+FGData.ar_coeff_shift]
+ pslldq xm3, 2
+ paddw xm3, xm0
+ pblendw xm0, xm3, 00000010b
+ packsswb xm0, xm0
+ pextrb [bufq+xq], xm0, 1
+ inc xq
+ jz .x_loop_ar2_end
+ test xb, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+INIT_YMM avx2
+.ar3:
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ pmovsxbw m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15
+ pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23
+ vpbroadcastb xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma]
+ movd xm13, [base+round_vals-10+shiftq*2]
+ vpbroadcastd xm14, [base+round_vals-14+shiftq*2]
+ pshufd m6, m0, q0000
+ pshufd m7, m0, q1111
+ pshufd m8, m0, q2222
+ pshufd m9, m0, q3333
+ pshufd xm10, xm1, q0000
+ pshufd xm11, xm1, q1111
+ pshufhw xm12, xm1, q0000
+ psraw xm2, 8
+ palignr xm13, xm1, 10
+ punpckhwd xm12, xm2 ; interleave luma cf
+ psrld xm14, 16
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+%if %2
+ vpbroadcastw xm15, [base+hmul_bits+2+%3*2]
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*70-(82-3)
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+.y_loop_ar3:
+ mov xq, -(76>>%2)
+.x_loop_ar3:
+ vbroadcasti128 m3, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12
+ palignr xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12]
+ vbroadcasti128 m4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
+ vpblendd m3, m1, 0x0f
+ pxor m0, m0
+ pcmpgtb m2, m0, m3
+ pcmpgtb m0, m4
+ punpcklbw m1, m3, m2
+ punpckhbw m3, m2
+ punpcklbw m2, m4, m0
+ punpckhbw xm4, xm0
+ pshufb m0, m1, [base+gen_shufA]
+ pmaddwd m0, m6
+ pshufb m5, m1, [base+gen_shufC]
+ pmaddwd m5, m7
+ shufps m1, m3, q1032
+ paddd m0, m5
+ pshufb m5, m1, [base+gen_shufA]
+ pmaddwd m5, m8
+ shufps xm1, xm3, q2121
+ vpblendd m1, m2, 0xf0
+ pshufb m1, [base+gen_shufE]
+ pmaddwd m1, m9
+ paddd m0, m5
+ pshufb xm3, xm2, [base+gen_shufC]
+ paddd m0, m1
+ pmaddwd xm3, xm10
+ palignr xm1, xm4, xm2, 2
+ punpckhwd xm1, xm2, xm1
+ pmaddwd xm1, xm11
+ palignr xm4, xm2, 12
+ paddd xm3, xm1
+%if %2
+ vpbroadcastd xm5, [base+pb_1]
+ movq xm1, [bufyq+xq*2]
+ pmaddubsw xm1, xm5, xm1
+%if %3
+ movq xm2, [bufyq+xq*2+82]
+ pmaddubsw xm5, xm2
+ paddw xm1, xm5
+%endif
+ pmulhrsw xm1, xm15
+%else
+ pmovsxbw xm1, [bufyq+xq]
+%endif
+ punpcklwd xm4, xm1
+ pmaddwd xm4, xm12
+ movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4]
+ vextracti128 xm2, m0, 1
+ paddd xm0, xm14
+ paddd xm3, xm4
+ paddd xm0, xm3
+ paddd xm0, xm2
+.x_loop_ar3_inner:
+ pmovsxbw xm1, xm1
+ pmaddwd xm2, xm13, xm1
+ pshuflw xm3, xm2, q1032
+ paddd xm2, xm0 ; add top
+ paddd xm2, xm3 ; left+cur
+ psrldq xm0, 4
+ psrad xm2, [fg_dataq+FGData.ar_coeff_shift]
+ psrldq xm1, 2
+ ; don't packssdw, we only care about one value
+ punpckldq xm2, xm2
+ pblendw xm1, xm2, 0100b
+ packsswb xm1, xm1
+ pextrb [bufq+xq], xm1, 2
+ inc xq
+ jz .x_loop_ar3_end
+ test xb, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+.x_loop_ar3_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar3
+ RET
+%endmacro
+
+INIT_YMM avx2
+cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, see, overlap
+%define base r9-pd_m65536
+ lea r9, [pd_m65536]
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov sbyd, sbym
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+ vpbroadcastd m8, [base+pd_m65536]
+ vpbroadcastw m9, [base+mul_bits+r6*2-14]
+ vpbroadcastd m10, [base+fg_min+r7*4]
+ vpbroadcastd m11, [base+fg_max+r7*8]
+ vpbroadcastd m12, [base+pw_1024]
+ movq xm13, [base+pb_27_17_17_27]
+ test sbyd, sbyd
+ setnz r7b
+ pxor m7, m7
+ test r7b, overlapb
+ jnz .vertical_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, overlap
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstq, srcq
+
+.loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, overlap
+
+ mov hd, hm
+ mov grain_lutq, grain_lutmp
+.loop_y:
+ ; src
+ mova m2, [srcq]
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+
+ ; scaling[src]
+ pandn m4, m8, m0
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, m0, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m1
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m5, [grain_lutq+offxyq]
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+ mova [dstq+srcq], m0
+
+ add srcq, strideq
+ add grain_lutq, 82
+ dec hd
+ jg .loop_y
+
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+ test overlapd, overlapd
+ jz .loop_x
+
+ ; r8m = sbym
+ cmp dword r8m, 0
+ jne .loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy
+
+ lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y_h_overlap:
+ ; src
+ mova m2, [srcq]
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+
+ ; scaling[src]
+ pandn m4, m8, m0
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, m0, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m1
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m5, [grain_lutq+offxyq]
+ movd xm4, [grain_lutq+left_offxyq]
+ punpcklbw xm4, xm5
+ pmaddubsw xm4, xm13, xm4
+ pmulhrsw xm4, xm12
+ packsswb xm4, xm4
+ vpblendd m4, m5, 0xfe
+ punpckhbw m5, m7
+ punpcklbw m4, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+ mova [dstq+srcq], m0
+
+ add srcq, strideq
+ add grain_lutq, 82
+ dec hd
+ jg .loop_y_h_overlap
+
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+
+ ; r8m = sbym
+ cmp dword r8m, 0
+ jne .loop_x_hv_overlap
+ jmp .loop_x_h_overlap
+
+.vertical_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ unused, sby, see, overlap
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, overlap
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstq, srcq
+
+.loop_x_v_overlap:
+ vpbroadcastd m14, [pb_27_17]
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, overlap, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+.loop_y_v_overlap:
+ ; src
+ mova m2, [srcq]
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+
+ ; scaling[src]
+ pandn m4, m8, m0
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, m0, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m1
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m6, [grain_lutq+offxyq]
+ movu m4, [grain_lutq+top_offxyq]
+ punpcklbw m5, m4, m6
+ punpckhbw m4, m6
+ pmaddubsw m5, m14, m5
+ pmaddubsw m4, m14, m4
+ pmulhrsw m5, m12
+ pmulhrsw m4, m12
+ packsswb m5, m4
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+ mova [dstq+srcq], m0
+
+ add srcq, strideq
+ add grain_lutq, 82
+ dec hb
+ jz .end_y_v_overlap
+ vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ add hd, 0x80000000
+ jnc .loop_y_v_overlap
+ jmp .loop_y
+.end_y_v_overlap:
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+.loop_x_hv_overlap:
+ vpbroadcastd m14, [pb_27_17]
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyd, [top_offxyq+32]
+ lea left_offxyd, [offyq+32]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+.loop_y_hv_overlap:
+ ; src
+ mova m2, [srcq]
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+
+ ; scaling[src]
+ pandn m4, m8, m0
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, m0, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m1
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ pblendw m2, m4, 0xaa
+ psrld m4, m1, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m4-2], m6
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m6, [grain_lutq+offxyq]
+ movd xm7, [grain_lutq+left_offxyq]
+ movu m4, [grain_lutq+top_offxyq]
+ movd xm5, [grain_lutq+topleft_offxyq]
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw xm7, xm6
+ punpcklbw xm5, xm4
+ pmaddubsw xm7, xm13, xm7
+ pmaddubsw xm5, xm13, xm5
+ pmulhrsw xm7, xm12
+ pmulhrsw xm5, xm12
+ packsswb xm7, xm7
+ packsswb xm5, xm5
+ vpblendd m7, m6, 0xfe
+ vpblendd m5, m4, 0xfe
+ ; followed by v interpolation (top | cur -> cur)
+ punpckhbw m4, m6
+ punpcklbw m5, m7
+ pmaddubsw m4, m14, m4
+ pmaddubsw m5, m14, m5
+ pmulhrsw m4, m12
+ pmulhrsw m5, m12
+ pxor m7, m7
+ packsswb m5, m4
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+ mova [dstq+srcq], m0
+
+ add srcq, strideq
+ add grain_lutq, 82
+ dec hb
+ jz .end_y_hv_overlap
+ vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ add hd, 0x80000000
+ jnc .loop_y_hv_overlap
+ jmp .loop_y_h_overlap
+.end_y_hv_overlap:
+ add wq, 32
+ lea srcq, [src_bakq+wq]
+ jl .loop_x_hv_overlap
+.end:
+ RET
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, overlap, uv_pl, is_id
+%define base r11-pd_m65536
+ lea r11, [pd_m65536]
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r9d, is_idm
+ mov sbyd, sbym
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+ vpbroadcastd m8, [base+pd_m65536]
+ vpbroadcastw m9, [base+mul_bits+r6*2-14]
+ vpbroadcastd m10, [base+fg_min+r7*4]
+ shlx r7d, r7d, r9d
+ vpbroadcastd m11, [base+fg_max+r7*4]
+ vpbroadcastd m12, [base+pw_1024]
+ pxor m7, m7
+ test sbyd, sbyd
+ setnz r7b
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, sby, see, overlap, uv_pl
+%if %1
+ mov r6d, uv_plm
+ vpbroadcastd m0, [base+pw_8]
+ vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4]
+ vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4]
+ pshufb m14, m0 ; uv_luma_mult, uv_mult
+%elif %2
+ vpbroadcastq m15, [base+pb_23_22]
+%else
+ vpbroadcastq xm15, [base+pb_27_17_17_27]
+%endif
+%if %3
+ vpbroadcastw m13, [base+pb_23_22]
+%elif %2
+ pshufd m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27
+%endif
+ test r7b, overlapb
+ jnz %%vertical_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ unused2, unused3, see, overlap, unused4, unused5, lstride
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq]
+ lea r13, [dstq+wq]
+ lea r14, [lumaq+wq*(1+%2)]
+ mov r11mp, r12
+ mov r12mp, r13
+ mov lstrideq, r10mp
+ neg wq
+
+%%loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, unused1, unused2, lstride
+
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, unused1, unused2, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y:
+ ; src
+%if %2
+ mova xm3, [lumaq+lstrideq*0+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1+%3) +0], 1
+ vpbroadcastd m2, [pb_1]
+ mova xm0, [lumaq+lstrideq*0+16]
+ vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1
+ mova xm1, [srcq]
+ vinserti128 m1, [srcq+strideq], 1
+ pmaddubsw m3, m2
+ pmaddubsw m0, m2
+ pavgw m3, m7
+ pavgw m0, m7
+%else
+ mova m2, [lumaq]
+ mova m1, [srcq]
+%endif
+%if %1
+%if %2
+ packuswb m2, m3, m0 ; luma
+%endif
+ punpckhbw m3, m2, m1
+ punpcklbw m2, m1 ; { luma, chroma }
+ pmaddubsw m3, m14
+ pmaddubsw m2, m14
+ psraw m3, 6
+ psraw m2, 6
+ paddw m3, m15
+ paddw m2, m15
+ packuswb m2, m3 ; pack+unpack = clip
+%endif
+%if %1 || %2 == 0
+ punpcklbw m3, m2, m7
+ punpckhbw m0, m2, m7
+%endif
+
+ ; scaling[luma_src]
+ pandn m4, m8, m3
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m0
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ psrld m0, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m0-2], m6
+ pblendw m2, m4, 0xaa
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %2
+ movu xm5, [grain_lutq+offxyq+ 0]
+ vinserti128 m5, [grain_lutq+offxyq+82], 1
+%else
+ movu m5, [grain_lutq+offxyq]
+%endif
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; unpack chroma_source
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m7
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+%if %2
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+%else
+ mova [dstq], m0
+%endif
+
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82<<%2
+ sub hb, 1+%2
+ jg %%loop_y
+
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*(1+%2)]
+ add srcq, wq
+ add dstq, wq
+ test overlapd, overlapd
+ jz %%loop_x
+
+ ; r8m = sbym
+ cmp dword r8m, 0
+ jne %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, lstride
+
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y_h_overlap:
+ ; src
+%if %2
+ mova xm3, [lumaq+lstrideq*0+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1
+ vpbroadcastd m2, [pb_1]
+ mova xm0, [lumaq+lstrideq*0+16]
+ vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1
+ mova xm1, [srcq]
+ vinserti128 m1, [srcq+strideq], 1
+ pmaddubsw m3, m2
+ pmaddubsw m0, m2
+ pavgw m3, m7
+ pavgw m0, m7
+%else
+ mova m2, [lumaq]
+ mova m1, [srcq]
+%endif
+%if %1
+%if %2
+ packuswb m2, m3, m0 ; luma
+%endif
+ punpckhbw m3, m2, m1
+ punpcklbw m2, m1 ; { luma, chroma }
+ pmaddubsw m3, m14
+ pmaddubsw m2, m14
+ psraw m3, 6
+ psraw m2, 6
+ paddw m3, m15
+ paddw m2, m15
+ packuswb m2, m3 ; pack+unpack = clip
+%endif
+%if %1 || %2 == 0
+ punpcklbw m3, m2, m7
+ punpckhbw m0, m2, m7
+%endif
+
+ ; scaling[luma_src]
+ pandn m4, m8, m3
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m0
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ psrld m0, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m0-2], m6
+ pblendw m2, m4, 0xaa
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %2
+ movu xm5, [grain_lutq+offxyq+ 0]
+ vinserti128 m5, [grain_lutq+offxyq+82], 1
+ movd xm4, [grain_lutq+left_offxyq+ 0]
+ vinserti128 m4, [grain_lutq+left_offxyq+82], 1
+ punpcklbw m4, m5
+%if %1
+ vpbroadcastq m0, [pb_23_22]
+ pmaddubsw m4, m0, m4
+%else
+ pmaddubsw m4, m15, m4
+%endif
+ pmulhrsw m4, m12
+ packsswb m4, m4
+ vpblendd m4, m5, 0xee
+%else
+ movu m5, [grain_lutq+offxyq]
+ movd xm4, [grain_lutq+left_offxyq]
+ punpcklbw xm4, xm5
+%if %1
+ movq xm0, [pb_27_17_17_27]
+ pmaddubsw xm4, xm0, xm4
+%else
+ pmaddubsw xm4, xm15, xm4
+%endif
+ pmulhrsw xm4, xm12
+ packsswb xm4, xm4
+ vpblendd m4, m5, 0xfe
+%endif
+ punpckhbw m5, m7
+ punpcklbw m4, m7
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; unpack chroma_source
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m7
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+%if %2
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+%else
+ mova [dstq], m0
+%endif
+
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82*(1+%2)
+ sub hb, 1+%2
+ jg %%loop_y_h_overlap
+
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*(1+%2)]
+ add srcq, wq
+ add dstq, wq
+
+ ; r8m = sbym
+ cmp dword r8m, 0
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%vertical_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
+ sby, see, overlap, unused1, unused2, lstride
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ unused1, unused2, see, overlap, unused3, unused4, lstride
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq]
+ lea r13, [dstq+wq]
+ lea r14, [lumaq+wq*(1+%2)]
+ mov r11mp, r12
+ mov r12mp, r13
+ mov lstrideq, r10mp
+ neg wq
+
+%%loop_x_v_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, top_offxy, unused, lstride
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, top_offxy, unused, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if %2 == 0
+ vpbroadcastd m13, [pb_27_17]
+%endif
+%%loop_y_v_overlap:
+ ; src
+%if %2
+ mova xm3, [lumaq+lstrideq*0+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1
+ vpbroadcastd m2, [pb_1]
+ mova xm0, [lumaq+lstrideq*0+16]
+ vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1
+ mova xm1, [srcq]
+ vinserti128 m1, [srcq+strideq], 1
+ pmaddubsw m3, m2
+ pmaddubsw m0, m2
+ pavgw m3, m7
+ pavgw m0, m7
+%else
+ mova m2, [lumaq]
+ mova m1, [srcq]
+%endif
+%if %1
+%if %2
+ packuswb m2, m3, m0 ; luma
+%endif
+ punpckhbw m3, m2, m1
+ punpcklbw m2, m1 ; { luma, chroma }
+ pmaddubsw m3, m14
+ pmaddubsw m2, m14
+ psraw m3, 6
+ psraw m2, 6
+ paddw m3, m15
+ paddw m2, m15
+ packuswb m2, m3 ; pack+unpack = clip
+%endif
+%if %1 || %2 == 0
+ punpcklbw m3, m2, m7
+ punpckhbw m0, m2, m7
+%endif
+
+ ; scaling[luma_src]
+ pandn m4, m8, m3
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m0
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ psrld m0, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m0-2], m6
+ pblendw m2, m4, 0xaa
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %3 == 0
+%if %2
+ movu xm0, [grain_lutq+offxyq]
+ vinserti128 m0, [grain_lutq+offxyq+82], 1
+ movu xm4, [grain_lutq+top_offxyq]
+ vinserti128 m4, [grain_lutq+top_offxyq+82], 1
+%else
+ movu m0, [grain_lutq+offxyq]
+ movu m4, [grain_lutq+top_offxyq]
+%endif
+ punpcklbw m5, m4, m0
+ punpckhbw m4, m0
+ pmaddubsw m5, m13, m5
+ pmaddubsw m4, m13, m4
+ pmulhrsw m5, m12
+ pmulhrsw m4, m12
+ packsswb m5, m4
+%else
+ movq xm4, [grain_lutq+offxyq]
+ vinserti128 m4, [grain_lutq+offxyq+8], 1
+ movq xm5, [grain_lutq+top_offxyq]
+ vinserti128 m5, [grain_lutq+top_offxyq+8], 1
+ punpcklbw m5, m4
+ pmaddubsw m5, m13, m5
+ pmulhrsw m5, m12
+ vextracti128 xm4, m5, 1
+ packsswb xm5, xm4
+ ; only interpolate first line, insert second line unmodified
+ vinserti128 m5, [grain_lutq+offxyq+82], 1
+%endif
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; unpack chroma_source
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m7
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+%if %2
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+%else
+ mova [dstq], m0
+%endif
+
+ sub hb, 1+%2
+ jle %%end_y_v_overlap
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82<<%2
+%if %2 == 0
+ vpbroadcastd m13, [pb_17_27]
+ add hd, 0x80000000
+ jnc %%loop_y_v_overlap
+%endif
+ jmp %%loop_y
+
+%%end_y_v_overlap:
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*(1+%2)]
+ add srcq, wq
+ add dstq, wq
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+
+%%loop_x_hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+ lea topleft_offxyd, [top_offxyq+(32>>%2)]
+ lea left_offxyd, [offyq+(32>>%2)]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if %2 == 0
+ vpbroadcastd m13, [pb_27_17]
+%endif
+%%loop_y_hv_overlap:
+ ; src
+%if %2
+ mova xm3, [lumaq+lstrideq*0+ 0]
+ vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1
+ vpbroadcastd m2, [pb_1]
+ mova xm0, [lumaq+lstrideq*0+16]
+ vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1
+ mova xm1, [srcq]
+ vinserti128 m1, [srcq+strideq], 1
+ pmaddubsw m3, m2
+ pmaddubsw m0, m2
+ pavgw m3, m7
+ pavgw m0, m7
+%else
+ mova m2, [lumaq]
+ mova m1, [srcq]
+%endif
+%if %1
+%if %2
+ packuswb m2, m3, m0 ; luma
+%endif
+ punpckhbw m3, m2, m1
+ punpcklbw m2, m1 ; { luma, chroma }
+ pmaddubsw m3, m14
+ pmaddubsw m2, m14
+ psraw m3, 6
+ psraw m2, 6
+ paddw m3, m15
+ paddw m2, m15
+ packuswb m2, m3 ; pack+unpack = clip
+%endif
+%if %1 || %2 == 0
+ punpcklbw m3, m2, m7
+ punpckhbw m0, m2, m7
+%endif
+
+ ; scaling[luma_src]
+ pandn m4, m8, m3
+ mova m6, m8
+ vpgatherdd m2, [scalingq+m4-0], m8
+ psrld m3, 16
+ mova m8, m6
+ vpgatherdd m4, [scalingq+m3-2], m6
+ pandn m5, m8, m0
+ mova m6, m8
+ vpgatherdd m3, [scalingq+m5-0], m8
+ psrld m0, 16
+ mova m8, m6
+ vpgatherdd m5, [scalingq+m0-2], m6
+ pblendw m2, m4, 0xaa
+ pblendw m3, m5, 0xaa
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %2
+ movu xm4, [grain_lutq+offxyq]
+ vinserti128 m4, [grain_lutq+offxyq+82], 1
+ movd xm0, [grain_lutq+left_offxyq]
+ vinserti128 m0, [grain_lutq+left_offxyq+82], 1
+ movd xm6, [grain_lutq+topleft_offxyq]
+%if %3
+ movq xm5, [grain_lutq+top_offxyq]
+ vinserti128 m5, [grain_lutq+top_offxyq+8], 1
+%else
+ vinserti128 m6, [grain_lutq+topleft_offxyq+82], 1
+ movu xm5, [grain_lutq+top_offxyq]
+ vinserti128 m5, [grain_lutq+top_offxyq+82], 1
+%endif
+
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m0, m4
+%if %3
+ punpcklbw xm6, xm5
+%else
+ punpcklbw m6, m5
+%endif
+ punpcklqdq m0, m6
+%if %1
+ vpbroadcastq m6, [pb_23_22]
+ pmaddubsw m0, m6, m0
+%else
+ pmaddubsw m0, m15, m0
+%endif
+ pmulhrsw m0, m12
+ packsswb m0, m0
+ vpblendd m4, m0, 0x11
+%if %3
+ pshuflw xm0, xm0, q1032
+ vpblendd m5, m0, 0x01
+%else
+ pshuflw m0, m0, q1032
+ vpblendd m5, m0, 0x11
+%endif
+%else
+ movu m4, [grain_lutq+offxyq]
+ movd xm0, [grain_lutq+left_offxyq]
+ movu m5, [grain_lutq+top_offxyq]
+ movd xm6, [grain_lutq+topleft_offxyq]
+ punpcklbw xm0, xm4
+ punpcklbw xm6, xm5
+ punpcklqdq xm0, xm6
+%if %1
+ vpbroadcastq xm6, [pb_27_17_17_27]
+ pmaddubsw xm0, xm6, xm0
+%else
+ pmaddubsw xm0, xm15, xm0
+%endif
+ pmulhrsw xm0, xm12
+ packsswb xm0, xm0
+ vpblendd m4, m0, 0x01
+ pshuflw xm0, xm0, q1032
+ vpblendd m5, m0, 0x01
+%endif
+
+ ; followed by v interpolation (top | cur -> cur)
+%if %3
+ vpermq m0, m4, q3120
+ punpcklbw m5, m0
+ pmaddubsw m5, m13, m5
+ pmulhrsw m5, m12
+ vextracti128 xm0, m5, 1
+ packsswb xm5, xm0
+ vpblendd m5, m4, 0xf0
+%else
+ punpckhbw m0, m5, m4
+ punpcklbw m5, m4
+ pmaddubsw m4, m13, m0
+ pmaddubsw m5, m13, m5
+ pmulhrsw m4, m12
+ pmulhrsw m5, m12
+ packsswb m5, m4
+%endif
+ punpcklbw m4, m5, m7
+ punpckhbw m5, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmaddubsw m2, m4
+ pmaddubsw m3, m5
+ pmulhrsw m2, m9
+ pmulhrsw m3, m9
+
+ ; unpack chroma source
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m7
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ pmaxub m0, m10
+ pminub m0, m11
+%if %2
+ mova [dstq], xm0
+ vextracti128 [dstq+strideq], m0, 1
+%else
+ mova [dstq], m0
+%endif
+
+%if %2
+ lea srcq, [srcq+strideq*2]
+ lea dstq, [dstq+strideq*2]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+ add srcq, strideq
+ add dstq, strideq
+ add lumaq, lstrideq
+%endif
+ add grain_lutq, 82<<%2
+ sub hb, 1+%2
+%if %2
+ jg %%loop_y_h_overlap
+%else
+ je %%end_y_hv_overlap
+ vpbroadcastd m13, [pb_17_27]
+ add hd, 0x80000000
+ jnc %%loop_y_hv_overlap
+ jmp %%loop_y_h_overlap
+%endif
+
+%%end_y_hv_overlap:
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r14+wq*(1+%2)]
+ add srcq, wq
+ add dstq, wq
+ jmp %%loop_x_hv_overlap
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
+%endmacro
+
+GEN_GRAIN_UV_FN 420, 1, 1
+FGUV_FN 420, 1, 1
+GEN_GRAIN_UV_FN 422, 1, 0
+FGUV_FN 422, 1, 0
+GEN_GRAIN_UV_FN 444, 0, 0
+FGUV_FN 444, 0, 0
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/filmgrain_avx512.asm b/third_party/dav1d/src/x86/filmgrain_avx512.asm
new file mode 100644
index 0000000000..317ec118b3
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_avx512.asm
@@ -0,0 +1,813 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+pb_even: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+ db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
+ db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
+pb_odd: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
+ db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
+interleave_hl: db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7
+pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32
+pb_23_22_0_32: db 23, 22, 0, 32, 0, 32, 0, 32
+pb_27_17: times 2 db 27, 17
+pb_23_22: times 2 db 23, 22
+pw_8: times 2 dw 8
+pw_1024: times 2 dw 1024
+pb_17_27: times 2 db 17, 27
+fg_max: times 4 db 255
+ times 4 db 240
+ times 4 db 235
+fg_min: times 4 db 0
+ times 4 db 16
+noise_rnd: times 2 dw 128
+ times 2 dw 64
+ times 2 dw 32
+ times 2 dw 16
+
+SECTION .text
+
+INIT_ZMM avx512icl
+cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, see, overlap
+%define base r11-fg_min
+ lea r11, [fg_min]
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov sbyd, sbym
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+ mov r12, 0x0000000f0000000f ; h_overlap mask
+ mova m0, [scalingq+64*0]
+ mova m1, [scalingq+64*1]
+ mova m2, [scalingq+64*2]
+ mova m3, [scalingq+64*3]
+ kmovq k1, r12
+ vbroadcasti32x4 m4, [base+interleave_hl]
+ vpbroadcastd ym16, [base+pb_27_17]
+ vpbroadcastd m12, [base+pb_17_27]
+ vpbroadcastd m6, [base+noise_rnd+r6*4-32]
+ test sbyd, sbyd
+ setnz r6b
+ vpbroadcastd m7, [base+fg_min+r7*4]
+ vpbroadcastd m8, [base+fg_max+r7*8]
+ pxor m5, m5
+ vpbroadcastd m9, [base+pw_1024]
+ vpbroadcastq m10, [base+pb_27_17_17_27]
+ vmovdqa64 m12{k1}, m16
+ test r6b, overlapb
+ jnz .v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, overlap
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstq, srcq
+.loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offxd, [offyq+offxq*2+829] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, overlap
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y:
+ movu ym21, [grain_lutq+offxyq-82]
+ vinserti32x8 m21, [grain_lutq+offxyq+ 0], 1
+ call .add_noise
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+ test overlapd, overlapd
+ jz .loop_x
+ test sbyd, sbyd
+ jnz .hv_overlap
+
+.loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, left_offxy
+
+ rorx offyd, seed, 8
+ mov left_offxyd, offxd ; previous column's offy*stride
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offxd, [offyq+offxq*2+829] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y_h_overlap:
+ movu ym20, [grain_lutq+offxyq-82]
+ vinserti32x8 m20, [grain_lutq+offxyq+ 0], 1
+ movd xm19, [grain_lutq+left_offxyq-50]
+ vinserti32x4 m19, [grain_lutq+left_offxyq+32], 2
+ punpcklbw m19, m20
+ pmaddubsw m19, m10, m19
+ pmulhrsw m19, m9
+ punpckhbw m21, m20, m5
+ packsswb m20{k1}, m19, m19
+ punpcklbw m20, m5, m20
+ call .add_noise_h
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+ test sbyd, sbyd
+ jnz .hv_overlap
+ jmp .loop_x_h_overlap
+
+.v_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \
+ h, sby, see, overlap
+
+ movzx r6d, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, r6d, 173 * 0x00010001
+ imul r6d, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add r6d, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and r6d, 0xff00ff00
+ xor seed, r7d
+ xor seed, r6d ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, overlap
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstq, srcq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offxd, [offyq+offxq*2+0x10001*829+32*82]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, overlap, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+ movu ym19, [grain_lutq+offxyq-82]
+ vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1
+ movu ym21, [grain_lutq+top_offxyq-82]
+ vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1
+ punpckhbw m20, m21, m19
+ punpcklbw m21, m19
+ call .add_noise_v
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump back
+ ; to .v_overlap, and instead always fall-through to h+v overlap
+.hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
+ h, sby, see, left_offxy, top_offxy, topleft_offxy
+
+ mov topleft_offxyd, top_offxyd
+ rorx offyd, seed, 8
+ mov left_offxyd, offxd
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offxd, [offyq+offxq*2+0x10001*829+32*82]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
+ h, sby, see, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+ movu ym19, [grain_lutq+offxyq-82]
+ vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1
+ movd xm16, [grain_lutq+left_offxyq-50]
+ vinserti32x4 m16, [grain_lutq+left_offxyq+32], 2
+ movu ym21, [grain_lutq+top_offxyq-82]
+ vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1
+ movd xm17, [grain_lutq+topleft_offxyq-50]
+ vinserti32x4 m17, [grain_lutq+topleft_offxyq+32], 2
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m16, m19
+ pmaddubsw m16, m10, m16
+ punpcklbw m17, m21
+ pmaddubsw m17, m10, m17
+ punpckhbw m20, m21, m19
+ pmulhrsw m16, m9
+ pmulhrsw m17, m9
+ packsswb m19{k1}, m16, m16
+ packsswb m21{k1}, m17, m17
+ ; followed by v interpolation (top | cur -> cur)
+ punpcklbw m21, m19
+ call .add_noise_v
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ lea srcq, [src_bakq+wq]
+ jl .hv_overlap
+.end:
+ RET
+ALIGN function_align
+.add_noise_v:
+ pmaddubsw m20, m12, m20
+ pmaddubsw m21, m12, m21
+ pmulhrsw m20, m9
+ pmulhrsw m21, m9
+ packsswb m21, m20
+.add_noise:
+ punpcklbw m20, m5, m21
+ punpckhbw m21, m5
+.add_noise_h:
+ mova ym18, [srcq+strideq*0]
+ vinserti32x8 m18, [srcq+strideq*1], 1
+ mova m19, m0
+ punpcklbw m16, m18, m5
+ vpermt2b m19, m18, m1 ; scaling[ 0..127]
+ vpmovb2m k2, m18
+ punpckhbw m17, m18, m5
+ vpermi2b m18, m2, m3 ; scaling[128..255]
+ vmovdqu8 m19{k2}, m18 ; scaling[src]
+ pshufb m19, m4
+ pmaddubsw m18, m19, m20
+ pmaddubsw m19, m21
+ add grain_lutq, 82*2
+ pmulhrsw m18, m6 ; noise
+ pmulhrsw m19, m6
+ paddw m16, m18
+ paddw m17, m19
+ packuswb m16, m17
+ pmaxub m16, m7
+ pminub m16, m8
+ mova [dstq+srcq], ym16
+ add srcq, strideq
+ vextracti32x8 [dstq+srcq], m16, 1
+ add srcq, strideq
+ ret
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \
+ scaling, grain_lut, h, sby, luma, \
+ overlap, uv_pl, is_id, _, stride3
+ lea r11, [fg_min]
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r9d, is_idm
+ mov sbyd, sbym
+ mov overlapd, [fg_dataq+FGData.overlap_flag]
+%if %2
+ mov r12, 0x000f000f000f000f ; h_overlap mask
+ vpbroadcastq m10, [base+pb_23_22_0_32]
+ lea stride3q, [strideq*3]
+%else
+ mov r12, 0x0000000f0000000f
+ vpbroadcastq m10, [base+pb_27_17_17_27]
+%endif
+ mova m0, [scalingq+64*0]
+ mova m1, [scalingq+64*1]
+ mova m2, [scalingq+64*2]
+ mova m3, [scalingq+64*3]
+ kmovq k1, r12
+ vbroadcasti32x4 m4, [base+interleave_hl]
+ vpbroadcastd m6, [base+noise_rnd+r6*4-32]
+ vpbroadcastd m7, [base+fg_min+r7*4]
+ shlx r7d, r7d, r9d
+ vpbroadcastd m8, [base+fg_max+r7*4]
+ test sbyd, sbyd
+ setnz r7b
+ vpbroadcastd m9, [base+pw_1024]
+ mova m11, [base+pb_even]
+ mova m12, [base+pb_odd]
+ pxor m5, m5
+ mov r5, r10mp ; lstride
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+ DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
+ h, sby, see, overlap, uv_pl, _, _, stride3
+%if %1
+ mov r6d, uv_plm
+ vpbroadcastd m16, [base+pw_8]
+ vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4]
+ vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4]
+ pshufb m14, m16 ; uv_luma_mult, uv_mult
+%endif
+ test r7b, overlapb
+ jnz %%v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, overlap, _, _, _, stride3
+
+ mov lumaq, r9mp
+ lea r11, [srcq+wq]
+ lea r12, [dstq+wq]
+ lea r13, [lumaq+wq*(1+%2)]
+ mov r11mp, r11
+ mov r12mp, r12
+ neg wq
+
+%%loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, overlap, _, _, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y:
+%if %2
+ movu xm21, [grain_lutq+offxyq+82*0]
+ vinserti128 ym21, [grain_lutq+offxyq+82*1], 1
+ vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+%else
+ movu ym21, [grain_lutq+offxyq+82*0]
+ vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1
+%endif
+ call %%add_noise
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+ test overlapd, overlapd
+ jz %%loop_x
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, left_offxy, _, _, _, stride3
+
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, left_offxy, _, _, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y_h_overlap:
+%if %2
+ movu xm20, [grain_lutq+offxyq +82*0]
+ movd xm19, [grain_lutq+left_offxyq+82*0]
+ vinserti32x4 ym20, [grain_lutq+offxyq +82*1], 1
+ vinserti32x4 ym19, [grain_lutq+left_offxyq+82*1], 1
+ vinserti32x4 m20, [grain_lutq+offxyq +82*2], 2
+ vinserti32x4 m19, [grain_lutq+left_offxyq+82*2], 2
+ vinserti32x4 m20, [grain_lutq+offxyq +82*3], 3
+ vinserti32x4 m19, [grain_lutq+left_offxyq+82*3], 3
+%else
+ movu ym20, [grain_lutq+offxyq + 0]
+ movd xm19, [grain_lutq+left_offxyq+ 0]
+ vinserti32x8 m20, [grain_lutq+offxyq +82], 1
+ vinserti32x4 m19, [grain_lutq+left_offxyq+82], 2
+%endif
+ punpcklbw m19, m20
+ pmaddubsw m19, m10, m19
+ punpckhbw m21, m20, m5
+ pmulhrsw m19, m9
+ vpacksswb m20{k1}, m19, m19
+ punpcklbw m20, m5, m20
+ call %%add_noise_h
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%v_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
+ _, sby, see, overlap, _, _, _, stride3
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+%if %3
+ vpbroadcastd m13, [base+pb_23_22]
+ kxnorw k3, k3, k3 ; v_overlap mask
+%elif %2
+ vbroadcasti32x8 m13, [base+pb_27_17]
+ kxnord k3, k3, k3
+ pshufd m13, m13, q0000 ; 8x27_17, 8x17_27
+%else
+ vpbroadcastd ym16, [base+pb_27_17]
+ vpbroadcastd m13, [base+pb_17_27]
+ vmovdqa64 m13{k1}, m16
+%endif
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, overlap, top_offxy, _, _, stride3
+
+ mov lumaq, r9mp
+ lea r11, [srcq+wq]
+ lea r12, [dstq+wq]
+ lea r13, [lumaq+wq*(1<<%2)]
+ mov r11mp, r11
+ mov r12mp, r12
+ neg wq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0x000f000f
+ and offxd, 0x000f000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, overlap, top_offxy, _, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+%if %3
+ movu xm18, [grain_lutq+offxyq+82*0]
+ movu xm20, [grain_lutq+top_offxyq+82*0]
+ ; only interpolate first line, insert remaining line unmodified
+ vbroadcasti128 ym21, [grain_lutq+offxyq+82*1]
+ vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+ punpcklbw xm19, xm20, xm18
+ punpckhbw xm20, xm18
+%elif %2
+ movu xm18, [grain_lutq+offxyq+82*0]
+ vinserti128 ym18, [grain_lutq+offxyq+82*1], 1
+ movu xm20, [grain_lutq+top_offxyq+82*0]
+ vinserti32x4 ym20, [grain_lutq+top_offxyq+82*1], 1
+ vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2]
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+ punpcklbw ym19, ym20, ym18
+ punpckhbw ym20, ym18
+%else
+ movu ym21, [grain_lutq+offxyq+82*0]
+ vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1
+ movu ym20, [grain_lutq+top_offxyq+82*0]
+ vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1
+%endif
+ call %%add_noise_v
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+
+%%hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
+
+ lea topleft_offxyd, [top_offxyq+(32>>%2)]
+ lea left_offxyd, [offyq+(32>>%2)]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0x000f000f
+ and offxd, 0x000f000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+%if %2
+ movu xm21, [grain_lutq+offxyq+82*0]
+ movd xm16, [grain_lutq+left_offxyq+82*0]
+ vinserti128 ym21, [grain_lutq+offxyq+82*1], 1
+ vinserti128 ym16, [grain_lutq+left_offxyq+82*1], 1
+ vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
+ vinserti32x4 m16, [grain_lutq+left_offxyq+82*2], 2
+ vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
+ vinserti32x4 m16, [grain_lutq+left_offxyq+82*3], 3
+ movd xm18, [grain_lutq+topleft_offxyq+82*0]
+ movu xm20, [grain_lutq+top_offxyq]
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m16, m21
+%if %3
+ punpcklbw xm18, xm20
+%else
+ vinserti128 ym18, [grain_lutq+topleft_offxyq+82*1], 1
+ vinserti128 ym20, [grain_lutq+top_offxyq+82*1], 1
+ punpcklbw ym18, ym20
+%endif
+ punpcklqdq m16, m18
+ pmaddubsw m16, m10, m16
+ pmulhrsw m16, m9
+ packsswb m16, m16
+ vmovdqu8 m21{k1}, m16
+%if %3
+ vpalignr xm20{k1}, xm16, xm16, 4
+ punpcklbw xm19, xm20, xm21
+ punpckhbw xm20, xm21
+%else
+ vpalignr ym20{k1}, ym16, ym16, 4
+ punpcklbw ym19, ym20, ym21
+ punpckhbw ym20, ym21
+%endif
+%else
+ movu ym21, [grain_lutq+offxyq+82*0]
+ vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1
+ movd xm16, [grain_lutq+left_offxyq+82*0]
+ vinserti32x4 m16, [grain_lutq+left_offxyq+82*1], 2
+ movu ym20, [grain_lutq+top_offxyq+82*0]
+ vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1
+ movd xm18, [grain_lutq+topleft_offxyq+82*0]
+ vinserti32x4 m18, [grain_lutq+topleft_offxyq+82*1], 2
+ punpcklbw m16, m21
+ punpcklbw m18, m20
+ punpcklqdq m16, m18
+ pmaddubsw m16, m10, m16
+ pmulhrsw m16, m9
+ packsswb m16, m16
+ vpalignr m20{k1}, m16, m16, 4
+ vmovdqu8 m21{k1}, m16
+%endif
+ call %%add_noise_v
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r11mp
+ mov dstq, r12mp
+ lea lumaq, [r13+wq*(1<<%2)]
+ add srcq, wq
+ add dstq, wq
+ jmp %%hv_overlap
+ALIGN function_align
+%%add_noise_v:
+%if %3
+ pmaddubsw xm19, xm13, xm19
+ pmaddubsw xm20, xm13, xm20
+ pmulhrsw xm19, xm9
+ pmulhrsw xm20, xm9
+ vpacksswb m21{k3}, m19, m20
+%elif %2
+ pmaddubsw ym19, ym13, ym19
+ pmaddubsw ym20, ym13, ym20
+ pmulhrsw ym19, ym9
+ pmulhrsw ym20, ym9
+ vpacksswb m21{k3}, m19, m20
+%else
+ punpcklbw m19, m20, m21
+ punpckhbw m20, m21
+ pmaddubsw m19, m13, m19
+ pmaddubsw m20, m13, m20
+ pmulhrsw m19, m9
+ pmulhrsw m20, m9
+ packsswb m21, m19, m20
+%endif
+%%add_noise:
+ punpcklbw m20, m5, m21
+ punpckhbw m21, m5
+%%add_noise_h:
+ mova ym18, [lumaq+lstrideq*(0<<%3)]
+ vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1
+%if %2
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ mova ym16, [lumaq+lstrideq*(0<<%3)]
+ vinserti32x8 m16, [lumaq+lstrideq*(1<<%3)], 1
+ mova xm17, [srcq+strideq*0]
+ mova m19, m11
+ vpermi2b m19, m18, m16
+ vinserti128 ym17, [srcq+strideq*1], 1
+ vpermt2b m18, m12, m16
+ vinserti32x4 m17, [srcq+strideq*2], 2
+ pavgb m18, m19
+ vinserti32x4 m17, [srcq+stride3q ], 3
+%else
+ mova ym17, [srcq+strideq*0]
+ vinserti32x8 m17, [srcq+strideq*1], 1
+%endif
+%if %1
+ punpckhbw m19, m18, m17
+ punpcklbw m18, m17 ; { luma, chroma }
+ pmaddubsw m19, m14
+ pmaddubsw m18, m14
+ psraw m19, 6
+ psraw m18, 6
+ paddw m19, m15
+ paddw m18, m15
+ packuswb m18, m19
+.add_noise_main:
+ mova m19, m0
+ vpermt2b m19, m18, m1 ; scaling[ 0..127]
+ vpmovb2m k2, m18
+ vpermi2b m18, m2, m3 ; scaling[128..255]
+ vmovdqu8 m19{k2}, m18 ; scaling[src]
+ pshufb m19, m4
+ pmaddubsw m18, m19, m20
+ pmaddubsw m19, m21
+ add grain_lutq, 82*2<<%2
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ lea srcq, [srcq+strideq*(2<<%2)]
+ pmulhrsw m18, m6 ; noise
+ pmulhrsw m19, m6
+ punpcklbw m16, m17, m5 ; chroma
+ punpckhbw m17, m5
+ paddw m16, m18
+ paddw m17, m19
+ packuswb m16, m17
+ pmaxub m16, m7
+ pminub m16, m8
+%if %2
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+stride3q ], m16, 3
+%else
+ mova [dstq+strideq*0], ym16
+ vextracti32x8 [dstq+strideq*1], m16, 1
+%endif
+ lea dstq, [dstq+strideq*(2<<%2)]
+ ret
+%else
+ jmp .add_noise_main
+%endif
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/filmgrain_common.asm b/third_party/dav1d/src/x86/filmgrain_common.asm
new file mode 100644
index 0000000000..74f7044e66
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_common.asm
@@ -0,0 +1,46 @@
+; Copyright © 2019-2022, VideoLAN and dav1d authors
+; Copyright © 2019-2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+struc FGData
+ .seed: resd 1
+ .num_y_points: resd 1
+ .y_points: resb 14 * 2
+ .chroma_scaling_from_luma: resd 1
+ .num_uv_points: resd 2
+ .uv_points: resb 2 * 10 * 2
+ .scaling_shift: resd 1
+ .ar_coeff_lag: resd 1
+ .ar_coeffs_y: resb 24
+ .ar_coeffs_uv: resb 2 * 28 ; includes padding
+ .ar_coeff_shift: resq 1
+ .grain_scale_shift: resd 1
+ .uv_mult: resd 2
+ .uv_luma_mult: resd 2
+ .uv_offset: resd 2
+ .overlap_flag: resd 1
+ .clip_to_restricted_range: resd 1
+endstruc
+
+cextern gaussian_sequence
diff --git a/third_party/dav1d/src/x86/filmgrain_sse.asm b/third_party/dav1d/src/x86/filmgrain_sse.asm
new file mode 100644
index 0000000000..0172f98760
--- /dev/null
+++ b/third_party/dav1d/src/x86/filmgrain_sse.asm
@@ -0,0 +1,3233 @@
+; Copyright © 2019-2021, VideoLAN and dav1d authors
+; Copyright © 2019, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+SECTION_RODATA
+
+pw_1024: times 8 dw 1024
+pb_27_17_17_27: db 27, 17, 17, 27
+ times 6 db 0, 32
+pb_23_22_h: db 23, 22
+ times 7 db 0, 32
+pb_27_17: times 8 db 27, 17
+pb_17_27: times 8 db 17, 27
+pb_23_22: times 8 db 23, 22
+pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
+rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
+byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
+pw_seed_xor: times 2 dw 0xb524
+ times 2 dw 0x49d8
+pb_1: times 4 db 1
+hmul_bits: dw 32768, 16384, 8192, 4096
+round: dw 2048, 1024, 512
+mul_bits: dw 256, 128, 64, 32, 16
+round_vals: dw 32, 64, 128, 256, 512
+max: dw 255, 240, 235
+min: dw 0, 16
+pw_1: dw 1
+
+%macro JMP_TABLE 2-*
+ %xdefine %1_8bpc_%2_table %%table
+ %xdefine %%base %1_8bpc_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .ar%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3
+
+SECTION .text
+
+%if ARCH_X86_32
+%define PIC_ptr(a) base+a
+%else
+%define PIC_ptr(a) a
+%endif
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
+ LEA r4, $$
+%define base r4-$$
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r2d, [fg_dataq+FGData.grain_scale_shift]
+ movd m2, [base+round+r2*2]
+ movd m0, [fg_dataq+FGData.seed]
+ mova m5, [base+pb_mask]
+ pshuflw m2, m2, q0000
+ pshuflw m0, m0, q0000
+ mov r2, -73*82
+ sub bufq, r2
+ lea r3, [base+gaussian_sequence]
+.loop:
+ pand m6, m0, m1
+ psrlw m3, m6, 10
+ por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m6, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m6 ; set 15th bit for next 4 seeds
+ psllq m6, m3, 30
+ por m3, m6
+ psllq m6, m3, 15
+ por m3, m6 ; aggregate each bit into next seed's high bit
+ pmulhuw m6, m0, m7
+ por m3, m6 ; 4 next output seeds
+ pshuflw m0, m3, q3333
+ psrlw m3, 5
+%if ARCH_X86_64
+ movq r6, m3
+ mov r8, r6
+ movzx r5d, r6w
+ shr r6d, 16
+ shr r8, 32
+ movzx r7, r8w
+ shr r8, 16
+
+ movd m6, [r3+r5*2]
+ pinsrw m6, [r3+r6*2], 1
+ pinsrw m6, [r3+r7*2], 2
+ pinsrw m6, [r3+r8*2], 3
+%else
+ movd r6, m3
+ pshuflw m3, m3, q3232
+ movzx r5, r6w
+ shr r6, 16
+
+ movd m6, [r3+r5*2]
+ pinsrw m6, [r3+r6*2], 1
+
+ movd r6, m3
+ movzx r5, r6w
+ shr r6, 16
+
+ pinsrw m6, [r3+r5*2], 2
+ pinsrw m6, [r3+r6*2], 3
+%endif
+ pmulhrsw m6, m2
+ packsswb m6, m6
+ movd [bufq+r2], m6
+ add r2, 4
+ jl .loop
+
+ ; auto-regression code
+ movsxd r2, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4]
+ lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table]
+ jmp r2
+
+.ar1:
+%if ARCH_X86_32
+ DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max
+%elif WIN64
+ DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0
+ mov bufq, r0
+%else
+ DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0
+%endif
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3]
+ movd m4, [fg_dataq+FGData.ar_coeffs_y]
+ mov ecx, [fg_dataq+FGData.ar_coeff_shift]
+%if ARCH_X86_32
+ mov r1m, cf3d
+ DEFINE_ARGS buf, shift, val3, min, max, x, val0
+%define hd r0mp
+%define cf3d r1mp
+%elif WIN64
+ DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0
+%else
+ DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0
+%endif
+ pxor m6, m6
+ pcmpgtb m7, m6, m4
+ punpcklbw m4, m7
+ pinsrw m4, [base+pw_1], 3
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ movd m3, [base+round_vals+shiftq*2-12] ; rnd
+ pshuflw m3, m3, q0000
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+ mov mind, -128
+ mov maxd, 127
+.y_loop_ar1:
+ mov xq, -76
+ movsx val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+ movq m0, [bufq+xq-82-1] ; top/left
+ pcmpgtb m7, m6, m0
+ punpcklbw m0, m7
+ psrldq m2, m0, 2 ; top
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+ imul val3d, cf3d
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, byte [bufq+xq]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovns val3d, maxd
+ cmp val3d, mind
+ cmovs val3d, mind
+ mov byte [bufq+xq], val3b
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar1
+.ar0:
+ RET
+
+.ar2:
+%if ARCH_X86_32
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -16*8
+%endif
+ DEFINE_ARGS buf, fg_data, shift
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m6, [base+round_vals-12+shiftq*2]
+ movd m7, [base+byte_blend+1]
+ SCRATCH 7, 15, 7
+ movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7
+ movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11
+ pxor m7, m7
+ pshuflw m6, m6, q0000
+ punpcklwd m6, m7
+ pcmpgtb m4, m7, m0
+ pcmpgtb m5, m7, m1
+ punpcklbw m0, m4
+ punpcklbw m1, m5
+ DEFINE_ARGS buf, fg_data, h, x
+ pshufd m4, m1, q0000
+ pshufd m5, m1, q1111
+ pshufd m3, m0, q3333
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ SCRATCH 0, 8, 0
+ SCRATCH 1, 9, 1
+ SCRATCH 2, 10, 2
+ SCRATCH 3, 11, 3
+ SCRATCH 4, 12, 4
+ SCRATCH 5, 13, 5
+ SCRATCH 6, 14, 6
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+.y_loop_ar2:
+ mov xq, -76
+
+.x_loop_ar2:
+ movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
+ movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
+ pcmpgtb m2, m7, m0
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2
+ psrldq m5, m0, 2 ; y=-2,x=[-1,+5]
+ psrldq m3, m1, 2 ; y=-1,x=[-1,+5]
+ psrldq m4, m1, 4 ; y=-1,x=[+0,+5]
+ punpcklwd m2, m0, m5
+ punpcklwd m3, m4
+ pmaddwd m2, m8
+ pmaddwd m3, m11
+ paddd m2, m3
+
+ psrldq m4, m0, 4 ; y=-2,x=[+0,+5]
+ psrldq m5, m0, 6 ; y=-2,x=[+1,+5]
+ psrldq m6, m0, 8 ; y=-2,x=[+2,+5]
+ punpcklwd m4, m5
+ punpcklwd m6, m1
+ psrldq m5, m1, 6 ; y=-1,x=[+1,+5]
+ psrldq m1, m1, 8 ; y=-1,x=[+2,+5]
+ punpcklwd m5, m1
+ pmaddwd m4, m9
+ pmaddwd m6, m10
+ pmaddwd m5, m12
+ paddd m4, m6
+ paddd m2, m5
+ paddd m2, m4
+ paddd m2, m14
+
+ movq m0, [bufq+xq-2] ; y=0,x=[-2,+5]
+.x_loop_ar2_inner:
+ pcmpgtb m4, m7, m0
+ punpcklbw m1, m0, m4
+ pmaddwd m3, m1, m13
+ paddd m3, m2
+ psrldq m1, 4 ; y=0,x=0
+ psrldq m2, 4 ; shift top to next pixel
+ psrad m3, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
+ paddw m3, m1
+ packsswb m3, m3
+ pslldq m3, 2
+ pand m3, m15
+ pandn m1, m15, m0
+ por m0, m1, m3
+ psrldq m0, 1
+ ; overwrite 2 pixels, but that's ok
+ movd [bufq+xq-1], m0
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+.ar3:
+ DEFINE_ARGS buf, fg_data, shift
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+ ALLOC_STACK -16*14
+%elif WIN64
+ SUB rsp, 16*6
+%assign stack_size_padded (stack_size_padded+16*6)
+%assign stack_size (stack_size+16*6)
+%else
+ ALLOC_STACK -16*6
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m6, [base+round_vals-12+shiftq*2]
+ movd m7, [base+byte_blend]
+ movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15
+ movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23
+ pxor m3, m3
+ pcmpgtb m4, m3, m0
+ pcmpgtb m3, m2
+ pshuflw m6, m6, q0000
+ SCRATCH 6, 14, 12
+ SCRATCH 7, 15, 13
+ punpckhbw m1, m0, m4
+ punpcklbw m0, m4
+ punpcklbw m2, m3
+ pshufd m3, m0, q1111
+ pshufd m4, m0, q2222
+ pshufd m5, m0, q3333
+ pshufd m0, m0, q0000
+ mova [rsp+ 0*16], m0
+ mova [rsp+ 1*16], m3
+ mova [rsp+ 2*16], m4
+ mova [rsp+ 3*16], m5
+ pshufd m6, m1, q1111
+ pshufd m7, m1, q2222
+ pshufd m5, m1, q3333
+ pshufd m1, m1, q0000
+ pshufd m3, m2, q1111
+ psrldq m0, m2, 10
+ pinsrw m2, [base+pw_1], 5
+ pshufd m4, m2, q2222
+ pshufd m2, m2, q0000
+ pinsrw m0, [base+round_vals+shiftq*2-10], 3
+ mova [rsp+ 4*16], m1
+ mova [rsp+ 5*16], m6
+ SCRATCH 7, 8, 6
+ SCRATCH 5, 9, 7
+ SCRATCH 2, 10, 8
+ SCRATCH 3, 11, 9
+ SCRATCH 4, 12, 10
+ SCRATCH 0, 13, 11
+ DEFINE_ARGS buf, fg_data, h, x
+ sub bufq, 82*73-(82*3+79)
+ mov hd, 70
+.y_loop_ar3:
+ mov xq, -76
+
+.x_loop_ar3:
+ movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
+ pxor m3, m3
+ pcmpgtb m3, m0
+ punpckhbw m2, m0, m3
+ punpcklbw m0, m3
+
+ psrldq m5, m0, 2
+ psrldq m6, m0, 4
+ psrldq m7, m0, 6
+ punpcklwd m4, m0, m5
+ punpcklwd m6, m7
+ pmaddwd m4, [rsp+ 0*16]
+ pmaddwd m6, [rsp+ 1*16]
+ paddd m4, m6
+
+ movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12]
+ pxor m5, m5
+ pcmpgtb m5, m1
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ palignr m6, m2, m0, 10
+ palignr m7, m2, m0, 12
+ psrldq m0, 8
+ punpcklwd m0, m6
+ punpcklwd m7, m1
+ pmaddwd m0, [rsp+ 2*16]
+ pmaddwd m7, [rsp+ 3*16]
+ paddd m0, m7
+ paddd m0, m4
+
+ psrldq m4, m1, 2
+ psrldq m5, m1, 4
+ psrldq m6, m1, 6
+ psrldq m7, m1, 8
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+ pmaddwd m4, [rsp+ 4*16]
+ pmaddwd m6, [rsp+ 5*16]
+ paddd m4, m6
+ paddd m0, m4
+
+ movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
+ pxor m7, m7
+ pcmpgtb m7, m2
+ punpckhbw m5, m2, m7
+ punpcklbw m2, m7
+ palignr m7, m3, m1, 10
+ palignr m3, m1, 12
+ psrldq m1, m2, 2
+ punpcklwd m7, m3
+ punpcklwd m3, m2, m1
+ pmaddwd m7, m8
+ pmaddwd m3, m9
+ paddd m7, m3
+ paddd m0, m7
+
+ psrldq m6, m2, 4
+ psrldq m1, m2, 6
+ psrldq m3, m2, 8
+ palignr m4, m5, m2, 10
+ palignr m5, m5, m2, 12
+
+ punpcklwd m6, m1
+ punpcklwd m3, m4
+ punpcklwd m5, m14
+ pmaddwd m6, m10
+ pmaddwd m3, m11
+ pmaddwd m5, m12
+ paddd m0, m6
+ paddd m3, m5
+ paddd m0, m3
+
+ movq m1, [bufq+xq-3] ; y=0,x=[-3,+4]
+.x_loop_ar3_inner:
+ pxor m5, m5
+ pcmpgtb m5, m1
+ punpcklbw m2, m1, m5
+ pmaddwd m2, m13
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw since we only care about one value
+ packsswb m2, m2
+ pslldq m2, 3
+ pand m2, m15
+ pandn m3, m15, m1
+ por m1, m2, m3
+ movd [bufq+xq-3], m1
+ psrldq m1, 1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82
+ dec hd
+ jg .y_loop_ar3
+ RET
+
+%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
+INIT_XMM ssse3
+cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
+ movifnidn r2, r2mp
+ movifnidn r3, r3mp
+ LEA r4, $$
+%define base r4-$$
+ movq m1, [base+rnd_next_upperbit_mask]
+ movq m4, [base+mul_bits]
+ movq m7, [base+hmul_bits]
+ mov r5d, [fg_dataq+FGData.grain_scale_shift]
+ movd m6, [base+round+r5*2]
+ mova m5, [base+pb_mask]
+ movd m0, [fg_dataq+FGData.seed]
+ movd m2, [base+pw_seed_xor+uvq*4]
+ pxor m0, m2
+ pshuflw m6, m6, q0000
+ pshuflw m0, m0, q0000
+ lea r6, [base+gaussian_sequence]
+%if %2
+%if ARCH_X86_64
+ mov r7d, 73-35*%3
+%else
+ mov r3mp, 73-35*%3
+%endif
+ add bufq, 44
+.loop_y:
+ mov r5, -44
+.loop_x:
+%else
+ mov r5, -82*73
+ sub bufq, r5
+.loop:
+%endif
+ pand m2, m0, m1
+ psrlw m3, m2, 10
+ por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
+ pmullw m2, m4 ; bits 0x0f00 are set
+ pshufb m3, m5, m2 ; set 15th bit for next 4 seeds
+ psllq m2, m3, 30
+ por m3, m2
+ psllq m2, m3, 15
+ por m3, m2 ; aggregate each bit into next seed's high bit
+ pmulhuw m2, m0, m7
+ por m2, m3 ; 4 next output seeds
+ pshuflw m0, m2, q3333
+ psrlw m2, 5
+%if ARCH_X86_64
+ movd r9d, m2
+ pshuflw m2, m2, q3232
+ movzx r8, r9w
+ shr r9, 16
+
+ movd m3, [r6+r8*2]
+ pinsrw m3, [r6+r9*2], 1
+
+ movd r9d, m2
+ movzx r8, r9w
+ shr r9, 16
+
+ pinsrw m3, [r6+r8*2], 2
+ pinsrw m3, [r6+r9*2], 3
+%else
+ movd r2, m2
+ pshuflw m2, m2, q3232
+ movzx r1, r2w
+ shr r2, 16
+
+ movd m3, [r6+r1*2]
+ pinsrw m3, [r6+r2*2], 1
+
+ movd r2, m2
+ movzx r1, r2w
+ shr r2, 16
+
+ pinsrw m3, [r6+r1*2], 2
+ pinsrw m3, [r6+r2*2], 3
+%endif
+ pmulhrsw m3, m6
+ packsswb m3, m3
+ movd [bufq+r5], m3
+ add r5, 4
+%if %2
+ jl .loop_x
+ add bufq, 82
+%if ARCH_X86_64
+ dec r7d
+%else
+ dec r3mp
+%endif
+ jg .loop_y
+%else
+ jl .loop
+%endif
+
+%if ARCH_X86_32
+ mov r2, r2mp
+%endif
+
+ ; auto-regression code
+ movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
+ movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4]
+ lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table]
+ jmp r5
+
+.ar0:
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ movifnidn bufyq, bufymp
+%if ARCH_X86_32
+%assign stack_offset_old stack_offset
+ ALLOC_STACK -2*16
+%endif
+ imul uvd, 28
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
+ movd m4, [base+hmul_bits+shiftq*2]
+ DEFINE_ARGS buf, bufy, h, x
+ pxor m0, m0
+ pcmpgtb m0, m5
+ punpcklbw m5, m0
+ movd m7, [base+pb_1]
+%if %2
+ movd m6, [base+hmul_bits+2+%3*2]
+%endif
+ pshuflw m5, m5, q0000
+ pshuflw m4, m4, q0000
+ pshufd m7, m7, q0000
+%if %2
+ pshuflw m6, m6, q0000
+%endif
+ punpcklqdq m5, m5
+ punpcklqdq m4, m4
+%if %2
+ punpcklqdq m6, m6
+%endif
+ pcmpeqw m1, m1
+ pslldq m1, 12>>%2
+ SCRATCH 1, 8, 0
+ SCRATCH 4, 9, 1
+%if %2
+ sub bufq, 82*(73-35*%3)+82-(82*3+41)
+%else
+ sub bufq, 82*70-3
+%endif
+ add bufyq, 3+82*3
+ mov hd, 70-35*%3
+.y_loop_ar0:
+ xor xd, xd
+.x_loop_ar0:
+ ; first 32 pixels
+%if %2
+ movu m1, [bufyq+xq*2]
+%if %3
+ movu m2, [bufyq+xq*2+82]
+%endif
+ movu m3, [bufyq+xq*2+16]
+%if %3
+ movu m4, [bufyq+xq*2+82+16]
+%endif
+ pmaddubsw m0, m7, m1
+%if %3
+ pmaddubsw m1, m7, m2
+%endif
+ pmaddubsw m2, m7, m3
+%if %3
+ pmaddubsw m3, m7, m4
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ pmulhrsw m0, m6
+ pmulhrsw m2, m6
+%else
+ movu m0, [bufyq+xq]
+ pxor m6, m6
+ pcmpgtb m6, m0
+ punpckhbw m2, m0, m6
+ punpcklbw m0, m6
+%endif
+ pmullw m0, m5
+ pmullw m2, m5
+ pmulhrsw m0, m9
+ pmulhrsw m2, m9
+ movu m1, [bufq+xq]
+ pxor m4, m4
+ pcmpgtb m4, m1
+ punpckhbw m3, m1, m4
+%if %2
+ punpcklbw m1, m4
+ paddw m2, m3
+ paddw m0, m1
+%else
+ punpcklbw m6, m1, m4
+ paddw m2, m3
+ paddw m0, m6
+%endif
+ packsswb m0, m2
+%if %2
+ movu [bufq+xq], m0
+ add xd, 16
+ cmp xd, 32
+ jl .x_loop_ar0
+
+ ; last 6/12 pixels
+ movu m1, [bufyq+xq*(1+%2)]
+%if %3
+ movu m2, [bufyq+xq*2+82]
+%endif
+ pmaddubsw m0, m7, m1
+%if %3
+ pmaddubsw m1, m7, m2
+ paddw m0, m1
+%endif
+ pmulhrsw m0, m6
+ pmullw m0, m5
+ pmulhrsw m0, m9
+ movq m1, [bufq+xq]
+ pxor m4, m4
+ pcmpgtb m4, m1
+ punpcklbw m2, m1, m4
+ paddw m0, m2
+ packsswb m0, m0
+ pandn m2, m8, m0
+ pand m1, m8
+ por m2, m1
+ movq [bufq+xq], m2
+%else
+ add xd, 16
+ cmp xd, 80
+ je .y_loop_final_ar0
+ movu [bufq+xq-16], m0
+ jmp .x_loop_ar0
+.y_loop_final_ar0:
+ pandn m2, m8, m0
+ pand m1, m8
+ por m2, m1
+ movu [bufq+xq-16], m2
+%endif
+
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar0
+ RET
+
+.ar1:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x
+ imul uvd, 28
+ movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
+ movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1]
+ pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2
+%if ARCH_X86_32
+ mov r3mp, cf3d
+ DEFINE_ARGS buf, shift, fg_data, val3, min, max, x
+%elif WIN64
+ DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x
+ mov bufq, r0
+%else
+ DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ movd m3, [base+round_vals+shiftq*2-12] ; rnd
+%if %2
+ movd m7, [base+pb_1]
+ movd m6, [base+hmul_bits+2+%3*2]
+%endif
+ psrldq m4, 1
+%if ARCH_X86_32
+ DEFINE_ARGS buf, shift, val0, val3, min, max, x
+%elif WIN64
+ DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0
+%else
+ DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0
+%endif
+ pxor m5, m5
+ punpcklwd m3, m5
+%if %2
+ punpcklwd m6, m6
+%endif
+ pcmpgtb m5, m4
+ punpcklbw m4, m5
+ pshufd m5, m4, q1111
+ pshufd m4, m4, q0000
+ pshufd m3, m3, q0000
+%if %2
+ pshufd m7, m7, q0000
+ pshufd m6, m6, q0000
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*69+3
+%endif
+%if ARCH_X86_32
+ add r1mp, 79+82*3
+ mov r0mp, 70-35*%3
+%else
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+%endif
+ mov mind, -128
+ mov maxd, 127
+.y_loop_ar1:
+ mov xq, -(76>>%2)
+ movsx val3d, byte [bufq+xq-1]
+.x_loop_ar1:
+%if %2
+%if ARCH_X86_32
+ mov r2, r1mp
+ movq m0, [r2+xq*2]
+%if %3
+ movq m1, [r2+xq*2+82]
+%endif
+%else
+ movq m0, [bufyq+xq*2]
+%if %3
+ movq m1, [bufyq+xq*2+82]
+%endif
+%endif
+ pmaddubsw m2, m7, m0
+%if %3
+ pmaddubsw m0, m7, m1
+ paddw m2, m0
+%endif
+ pmulhrsw m2, m6
+%else
+%if ARCH_X86_32
+ mov r2, r1mp
+ movd m2, [r2+xq]
+%else
+ movd m2, [bufyq+xq]
+%endif
+ pxor m0, m0
+ pcmpgtb m0, m2
+ punpcklbw m2, m0
+%endif
+
+ movq m0, [bufq+xq-82-1] ; top/left
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpcklbw m0, m1
+ psrldq m1, m0, 4 ; top/right
+ punpcklwd m1, m2
+ psrldq m2, m0, 2 ; top
+ punpcklwd m0, m2
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ paddd m0, m1
+ paddd m0, m3
+.x_loop_ar1_inner:
+ movd val0d, m0
+ psrldq m0, 4
+%if ARCH_X86_32
+ imul val3d, r3mp
+%else
+ imul val3d, cf3d
+%endif
+ add val3d, val0d
+ sar val3d, shiftb
+ movsx val0d, byte [bufq+xq]
+ add val3d, val0d
+ cmp val3d, maxd
+ cmovns val3d, maxd
+ cmp val3d, mind
+ cmovs val3d, mind
+ mov byte [bufq+xq], val3b
+ ; keep val3d in-place as left for next x iteration
+ inc xq
+ jz .x_loop_ar1_end
+ test xq, 3
+ jnz .x_loop_ar1_inner
+ jmp .x_loop_ar1
+
+.x_loop_ar1_end:
+ add bufq, 82
+%if ARCH_X86_32
+ add r1mp, 82<<%3
+ dec r0mp
+%else
+ add bufyq, 82<<%3
+ dec hd
+%endif
+ jg .y_loop_ar1
+ RET
+
+.ar2:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+ ALLOC_STACK -8*16
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ movifnidn bufyq, bufymp
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+ movd m7, [base+round_vals-12+shiftq*2]
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12
+ pxor m2, m2
+ pcmpgtb m2, m0
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2
+ pinsrw m1, [base+pw_1], 5
+ punpcklwd m7, m7
+ pshufd m7, m7, q0000
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+ pshufd m4, m1, q0000
+ pshufd m5, m1, q1111
+ pshufd m6, m1, q2222
+ pshufd m3, m0, q3333
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ SCRATCH 0, 8, 0
+ SCRATCH 1, 9, 1
+ SCRATCH 2, 10, 2
+ SCRATCH 3, 11, 3
+ SCRATCH 4, 12, 4
+ SCRATCH 5, 13, 5
+ SCRATCH 6, 14, 6
+ SCRATCH 7, 15, 7
+%if %2
+ movd m7, [base+hmul_bits+2+%3*2]
+ movd m6, [base+pb_1]
+ punpcklwd m7, m7
+ pshufd m6, m6, q0000
+ pshufd m7, m7, q0000
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*69+3
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+.y_loop_ar2:
+ mov xq, -(76>>%2)
+
+.x_loop_ar2:
+ pxor m2, m2
+ movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
+ movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
+ pcmpgtb m2, m0
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2
+ psrldq m5, m0, 2 ; y=-2,x=[-1,+5]
+ psrldq m3, m1, 2 ; y=-1,x=[-1,+5]
+ psrldq m4, m1, 4 ; y=-1,x=[+0,+5]
+ punpcklwd m2, m0, m5
+ punpcklwd m3, m4
+ pmaddwd m2, m8
+ pmaddwd m3, m11
+ paddd m2, m3
+
+ psrldq m4, m0, 4 ; y=-2,x=[+0,+5]
+ psrldq m5, m0, 6 ; y=-2,x=[+1,+5]
+ psrldq m0, 8 ; y=-2,x=[+2,+5]
+ punpcklwd m4, m5
+ punpcklwd m0, m1
+ psrldq m3, m1, 6 ; y=-1,x=[+1,+5]
+ psrldq m1, m1, 8 ; y=-1,x=[+2,+5]
+ punpcklwd m3, m1
+ pmaddwd m4, m9
+ pmaddwd m0, m10
+ pmaddwd m3, m12
+ paddd m4, m0
+ paddd m2, m3
+ paddd m2, m4
+
+%if %2
+ movq m1, [bufyq+xq*2]
+%if %3
+ movq m3, [bufyq+xq*2+82]
+%endif
+ pmaddubsw m0, m6, m1
+%if %3
+ pmaddubsw m1, m6, m3
+ paddw m0, m1
+%endif
+ pmulhrsw m0, m7
+%else
+ movd m0, [bufyq+xq]
+ pxor m1, m1
+ pcmpgtb m1, m0
+ punpcklbw m0, m1
+%endif
+ punpcklwd m0, m15
+ pmaddwd m0, m14
+ paddd m2, m0
+
+ movq m0, [bufq+xq-2] ; y=0,x=[-2,+5]
+ pxor m4, m4
+ movd m5, [base+byte_blend+1]
+ punpcklbw m5, m5
+.x_loop_ar2_inner:
+ pcmpgtb m1, m4, m0
+ punpcklbw m0, m1
+ pmaddwd m3, m0, m13
+ paddd m3, m2
+ psrldq m2, 4 ; shift top to next pixel
+ psrad m3, [fg_dataq+FGData.ar_coeff_shift]
+ pslldq m3, 4
+ pand m3, m5
+ paddw m0, m3
+ packsswb m0, m0
+ movd [bufq+xq-2], m0
+ psrldq m0, 1
+ inc xq
+ jz .x_loop_ar2_end
+ test xq, 3
+ jnz .x_loop_ar2_inner
+ jmp .x_loop_ar2
+
+.x_loop_ar2_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar2
+ RET
+
+.ar3:
+%if ARCH_X86_32
+%assign stack_offset stack_offset_old
+%assign stack_size_padded 0
+%xdefine rstk rsp
+%endif
+ DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
+ movifnidn bufyq, bufymp
+%if ARCH_X86_32
+ ALLOC_STACK -15*16
+%else
+ SUB rsp, 16*7
+%assign stack_size_padded (stack_size_padded+16*7)
+%assign stack_size (stack_size+16*7)
+%endif
+ mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
+ imul uvd, 28
+
+ movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15
+ pxor m3, m3
+ pcmpgtb m3, m0
+ punpckhbw m1, m0, m3
+ punpcklbw m0, m3
+ pshufd m2, m0, q1111
+ pshufd m3, m0, q2222
+ pshufd m4, m0, q3333
+ pshufd m0, m0, q0000
+ pshufd m5, m1, q1111
+ pshufd m6, m1, q2222
+ pshufd m7, m1, q3333
+ pshufd m1, m1, q0000
+ mova [rsp+ 0*16], m0
+ mova [rsp+ 1*16], m2
+ mova [rsp+ 2*16], m3
+ mova [rsp+ 3*16], m4
+ mova [rsp+ 4*16], m1
+ mova [rsp+ 5*16], m5
+ mova [rsp+ 6*16], m6
+ SCRATCH 7, 8, 7
+
+ movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma]
+ pxor m4, m4
+ pcmpgtb m4, m2
+ punpckhbw m5, m2, m4
+ punpcklbw m2, m4
+ pshufd m4, m2, q3232
+ punpcklwd m3, m4, m5
+ pshuflw m5, m4, q3321
+ pshufd m4, m3, q0000
+ pshufd m3, m2, q1111
+ pshufd m2, m2, q0000
+ pinsrw m5, [base+round_vals+shiftq*2-10], 3
+ SCRATCH 2, 9, 8
+ SCRATCH 3, 10, 9
+ SCRATCH 4, 11, 10
+ SCRATCH 5, 12, 11
+
+ movd m2, [base+round_vals-12+shiftq*2]
+%if %2
+ movd m1, [base+pb_1]
+ movd m3, [base+hmul_bits+2+%3*2]
+%endif
+ pxor m0, m0
+ punpcklwd m2, m0
+%if %2
+ punpcklwd m3, m3
+%endif
+ pshufd m2, m2, q0000
+%if %2
+ pshufd m1, m1, q0000
+ pshufd m3, m3, q0000
+ SCRATCH 1, 13, 12
+%endif
+ SCRATCH 2, 14, 13
+%if %2
+ SCRATCH 3, 15, 14
+%endif
+
+ DEFINE_ARGS buf, bufy, fg_data, h, unused, x
+%if %2
+ sub bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+ sub bufq, 82*69+3
+%endif
+ add bufyq, 79+82*3
+ mov hd, 70-35*%3
+.y_loop_ar3:
+ mov xq, -(76>>%2)
+
+.x_loop_ar3:
+ movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
+ pxor m4, m4
+ pcmpgtb m4, m0
+ punpckhbw m3, m0, m4
+ punpcklbw m0, m4
+
+ psrldq m5, m0, 2
+ psrldq m6, m0, 4
+ psrldq m7, m0, 6
+ punpcklwd m4, m0, m5
+ punpcklwd m6, m7
+ pmaddwd m4, [rsp+ 0*16]
+ pmaddwd m6, [rsp+ 1*16]
+ paddd m4, m6
+
+ palignr m2, m3, m0, 10
+ palignr m3, m0, 12
+ psrldq m0, 8
+
+ movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12]
+ pxor m6, m6
+ pcmpgtb m6, m1
+ punpckhbw m5, m1, m6
+ punpcklbw m1, m6
+
+ punpcklwd m0, m2
+ punpcklwd m3, m1
+ pmaddwd m0, [rsp+ 2*16]
+ pmaddwd m3, [rsp+ 3*16]
+ paddd m0, m3
+ paddd m0, m4
+
+ movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
+ pxor m7, m7
+ pcmpgtb m7, m2
+ punpckhbw m6, m2, m7
+ punpcklbw m2, m7
+
+ palignr m3, m5, m1, 10
+ palignr m5, m1, 12
+ psrldq m4, m2, 2
+
+ punpcklwd m3, m5
+ punpcklwd m5, m2, m4
+ pmaddwd m3, [rsp+ 6*16]
+ pmaddwd m5, m8
+ paddd m3, m5
+ paddd m0, m3
+
+ psrldq m3, m1, 2
+ psrldq m4, m1, 4
+ psrldq m5, m1, 6
+ psrldq m1, 8
+
+ punpcklwd m3, m4
+ punpcklwd m5, m1
+ pmaddwd m3, [rsp+ 4*16]
+ pmaddwd m5, [rsp+ 5*16]
+ paddd m3, m5
+ paddd m0, m3
+
+%if %2
+ movq m1, [bufyq+xq*2]
+%if %3
+ movq m3, [bufyq+xq*2+82]
+%endif
+ pmaddubsw m7, m13, m1
+%if %3
+ pmaddubsw m5, m13, m3
+ paddw m7, m5
+%endif
+ pmulhrsw m7, m15
+%else
+ movd m7, [bufyq+xq]
+ pxor m1, m1
+ pcmpgtb m1, m7
+ punpcklbw m7, m1
+%endif
+
+ psrldq m1, m2, 4
+ psrldq m3, m2, 6
+ palignr m4, m6, m2, 10
+ palignr m6, m2, 12
+ psrldq m2, 8
+
+ punpcklwd m1, m3
+ punpcklwd m2, m4
+ punpcklwd m6, m7
+ pmaddwd m1, m9
+ pmaddwd m2, m10
+ pmaddwd m6, m11
+ paddd m1, m2
+ paddd m0, m6
+ paddd m0, m1
+ paddd m0, m14
+
+ movq m1, [bufq+xq-3] ; y=0,x=[-3,+4]
+ pxor m4, m4
+ movd m5, [base+byte_blend]
+.x_loop_ar3_inner:
+ pcmpgtb m2, m4, m1
+ punpcklbw m3, m1, m2
+ pmaddwd m2, m3, m12
+ pshufd m3, m2, q1111
+ paddd m2, m3 ; left+cur
+ paddd m2, m0 ; add top
+ psrldq m0, 4
+ psrad m2, [fg_dataq+FGData.ar_coeff_shift]
+ ; don't packssdw, we only care about one value
+ packsswb m2, m2
+ pandn m3, m5, m1
+ pslld m2, 24
+ pand m2, m5
+ por m1, m2, m3
+ movd [bufq+xq-3], m1
+ psrldq m1, 1
+ inc xq
+ jz .x_loop_ar3_end
+ test xq, 3
+ jnz .x_loop_ar3_inner
+ jmp .x_loop_ar3
+
+.x_loop_ar3_end:
+ add bufq, 82
+ add bufyq, 82<<%3
+ dec hd
+ jg .y_loop_ar3
+ RET
+%endmacro
+
+generate_grain_uv_fn 420, 1, 1
+generate_grain_uv_fn 422, 1, 0
+generate_grain_uv_fn 444, 0, 0
+
+%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
+%assign %%idx 0
+%define %%tmp %2
+%if %0 == 6
+%define %%tmp %6
+%endif
+%rep 4
+%if %%idx == 0
+ movd %5 %+ d, %2
+ pshuflw %%tmp, %2, q3232
+%else
+ movd %5 %+ d, %%tmp
+%if %%idx == 2
+ punpckhqdq %%tmp, %%tmp
+%elif %%idx == 4
+ psrlq %%tmp, 32
+%endif
+%endif
+ movzx %4 %+ d, %5 %+ w
+ shr %5 %+ d, 16
+
+%if %%idx == 0
+ movd %1, [%3+%4]
+%else
+ pinsrw %1, [%3+%4], %%idx + 0
+%endif
+ pinsrw %1, [%3+%5], %%idx + 1
+%assign %%idx %%idx+2
+%endrep
+%endmacro
+
+INIT_XMM ssse3
+; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+ ; copy stack arguments to new position post-alignment, so that we
+ ; don't have to keep the old stack location in a separate register
+ mov r0, r0m
+ mov r1, r2m
+ mov r2, r4m
+ mov r3, r6m
+ mov r4, r7m
+ mov r5, r8m
+
+ mov [rsp+5*mmsize+ 4*gprsize], r0
+ mov [rsp+5*mmsize+ 6*gprsize], r1
+ mov [rsp+5*mmsize+ 8*gprsize], r2
+ mov [rsp+5*mmsize+10*gprsize], r3
+ mov [rsp+5*mmsize+11*gprsize], r4
+ mov [rsp+5*mmsize+12*gprsize], r5
+%else
+cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \
+ dst, src, scaling, unused1, fg_data, picptr, unused2
+%endif
+ mov srcq, srcm
+ mov fg_dataq, r3m
+ mov scalingq, r5m
+%if STACK_ALIGNMENT < mmsize
+%define r0m [rsp+5*mmsize+ 4*gprsize]
+%define r1m [rsp+5*mmsize+ 5*gprsize]
+%define r2m [rsp+5*mmsize+ 6*gprsize]
+%define r3m [rsp+5*mmsize+ 7*gprsize]
+%define r4m [rsp+5*mmsize+ 8*gprsize]
+%define r5m [rsp+5*mmsize+ 9*gprsize]
+%define r6m [rsp+5*mmsize+10*gprsize]
+%define r7m [rsp+5*mmsize+11*gprsize]
+%define r8m [rsp+5*mmsize+12*gprsize]
+%endif
+ LEA r5, pb_mask
+%define base r5-pb_mask
+ mov r5m, picptrq
+%else
+cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
+ lea r7, [pb_mask]
+%define base r7-pb_mask
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ movd m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ movd m4, [base+max+r6*4]
+ movd m5, [base+min+r6*2]
+ punpcklwd m3, m3
+ punpcklwd m4, m4
+ punpcklwd m5, m5
+ pshufd m3, m3, q0000
+ pshufd m4, m4, q0000
+ pshufd m5, m5, q0000
+ SCRATCH 3, 11, 0
+ SCRATCH 4, 12, 1
+ SCRATCH 5, 13, 2
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+%endif
+
+ mov sbyd, r8m
+ mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
+ test overlapd, overlapd
+ jz .no_vertical_overlap
+ mova m6, [base+pw_1024]
+ mova m7, [base+pb_27_17_17_27]
+ SCRATCH 6, 14, 3
+ SCRATCH 7, 15, 4
+ test sbyd, sbyd
+ jnz .vertical_overlap
+ ; fall-through
+
+.no_vertical_overlap:
+ mov r8m, overlapd
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ unused1, unused2, see, unused3
+%endif
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r4m, wq
+ DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
+%endif
+
+.loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, unused
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
+ ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, unused
+%endif
+
+.loop_x_odd:
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y:
+ ; src
+ mova m0, [srcq]
+ pxor m2, m2
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, m3
+ vpgatherdw m5, m1, scalingq-1, r0, r5, m3
+%else
+ vpgatherdw m4, m0, scalingq-1, r12, r13, m3
+ vpgatherdw m5, m1, scalingq-1, r12, r13, m3
+%endif
+ REPX {psrlw x, 8}, m4, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+ pcmpgtb m7, m2, m3
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m2, m4
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hd
+ jg .loop_y
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r1mp
+ add srcq, r4mp
+%else
+ lea srcq, [src_bakq+wq]
+%endif
+ btc dword r8m, 2
+ jc .next_blk
+
+ add offxyd, 16
+ test dword r8m, 2 ; r8m & 2 = have_top_overlap
+ jz .loop_x_odd
+
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxyd
+%endif
+ jnz .loop_x_odd_v_overlap
+
+.next_blk:
+ test dword r8m, 1
+ jz .loop_x
+
+ test dword r8m, 2
+ jnz .loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+%if ARCH_X86_32
+ ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr,
+ ; r6m=grain_lut, r7m=h, r8m=overlap_v|h
+ DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
+
+ add offxyd, 16 ; left_offxyd
+ mov [rsp+5*mmsize+0*gprsize], offxyd
+
+ DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
+
+ mov seed, r3m
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy
+
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+%endif
+
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyq, [offyq+offxq*2+747] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_h_overlap:
+ ; src
+ mova m0, [srcq]
+ pxor m2, m2
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, m3
+ vpgatherdw m5, m1, scalingq-1, r0, r5, m3
+%else
+ vpgatherdw m4, m0, scalingq-1, r12, r13, m3
+ vpgatherdw m5, m1, scalingq-1, r12, r13, m3
+%endif
+ REPX {psrlw x, 8}, m4, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+0*gprsize]
+ movd m7, [grain_lutq+r5]
+%else
+ movd m7, [grain_lutq+left_offxyq]
+%endif
+ punpcklbw m7, m3
+ pmaddubsw m6, m15, m7
+ pmulhrsw m6, m14
+ packsswb m6, m6
+ shufps m6, m3, q3210
+ pcmpgtb m2, m6
+ punpcklbw m7, m6, m2
+ punpckhbw m6, m2
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m7, m4
+ pmullw m6, m5
+ pmulhrsw m7, m11
+ pmulhrsw m6, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m7
+ paddw m1, m6
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hd
+ jg .loop_y_h_overlap
+
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end
+%if ARCH_X86_32
+ mov srcq, r1m
+ add srcq, r4m
+%else
+ lea srcq, [src_bakq+wq]
+%endif
+ xor dword r8m, 4
+ add offxyd, 16
+
+ ; since this half-block had left-overlap, the next does not
+ test dword r8m, 2 ; have_top_overlap
+ jz .loop_x_odd
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+1*gprsize], 16
+%else
+ add r11d, 16 ; top_offxyd
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end:
+ RET
+
+.vertical_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
+%endif
+
+ or overlapd, 2 ; top_overlap: overlap & 2
+ mov r8m, overlapd
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+ DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul tmpd, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add tmpd, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and tmpd, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, tmpd
+%if ARCH_X86_32
+ xor sbyd, seed ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ tmp, unused2, see, unused3
+%endif
+
+ lea src_bakq, [srcq+wq]
+ neg wq
+ sub dstmp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r4m, wq
+ DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%endif
+
+.loop_x_v_overlap:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed,
+ ; because of the 'and tmpd, 0x00ff00ff' above
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, unused, top_offxy
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, unused, top_offxy
+%endif
+
+ movzx top_offxyd, offxyw
+%if ARCH_X86_32
+ mov [rsp+5*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+ shr offxyd, 16
+
+.loop_x_odd_v_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+ lea r5, [base+pb_27_17]
+ mov [rsp+5*mmsize+12], r5
+%else
+ mova m8, [pb_27_17]
+%endif
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_v_overlap:
+ ; src
+ mova m0, [srcq]
+ pxor m2, m2
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m4, m0, scalingq-1, r0, r5, m3
+ vpgatherdw m5, m1, scalingq-1, r0, r5, m3
+%else
+ vpgatherdw m4, m0, scalingq-1, r12, r13, m3
+ vpgatherdw m5, m1, scalingq-1, r12, r13, m3
+%endif
+ REPX {psrlw x, 8}, m4, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+1*gprsize]
+ movu m7, [grain_lutq+r5]
+%else
+ movu m7, [grain_lutq+top_offxyq]
+%endif
+ punpckhbw m6, m7, m3
+ punpcklbw m7, m3
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+12]
+ pmaddubsw m3, [r5], m6
+ pmaddubsw m6, [r5], m7
+%else
+ pmaddubsw m3, m8, m6
+ pmaddubsw m6, m8, m7
+%endif
+ pmulhrsw m3, m14
+ pmulhrsw m6, m14
+ packsswb m6, m3
+ pcmpgtb m7, m2, m6
+ punpcklbw m2, m6, m7
+ punpckhbw m6, m7
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m2, m4
+ pmullw m6, m5
+ pmulhrsw m2, m11
+ pmulhrsw m6, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m6
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+12], mmsize
+%else
+ mova m8, [pb_17_27]
+%endif
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hw
+ jz .end_y_v_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ btc hd, 16
+ jnc .loop_y_v_overlap
+ jmp .loop_y
+
+.end_y_v_overlap:
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+ add srcq, r4mp
+%else
+ lea srcq, [src_bakq+wq]
+%endif
+ btc dword r8m, 2
+ jc .loop_x_hv_overlap
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.loop_x_hv_overlap:
+%if ARCH_X86_32
+ mov r5, r5m
+ lea r5, [base+pb_27_17]
+ mov [rsp+5*mmsize+12], r5
+
+ DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
+
+ mov r5, [rsp+5*mmsize+1*gprsize]
+ mov r4, offxyd
+ add r5, 16
+ add r4, 16
+ mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy
+ mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy
+
+ DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
+
+ xor tmpd, tmpd
+ mov seed, r3m
+%else
+ mova m8, [pb_27_17]
+
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ tmp, unused2, see, unused3
+
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offyq+16]
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*2+0x10001*747+32*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+
+ movzx r5, offxyw ; top_offxy
+ mov [rsp+5*mmsize+1*gprsize], r5
+%else
+ DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy
+
+ movzx top_offxyd, offxyw
+%endif
+ shr offxyd, 16
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+.loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy
+ mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy
+ movu m6, [grain_lutq+r5]
+ mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy
+ movd m4, [grain_lutq+r0]
+ movd m7, [grain_lutq+r5]
+%else
+ movu m6, [grain_lutq+top_offxyq]
+ movd m4, [grain_lutq+left_offxyq]
+ movd m7, [grain_lutq+topleft_offxyq]
+%endif
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m4, m3
+ punpcklbw m7, m6
+ pmaddubsw m2, m15, m4
+ pmaddubsw m4, m15, m7
+ pmulhrsw m2, m14
+ pmulhrsw m4, m14
+ packsswb m2, m2
+ packsswb m4, m4
+ shufps m2, m3, q3210
+ shufps m4, m6, q3210
+ ; followed by v interpolation (top | cur -> cur)
+ punpcklbw m3, m4, m2
+ punpckhbw m4, m2
+%if ARCH_X86_32
+ mov r5, [rsp+5*mmsize+12]
+ pmaddubsw m7, [r5], m4
+ pmaddubsw m4, [r5], m3
+%else
+ pmaddubsw m7, m8, m4
+ pmaddubsw m4, m8, m3
+%endif
+ pmulhrsw m7, m14
+ pmulhrsw m4, m14
+ packsswb m4, m7
+ pxor m2, m2
+ pcmpgtb m7, m2, m4
+ punpcklbw m3, m4, m7
+ punpckhbw m4, m7
+
+ ; src
+ mova m0, [srcq]
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m5, m0, scalingq-1, r0, r5, m7
+ vpgatherdw m6, m1, scalingq-1, r0, r5, m7
+%else
+ vpgatherdw m5, m0, scalingq-1, r13, r14, m7
+ vpgatherdw m6, m1, scalingq-1, r13, r14, m7
+%endif
+ REPX {psrlw x, 8}, m5, m6
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m3, m5
+ pmullw m4, m6
+ pmulhrsw m3, m11
+ pmulhrsw m4, m11
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m3
+ paddw m1, m4
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+12], mmsize
+%else
+ mova m8, [pb_17_27]
+%endif
+ add srcq, r2mp
+ add grain_lutq, 82
+ dec hw
+ jz .end_y_hv_overlap
+ ; 2 lines get vertical overlap, then fall back to non-overlap code for
+ ; remaining (up to) 30 lines
+ btc hd, 16
+ jnc .loop_y_hv_overlap
+ jmp .loop_y_h_overlap
+
+.end_y_hv_overlap:
+%if ARCH_X86_32
+ add r4mp, 16
+%else
+ add wq, 16
+%endif
+ jge .end_hv
+%if ARCH_X86_32
+ mov srcq, r1m
+ add srcq, r4m
+%else
+ lea srcq, [src_bakq+wq]
+%endif
+ xor dword r8m, 4
+ add offxyd, 16
+%if ARCH_X86_32
+ add dword [rsp+5*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ jmp .loop_x_odd_v_overlap
+
+.end_hv:
+ RET
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+INIT_XMM ssse3
+%if ARCH_X86_32
+; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
+; sby, luma, lstride, uv_pl, is_id)
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
+cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \
+ tmp, src, scaling, h, fg_data, picptr, unused
+ mov r0, r0m
+ mov r1, r2m
+ mov r2, r4m
+ mov r3, r6m
+ mov r4, r7m
+ mov [rsp+7*mmsize+3*gprsize], r0
+ mov [rsp+7*mmsize+5*gprsize], r1
+ mov [rsp+7*mmsize+7*gprsize], r2
+ mov [rsp+7*mmsize+9*gprsize], r3
+ mov [rsp+7*mmsize+10*gprsize], r4
+
+ mov r0, r8m
+ mov r1, r9m
+ mov r2, r10m
+ mov r4, r11m
+ mov r3, r12m
+ mov [rsp+7*mmsize+11*gprsize], r0
+ mov [rsp+7*mmsize+12*gprsize], r1
+ mov [rsp+7*mmsize+13*gprsize], r2
+ mov [rsp+7*mmsize+14*gprsize], r4
+%else
+cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \
+ tmp, src, scaling, h, fg_data, picptr, unused
+%endif
+ mov srcq, srcm
+ mov fg_dataq, r3m
+ mov scalingq, r5m
+%if STACK_ALIGNMENT < mmsize
+%define r0m [rsp+7*mmsize+ 3*gprsize]
+%define r1m [rsp+7*mmsize+ 4*gprsize]
+%define r2m [rsp+7*mmsize+ 5*gprsize]
+%define r3m [rsp+7*mmsize+ 6*gprsize]
+%define r4m [rsp+7*mmsize+ 7*gprsize]
+%define r5m [rsp+7*mmsize+ 8*gprsize]
+%define r6m [rsp+7*mmsize+ 9*gprsize]
+%define r7m [rsp+7*mmsize+10*gprsize]
+%define r8m [rsp+7*mmsize+11*gprsize]
+%define r9m [rsp+7*mmsize+12*gprsize]
+%define r10m [rsp+7*mmsize+13*gprsize]
+%define r11m [rsp+7*mmsize+14*gprsize]
+%define r12m [rsp+7*mmsize+15*gprsize]
+%endif
+ LEA r5, pb_mask
+%define base r5-pb_mask
+ mov r5m, r5
+%else
+cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
+ lea r8, [pb_mask]
+%define base r8-pb_mask
+%endif
+ mov r6d, [fg_dataq+FGData.scaling_shift]
+ movd m3, [base+mul_bits+r6*2-14]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ lea tmpd, [r6d*2]
+%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize
+ test r3, r3
+%else
+ cmp dword r12m, 0 ; is_idm
+%endif
+ movd m5, [base+min+r6*2]
+ cmovne r6d, tmpd
+ movd m4, [base+max+r6*2]
+ punpcklwd m3, m3
+ punpcklwd m5, m5
+ punpcklwd m4, m4
+ pshufd m3, m3, q0000
+ pshufd m5, m5, q0000
+ pshufd m4, m4, q0000
+ SCRATCH 3, 11, 0
+ SCRATCH 4, 12, 1
+ SCRATCH 5, 13, 2
+
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
+%endif
+
+%if %1
+ mov r6d, dword r11m
+ movd m0, [fg_dataq+FGData.uv_mult+r6*4]
+ movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4]
+ punpcklbw m6, m1, m0
+ movd m7, [fg_dataq+FGData.uv_offset+r6*4]
+ punpcklwd m6, m6
+ punpcklwd m7, m7
+ pshufd m6, m6, q0000
+ pshufd m7, m7, q0000
+ SCRATCH 6, 14, 3
+ SCRATCH 7, 15, 4
+%endif
+
+ mov sbyd, r8m
+ mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1
+ test overlapd, overlapd
+ jz %%no_vertical_overlap
+%if ARCH_X86_32
+%if %2
+ mova m1, [base+pb_23_22_h]
+%else
+ mova m1, [base+pb_27_17_17_27]
+%endif
+ mova m0, [base+pw_1024]
+%else
+%if %2
+ mova m1, [pb_23_22_h]
+%else
+ mova m1, [pb_27_17_17_27]
+%endif
+ mova m0, [pw_1024]
+%endif
+ SCRATCH 0, 8, 5
+ SCRATCH 1, 9, 6
+ test sbyd, sbyd
+ jnz %%vertical_overlap
+ ; fall-through
+
+%%no_vertical_overlap:
+ mov r8m, overlapd
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap
+ imul seed, (173 << 24) | 37
+%else
+ imul seed, sbyd, (173 << 24) | 37
+%endif
+ add seed, (105 << 24) | 178
+ rol seed, 8
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
+%define luma_bakq lumaq
+
+ mov wq, r4m
+%if %3
+ shl r10mp, 1
+%endif
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
+
+ mov lstrideq, r10mp
+%endif
+
+ mov lumaq, r9mp
+ lea src_bakq, [srcq+wq]
+ lea luma_bakq, [lumaq+wq*(1+%2)]
+ neg wq
+ sub r0mp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r11m, luma_bakq
+ mov r4m, wq
+
+ DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%else
+ mov r11mp, src_bakq
+ mov r12mp, strideq
+%endif
+
+%%loop_x:
+%if ARCH_X86_32
+ mov seed, r3m
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, unused1, unused2, lstride
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
+%endif
+
+%%loop_x_odd:
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y:
+ ; src
+%if ARCH_X86_32
+ mov lumaq, r9mp
+%endif
+%if %2
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
+
+%if %1
+%if %2
+ packuswb m4, m6 ; luma
+%endif
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5
+ vpgatherdw m5, m6, scalingq-1, r0, r5
+%else
+ vpgatherdw m7, m4, scalingq-1, r12, r2
+ vpgatherdw m5, m6, scalingq-1, r12, r2
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; unpack chroma_source
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq+ 0]
+ pcmpgtb m6, m2, m3
+ punpcklbw m2, m3, m6
+ punpckhbw m3, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; we already incremented lumaq above
+%else
+ add srcq, r12mp
+%if %3
+ lea lumaq, [lumaq+lstrideq*2]
+%else
+ add lumaq, lstrideq
+%endif
+%endif
+ add grain_lutq, 82
+ dec hw
+ jg %%loop_y
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*(1+%2)]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+%if %2 == 0
+ ; adjust top_offxy
+%if ARCH_X86_32
+ add dword [rsp+7*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ add offxyd, 16
+ btc dword r8m, 2
+ jc %%loop_x_even
+ test dword r8m, 2
+ jz %%loop_x_odd
+ jmp %%loop_x_odd_v_overlap
+%%loop_x_even:
+%endif
+ test dword r8m, 1
+ jz %%loop_x
+
+ ; r8m = sbym
+ test dword r8m, 2
+ jne %%loop_x_hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+%if ARCH_X86_32
+%if %2
+ lea r6, [offxyd+16]
+ mov [rsp+7*mmsize+0*gprsize], r6
+%else
+ mov [rsp+7*mmsize+0*gprsize], offxyd
+%endif
+
+ DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
+
+ mov seed, r3m
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, lstride
+
+%if %2
+ lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+%else
+ mov left_offxyd, offyd
+%endif
+%endif
+ mov r6d, seed
+ or seed, 0xEFF4
+ shr r6d, 1
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, unused1, unused2, lstride
+
+ mov offyd, seed
+ mov offxd, seed
+%endif
+ ror offyd, 8
+ shr offxd, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%%loop_y_h_overlap:
+ ; src
+%if ARCH_X86_32
+ mov lumaq, r9mp
+%endif
+%if %2
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
+
+%if %1
+%if %2
+ packuswb m4, m6 ; luma
+%endif
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5
+ vpgatherdw m5, m6, scalingq-1, r0, r5
+%else
+ vpgatherdw m7, m4, scalingq-1, r12, r2
+ vpgatherdw m5, m6, scalingq-1, r12, r2
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; unpack chroma_source
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m2 ; m0-1: src as word
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m4, [grain_lutq+offxyq+ 0]
+%if ARCH_X86_32
+ mov r0, [rsp+7*mmsize+0*gprsize]
+ movd m2, [grain_lutq+r0+ 0]
+%else
+ movd m2, [grain_lutq+left_offxyq+ 0]
+%endif
+ punpcklbw m2, m4
+ pmaddubsw m3, m9, m2
+ pmulhrsw m3, m8
+ packsswb m3, m3
+ shufps m3, m4, q3210
+ pxor m4, m4
+ pcmpgtb m4, m3
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m13
+ pmaxsw m1, m13
+ pminsw m0, m12
+ pminsw m1, m12
+ packuswb m0, m1
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; lumaq has already been incremented above
+%else
+ add srcq, r12mp
+%if %3
+ lea lumaq, [lumaq+lstrideq*2]
+%else
+ add lumaq, lstrideq
+%endif
+%endif
+ add grain_lutq, 82
+ dec hw
+ jg %%loop_y_h_overlap
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*(1+%2)]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+%if %2 == 0
+ xor dword r8m, 4
+ ; adjust top_offxyd
+%if ARCH_X86_32
+ add dword [rsp+7*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ add offxyd, 16
+%endif
+
+ ; r8m = sbym
+ test dword r8m, 2
+%if %2
+ jne %%loop_x_hv_overlap
+ jmp %%loop_x_h_overlap
+%else
+ jne %%loop_x_odd_v_overlap
+ jmp %%loop_x_odd
+%endif
+
+%%end:
+ RET
+
+%%vertical_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
+%else
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap
+%endif
+
+ or overlapd, 2 ; top_overlap: overlap & 2
+ mov r8m, overlapd
+ movzx sbyd, sbyb
+%if ARCH_X86_32
+ imul r4, [fg_dataq+FGData.seed], 0x00010001
+ DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused
+%else
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+%endif
+ imul tmpd, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add tmpd, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and tmpd, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, tmpd
+%if ARCH_X86_32
+ xor sbyd, seed ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak
+
+ mov r3m, seed
+ mov wq, r4m
+%if %3
+ shl r10mp, 1
+%endif
+%else
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak
+
+ mov lstrideq, r10mp
+%endif
+
+ mov lumaq, r9mp
+ lea src_bakq, [srcq+wq]
+ lea luma_bakq, [lumaq+wq*(1+%2)]
+ neg wq
+ sub r0mp, srcq
+%if ARCH_X86_32
+ mov r1m, src_bakq
+ mov r11m, luma_bakq
+ mov r4m, wq
+
+ DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
+%else
+ mov r11mp, src_bakq
+ mov r12mp, strideq
+%endif
+
+%%loop_x_v_overlap:
+%if ARCH_X86_32
+ mov seed, r3m
+ xor tmpd, tmpd
+%endif
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, overlap, top_offxy, unused, lstride
+
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak
+%endif
+
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if ARCH_X86_32
+ mov [rsp+7*mmsize+1*gprsize], top_offxyd
+
+ DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+%%loop_x_odd_v_overlap:
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+%if %3
+ mova m1, [PIC_ptr(pb_23_22)]
+%else
+ mova m1, [PIC_ptr(pb_27_17)]
+%endif
+%%loop_y_v_overlap:
+%if ARCH_X86_32
+ mov lumaq, r9mp
+%endif
+%if %2
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
+
+%if %1
+%if %2
+ packuswb m4, m6 ; luma
+%endif
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[luma_src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5
+ vpgatherdw m5, m6, scalingq-1, r0, r5
+%else
+ vpgatherdw m7, m4, scalingq-1, r12, r2
+ vpgatherdw m5, m6, scalingq-1, r12, r2
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; grain = grain_lut[offy+y][offx+x]
+ movu m3, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ mov r0, [rsp+7*mmsize+1*gprsize]
+ movu m4, [grain_lutq+r0]
+%else
+ movu m4, [grain_lutq+top_offxyq]
+%endif
+ punpckhbw m6, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, m1, m6
+ pmaddubsw m3, m1, m4
+ pmulhrsw m2, m8
+ pmulhrsw m3, m8
+ packsswb m3, m2
+ pxor m6, m6
+ pcmpgtb m6, m3
+ punpcklbw m2, m3, m6
+ punpckhbw m3, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m3, m5
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+
+ ; unpack chroma_source
+ pxor m4, m4
+ punpckhbw m6, m0, m4
+ punpcklbw m0, m4 ; m0-1: src as word
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m6, m3
+ pmaxsw m0, m13
+ pmaxsw m6, m13
+ pminsw m0, m12
+ pminsw m6, m12
+ packuswb m0, m6
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+ dec hw
+ je %%end_y_v_overlap
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; lumaq has already been incremented above
+%else
+ add srcq, r12mp
+%if %3
+ lea lumaq, [lumaq+lstrideq*2]
+%else
+ add lumaq, lstrideq
+%endif
+%endif
+ add grain_lutq, 82
+%if %3 == 0
+ btc hd, 16
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ mova m1, [PIC_ptr(pb_17_27)]
+ jnc %%loop_y_v_overlap
+%endif
+ jmp %%loop_y
+
+%%end_y_v_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*(1+%2)]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+
+%if %2
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump
+ ; back to .loop_x_v_overlap, and instead always fall-through to
+ ; h+v overlap
+%else
+%if ARCH_X86_32
+ add dword [rsp+7*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ add offxyd, 16
+ btc dword r8m, 2
+ jnc %%loop_x_odd_v_overlap
+%endif
+
+%%loop_x_hv_overlap:
+%if ARCH_X86_32
+ DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
+
+ mov r6, [rsp+7*mmsize+1*gprsize]
+%if %2
+ lea r0, [r3d+16]
+ add r6, 16
+ mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy
+%else
+ mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy
+%endif
+ mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy
+
+ DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
+
+ mov seed, r3m
+ xor tmpd, tmpd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+%if %2
+ lea topleft_offxyq, [top_offxyq+16]
+ lea left_offxyq, [offxyq+16]
+%else
+ mov topleft_offxyq, top_offxyq
+ mov left_offxyq, offxyq
+%endif
+
+ ; we assume from the block above that bits 8-15 of tmpd are zero'ed
+%endif
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp tmpb ; parity of top_seed
+ shr seed, 16
+ shl tmpd, 16
+ test seeb, seeh
+ setp tmpb ; parity of cur_seed
+ or r6d, 0x00010001
+ xor tmpd, r6d
+ mov seed, tmpd
+ ror seed, 1 ; updated (cur_seed << 16) | top_seed
+
+%if ARCH_X86_32
+ mov r3m, seed
+
+ DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx
+
+ mov offxd, offyd
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
+
+ mov offxd, seed
+ mov offyd, seed
+%endif
+ ror offyd, 8
+ ror offxd, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+%if ARCH_X86_32
+ DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
+%else
+ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
+ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak
+%endif
+
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+%if ARCH_X86_32
+ mov [rsp+7*mmsize+1*gprsize], top_offxyd
+%endif
+
+ mov hd, r7m
+ mov grain_lutq, grain_lutmp
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+%if %3
+ mova m3, [PIC_ptr(pb_23_22)]
+%else
+ mova m3, [PIC_ptr(pb_27_17)]
+%endif
+%%loop_y_hv_overlap:
+ ; grain = grain_lut[offy+y][offx+x]
+%if ARCH_X86_32
+ mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy
+ mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy
+ movd m1, [grain_lutq+r0]
+ mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy
+%else
+ movd m1, [grain_lutq+topleft_offxyq]
+%endif
+ movu m2, [grain_lutq+offxyq]
+%if ARCH_X86_32
+ movu m6, [grain_lutq+r5]
+ movd m4, [grain_lutq+r0]
+%else
+ movu m6, [grain_lutq+top_offxyq]
+ movd m4, [grain_lutq+left_offxyq]
+%endif
+ ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+ punpcklbw m1, m6
+ punpcklbw m4, m2
+ pmaddubsw m0, m9, m1
+ pmaddubsw m1, m9, m4
+ REPX {pmulhrsw x, m8}, m0, m1
+ packsswb m0, m1
+ shufps m4, m0, m2, q3232
+ shufps m0, m6, q3210
+ ; followed by v interpolation (top | cur -> cur)
+ punpcklbw m2, m0, m4
+ punpckhbw m0, m4
+ pmaddubsw m4, m3, m0
+ pmaddubsw m1, m3, m2
+ pmulhrsw m4, m8
+ pmulhrsw m1, m8
+ packsswb m1, m4
+
+ ; src
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov lumaq, r9mp
+%endif
+%if %2
+ mova m4, [lumaq+ 0]
+ mova m6, [lumaq+16]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+ mov r5, r5m
+ movd m7, [base+pb_1]
+%else
+ movd m7, [pb_1]
+%endif
+ pshufd m7, m7, q0000
+ pxor m2, m2
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pavgw m4, m2
+ pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
+
+%if %1
+%if %2
+ packuswb m4, m6 ; luma
+%endif
+ punpckhbw m6, m4, m0
+ punpcklbw m4, m0 ; { luma, chroma }
+ pmaddubsw m6, m14
+ pmaddubsw m4, m14
+ psraw m6, 6
+ psraw m4, 6
+ paddw m6, m15
+ paddw m4, m15
+ packuswb m4, m6 ; pack+unpack = clip
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
+%endif
+
+ ; scaling[src]
+%if ARCH_X86_32
+ vpgatherdw m7, m4, scalingq-1, r0, r5
+ vpgatherdw m5, m6, scalingq-1, r0, r5
+%else
+%if %3
+ vpgatherdw m7, m4, scalingq-1, r2, r12
+ vpgatherdw m5, m6, scalingq-1, r2, r12
+%else
+ vpgatherdw m7, m4, scalingq-1, r2, r13
+ vpgatherdw m5, m6, scalingq-1, r2, r13
+%endif
+%endif
+ REPX {psrlw x, 8}, m7, m5
+
+ ; unpack grain
+ pxor m4, m4
+ pcmpgtb m4, m1
+ punpcklbw m2, m1, m4
+ punpckhbw m1, m4
+
+ ; noise = round2(scaling[src] * grain, scaling_shift)
+ pmullw m2, m7
+ pmullw m1, m5
+ pmulhrsw m2, m11
+ pmulhrsw m1, m11
+
+%if ARCH_X86_32
+ DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
+%endif
+
+ ; unpack chroma source
+ pxor m4, m4
+ punpckhbw m5, m0, m4
+ punpcklbw m0, m4 ; m0-1: src as word
+
+ ; dst = clip_pixel(src, noise)
+ paddw m0, m2
+ paddw m5, m1
+ pmaxsw m0, m13
+ pmaxsw m5, m13
+ pminsw m0, m12
+ pminsw m5, m12
+ packuswb m0, m5
+ movifnidn dstq, dstmp
+ mova [dstq+srcq], m0
+
+%if ARCH_X86_32
+ add srcq, r2mp
+ ; lumaq has been adjusted above already
+%else
+ add srcq, r12mp
+%if %3
+ lea lumaq, [lumaq+lstrideq*(1+%2)]
+%else
+ add lumaq, r10mp
+%endif
+%endif
+ add grain_lutq, 82
+ dec hw
+%if %3
+ jg %%loop_y_h_overlap
+%else
+ jle %%end_y_hv_overlap
+%if ARCH_X86_32
+ mov r5, r5m
+%endif
+ mova m3, [PIC_ptr(pb_17_27)]
+ btc hd, 16
+ jnc %%loop_y_hv_overlap
+%if ARCH_X86_64
+ mov lstrideq, r10mp
+%endif
+ jmp %%loop_y_h_overlap
+%%end_y_hv_overlap:
+%if ARCH_X86_64
+ mov lstrideq, r10mp
+%endif
+%endif
+
+%if ARCH_X86_32
+ DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
+
+ mov wq, r4m
+%endif
+ add wq, 16
+ jge %%end_hv
+%if ARCH_X86_32
+ mov srcq, r1mp
+ mov lumaq, r11mp
+%else
+ mov srcq, r11mp
+%endif
+ lea lumaq, [luma_bakq+wq*(1+%2)]
+ add srcq, wq
+%if ARCH_X86_32
+ mov r4m, wq
+ mov r9m, lumaq
+%endif
+%if %2
+ jmp %%loop_x_hv_overlap
+%else
+%if ARCH_X86_32
+ add dword [rsp+7*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ add offxyd, 16
+ xor dword r8m, 4
+ jmp %%loop_x_odd_v_overlap
+%endif
+
+%%end_hv:
+ RET
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+%endmacro
+
+FGUV_FN 420, 1, 1
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+
+FGUV_FN 422, 1, 0
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+
+FGUV_FN 444, 0, 0
diff --git a/third_party/dav1d/src/x86/ipred.h b/third_party/dav1d/src/x86/ipred.h
new file mode 100644
index 0000000000..415a4d8d62
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+#define decl_fn(type, name) \
+ decl_##type##_fn(BF(dav1d_##name, ssse3)); \
+ decl_##type##_fn(BF(dav1d_##name, avx2)); \
+ decl_##type##_fn(BF(dav1d_##name, avx512icl))
+#define init_fn(type0, type1, name, suffix) \
+ c->type0[type1] = BF(dav1d_##name, suffix)
+
+#define init_angular_ipred_fn(type, name, suffix) \
+ init_fn(intra_pred, type, name, suffix)
+#define init_cfl_pred_fn(type, name, suffix) \
+ init_fn(cfl_pred, type, name, suffix)
+#define init_cfl_ac_fn(type, name, suffix) \
+ init_fn(cfl_ac, type, name, suffix)
+
+decl_fn(angular_ipred, ipred_dc);
+decl_fn(angular_ipred, ipred_dc_128);
+decl_fn(angular_ipred, ipred_dc_top);
+decl_fn(angular_ipred, ipred_dc_left);
+decl_fn(angular_ipred, ipred_h);
+decl_fn(angular_ipred, ipred_v);
+decl_fn(angular_ipred, ipred_paeth);
+decl_fn(angular_ipred, ipred_smooth);
+decl_fn(angular_ipred, ipred_smooth_h);
+decl_fn(angular_ipred, ipred_smooth_v);
+decl_fn(angular_ipred, ipred_z1);
+decl_fn(angular_ipred, ipred_z2);
+decl_fn(angular_ipred, ipred_z3);
+decl_fn(angular_ipred, ipred_filter);
+
+decl_fn(cfl_pred, ipred_cfl);
+decl_fn(cfl_pred, ipred_cfl_128);
+decl_fn(cfl_pred, ipred_cfl_top);
+decl_fn(cfl_pred, ipred_cfl_left);
+
+decl_fn(cfl_ac, ipred_cfl_ac_420);
+decl_fn(cfl_ac, ipred_cfl_ac_422);
+decl_fn(cfl_ac, ipred_cfl_ac_444);
+
+decl_fn(pal_pred, pal_pred);
+
+static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, ssse3);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, ssse3);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, ssse3);
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, ssse3);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, ssse3);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3);
+#if BITDEPTH == 8
+ init_angular_ipred_fn(Z1_PRED, ipred_z1, ssse3);
+ init_angular_ipred_fn(Z2_PRED, ipred_z2, ssse3);
+ init_angular_ipred_fn(Z3_PRED, ipred_z3, ssse3);
+#endif
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3);
+
+ init_cfl_pred_fn(DC_PRED, ipred_cfl, ssse3);
+ init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, ssse3);
+ init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, ssse3);
+ init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, ssse3);
+
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, ssse3);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3);
+
+ c->pal_pred = BF(dav1d_pal_pred, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ init_angular_ipred_fn(DC_PRED, ipred_dc, avx2);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx2);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx2);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx2);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, avx2);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, avx2);
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx2);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2);
+ init_angular_ipred_fn(Z1_PRED, ipred_z1, avx2);
+ init_angular_ipred_fn(Z2_PRED, ipred_z2, avx2);
+ init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx2);
+
+ init_cfl_pred_fn(DC_PRED, ipred_cfl, avx2);
+ init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, avx2);
+ init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, avx2);
+ init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, avx2);
+
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, avx2);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2);
+
+ c->pal_pred = BF(dav1d_pal_pred, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+#if BITDEPTH == 8
+ init_angular_ipred_fn(DC_PRED, ipred_dc, avx512icl);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx512icl);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx512icl);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl);
+#endif
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl);
+
+ c->pal_pred = BF(dav1d_pal_pred, avx512icl);
+#endif
+}
diff --git a/third_party/dav1d/src/x86/ipred16_avx2.asm b/third_party/dav1d/src/x86/ipred16_avx2.asm
new file mode 100644
index 0000000000..7ddb189916
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred16_avx2.asm
@@ -0,0 +1,4992 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 64
+
+%macro SMOOTH_WEIGHTS 1-*
+const smooth_weights_1d_16bpc ; sm_weights[] << 7
+ %rep %0
+ dw %1*128
+ %rotate 1
+ %endrep
+const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[]
+ %rep %0
+ dw %1, 256-%1
+ %rotate 1
+ %endrep
+%endmacro
+
+SMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \
+ 255, 197, 146, 105, 73, 50, 37, 32, \
+ 255, 225, 196, 170, 145, 123, 102, 84, \
+ 68, 54, 43, 33, 26, 20, 17, 16, \
+ 255, 240, 225, 210, 196, 182, 169, 157, \
+ 145, 133, 122, 111, 101, 92, 83, 74, \
+ 66, 59, 52, 45, 39, 34, 29, 25, \
+ 21, 17, 14, 12, 10, 9, 8, 8, \
+ 255, 248, 240, 233, 225, 218, 210, 203, \
+ 196, 189, 182, 176, 169, 163, 156, 150, \
+ 144, 138, 133, 127, 121, 116, 111, 106, \
+ 101, 96, 91, 86, 82, 77, 73, 69, \
+ 65, 61, 57, 54, 50, 47, 44, 41, \
+ 38, 35, 32, 29, 27, 25, 22, 20, \
+ 18, 16, 15, 13, 12, 10, 9, 8, \
+ 7, 6, 6, 5, 5, 4, 4, 4
+
+%if ARCH_X86_64
+
+ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11
+ db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15
+filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1
+filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1
+filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1
+pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
+ dw 8*64, 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64
+z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
+z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
+z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
+ db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
+pw_m1024: times 2 dw -1024
+pw_1to16: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+pw_16to1: dw 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+z2_ymul: dw 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4
+z2_ymul8: dw 1, 2, 5, 6, 3, 4, 7, 8, 5, 6, 16, 16, 7, 8
+pb_90: times 4 db 90
+z2_y_shuf_h4: dd 3, 7, 2, 6, 1, 5, 0, 4
+z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+z2_x_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+z2_y_shuf: db 6, 7, 14, 15, 4, 5, 12, 13, 4, 5, 12, 13, 2, 3, 10, 11
+z2_y_shuf_us: db 6, 7, 14, 15, 2, 3, 10, 11, 4, 5, 12, 13, 0, 1, 8, 9
+z_filter_k: dw 4, 4, 5, 5, 4, 4
+ dw 8, 8, 6, 6, 4, 4
+ dw 0, 0, 0, 0, 2, 2
+
+%define pw_2 (z_filter_k+32)
+%define pw_4 (z_filter_k+ 0)
+%define pw_16 (z2_ymul8 +20)
+
+pw_1: times 2 dw 1
+pw_3: times 2 dw 3
+pw_62: times 2 dw 62
+pw_512: times 2 dw 512
+pw_2048: times 2 dw 2048
+pd_8: dd 8
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4)
+%define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4)
+
+JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2_16bpc, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32
+JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32
+JMP_TABLE ipred_cfl_ac_444_16bpc, avx2, w4, w8, w16, w32
+JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64
+
+cextern dr_intra_derivative
+cextern filter_intra_taps
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
+ movifnidn hd, hm
+ add tlq, 2
+ movd xm4, wd
+ pxor xm3, xm3
+ pavgw xm4, xm3
+ tzcnt wd, wd
+ movd xm5, wd
+ movu m0, [tlq]
+ lea r5, [ipred_dc_left_16bpc_avx2_table]
+ movsxd r6, [r5+wq*4]
+ add r6, r5
+ add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ mov hd, hm
+ sub tlq, hq
+ movd xm4, hd
+ sub tlq, hq
+ pxor xm3, xm3
+ pavgw xm4, xm3
+ tzcnt r6d, hd
+ movd xm5, r6d
+ movu m0, [tlq]
+ lea r5, [ipred_dc_left_16bpc_avx2_table]
+ movsxd r6, [r5+r6*4]
+ add r6, r5
+ add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ paddw m0, [tlq+96]
+ paddw m0, [tlq+64]
+.h32:
+ paddw m0, [tlq+32]
+.h16:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h8:
+ psrldq xm1, xm0, 8
+ paddw xm0, xm1
+.h4:
+ punpcklwd xm0, xm3
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ paddd xm0, xm4
+ psrld xm0, xm5
+ lea stride3q, [strideq*3]
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp wq
+
+cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd xm4, r5d
+ tzcnt r5d, r5d
+ movd xm5, r5d
+ lea r5, [ipred_dc_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ pxor m3, m3
+ psrlw xm4, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movq xm0, [tlq-8]
+ jmp wq
+.w4:
+ movq xm1, [tlq+2]
+ paddw m0, m4
+ paddw m0, m1
+ psrlq m1, m0, 32
+ paddw m0, m1
+ psrld m1, m0, 16
+ paddw m0, m1
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ lea r2d, [hq*2]
+ mov r6d, 0xAAAB6667
+ shrx r6d, r6d, r2d
+ punpckhwd xm1, xm0, xm3
+ punpcklwd xm0, xm3
+ paddd xm0, xm1
+ movd xm1, r6d
+ psrld xm0, 2
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w4_end:
+ vpbroadcastw xm0, xm0
+.s4:
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm0
+ movq [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+ALIGN function_align
+.h8:
+ mova xm0, [tlq-16]
+ jmp wq
+.w8:
+ vextracti128 xm1, m0, 1
+ paddw xm0, [tlq+2]
+ paddw xm0, xm4
+ paddw xm0, xm1
+ psrld xm1, xm0, 16
+ paddw xm0, xm1
+ pblendw xm0, xm3, 0xAA
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w8_end:
+ vpbroadcastw xm0, xm0
+.s8:
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm0
+ mova [dstq+strideq*2], xm0
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-32]
+ jmp wq
+.w16:
+ paddw m0, [tlq+2]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpckhwd xm1, xm0, xm3
+ punpcklwd xm0, xm3
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w16_end:
+ vpbroadcastw m0, xm0
+.s16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-32]
+ jmp wq
+.w32:
+ paddw m0, [tlq+ 2]
+ paddw m0, [tlq+34]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpcklwd xm1, xm0, xm3
+ punpckhwd xm0, xm3
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x6667AAAB
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w32_end:
+ vpbroadcastw m0, xm0
+ mova m1, m0
+.s32:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m0
+ mova [dstq+strideq*2+32*1], m1
+ mova [dstq+stride3q +32*0], m0
+ mova [dstq+stride3q +32*1], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+ALIGN function_align
+.h64:
+ mova m0, [tlq-128]
+ mova m1, [tlq- 96]
+ paddw m0, [tlq- 64]
+ paddw m1, [tlq- 32]
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 2]
+ paddw m0, [tlq+34]
+ paddw m1, [tlq+66]
+ paddw m0, [tlq+98]
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ punpcklwd xm1, xm0, xm3
+ punpckhwd xm0, xm3
+ paddd xm1, xm4
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x6667AAAB
+ shrx r6d, r6d, hd
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w64_end:
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+.s64:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*0+32*2], m2
+ mova [dstq+strideq*0+32*3], m3
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m2
+ mova [dstq+strideq*1+32*3], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ mov r6d, r8m
+ shr r6d, 11
+ lea r5, [ipred_dc_splat_16bpc_avx2_table]
+ tzcnt wd, wd
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movu m0, [tlq+ 2]
+ movu m1, [tlq+34]
+ movu m2, [tlq+66]
+ movu m3, [tlq+98]
+ lea r5, [ipred_dc_splat_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+%macro IPRED_H 2 ; w, store_type
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ vpbroadcastw m2, [tlq-6]
+ vpbroadcastw m3, [tlq-8]
+ sub tlq, 8
+ mov%2 [dstq+strideq*0], m0
+ mov%2 [dstq+strideq*1], m1
+ mov%2 [dstq+strideq*2], m2
+ mov%2 [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w%1
+ RET
+ALIGN function_align
+%endmacro
+
+cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ lea r5, [ipred_h_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+INIT_XMM avx2
+.w4:
+ IPRED_H 4, q
+.w8:
+ IPRED_H 8, a
+INIT_YMM avx2
+.w16:
+ IPRED_H 16, a
+.w32:
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ vpbroadcastw m2, [tlq-6]
+ vpbroadcastw m3, [tlq-8]
+ sub tlq, 8
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m0
+ mova [dstq+strideq*1+32*0], m1
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m2
+ mova [dstq+strideq*2+32*1], m2
+ mova [dstq+stride3q +32*0], m3
+ mova [dstq+stride3q +32*1], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w32
+ RET
+.w64:
+ vpbroadcastw m0, [tlq-2]
+ vpbroadcastw m1, [tlq-4]
+ sub tlq, 4
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m0
+ mova [dstq+strideq*0+32*2], m0
+ mova [dstq+strideq*0+32*3], m0
+ mova [dstq+strideq*1+32*0], m1
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m1
+ mova [dstq+strideq*1+32*3], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64
+ RET
+
+%macro PAETH 3 ; top, signed_ldiff, ldiff
+ paddw m0, m%2, m1
+ psubw m7, m3, m0 ; tldiff
+ psubw m0, m%1 ; tdiff
+ pabsw m7, m7
+ pabsw m0, m0
+ pminsw m7, m0
+ pcmpeqw m0, m7
+ pcmpgtw m7, m%3, m7
+ vpblendvb m0, m3, m%1, m0
+ vpblendvb m0, m1, m0, m7
+%endmacro
+
+cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h
+%define base r5-ipred_paeth_16bpc_avx2_table
+ movifnidn hd, hm
+ lea r5, [ipred_paeth_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ vpbroadcastw m3, [tlq] ; topleft
+ add wq, r5
+ jmp wq
+.w4:
+ vpbroadcastq m2, [tlq+2] ; top
+ movsldup m6, [base+ipred_hv_shuf]
+ lea r3, [strideq*3]
+ psubw m4, m2, m3
+ pabsw m5, m4
+.w4_loop:
+ sub tlq, 8
+ vpbroadcastq m1, [tlq]
+ pshufb m1, m6 ; left
+ PAETH 2, 4, 5
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ vbroadcasti128 m2, [tlq+2]
+ movsldup m6, [base+ipred_hv_shuf]
+ psubw m4, m2, m3
+ pabsw m5, m4
+.w8_loop:
+ sub tlq, 4
+ vpbroadcastd m1, [tlq]
+ pshufb m1, m6
+ PAETH 2, 4, 5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ movu m2, [tlq+2]
+ psubw m4, m2, m3
+ pabsw m5, m4
+.w16_loop:
+ sub tlq, 2
+ vpbroadcastw m1, [tlq]
+ PAETH 2, 4, 5
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m2, [tlq+2]
+ movu m6, [tlq+34]
+%if WIN64
+ movaps r4m, xmm8
+ movaps r6m, xmm9
+%endif
+ psubw m4, m2, m3
+ psubw m8, m6, m3
+ pabsw m5, m4
+ pabsw m9, m8
+.w32_loop:
+ sub tlq, 2
+ vpbroadcastw m1, [tlq]
+ PAETH 2, 4, 5
+ mova [dstq+32*0], m0
+ PAETH 6, 8, 9
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+%if WIN64
+ movaps xmm8, r4m
+ movaps xmm9, r6m
+%endif
+ RET
+ALIGN function_align
+.w64:
+ WIN64_SPILL_XMM 16
+ movu m2, [tlq+ 2]
+ movu m6, [tlq+34]
+ movu m10, [tlq+66]
+ movu m13, [tlq+98]
+ psubw m4, m2, m3
+ psubw m8, m6, m3
+ psubw m11, m10, m3
+ psubw m14, m13, m3
+ pabsw m5, m4
+ pabsw m9, m8
+ pabsw m12, m11
+ pabsw m15, m14
+.w64_loop:
+ sub tlq, 2
+ vpbroadcastw m1, [tlq]
+ PAETH 2, 4, 5
+ mova [dstq+32*0], m0
+ PAETH 6, 8, 9
+ mova [dstq+32*1], m0
+ PAETH 10, 11, 12
+ mova [dstq+32*2], m0
+ PAETH 13, 14, 15
+ mova [dstq+32*3], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights
+%define base r6-ipred_smooth_v_16bpc_avx2_table
+ lea r6, [ipred_smooth_v_16bpc_avx2_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ lea weightsq, [base+smooth_weights_1d_16bpc+hq*4]
+ neg hq
+ vpbroadcastw m5, [tlq+hq*2] ; bottom
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastq m4, [tlq+2] ; top
+ movsldup m3, [base+ipred_hv_shuf]
+ lea r6, [strideq*3]
+ psubw m4, m5 ; top - bottom
+.w4_loop:
+ vpbroadcastq m0, [weightsq+hq*2]
+ pshufb m0, m3
+ pmulhrsw m0, m4
+ paddw m0, m5
+ vextracti128 xm1, m0, 1
+ movhps [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movq [dstq+r6 ], xm0
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w4_loop
+.ret:
+ RET
+.w8:
+ vbroadcasti128 m4, [tlq+2]
+ movsldup m3, [base+ipred_hv_shuf]
+ lea r6, [strideq*3]
+ psubw m4, m5
+.w8_loop:
+ vpbroadcastd m0, [weightsq+hq*2+0]
+ vpbroadcastd m1, [weightsq+hq*2+4]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ vextracti128 [dstq+strideq*0], m0, 1
+ mova [dstq+strideq*1], xm0
+ vextracti128 [dstq+strideq*2], m1, 1
+ mova [dstq+r6 ], xm1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w8_loop
+ RET
+.w16:
+ movu m4, [tlq+2]
+ lea r6, [strideq*3]
+ psubw m4, m5
+.w16_loop:
+ vpbroadcastw m0, [weightsq+hq*2+0]
+ vpbroadcastw m1, [weightsq+hq*2+2]
+ vpbroadcastw m2, [weightsq+hq*2+4]
+ vpbroadcastw m3, [weightsq+hq*2+6]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r6 ], m3
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w16_loop
+ RET
+.w32:
+ WIN64_SPILL_XMM 7
+ movu m4, [tlq+ 2]
+ movu m6, [tlq+34]
+ psubw m4, m5
+ psubw m6, m5
+.w32_loop:
+ vpbroadcastw m1, [weightsq+hq*2+0]
+ vpbroadcastw m3, [weightsq+hq*2+2]
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m6
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m6
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w32_loop
+ RET
+.w64:
+ WIN64_SPILL_XMM 8
+ movu m3, [tlq+ 2]
+ movu m4, [tlq+34]
+ movu m6, [tlq+66]
+ movu m7, [tlq+98]
+ REPX {psubw x, m5}, m3, m4, m6, m7
+.w64_loop:
+ vpbroadcastw m2, [weightsq+hq*2]
+ pmulhrsw m0, m3, m2
+ pmulhrsw m1, m4, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*0], m0
+ pmulhrsw m0, m6, m2
+ mova [dstq+32*1], m1
+ pmulhrsw m1, m7, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ add dstq, strideq
+ inc hq
+ jl .w64_loop
+ RET
+
+cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+%define base r6-ipred_smooth_h_16bpc_avx2_table
+ lea r6, [ipred_smooth_h_16bpc_avx2_table]
+ mov wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m5, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ add hd, hd
+ movsxd wq, [r6+wq*4]
+ sub tlq, hq
+ lea stride3q, [strideq*3]
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastq m4, [base+smooth_weights_1d_16bpc+4*2]
+ movsldup m3, [base+ipred_hv_shuf]
+.w4_loop:
+ vpbroadcastq m0, [tlq+hq-8] ; left
+ pshufb m0, m3
+ psubw m0, m5 ; left - right
+ pmulhrsw m0, m4
+ paddw m0, m5
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w4_loop
+ RET
+.w8:
+ vbroadcasti128 m4, [base+smooth_weights_1d_16bpc+8*2]
+ movsldup m3, [base+ipred_hv_shuf]
+.w8_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ vpbroadcastd m1, [tlq+hq-8]
+ pshufb m0, m3
+ pshufb m1, m3
+ psubw m0, m5
+ psubw m1, m5
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w8_loop
+ RET
+.w16:
+ movu m4, [base+smooth_weights_1d_16bpc+16*2]
+.w16_loop:
+ vpbroadcastq m3, [tlq+hq-8]
+ punpcklwd m3, m3
+ psubw m3, m5
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w16_loop
+ RET
+.w32:
+ WIN64_SPILL_XMM 7
+ movu m4, [base+smooth_weights_1d_16bpc+32*2]
+ movu m6, [base+smooth_weights_1d_16bpc+32*3]
+.w32_loop:
+ vpbroadcastw m1, [tlq+hq-2]
+ vpbroadcastw m3, [tlq+hq-4]
+ psubw m1, m5
+ psubw m3, m5
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m6
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m6
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w32_loop
+ RET
+.w64:
+ WIN64_SPILL_XMM 8
+ movu m3, [base+smooth_weights_1d_16bpc+32*4]
+ movu m4, [base+smooth_weights_1d_16bpc+32*5]
+ movu m6, [base+smooth_weights_1d_16bpc+32*6]
+ movu m7, [base+smooth_weights_1d_16bpc+32*7]
+.w64_loop:
+ vpbroadcastw m2, [tlq+hq-2]
+ psubw m2, m5
+ pmulhrsw m0, m3, m2
+ pmulhrsw m1, m4, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*0], m0
+ pmulhrsw m0, m6, m2
+ mova [dstq+32*1], m1
+ pmulhrsw m1, m7, m2
+ paddw m0, m5
+ paddw m1, m5
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ add dstq, strideq
+ sub hq, 1*2
+ jg .w64_loop
+ RET
+
+%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
+ pmaddwd m0, m%1, m%3
+ pmaddwd m1, m%2, m%4
+ paddd m0, m%5
+ paddd m1, m%6
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pavgw m0, m5
+%endmacro
+
+cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
+%define base r6-ipred_smooth_16bpc_avx2_table
+ lea r6, [ipred_smooth_16bpc_avx2_table]
+ mov wd, wm
+ vpbroadcastw m4, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ mov hd, hm
+ sub tlq, hq
+ sub tlq, hq
+ movsxd wq, [r6+wq*4]
+ pxor m5, m5
+ add wq, r6
+ lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*4]
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 11
+ vpbroadcastw m0, [tlq] ; bottom
+ vpbroadcastq m6, [tlq+hq*2+2]
+ movsldup m7, [base+ipred_hv_shuf]
+ movshdup m9, [base+ipred_hv_shuf]
+ vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+4*4]
+ punpcklwd m6, m0 ; top, bottom
+ punpcklqdq m8, m9, m9
+ punpckhqdq m9, m9
+ lea r3, [strideq*3]
+.w4_loop:
+ vpbroadcastq m3, [tlq+hq*2-8]
+ vbroadcasti128 m1, [v_weightsq]
+ pshufb m3, m7
+ punpcklwd m2, m3, m4 ; left, right
+ punpckhwd m3, m4
+ pmaddwd m2, m10
+ pmaddwd m3, m10
+ pshufb m0, m1, m8
+ pshufb m1, m9
+ SMOOTH_2D_END 0, 1, 6, 6, 2, 3
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ add v_weightsq, 16
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8:
+%assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+ vpbroadcastw m0, [tlq] ; bottom
+ vbroadcasti128 m7, [tlq+hq*2+2]
+ movsldup m8, [base+ipred_hv_shuf]
+ movshdup m9, [base+ipred_hv_shuf]
+ vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+8*4+16*0]
+ vbroadcasti128 m11, [base+smooth_weights_2d_16bpc+8*4+16*1]
+ punpcklwd m6, m7, m0 ; top, bottom
+ punpckhwd m7, m0
+.w8_loop:
+ vpbroadcastd m3, [tlq+hq*2-4]
+ vpbroadcastq m1, [v_weightsq]
+ pshufb m3, m8
+ punpcklwd m2, m3, m4 ; left, right
+ punpckhwd m3, m4
+ pmaddwd m2, m10
+ pmaddwd m3, m11
+ pshufb m1, m9
+ SMOOTH_2D_END 1, 1, 6, 7, 2, 3
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ add v_weightsq, 8
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+%assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 11
+ vpbroadcastw m0, [tlq] ; bottom
+ movu m7, [tlq+hq*2+2]
+ mova xm8, [base+smooth_weights_2d_16bpc+16*4+16*0]
+ mova xm9, [base+smooth_weights_2d_16bpc+16*4+16*1]
+ vinserti128 m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1
+ vinserti128 m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1
+ punpcklwd m6, m7, m0 ; top, bottom
+ punpckhwd m7, m0
+.w16_loop:
+ vpbroadcastd m3, [tlq+hq*2-4]
+ vpbroadcastd m1, [v_weightsq+0]
+ punpcklwd m3, m4 ; left, right
+ pshufd m2, m3, q1111
+ pmaddwd m10, m8, m2
+ pmaddwd m2, m9
+ pshufd m3, m3, q0000
+ SMOOTH_2D_END 1, 1, 6, 7, 10, 2
+ vpbroadcastd m1, [v_weightsq+4]
+ pmaddwd m2, m8, m3
+ pmaddwd m3, m9
+ mova [dstq+strideq*0], m0
+ SMOOTH_2D_END 1, 1, 6, 7, 2, 3
+ mova [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ add v_weightsq, 8
+ sub hq, 2
+ jg .w16_loop
+ RET
+.w32:
+%assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 15
+ vpbroadcastw m0, [tlq] ; bottom
+ movu m7, [tlq+hq*2+ 2]
+ movu m9, [tlq+hq*2+34]
+ mova xm10, [base+smooth_weights_2d_16bpc+32*4+16*0]
+ mova xm11, [base+smooth_weights_2d_16bpc+32*4+16*1]
+ vinserti128 m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1
+ vinserti128 m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1
+ mova xm12, [base+smooth_weights_2d_16bpc+32*4+16*4]
+ mova xm13, [base+smooth_weights_2d_16bpc+32*4+16*5]
+ vinserti128 m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1
+ vinserti128 m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1
+ punpcklwd m6, m7, m0
+ punpckhwd m7, m0
+ punpcklwd m8, m9, m0
+ punpckhwd m9, m0
+.w32_loop:
+ vpbroadcastw m3, [tlq+hq*2-2]
+ vpbroadcastd m14, [v_weightsq]
+ punpcklwd m3, m4
+ pmaddwd m1, m10, m3
+ pmaddwd m2, m11, m3
+ pmaddwd m0, m6, m14
+ paddd m0, m1
+ pmaddwd m1, m7, m14
+ paddd m1, m2
+ pmaddwd m2, m12, m3
+ pmaddwd m3, m13
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pavgw m0, m5
+ mova [dstq+32*0], m0
+ SMOOTH_2D_END 14, 14, 8, 9, 2, 3
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ add v_weightsq, 4
+ dec hd
+ jg .w32_loop
+ RET
+.w64:
+%assign stack_offset stack_offset - stack_size_padded
+ PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base
+ mov dst_baseq, dstq
+ mov tl_baseq, tlq
+ mov v_weights_baseq, v_weightsq
+ xor xq, xq
+.w64_loop_x:
+ mov yq, hq
+ lea tlq, [tl_baseq+hq*2]
+ vpbroadcastw m0, [tl_baseq] ; bottom
+ movu m7, [tlq+xq*2+ 2]
+ movu m9, [tlq+xq*2+34]
+ mova xm10, [base+smooth_weights_2d_16bpc+64*4+16*0]
+ mova xm11, [base+smooth_weights_2d_16bpc+64*4+16*1]
+ vinserti128 m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1
+ vinserti128 m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1
+ mova xm12, [base+smooth_weights_2d_16bpc+64*4+16*4]
+ mova xm13, [base+smooth_weights_2d_16bpc+64*4+16*5]
+ vinserti128 m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1
+ vinserti128 m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1
+ punpcklwd m6, m7, m0
+ punpckhwd m7, m0
+ punpcklwd m8, m9, m0
+ punpckhwd m9, m0
+ lea tlq, [tl_baseq-2]
+.w64_loop_y:
+ vpbroadcastw m3, [tlq+yq*2]
+ vpbroadcastd m1, [v_weightsq]
+ punpcklwd m3, m4
+ pmaddwd m14, m10, m3
+ pmaddwd m15, m11, m3
+ pmaddwd m2, m12, m3
+ pmaddwd m3, m13
+ pmaddwd m0, m6, m1
+ paddd m0, m14
+ pmaddwd m14, m7, m1
+ paddd m14, m15
+ psrld m0, 8
+ psrld m14, 8
+ packssdw m0, m14
+ pavgw m0, m5
+ mova [dstq+32*0], m0
+ SMOOTH_2D_END 8, 9, 1, 1, 2, 3
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ add v_weightsq, 4
+ dec yq
+ jg .w64_loop_y
+ lea dstq, [dst_baseq+32*2]
+ add r6, 16*8
+ mov v_weightsq, v_weights_baseq
+ add xq, 32
+ test xb, 64
+ jz .w64_loop_x
+ RET
+
+cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z1_16bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea r7, [dr_intra_derivative]
+ movsxd wq, [r6+wq*4]
+ add tlq, 2
+ add wq, r6
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ movzx dxd, word [r7+dxq]
+ xor angled, 0x4ff ; d = 90 - angle
+ vpbroadcastd m5, [pw_62]
+ jmp wq
+.w4:
+ ALLOC_STACK -64, 7
+ cmp angleb, 40
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ vpbroadcastw xm3, [tlq+14]
+ movu xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8
+ palignr xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8
+ paddw xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7
+ add dxd, dxd
+ palignr xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8
+ paddw xm2, xm1 ; -1 * a + 9 * b + 9 * c + -1 * d
+ psubw xm0, xm2, xm0 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4
+ psraw xm0, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1
+ pxor xm4, xm4
+ paddw xm2, xm0
+ vpbroadcastw xm0, r8m ; pixel_max
+ mova [rsp+32], xm3
+ movd xm3, dxd
+ pmaxsw xm2, xm4
+ mov r3d, dxd
+ pavgw xm2, xm4
+ vpbroadcastw m3, xm3
+ pminsw xm2, xm0
+ punpcklwd xm0, xm1, xm2
+ punpckhwd xm1, xm2
+ lea r5, [strideq*3]
+ pslldq m2, m3, 8
+ mova [rsp+ 0], xm0
+ mova [rsp+16], xm1
+ paddw m6, m3, m3
+ paddw m3, m2
+ vpblendd m4, m6, 0xf0
+ paddw m6, m6
+ paddw m3, m4 ; xpos0 xpos1 xpos2 xpos3
+ vbroadcasti128 m4, [z_upsample]
+.w4_upsample_loop:
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm1, [rsp+r3*2]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movu xm2, [rsp+r2*2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base2
+ vinserti128 m1, [rsp+r3*2], 1 ; 0 2
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base3
+ vinserti128 m2, [rsp+r2*2], 1 ; 1 3
+ pshufb m1, m4
+ pshufb m2, m4
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pand m2, m5, m3 ; frac
+ psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6
+ psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6)
+ pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15)
+ paddw m3, m6 ; xpos += dx
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r5 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; w4/w8/w16
+%define base r3-z_filter_t0
+ movd xm0, maxbased
+ lea r3, [z_filter_t0]
+ movd xm1, angled
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m1, xm1
+ pcmpeqb m0, [base+z_filter_wh]
+ mova xm2, [r3+angleq*8]
+ pand m0, m1
+ pcmpgtb m0, m2
+ pmovmskb r5d, m0
+ ret
+.w4_no_upsample:
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ lea maxbased, [hq+3]
+ call .filter_strength
+ mov maxbased, 7
+ test r5d, r5d
+ jz .w4_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastw xm3, [tlq+14]
+ mova xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7
+ vpbroadcastd xm1, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0]
+ palignr xm2, xm3, xm0, 4 ; 2 3 4 5 6 7 8 8
+ pmullw xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8
+ paddw xm2, xm0
+ pmullw xm2, xm4
+ movd [rsp+16], xm3
+ cmp r5d, 3
+ jne .w4_3tap
+ paddw xm1, xm2
+ palignr xm2, xm3, xm0, 6 ; 3 4 5 6 7 8 8 8
+ pblendw xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6
+ movzx r3d, word [tlq+14]
+ movzx r2d, word [tlq+12]
+ inc maxbased
+ paddw xm2, xm0
+ sub r2d, r3d
+ paddw xm2, xm2
+ lea r2d, [r2+r3*8+4]
+ shr r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3
+ mov [rsp+16], r2w
+.w4_3tap:
+ pxor xm0, xm0
+ paddw xm1, xm2
+ mov tlq, rsp
+ psrlw xm1, 3
+ cmp hd, 8
+ sbb maxbased, -1
+ pavgw xm0, xm1
+ mova [tlq], xm0
+.w4_main:
+ movd xm3, dxd
+ vpbroadcastq m1, [z_base_inc]
+ vpbroadcastw m6, [tlq+maxbaseq*2] ; top[max_base_x]
+ shl maxbased, 6
+ vpbroadcastw m3, xm3
+ movd xm0, maxbased
+ mov r3d, dxd ; xpos
+ vpbroadcastw m0, xm0
+ paddw m4, m3, m3
+ psubw m1, m0 ; -max_base_x
+ vpblendd m3, m4, 0xcc
+ paddw m0, m4, m3
+ vpblendd m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3
+ paddw m4, m4
+ paddw m3, m1
+.w4_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm1, [tlq+r3*2]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ movu xm2, [tlq+r5*2]
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ vinserti128 m1, [tlq+r3*2], 1 ; 0 2
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ vinserti128 m2, [tlq+r5*2], 1 ; 1 3
+ punpcklqdq m0, m1, m2
+ psrldq m1, 2
+ pslldq m2, 6
+ vpblendd m1, m2, 0xcc
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15 ; xpos < max_base_x
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ sub hd, 4
+ jz .w4_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w4_loop
+ lea r6, [strideq*3]
+.w4_end_loop:
+ movq [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm6
+ movq [dstq+strideq*2], xm6
+ movq [dstq+r6 ], xm6
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_end_loop
+.w4_end:
+ RET
+.w8:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 7
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ movu m2, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g _
+ movu m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g _ _
+ movu m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ cmp hd, 4
+ jne .w8_upsample_h8 ; awkward single-pixel edge case
+ vpblendd m0, m2, 0x20 ; 3 4 5 6 7 8 9 a b c c _ _ _ _ _
+.w8_upsample_h8:
+ paddw m2, m1
+ paddw m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ add dxd, dxd
+ psubw m0, m2, m0
+ psraw m0, 3
+ pxor m4, m4
+ paddw m2, m0
+ vpbroadcastw m0, r8m
+ movd xm3, dxd
+ pmaxsw m2, m4
+ mov r3d, dxd
+ pavgw m2, m4
+ vpbroadcastw m3, xm3
+ pminsw m2, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ vbroadcasti128 m4, [z_upsample]
+ mova [rsp+ 0], xm0
+ mova [rsp+16], xm1
+ paddw m6, m3, m3
+ vextracti128 [rsp+32], m0, 1
+ vextracti128 [rsp+48], m1, 1
+ vpblendd m3, m6, 0xf0 ; xpos0 xpos1
+.w8_upsample_loop:
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm1, [rsp+r3*2]
+ movu xm2, [rsp+r3*2+16]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base1
+ vinserti128 m1, [rsp+r2*2], 1
+ vinserti128 m2, [rsp+r2*2+16], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m3, m6
+ paddw m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_upsample_loop
+ RET
+.w8_no_intra_edge_filter:
+ and maxbased, 7
+ or maxbased, 8 ; imin(h+7, 15)
+ jmp .w8_main
+.w8_no_upsample:
+ lea maxbased, [hq+7]
+ test angled, 0x400
+ jnz .w8_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .w8_main
+ popcnt r5d, r5d
+ vpbroadcastd m1, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0]
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ movu m2, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m2
+ cmp hd, 8
+ jl .w8_filter_h4
+ punpckhwd m2, m2
+ vpblendd m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ je .w8_filter_end ; 8x4 and 8x8 are always 3-tap
+ movzx r3d, word [tlq+30]
+ mov maxbased, 16
+ mov [rsp+32], r3d
+ cmp r5d, 3
+ jne .w8_filter_end
+ punpcklwd xm6, xm0, xm0
+ vpblendd m2, [tlq+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g
+ vpblendd m6, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movzx r5d, word [tlq+28]
+ mov [rsp+34], r3w
+ paddw m2, m6
+ sub r5d, r3d
+ inc maxbased
+ paddw m2, m2
+ lea r3d, [r5+r3*8+4]
+ paddw m1, m2
+ shr r3d, 3
+ mov [rsp+32], r3w
+ jmp .w8_filter_end
+.w8_filter_h4:
+ pshuflw m3, m2, q3321
+ vinserti128 m3, [tlq+2], 0 ; 2 3 4 5 6 7 8 9 a b c c _ _ _ _
+.w8_filter_end:
+ paddw m0, m3
+ pmullw m0, m4
+ mov tlq, rsp
+ pxor m2, m2
+ paddw m0, m1
+ psrlw m0, 3
+ pavgw m0, m2
+ mova [tlq], m0
+.w8_main:
+ movd xm3, dxd
+ vbroadcasti128 m1, [z_base_inc]
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m3, xm3
+ movd xm0, maxbased
+ mov r3d, dxd
+ vpbroadcastw m0, xm0
+ paddw m4, m3, m3
+ psubw m1, m0
+ vpblendd m3, m4, 0xf0 ; xpos0 xpos1
+ paddw m3, m1
+.w8_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6
+ movu xm0, [tlq+r3*2]
+ movu xm1, [tlq+r3*2+2]
+ lea r3d, [r5+dxq]
+ shr r5d, 6
+ vinserti128 m0, [tlq+r5*2], 1
+ vinserti128 m1, [tlq+r5*2+2], 1
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w8_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w8_loop
+.w8_end_loop:
+ mova [dstq+strideq*0], xm6
+ mova [dstq+strideq*1], xm6
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16_no_intra_edge_filter:
+ and maxbased, 15
+ or maxbased, 16 ; imin(h+15, 31)
+ jmp .w16_main
+.w16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 7
+ lea maxbased, [hq+15]
+ test angled, 0x400
+ jnz .w16_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .w16_main
+ popcnt r5d, r5d
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ cmp r5d, 3
+ jne .w16_filter_3tap
+ vpbroadcastd m2, [base+pw_3]
+ punpcklwd xm0, xm0
+ vpblendd m0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ paddw m0, m2
+ pavgw m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ paddw m0, m1
+ psrlw m0, 2
+ movu m3, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m3, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ cmp hd, 8
+ jl .w16_filter_5tap_h4
+ punpckhwd m3, m3
+ je .w16_filter_5tap_h8
+ vpblendd m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m3, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ movzx r3d, word [tlq+62]
+ movzx r2d, word [tlq+60]
+ pavgw m2, m4
+ sub r2d, r3d
+ paddw m1, m3
+ lea r2d, [r2+r3*8+4]
+ paddw m1, m2
+ shr r2d, 3
+ psrlw m1, 2
+ mov [rsp+66], r3w
+ mov [rsp+64], r2w
+ mov tlq, rsp
+ mov r3d, 33
+ cmp hd, 16
+ cmovg maxbased, r3d
+ jmp .w16_filter_end2
+.w16_filter_5tap_h8:
+ vpblendd xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9
+ vpblendd xm3, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9
+ pavgw xm2, xm4
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2
+ jmp .w16_filter_end2
+.w16_filter_5tap_h4:
+ pshuflw xm4, xm3, q3332 ; 4 5 5 5
+ pshuflw xm3, xm3, q3321 ; 3 4 5 5
+ pavgw xm2, xm4
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2
+ jmp .w16_filter_end2
+.w16_filter_3tap:
+ vpbroadcastd m3, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0]
+ pmullw m0, m3, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ movu m2, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m4
+ pmullw m3, m2
+ paddw m0, m1
+ cmp hd, 8
+ je .w16_filter_3tap_h8
+ jl .w16_filter_3tap_h4
+ punpckhwd m2, m2
+ vpblendd m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ jmp .w16_filter_end
+.w16_filter_3tap_h4:
+ pshuflw xm2, xm2, q3321 ; 2 3 4 4 _ _ _ _
+ jmp .w16_filter_end
+.w16_filter_3tap_h8:
+ psrldq xm2, 2
+ pshufhw xm2, xm2, q2210 ; 2 3 4 5 6 7 8 8
+.w16_filter_end:
+ paddw m2, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ pmullw m2, m4
+ psrlw m0, 3
+ pxor m1, m1
+ paddw m2, m3
+ psrlw m2, 3
+ pavgw m0, m1
+ pavgw m1, m2
+.w16_filter_end2:
+ mov tlq, rsp
+ mova [tlq+ 0], m0
+ mova [tlq+32], m1
+.w16_main:
+ movd xm4, dxd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ movd xm0, maxbased
+ mov r3d, dxd
+ vpbroadcastw m0, xm0
+ paddw m3, m4, [z_base_inc]
+ psubw m3, m0
+.w16_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6
+ movu m0, [tlq+r3*2]
+ movu m1, [tlq+r3*2+2]
+ lea r3d, [r5+dxq]
+ shr r5d, 6
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m1, m0
+ movu m0, [tlq+r5*2]
+ vpblendvb m2, m6, m1, m2
+ movu m1, [tlq+r5*2+2]
+ mova [dstq+strideq*0], m2
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jz .w16_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w16_loop
+.w16_end_loop:
+ mova [dstq+strideq*0], m6
+ mova [dstq+strideq*1], m6
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_end_loop
+.w16_end:
+ RET
+.w32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -160, 8
+ lea maxbased, [hq+31]
+ mov r3d, 63
+ cmp hd, 32
+ cmova maxbased, r3d
+ test angled, 0x400
+ jnz .w32_main
+ vpbroadcastd m2, [pw_3]
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ paddw m1, m2
+ paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ mov r3, rsp
+ paddw m0, m1
+ lea r5d, [maxbaseq-31]
+ psrlw m0, 2
+ mova [r3], m0
+.w32_filter_loop:
+ mova m0, [tlq+30]
+ paddw m1, m2, [tlq+28]
+ add tlq, 32
+ paddw m0, [tlq+0]
+ pavgw m1, [tlq+4]
+ paddw m0, [tlq+2]
+ add r3, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r3], m0
+ sub r5d, 16
+ jg .w32_filter_loop
+ movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ punpckhwd m1, m0, m0
+ paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ jl .w32_filter_h8
+ vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ movzx r5d, word [tlq+62]
+ movzx r2d, word [tlq+60]
+ pavgw m2, m3
+ sub r2d, r5d
+ paddw m0, m1
+ lea r2d, [r2+r5*8+4]
+ paddw m0, m2
+ shr r2d, 3
+ psrlw m0, 2
+ mova [r3+32], m0
+ mov [r3+66], r5w
+ mov [r3+64], r2w
+ mov tlq, rsp
+ mov r3d, 65
+ cmp hd, 64
+ cmove maxbased, r3d
+ jmp .w32_main
+.w32_filter_h8:
+ vpblendd xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9
+ vpblendd xm1, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9
+ pavgw xm2, xm3
+ paddw xm0, xm1
+ mov tlq, rsp
+ paddw xm0, xm2
+ psrlw xm0, 2
+ mova [r3+32], xm0
+.w32_main:
+ movd xm4, dxd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ movd xm0, maxbased
+ mov r5d, dxd
+ vpbroadcastd m7, [pw_m1024] ; -16 * 64
+ vpbroadcastw m0, xm0
+ paddw m3, m4, [z_base_inc]
+ psubw m3, m0
+.w32_loop:
+ mov r3d, r5d
+ shr r3d, 6
+ movu m0, [tlq+r3*2]
+ movu m1, [tlq+r3*2+2]
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ psraw m1, m3, 15
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*0], m0
+ movu m0, [tlq+r3*2+32]
+ movu m1, [tlq+r3*2+34]
+ add r5d, dxd
+ psubw m1, m0
+ pmulhrsw m1, m2
+ pcmpgtw m2, m7, m3
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+32*1], m0
+ dec hd
+ jz .w32_end
+ add dstq, strideq
+ cmp r5d, maxbased
+ jb .w32_loop
+.w32_end_loop:
+ mova [dstq+32*0], m6
+ mova [dstq+32*1], m6
+ add dstq, strideq
+ dec hd
+ jg .w32_end_loop
+.w32_end:
+ RET
+.w64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -256, 10
+ lea maxbased, [hq+63]
+ test angled, 0x400
+ jnz .w64_main
+ vpbroadcastd m2, [pw_3]
+ mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ paddw m1, m2
+ paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ mov r3, rsp
+ paddw m0, m1
+ lea r5d, [hq+32]
+ psrlw m0, 2
+ mova [r3], m0
+.w64_filter_loop:
+ mova m0, [tlq+30]
+ paddw m1, m2, [tlq+28]
+ add tlq, 32
+ paddw m0, [tlq+0]
+ pavgw m1, [tlq+4]
+ paddw m0, [tlq+2]
+ add r3, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r3], m0
+ sub r5d, 16
+ jg .w64_filter_loop
+ movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ punpckhwd m1, m0, m0
+ paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ pavgw m2, m3
+ paddw m0, m1
+ paddw m0, m2
+ mov tlq, rsp
+ psrlw m0, 2
+ mova [r3+32], m0
+.w64_main:
+ movd xm4, dxd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ movd xm0, maxbased
+ mov r5d, dxd
+ vpbroadcastd m7, [pw_m1024] ; -16 * 64
+ vpbroadcastw m0, xm0
+ paddw m3, m4, [z_base_inc]
+ paddw m8, m7, m7 ; -32 * 64
+ psubw m3, m0
+ paddw m9, m8, m7 ; -48 * 64
+.w64_loop:
+ mov r3d, r5d
+ shr r3d, 6
+ movu m0, [tlq+r3*2]
+ movu m1, [tlq+r3*2+2]
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ psraw m1, m3, 15
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*0], m0
+ movu m0, [tlq+r3*2+32]
+ movu m1, [tlq+r3*2+34]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m7, m3
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*1], m0
+ movu m0, [tlq+r3*2+64]
+ movu m1, [tlq+r3*2+66]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m8, m3
+ vpblendvb m0, m6, m0, m1
+ mova [dstq+32*2], m0
+ movu m0, [tlq+r3*2+96]
+ movu m1, [tlq+r3*2+98]
+ add r5d, dxd
+ psubw m1, m0
+ pmulhrsw m1, m2
+ pcmpgtw m2, m9, m3
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [dstq+32*3], m0
+ dec hd
+ jz .w64_end
+ add dstq, strideq
+ cmp r5d, maxbased
+ jb .w64_loop
+.w64_end_loop:
+ mova [dstq+32*0], m6
+ mova [dstq+32*1], m6
+ mova [dstq+32*2], m6
+ mova [dstq+32*3], m6
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+
+cglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy
+%define base r9-z_filter_t0
+ lea r9, [ipred_z2_16bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea dxq, [dr_intra_derivative-90]
+ movsxd wq, [r9+wq*4]
+ mova m1, [tlq- 0]
+ movzx dyd, angleb
+ xor angled, 0x400
+ mova m2, [tlq- 32]
+ mov r8, dxq
+ sub dxq, dyq
+ mova m3, [tlq- 64]
+ add wq, r9
+ add r9, z_filter_t0-ipred_z2_16bpc_avx2_table
+ mova m4, [tlq- 96]
+ and dyd, ~1
+ mova m5, [tlq-128]
+ and dxq, ~1
+ movzx dyd, word [r8+dyq] ; angle - 90
+ movzx dxd, word [dxq+270] ; 180 - angle
+ vpbroadcastd m11, [base+pw_62]
+ mova [rsp+128], m1
+ mova [rsp+ 96], m2
+ mova [rsp+ 64], m3
+ neg dxd
+ mova [rsp+ 32], m4
+ neg dyq
+ mova [rsp+ 0], m5
+ jmp wq
+.w4:
+ vbroadcasti128 m10, [base+z2_x_shuf]
+ vpbroadcastq m6, [base+z_base_inc+2]
+ lea r8d, [dxq+(65<<6)] ; xpos
+ mov r10d, (63-4)<<6
+ test angled, 0x400
+ jnz .w4_main ; !enable_intra_edge_filter
+ lea r3d, [hq+2]
+ add angled, 1022
+ shl r3d, 6
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ movq xm0, [tlq+2] ; 1 2 3 4
+ movq xm1, [tlq+0] ; 0 1 2 3
+ pshuflw xm2, xm0, q3321 ; 2 3 4 4
+ pshuflw xm3, xm1, q2100 ; 0 0 1 2
+ vpbroadcastw xm4, r8m ; pixel_max
+ vbroadcasti128 m10, [base+z_upsample]
+ paddw xm1, xm0
+ paddw xm2, xm3
+ lea r8d, [r8+dxq+(1<<6)]
+ psubw xm2, xm1, xm2
+ add dxd, dxd
+ psraw xm2, 3
+ pxor xm3, xm3
+ sub r10d, 3<<6
+ paddw xm1, xm2
+ paddw m6, m6
+ pmaxsw xm1, xm3
+ sub angled, 1075 ; angle - 53
+ pavgw xm1, xm3
+ lea r3d, [hq+3]
+ pminsw xm1, xm4
+ xor angled, 0x7f ; 180 - angle
+ punpcklwd xm1, xm0
+ movu [rsp+130], xm1
+ call .filter_strength
+ jmp .w4_filter_left
+ALIGN function_align
+.filter_strength:
+ movd xm8, r3d
+ mov r3d, angled
+ movd xm7, angled
+ vpbroadcastb m8, xm8
+ shr r3d, 8 ; is_sm << 1
+ vpbroadcastb m7, xm7
+ pcmpeqb m8, [base+z_filter_wh]
+ mova xm9, [r9+r3*8]
+ pand m0, m8, m7
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+ ret
+ALIGN function_align
+.upsample_left: ; h4/h8
+ mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1
+ movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0
+ vpbroadcastw xm4, r8m ; pixel_max
+ cmp hd, 8
+ je .upsample_left_h8
+ pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2
+ pshufhw xm3, xm1, q3321 ; _ _ _ _ 2 1 0 0
+ jmp .upsample_left_end
+.upsample_left_h8:
+ pblendw xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2
+ pblendw xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0
+.upsample_left_end:
+ paddw xm1, xm0
+ paddw xm2, xm3
+ psubw xm2, xm1, xm2
+ add dyq, dyq
+ psraw xm2, 3
+ pxor xm3, xm3
+ paddw xm1, xm2
+ pmaxsw xm1, xm3
+ pavgw xm1, xm3
+ pminsw xm1, xm4
+ punpcklwd xm2, xm0, xm1
+ punpckhwd xm0, xm1
+ mova [rsp+ 96+gprsize], xm2
+ mova [rsp+112+gprsize], xm0
+ ret
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ sub angled, 1112 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w4_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0]
+ psrldq xm0, xm1, 2 ; 1 2 3 4
+ pshuflw xm2, xm1, q2100 ; 0 0 1 2
+ pmullw xm4, xm0
+ pshuflw xm3, xm0, q3321 ; 2 3 4 4
+ paddw xm1, xm3
+ pshuflw xm3, xm0, q3332 ; 3 4 4 4
+ pmullw xm1, xm5
+ vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*2]
+ paddw xm2, xm3
+ vpbroadcastd xm3, r6m ; max_width
+ pmullw xm2, xm5
+ packssdw xm3, xm3
+ paddw xm1, xm4
+ paddw xm1, xm2
+ psubw xm3, [base+pw_1to16]
+ pxor xm4, xm4
+ psrlw xm1, 3
+ pminsw xm3, xm11 ; clip to byte range since there's no variable word blend
+ pavgw xm1, xm4
+ vpblendvb xm1, xm0, xm3
+ movq [rsp+130], xm1
+.w4_no_filter_above:
+ lea r3d, [hq+2]
+ add angled, 973 ; angle + 883
+ shl r3d, 6
+ test r3d, angled
+ jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+ vpbroadcastd xm0, [base+pb_90]
+ psubb xm0, xm7 ; 180 - angle
+ pand xm0, xm8 ; reuse from previous filter_strength call
+ pcmpgtb xm0, xm9
+ pmovmskb r3d, xm0
+.w4_filter_left:
+ test r3d, r3d
+ jz .w4_main
+ popcnt r3d, r3d
+ mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ vpbroadcastd m5, r7m ; max_height
+ cmp r3d, 3
+ je .w4_filter_left_s3
+ vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0]
+ pmullw m2, m0
+ cmp hd, 8
+ jl .w4_filter_left_h4
+ movu m4, [tlq-34]
+ punpcklwd m1, m0, m0
+ vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e
+ je .w4_filter_left_end
+ vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ jmp .w4_filter_left_end
+.w4_upsample_left:
+ call .upsample_left
+ mov r11, -16
+ vbroadcasti128 m9, [base+z_upsample]
+ jmp .w4_main_upsample_left
+.w4_filter_left_s3: ; can only be h16
+ movu m2, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastd m4, [base+pw_3]
+ paddw m1, m0, m2
+ punpckhwd m2, m2
+ vpblendd m2, [tlq-28], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ punpcklwd xm3, xm0, xm0
+ paddw m2, m4
+ vpblendd m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e
+ vpblendd m3, [tlq-36], 0xfe ; 0 0 0 1 2 3 4 5 6 8 8 9 a b c d
+ paddw m1, m4
+ pavgw m2, m3
+ paddw m1, m2
+ psrlw m1, 2
+ jmp .w4_filter_left_end2
+.w4_filter_left_h4:
+ pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e
+.w4_filter_left_end:
+ paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m3
+ paddw m1, m2
+ pxor m2, m2
+ psrlw m1, 3
+ pavgw m1, m2
+.w4_filter_left_end2:
+ packssdw m5, m5
+ psubw m5, [base+pw_16to1]
+ pminsw m5, m11
+ vpblendvb m1, m0, m5
+ mova [rsp+96], m1
+.w4_main:
+ vbroadcasti128 m9, [base+z2_x_shuf]
+ mov r11, -8
+.w4_main_upsample_left:
+ movd xm5, dyd
+ mova m4, [base+z2_y_shuf_h4]
+ mov r2d, r8d
+ movd xm0, dxd
+ vpbroadcastw m5, xm5
+ rorx r5, dyq, 5
+ lea r8d, [dyq*3]
+ pmullw m5, [base+z2_ymul]
+ rorx r9, dyq, 4
+ sar dyd, 6
+ vpbroadcastw m0, xm0
+ sar r8d, 6
+ pand m5, m11 ; frac_y
+ neg dyd
+ psllw m5, 9
+ add r5d, dyd
+ add r8d, dyd
+ add r9d, dyd
+ paddw m7, m0, m0
+ lea dyq, [rsp+dyq*2+126]
+ vpblendd m0, m7, 0xcc
+ add dyq, r11
+ neg r5d
+ paddw m1, m0, m7
+ neg r8d
+ vpblendd m0, m1, 0xf0 ; xpos0 xpos1 xpos2 xpos3
+ neg r9d
+ paddw m7, m7
+ paddw m6, m0
+.w4_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu xm1, [rsp+r2*2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ movu xm3, [rsp+r3*2]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ vinserti128 m1, [rsp+r2*2], 1
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ vinserti128 m3, [rsp+r3*2], 1
+ pshufb m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3
+ pshufb m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3
+ pand m2, m11, m6
+ punpcklqdq m0, m1, m3
+ punpckhqdq m1, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ cmp r3d, 64
+ jge .w4_toponly
+ movu xm2, [dyq]
+ vinserti128 m2, [dyq+r8*2], 1
+ movu xm3, [dyq+r5*2]
+ vinserti128 m3, [dyq+r9*2], 1
+ pshufb m2, m9
+ pshufb m3, m9
+ punpckhwd m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0
+ punpcklwd m2, m3
+ psubw m2, m1
+ pmulhrsw m2, m5
+ psraw m3, m6, 15 ; base_x < topleft
+ paddw m1, m2
+ vpermd m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3
+ vpblendvb m0, m1, m3
+.w4_toponly:
+ paddw m6, m7 ; xpos += dx
+ lea r3, [strideq*3]
+ add dyq, r11
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ sub hd, 4
+ jz .w4_end
+ lea dstq, [dstq+strideq*4]
+ cmp r2d, r10d
+ jge .w4_loop
+.w4_leftonly_loop:
+ movu xm1, [dyq]
+ vinserti128 m1, [dyq+r8*2], 1
+ movu xm2, [dyq+r5*2]
+ vinserti128 m2, [dyq+r9*2], 1
+ add dyq, r11
+ pshufb m1, m9
+ pshufb m2, m9
+ punpckhwd m0, m1, m2
+ punpcklwd m1, m2
+ psubw m1, m0
+ pmulhrsw m1, m5
+ paddw m0, m1
+ vpermd m0, m4, m0
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_leftonly_loop
+.w4_end:
+ RET
+.w8:
+ mov r10d, hd
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [angleq+126]
+ xor r8d, r8d
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8
+ mova xm1, [tlq+0] ; 0 1 2 3 4 5 6 7
+ pblendw xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8
+ pblendw xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6
+ vpbroadcastw xm4, r8m ; pixel_max
+ paddw xm1, xm0
+ paddw xm2, xm3
+ not r8d
+ psubw xm2, xm1, xm2
+ add dxd, dxd
+ psraw xm2, 3
+ sub angled, 53 ; angle - 53
+ pxor xm3, xm3
+ paddw xm2, xm1
+ lea r3d, [hq+7]
+ pmaxsw xm2, xm3
+ xor angled, 0x7f ; 180 - angle
+ pavgw xm2, xm3
+ pminsw xm2, xm4
+ punpcklwd xm1, xm2, xm0
+ punpckhwd xm2, xm0
+ movu [rsp+130], xm1
+ movu [rsp+146], xm2
+ call .filter_strength
+ jmp .w8_filter_left
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ sub angled, 90 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w8_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd xm6, [base+z_filter_k-4+r3*4+12*2]
+ movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 x
+ pblendw xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x
+ pmullw xm4, xm0
+ pblendw xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x
+ paddw xm1, xm3
+ vpblendd xm3, [tlq+6], 0x07 ; 3 4 5 6 7 8 8 8 x
+ paddw xm2, xm3
+ vpbroadcastd xm3, r6m ; max_width
+ pmullw xm1, xm5
+ pmullw xm2, xm6
+ packssdw xm3, xm3
+ paddw xm1, xm4
+ paddw xm1, xm2
+ psubw xm3, [base+pw_1to16]
+ pxor xm4, xm4
+ psrlw xm1, 3
+ pminsw xm3, xm11
+ pavgw xm1, xm4
+ vpblendvb xm1, xm0, xm3
+ movu [rsp+130], xm1
+.w8_no_filter_above:
+ lea r3d, [angleq-51]
+ mov r3b, hb
+ cmp r3d, 8
+ jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
+ vpbroadcastd m0, [base+pb_90]
+ psubb m0, m7
+ pand m0, m8
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+.w8_filter_left:
+ test r3d, r3d
+ jz .w8_main
+ popcnt r3d, r3d
+ cmp r3d, 3
+ jne .w8_filter_left_s12
+ vpbroadcastd m6, [base+pw_3]
+ vpbroadcastd m7, [base+pw_16]
+ cmp hd, 16 ; flags needed for later
+ jmp .filter_left_s3b
+.w8_upsample_left:
+ call .upsample_left
+ vbroadcasti128 m7, [base+z2_y_shuf_us]
+ lea r11, [rsp+118]
+ mov r8, -8
+ jmp .w8_main_upsample_left
+.w16_filter_left_s12:
+ xor r8d, r8d
+.w8_filter_left_s12:
+ mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ vpbroadcastd m5, r7m ; max_height
+ vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0]
+ pmullw m2, m0
+ cmp hd, 8
+ jl .w8_filter_left_h4
+ movu m4, [tlq-34]
+ punpcklwd m1, m0, m0
+ vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e
+ je .w8_filter_left_end
+ vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ jmp .w8_filter_left_end
+.w8_filter_left_h4:
+ pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e
+.w8_filter_left_end:
+ paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pmullw m1, m3
+ paddw m1, m2
+ pxor m2, m2
+ psrlw m1, 3
+ pavgw m1, m2
+ packssdw m5, m5
+ psubw m5, [base+pw_16to1]
+ pminsw m5, m11
+ vpblendvb m1, m0, m5
+ mova [rsp+96], m1
+ test r8d, r8d
+ jz .w8_main
+; upsample_main
+ vbroadcasti128 m10, [base+z_upsample]
+ vbroadcasti128 m7, [base+z2_y_shuf]
+ lea r5, [rsp+120]
+ movd xm1, dyd
+ vbroadcasti128 m4, [base+z_base_inc+2]
+ movd xm2, dxd
+ vpbroadcastw m1, xm1
+ vpbroadcastw m2, xm2
+ mov r7, dstq
+ paddw m4, m4
+ pmullw m0, m1, [base+z2_ymul8]
+ paddw m5, m2, m2
+ psllw xm1, 3
+ vpblendd m2, m5, 0xf0
+ lea r2d, [dxq+(66<<6)] ; xpos
+ paddw m4, m2
+ pshufd m6, m0, q2020
+ psraw xm0, 6
+ pxor xm1, xm1
+ psubw xm8, xm1, xm0
+ pand m6, m11
+ punpckhwd xm9, xm8, xm1
+ psllw m6, 9
+ punpcklwd xm8, xm1
+.w8_upsample_above_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6
+ movu xm1, [rsp+r2*2]
+ movu xm2, [rsp+r2*2+16]
+ lea r2d, [r3+dxq]
+ shr r3d, 6
+ vinserti128 m1, [rsp+r3*2], 1
+ vinserti128 m2, [rsp+r3*2+16], 1
+ pshufb m1, m10
+ pshufb m2, m10
+ punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0
+ punpckhqdq m1, m2
+ pand m2, m11, m4
+ psubw m1, m0
+ psllw m2, 9
+ pmulhrsw m1, m2
+ paddw m0, m1
+ cmp r3d, 64
+ jge .w8_upsample_above_toponly
+ mova m1, m5
+ vpgatherdq m3, [r5+xm9*2], m5
+ mova m5, m1
+ vpgatherdq m2, [r5+xm8*2], m1
+ pshufb m3, m7
+ pshufb m2, m7
+ punpckldq m1, m2, m3
+ punpckhdq m2, m3
+ psubw m2, m1
+ pmulhrsw m2, m6
+ paddw m1, m2
+ vpermq m1, m1, q3120
+ psraw m2, m4, 15
+ vpblendvb m0, m1, m2
+.w8_upsample_above_toponly:
+ paddw m4, m5
+ sub r5, 4
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w8_ret
+ lea dstq, [dstq+strideq*2]
+ jmp .w8_upsample_above_loop
+.w8_main:
+ vbroadcasti128 m7, [base+z2_y_shuf]
+ lea r11, [rsp+120]
+ mov r8, -4
+.w8_main_upsample_left:
+ movd xm1, dyd
+ vbroadcasti128 m4, [base+z_base_inc+2]
+ movd xm2, dxd
+ vpbroadcastw m1, xm1
+ vpbroadcastw m2, xm2
+ mov r7, dstq
+ pmullw m0, m1, [base+z2_ymul8]
+ paddw m5, m2, m2
+ psllw xm1, 3
+ vpblendd m2, m5, 0xf0 ; xpos0 xpos1
+ lea r9d, [dxq+(65<<6)] ; xpos
+ paddw m4, m2
+ movd [rsp+284], xm1
+.w8_loop0:
+ mov r2d, r9d
+ mova [rsp+288], m0
+ mov r5, r11
+ mova [rsp+320], m4
+ pshufd m6, m0, q2020
+ psraw xm0, 6
+ pxor xm1, xm1
+ psubw xm8, xm1, xm0 ; base_y
+ pand m6, m11 ; frac_y
+ punpckhwd xm9, xm8, xm1 ; base_y 2 3 6 7
+ psllw m6, 9
+ punpcklwd xm8, xm1 ; base_y 0 1 4 5
+.w8_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu xm0, [rsp+r2*2]
+ movu xm1, [rsp+r2*2+2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ vinserti128 m0, [rsp+r3*2], 1
+ vinserti128 m1, [rsp+r3*2+2], 1
+ pand m2, m11, m4
+ psubw m1, m0
+ psllw m2, 9
+ pmulhrsw m1, m2
+ paddw m0, m1
+ cmp r3d, 64
+ jge .w8_toponly
+ mova m1, m5
+ vpgatherdq m3, [r5+xm9*2], m5
+ mova m5, m1
+ vpgatherdq m2, [r5+xm8*2], m1
+ pshufb m3, m7 ; c0 d0 c1 d1 g0 h0 g1 h1
+ pshufb m2, m7 ; a0 b0 a1 b1 e0 f0 e1 f1
+ punpckldq m1, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m2, m3
+ psubw m2, m1
+ pmulhrsw m2, m6
+ paddw m1, m2
+ vpermq m1, m1, q3120
+ psraw m2, m4, 15 ; base_x < topleft
+ vpblendvb m0, m1, m2
+.w8_toponly:
+ paddw m4, m5 ; xpos += dx
+ add r5, r8
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w8_end
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, (63-8)<<6
+ jge .w8_loop
+.w8_leftonly_loop:
+ mova m0, m5
+ vpgatherdq m4, [r5+xm9*2], m5
+ mova m5, m0
+ vpgatherdq m3, [r5+xm8*2], m0
+ add r5, r8
+ pshufb m2, m4, m7
+ pshufb m1, m3, m7
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ psubw m1, m0
+ pmulhrsw m1, m6
+ paddw m0, m1
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_leftonly_loop
+.w8_end:
+ sub r10d, 1<<8
+ jl .w8_ret
+ vpbroadcastd m0, [rsp+284]
+ add r7, 16
+ paddw m0, [rsp+288] ; base_y += 8*dy
+ add r9d, 8<<6
+ vpbroadcastd m4, [pw_512]
+ movzx hd, r10b
+ paddw m4, [rsp+320] ; base_x += 8*64
+ mov dstq, r7
+ jmp .w8_loop0
+.w8_ret:
+ RET
+.w16:
+ movd xm0, [tlq+32]
+ lea r10d, [hq+(1<<8)]
+ movd [rsp+160], xm0
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [hq+15]
+ sub angled, 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w16_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd m6, [base+z_filter_k-4+r3*4+12*2]
+ movu m0, [tlq+2] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ punpcklwd xm2, xm1, xm1
+ vpblendd m2, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ punpckhwd m3, m0, m0
+ pmullw m4, m0
+ vpblendd m3, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ paddw m1, m3
+ vpblendd m3, [tlq+6], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g
+ paddw m2, m3
+ vpbroadcastd m3, r6m ; max_width
+ pmullw m1, m5
+ pmullw m2, m6
+ packssdw m3, m3
+ paddw m1, m4
+ paddw m1, m2
+ psubw m3, [base+pw_1to16]
+ pxor m4, m4
+ psrlw m1, 3
+ pminsw m3, m11
+ pavgw m1, m4
+ vpblendvb m1, m0, m3
+ movu [rsp+130], m1
+.w16_no_filter_above:
+ vpbroadcastd m0, [base+pb_90]
+ psubb m0, m7
+ pand m0, m8
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+ test r3d, r3d
+ jz .w8_main
+ popcnt r3d, r3d
+ cmp r3d, 3
+ jne .w16_filter_left_s12
+ vpbroadcastd m6, [base+pw_3]
+ vpbroadcastd m7, [base+pw_16]
+ cmp hd, 4
+ jne .filter_left_s3
+ movq xm0, [tlq-8] ; 0 1 2 3
+ movq xm1, [tlq-6] ; 1 2 3 4
+ vpbroadcastd xm5, r7m ; max_height
+ movq xm4, [base+pw_16to1+24] ; 4to1
+ pshuflw xm2, xm0, q2100 ; 0 0 1 2
+ pshuflw xm3, xm1, q3321 ; 2 3 4 4
+ paddw xm1, xm0
+ paddw xm1, xm2
+ pshuflw xm2, xm0, q1000 ; 0 0 0 1
+ paddw xm3, xm6
+ packssdw xm5, xm5
+ pavgw xm2, xm3
+ psubw xm5, xm4
+ paddw xm1, xm2
+ pminsw xm5, xm11
+ psrlw xm1, 2
+ vpblendvb xm1, xm0, xm5
+ movq [rsp+120], xm1
+ jmp .w8_main
+.w32:
+ mova m2, [tlq+32]
+ movd xm0, [tlq+64]
+ lea r10d, [hq+(3<<8)]
+ mova [rsp+160], m2
+ movd [rsp+192], xm0
+ test angled, 0x400
+ jnz .w8_main
+ vpbroadcastd m6, [base+pw_3]
+ vpbroadcastd m0, r6m ; max_width
+ vpbroadcastd m7, [base+pw_16]
+ mov r3d, 32
+ packssdw m0, m0
+ psubw m0, [base+pw_1to16]
+ pminsw m8, m0, m11
+ psubw m9, m8, m7
+.w32_filter_above:
+ movu m0, [tlq+2]
+ punpcklwd xm4, xm1, xm1
+ paddw m2, m6, [tlq+6]
+ paddw m1, m0
+ vpblendd m4, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m1, [tlq+4]
+ movu m3, [tlq+r3+2]
+ paddw m5, m6, [tlq+r3-2]
+ pavgw m2, m4
+ punpckhwd m4, m3, m3
+ paddw m1, m2
+ vpblendd m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h
+ vpblendd m4, [tlq+r3+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h
+ pavgw m2, m5
+ paddw m5, m3, [tlq+r3]
+ paddw m4, m5
+ psrlw m1, 2
+ paddw m2, m4
+ vpblendvb m1, m0, m8
+ psrlw m2, 2
+ vpblendvb m2, m3, m9
+ movu [rsp+130], m1
+ movu [rsp+r3+130], m2
+.filter_left_s3:
+ cmp hd, 16
+ jl .filter_left_s3_h8 ; h8
+.filter_left_s3b:
+ mova m0, [tlq-32] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ movu m2, [tlq-30] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ vpbroadcastd m5, r7m ; max_height
+ paddw m1, m0, m2
+ punpckhwd m2, m2
+ mov r3d, hd
+ vpblendd m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ packssdw m5, m5
+ not r3
+ psubw m5, [base+pw_16to1]
+ paddw m2, m6
+ pminsw m8, m11, m5
+ je .filter_left_s3_end ; h16
+ paddw m1, [tlq-34] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m2, [tlq-36] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m2
+ psrlw m1, 2
+ vpblendvb m3, m1, m0, m8
+ mova m0, [tlq-64] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m1, m0, [tlq-62] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ paddw m2, m6, [tlq-60] ; 4 5 6 7 8 9 a b c d e f g h i j
+ psubw m8, m7
+ mova [rsp+96], m3
+ jnp .filter_left_s3_end ; h32
+ mova m5, [tlq-96]
+ paddw m1, [tlq-66]
+ pavgw m2, [tlq-68]
+ paddw m1, m2
+ paddw m4, m5, [tlq-94]
+ paddw m2, m6, [tlq-92]
+ psrlw m1, 2
+ paddw m4, [tlq- 98]
+ pavgw m2, [tlq-100]
+ vpblendvb m3, m1, m0, m8
+ mova m0, [tlq-128]
+ psubw m8, m7
+ paddw m4, m2
+ paddw m1, m0, [tlq-126]
+ paddw m2, m6, [tlq-124]
+ psrlw m4, 2
+ mova [rsp+64], m3
+ vpblendvb m4, m5, m8
+ psubw m8, m7
+ mova [rsp+32], m4
+.filter_left_s3_end:
+ punpcklwd xm3, xm0, xm0
+ vpblendd m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, [tlq+r3*2-2], 0xfe ; 2 2 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m4
+ pavgw m2, m3
+ paddw m1, m2
+ psrlw m1, 2
+ vpblendvb m1, m0, m8
+ mova [rsp+r3*2+130], m1
+ jmp .w8_main
+.filter_left_s3_h8:
+ mova xm0, [tlq-16] ; 0 1 2 3 4 5 6 7
+ movu xm3, [tlq-14] ; 1 2 3 4 5 6 7 8
+ pblendw xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6
+ vpbroadcastd xm5, r7m ; max_height
+ paddw xm1, xm0, xm3
+ pblendw xm3, [tlq-12], 0x7f ; 2 3 4 5 6 7 8 8
+ paddw xm1, xm2
+ vpblendd xm2, [tlq-20], 0x0e ; 0 0 0 1 2 3 4 5
+ paddw xm3, xm6
+ packssdw xm5, xm5
+ pavgw xm2, xm3
+ psubw xm5, [base+pw_16to1+16] ; 8to1
+ paddw xm1, xm2
+ pminsw xm5, xm11
+ psrlw xm1, 2
+ vpblendvb xm1, xm0, xm5
+ mova [rsp+112], xm1
+ jmp .w8_main
+.w64:
+ mova m2, [tlq+ 32]
+ mova m3, [tlq+ 64]
+ mova m4, [tlq+ 96]
+ movd xm0, [tlq+128]
+ lea r10d, [hq+(7<<8)]
+ mova [rsp+160], m2
+ mova [rsp+192], m3
+ mova [rsp+224], m4
+ movd [rsp+256], xm0
+ test angled, 0x400
+ jnz .w8_main
+ vpbroadcastd m6, [base+pw_3]
+ movu m0, [tlq+34] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m2, [tlq+38] ; 4 5 6 7 8 9 a b c d e f g h h h
+ paddw m5, [tlq+36] ; 3 4 5 6 7 8 9 a b c d e f g h h
+ movu m4, [tlq+66]
+ paddw m3, m6, [tlq+62]
+ paddw m7, m4, [tlq+64]
+ pavgw m3, [tlq+70]
+ paddw m7, [tlq+68]
+ paddw m2, m5
+ vpbroadcastd m5, r6m ; max_width
+ mov r3d, 96
+ packssdw m5, m5
+ paddw m3, m7
+ psubw m5, [base+pw_1to16]
+ psrlw m2, 2
+ vpbroadcastd m7, [base+pw_16]
+ psrlw m3, 2
+ pminsw m8, m11, m5
+ psubw m9, m8, m7
+ vpblendvb m2, m0, m9
+ psubw m9, m7
+ vpblendvb m3, m4, m9
+ psubw m9, m7
+ movu [rsp+162], m2
+ movu [rsp+194], m3
+ jmp .w32_filter_above
+
+cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z3_16bpc_avx2_table]
+ tzcnt hd, hm
+ movifnidn angled, anglem
+ lea r7, [dr_intra_derivative+45*2-1]
+ sub tlq, 2
+ movsxd hq, [r6+hq*4]
+ sub angled, 180
+ add hq, r6
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ movzx dyd, word [r7+dyq]
+ vpbroadcastd m5, [pw_62]
+ mov org_wd, wd
+ jmp hq
+.h4:
+ ALLOC_STACK -64, 7
+ lea r7, [strideq*3]
+ cmp angleb, 40
+ jae .h4_no_upsample
+ lea r4d, [angleq-1024]
+ sar r4d, 7
+ add r4d, wd
+ jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
+ mova xm2, [tlq-14] ; 0 1 2 3 4 5 6 7
+ pblendw xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6
+ vpblendd xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5
+ pshufd xm3, xm1, q0000
+ paddw xm1, xm2
+ paddw xm0, [tlq-12] ; 1 2 3 4 5 6 7 8
+ vpbroadcastw xm4, r8m ; pixel_max
+ add dyd, dyd
+ psubw xm0, xm1, xm0
+ mova [rsp+ 0], xm3
+ movd xm3, dyd
+ psraw xm0, 3
+ neg dyd
+ paddw xm1, xm0
+ pxor xm0, xm0
+ lea r2d, [dyq+(16<<6)+63] ; ypos
+ pmaxsw xm1, xm0
+ pavgw xm1, xm0
+ vpbroadcastw m3, xm3
+ pminsw xm1, xm4
+ punpckhwd xm0, xm1, xm2
+ punpcklwd xm1, xm2
+ paddw m2, m3, m3
+ mova [rsp+32], xm0
+ punpcklwd m3, m2
+ mova [rsp+16], xm1
+ paddw m4, m2, m2
+ paddw m2, m3
+ vpblendd m3, m2, 0xf0 ; ypos0 ypos1 ypos2 ypos3
+.h4_upsample_loop:
+ lea r4d, [r2+dyq]
+ shr r2d, 6
+ movu xm1, [rsp+r2*2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6
+ movu xm2, [rsp+r4*2]
+ lea r4d, [r2+dyq]
+ shr r2d, 6
+ vinserti128 m1, [rsp+r2*2], 1
+ lea r2d, [r4+dyq]
+ shr r4d, 6
+ vinserti128 m2, [rsp+r4*2], 1
+ psrld m0, m1, 16
+ pblendw m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0
+ pslld m2, 16
+ pblendw m1, m2, 0xaa
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m3, m4
+ paddw m1, m0
+ vextracti128 xm2, m1, 1
+ punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2
+ movhps [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movhps [dstq+strideq*2], xm1
+ movq [dstq+r7 ], xm1
+ add dstq, 8
+ sub wd, 4
+ jg .h4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; h4/h8/h16
+%define base r4-z_filter_t0
+ lea r4, [z_filter_t0]
+ movd xm0, maxbased
+ movd xm1, angled
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m1, xm1
+ pcmpeqb m0, [base+z_filter_wh]
+ pand m0, m1
+ mova xm1, [r4+angleq*8]
+ pcmpgtb m0, m1
+ pmovmskb r5d, m0
+ ret
+.h4_no_upsample:
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h4_main
+ lea maxbased, [wq+3]
+ call .filter_strength
+ mov maxbased, 7
+ test r5d, r5d
+ jz .h4_main ; filter_strength == 0
+ popcnt r5d, r5d
+ mova xm0, [tlq-14] ; 0 1 2 3 4 5 6 7
+ movu xm3, [tlq-12] ; 1 2 3 4 5 6 7 8
+ vpbroadcastd xm2, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0]
+ pmullw xm2, xm0
+ pblendw xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6
+ paddw xm1, xm0, xm3
+ movd [rsp+12], xm0
+ pmullw xm1, xm4
+ cmp r5d, 3
+ jne .h4_filter_3tap
+ pblendw xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8
+ vpblendd xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5
+ movzx r4d, word [tlq-14]
+ movzx r2d, word [tlq-12]
+ inc maxbased
+ paddw xm1, xm2
+ paddw xm0, xm3
+ sub r2d, r4d
+ paddw xm2, xm0, xm0
+ lea r2d, [r2+r4*8+4]
+ shr r2d, 3
+ mov [rsp+14], r2w
+.h4_filter_3tap:
+ pxor xm0, xm0
+ paddw xm1, xm2
+ lea tlq, [rsp+30]
+ psrlw xm1, 3
+ cmp wd, 8
+ sbb maxbased, -1
+ pavgw xm0, xm1
+ mova [rsp+16], xm0
+.h4_main:
+ movd xm3, dyd
+ neg maxbaseq
+ vbroadcasti128 m1, [z_base_inc]
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m3, xm3
+ lea r4d, [maxbaseq+3*64]
+ neg dyq
+ movd xm2, r4d
+ sub tlq, 8
+ lea r4, [dyq+63] ; ypos
+ punpcklwd m1, m1
+ paddw m0, m3, m3
+ vpbroadcastw m2, xm2
+ punpcklwd m3, m0
+ paddw m4, m0, m0
+ paddw m0, m3
+ psubw m2, m1
+ vpblendd m3, m0, 0xf0 ; ypos0 ypos1 ypos2 ypos3
+ or maxbased, 63
+ paddw m3, m2
+.h4_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ movu xm1, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ movu xm2, [tlq+r5*2]
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base2
+ vinserti128 m1, [tlq+r4*2], 1
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ vinserti128 m2, [tlq+r5*2], 1
+ punpckhwd m0, m1, m2
+ punpcklwd m1, m2
+ pand m2, m5, m3
+ palignr m0, m1, 4 ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ psraw m2, m3, 15 ; ypos < max_base_y
+ paddw m3, m4
+ paddw m1, m0
+ vpblendvb m1, m6, m1, m2
+ vextracti128 xm2, m1, 1
+ punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2
+ movhps [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movhps [dstq+strideq*2], xm1
+ movq [dstq+r7 ], xm1
+ sub wd, 4
+ jz .h4_end
+ add dstq, 8
+ cmp r4d, maxbased
+ jg .h4_loop
+.h4_end_loop:
+ movq [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm6
+ movq [dstq+strideq*2], xm6
+ movq [dstq+r7 ], xm6
+ add dstq, 8
+ sub wd, 4
+ jg .h4_end_loop
+.h4_end:
+ RET
+.h8:
+ lea r4d, [angleq+216]
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 8
+ mov r4b, wb
+ lea r7, [strideq*3]
+ cmp r4d, 8
+ ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+ mova m2, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movu m0, [tlq-34] ; _ _ 0 1 2 3 4 5 6 7 8 9 a b c d
+ cmp wd, 8
+ je .h8_upsample_w8
+ pshufhw xm3, xm2, q1000
+ vpblendd m0, m3, 0x0f ; _ _ _ _ 4 4 4 5 6 7 8 9 a b c d
+.h8_upsample_w8:
+ paddw m0, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastw m4, r8m ; pixel_max
+ add dyd, dyd
+ psubw m0, m1, m0
+ movd xm6, dyd
+ psraw m0, 3
+ neg dyd
+ paddw m1, m0
+ pxor m0, m0
+ pmaxsw m1, m0
+ lea r4d, [dyq+(16<<6)+63] ; ypos
+ pavgw m1, m0
+ vpbroadcastw m6, xm6
+ pminsw m1, m4
+ punpckhwd m0, m1, m2
+ punpcklwd m1, m2
+ vextracti128 [rsp+48], m0, 1
+ vextracti128 [rsp+32], m1, 1
+ paddw m7, m6, m6
+ mova [rsp+16], xm0
+ mova [rsp+ 0], xm1
+ punpcklwd m6, m7 ; ypos0 ypos1
+.h8_upsample_loop:
+ lea r2d, [r4+dyq]
+ shr r4d, 6 ; base0
+ movu m1, [rsp+r4*2]
+ lea r4d, [r2+dyq]
+ shr r2d, 6 ; base1
+ movu m2, [rsp+r2*2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6 ; base2
+ movu m3, [rsp+r4*2]
+ lea r4d, [r2+dyq]
+ shr r2d, 6 ; base3
+ movu m4, [rsp+r2*2]
+ psrld m0, m1, 16
+ pblendw m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0
+ pslld m2, 16
+ pblendw m1, m2, 0xaa
+ psrld m2, m3, 16
+ pblendw m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4 c3 d3 c2 d2 c1 d1 c0 d0
+ pslld m4, 16
+ pblendw m3, m4, 0xaa
+ pand m4, m5, m6
+ paddw m6, m7
+ psllw m4, 9
+ psubw m1, m0
+ pmulhrsw m1, m4
+ pand m4, m5, m6
+ psllw m4, 9
+ psubw m3, m2
+ pmulhrsw m3, m4
+ paddw m6, m7
+ lea r2, [dstq+strideq*4]
+ paddw m1, m0
+ paddw m3, m2
+ punpckhdq m0, m1, m3 ; a5 b5 c5 d5 a4 b4 c4 d4 a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq m1, m3 ; a7 b7 c7 d7 a6 b6 c6 d6 a3 b3 c3 d3 a2 b2 c2 d2
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ movhps [r2 +strideq*0], xm0
+ movq [r2 +strideq*1], xm0
+ movhps [r2 +strideq*2], xm1
+ movq [r2 +r7 ], xm1
+ movhps [dstq+strideq*0], xm2
+ movq [dstq+strideq*1], xm2
+ movhps [dstq+strideq*2], xm3
+ movq [dstq+r7 ], xm3
+ add dstq, 8
+ sub wd, 4
+ jg .h8_upsample_loop
+ RET
+.h8_no_intra_edge_filter:
+ and maxbased, 7
+ or maxbased, 8 ; imin(w+7, 15)
+ jmp .h8_main
+.h8_no_upsample:
+ lea maxbased, [wq+7]
+ test angled, 0x400
+ jnz .h8_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .h8_main
+ popcnt r5d, r5d
+ mova m0, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ movu m3, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastd m2, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0]
+ pmullw m2, m0
+ cmp wd, 8
+ jl .h8_filter_w4
+ punpcklwd xm0, xm0
+ vpblendd m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movd [rsp+28], xm0
+ paddw m1, m3
+ mov r4d, 16
+ pmullw m1, m4
+ cmovg maxbased, r4d
+ cmp r5d, 3
+ jne .h8_filter_3tap
+ punpckhwd m3, m3
+ vpblendd m0, [tlq-34], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m3, [tlq-26], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g
+ movzx r4d, word [tlq-30]
+ movzx r2d, word [tlq-28]
+ inc maxbased
+ paddw m1, m2
+ paddw m0, m3
+ sub r2d, r4d
+ paddw m2, m0, m0
+ lea r2d, [r2+r4*8+4]
+ shr r2d, 3
+ mov [rsp+30], r2w
+ jmp .h8_filter_3tap
+.h8_filter_w4:
+ pshufhw xm1, xm0, q2100
+ vinserti128 m1, [tlq-16], 1 ; _ _ _ _ 4 4 5 6 7 8 9 a b c d e
+ paddw m1, m3
+ pmullw m1, m4
+.h8_filter_3tap:
+ pxor m0, m0
+ paddw m1, m2
+ lea tlq, [rsp+62]
+ psrlw m1, 3
+ pavgw m0, m1
+ mova [rsp+32], m0
+.h8_main:
+ movd xm4, dyd
+ neg maxbaseq
+ vbroadcasti128 m1, [z_base_inc]
+ vpbroadcastw m7, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ lea r4d, [maxbaseq+7*64]
+ neg dyq
+ movd xm2, r4d
+ sub tlq, 16
+ lea r4, [dyq+63]
+ paddw m6, m4, m4
+ vpbroadcastw m2, xm2
+ vpblendd m4, m6, 0xf0 ; ypos0 ypos1
+ psubw m2, m1
+ or maxbased, 63
+ paddw m4, m2
+.h8_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ movu xm0, [tlq+r4*2+2]
+ movu xm1, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ vinserti128 m0, [tlq+r5*2+2], 1
+ vinserti128 m1, [tlq+r5*2], 1
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base2
+ pand m3, m5, m4
+ psllw m3, 9
+ psubw m1, m0
+ pmulhrsw m1, m3
+ psraw m3, m4, 15
+ paddw m4, m6
+ paddw m0, m1
+ movu xm1, [tlq+r4*2+2]
+ movu xm2, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ vpblendvb m0, m7, m0, m3
+ vinserti128 m1, [tlq+r5*2+2], 1
+ vinserti128 m2, [tlq+r5*2], 1
+ pand m3, m5, m4
+ psllw m3, 9
+ psubw m2, m1
+ pmulhrsw m2, m3
+ psraw m3, m4, 15
+ paddw m4, m6
+ lea r5, [dstq+strideq*4]
+ paddw m1, m2
+ vpblendvb m1, m7, m1, m3
+ punpckhwd m2, m0, m1 ; a3 c3 a2 c2 a1 c1 a0 c0 b3 d3 b2 d2 b1 d1 b0 d0
+ vextracti128 xm3, m2, 1
+ punpcklwd m0, m1 ; a7 c7 a6 c6 a5 c5 a4 c5 b7 d7 b6 d6 b5 d5 b4 d4
+ punpckhwd xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpcklwd xm2, xm3 ; a3 b3 c3 d3 a2 b2 c2 d2
+ vextracti128 xm3, m0, 1
+ movhps [dstq+strideq*0], xm1
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movq [dstq+r7 ], xm2
+ punpckhwd xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4
+ punpcklwd xm0, xm3 ; a7 b7 c7 d7 a6 b6 c6 d6
+ movhps [r5 +strideq*0], xm1
+ movq [r5 +strideq*1], xm1
+ movhps [r5 +strideq*2], xm0
+ movq [r5 +r7 ], xm0
+ sub wd, 4
+ jz .h8_end
+ add dstq, 8
+ cmp r4d, maxbased
+ jg .h8_loop
+ lea r6, [strideq*5]
+ lea r2, [strideq+r7*2] ; stride*7
+ test wd, 4
+ jz .h8_end_loop
+ movq [dstq+strideq*0], xm7
+ movq [dstq+strideq*1], xm7
+ movq [dstq+strideq*2], xm7
+ movq [dstq+r7 ], xm7
+ movq [dstq+strideq*4], xm7
+ movq [dstq+r6 ], xm7
+ movq [dstq+r7*2 ], xm7
+ movq [dstq+r2 ], xm7
+ add dstq, 8
+ sub wd, 4
+ jz .h8_end
+.h8_end_loop:
+ mova [dstq+strideq*0], xm7
+ mova [dstq+strideq*1], xm7
+ mova [dstq+strideq*2], xm7
+ mova [dstq+r7 ], xm7
+ mova [dstq+strideq*4], xm7
+ mova [dstq+r6 ], xm7
+ mova [dstq+r7*2 ], xm7
+ mova [dstq+r2 ], xm7
+ add dstq, 16
+ sub wd, 8
+ jg .h8_end_loop
+.h8_end:
+ RET
+.h16_no_intra_edge_filter:
+ and maxbased, 15
+ or maxbased, 16 ; imin(w+15, 31)
+ jmp .h16_main
+ALIGN function_align
+.h16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 10
+ lea maxbased, [wq+15]
+ lea r7, [strideq*3]
+ test angled, 0x400
+ jnz .h16_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .h16_main ; filter_strength == 0
+ popcnt r5d, r5d
+ movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ paddw m1, m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpbroadcastd m6, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0]
+ pmullw m2, m6, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ pmullw m1, m7
+ paddw m1, m2
+ cmp wd, 8
+ jg .h16_filter_w16
+ mova xm3, [tlq-46] ; 0 1 2 3 4 5 6 7
+ pmullw xm6, xm3
+ jl .h16_filter_w4
+ pblendw xm3, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6
+ cmp r5d, 3
+ jne .h16_filter_w8_3tap
+ vpblendd xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5
+.h16_filter_w8_5tap:
+ punpckhwd m0, m0
+ vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw xm4, [tlq-42] ; 2 3 4 5 6 7 8 9
+ paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ paddw xm4, xm4
+ paddw m0, m0
+ paddw xm6, xm4
+ paddw m1, m0
+.h16_filter_w8_3tap:
+ paddw xm3, [tlq-44] ; 1 2 3 4 5 6 7 8
+ pmullw xm3, xm7
+ pxor m0, m0
+ paddw xm3, xm6
+ psrlw xm3, 3
+ pavgw xm3, xm0
+ mova [rsp+48], xm3
+ jmp .h16_filter_end
+.h16_filter_w4:
+ pshufhw xm3, xm3, q2100 ; _ _ _ _ 4 4 5 6
+ cmp r5d, 3
+ jne .h16_filter_w8_3tap
+ pshufhw xm4, xm3, q2100 ; _ _ _ _ 4 4 4 5
+ jmp .h16_filter_w8_5tap
+.h16_filter_w16:
+ mova m3, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ pmullw m6, m3
+ punpcklwd xm3, xm3
+ vpblendd m4, m3, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ paddw m4, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ mov r4d, 32
+ cmp wd, 16
+ cmovg maxbased, r4d
+ movd [rsp+28], xm3
+ pmullw m4, m7
+ cmp r5d, 3
+ jne .h16_filter_w16_3tap
+ punpckhwd m0, m0
+ vpblendd m3, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw m3, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ movzx r4d, word [tlq-62]
+ movzx r2d, word [tlq-60]
+ or maxbased, 1
+ paddw m3, m3
+ sub r2d, r4d
+ paddw m0, m0
+ lea r2d, [r2+r4*8+4]
+ paddw m4, m3
+ shr r2d, 3
+ paddw m1, m0
+ mov [rsp+30], r2w
+.h16_filter_w16_3tap:
+ pxor m0, m0
+ paddw m4, m6
+ psrlw m4, 3
+ pavgw m4, m0
+ mova [rsp+32], m4
+.h16_filter_end:
+ psrlw m1, 3
+ lea tlq, [rsp+94]
+ pavgw m1, m0
+ mova [rsp+64], m1
+.h16_main:
+ movd xm8, dyd
+ neg maxbaseq
+ vpbroadcastw m9, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m8, xm8
+ lea r4d, [maxbaseq+dyq+15*64]
+ neg dyq
+ movd xm7, r4d
+ sub tlq, 32
+ lea r4, [dyq+63]
+ vpbroadcastw m7, xm7
+ or maxbased, 63
+ psubw m7, [z_base_inc]
+.h16_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ movu m0, [tlq+r4*2+2]
+ movu m2, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ movu m1, [tlq+r5*2+2]
+ movu m3, [tlq+r5*2]
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base3
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m2, m0
+ pmulhrsw m2, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ paddw m0, m2
+ movu m2, [tlq+r4*2+2]
+ movu m4, [tlq+r4*2]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ vpblendvb m0, m9, m0, m6
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m3, m1
+ pmulhrsw m3, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ paddw m1, m3
+ vpblendvb m1, m9, m1, m6
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m4, m2
+ pmulhrsw m4, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ paddw m2, m4
+ movu m3, [tlq+r5*2+2]
+ movu m4, [tlq+r5*2]
+ vpblendvb m2, m9, m2, m6
+ pand m6, m5, m7
+ psllw m6, 9
+ psubw m4, m3
+ pmulhrsw m4, m6
+ psraw m6, m7, 15
+ paddw m7, m8
+ lea r5, [dstq+strideq*4]
+ paddw m3, m4
+ vpblendvb m3, m9, m3, m6
+ punpckhwd m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8 a3 b3 a2 b2 a1 b1 a0 b0
+ punpcklwd m0, m1 ; af bf ae be ad bd ac bc a7 b7 a6 b6 a5 b5 a4 b4
+ punpckhwd m1, m2, m3 ; cb db ca da c9 d9 c8 d8 c3 d3 c2 d2 c1 d1 c0 d0
+ punpcklwd m2, m3 ; cf df ce de cd dd cc dc c7 d7 c6 d6 c5 d5 c4 d4
+ punpckhdq m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8 a1 b1 c1 d1 a0 b0 c0 d0
+ vextracti128 xm6, m3, 1
+ punpckldq m4, m1 ; ab bb cb db aa ba ca da a3 b3 c3 d3 a2 b2 c2 d2
+ punpckhdq m1, m0, m2 ; ad bd cd dd ac bc cc dc a5 b5 c5 d5 a4 b4 c4 d4
+ punpckldq m0, m2 ; af bf cf df ae be ce de a7 b7 c7 d7 a6 b6 c6 d6
+ vextracti128 xm2, m4, 1
+ movhps [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm6
+ vextracti128 xm6, m1, 1
+ movhps [dstq+strideq*2], xm2
+ movq [dstq+r7 ], xm2
+ vextracti128 xm2, m0, 1
+ movhps [r5 +strideq*0], xm6
+ movq [r5 +strideq*1], xm6
+ movhps [r5 +strideq*2], xm2
+ movq [r5 +r7 ], xm2
+ lea r5, [dstq+strideq*8]
+ movhps [r5 +strideq*0], xm3
+ movq [r5 +strideq*1], xm3
+ movhps [r5 +strideq*2], xm4
+ movq [r5 +r7 ], xm4
+ lea r5, [r5+strideq*4]
+ movhps [r5 +strideq*0], xm1
+ movq [r5 +strideq*1], xm1
+ movhps [r5 +strideq*2], xm0
+ movq [r5 +r7 ], xm0
+ sub wd, 4
+ jz .h16_end
+ add dstq, 8
+ cmp r4d, maxbased
+ jg .h16_loop
+ mov hd, 4
+.h16_end_loop0:
+ mov r6d, wd
+ mov r2, dstq
+ test wb, 4
+ jz .h16_end_loop
+ movq [dstq+strideq*0], xm9
+ movq [dstq+strideq*1], xm9
+ movq [dstq+strideq*2], xm9
+ movq [dstq+r7 ], xm9
+ and r6d, 120
+ jz .h16_end_w4
+ add dstq, 8
+.h16_end_loop:
+ mova [dstq+strideq*0], xm9
+ mova [dstq+strideq*1], xm9
+ mova [dstq+strideq*2], xm9
+ mova [dstq+r7 ], xm9
+ add dstq, 16
+ sub r6d, 8
+ jg .h16_end_loop
+.h16_end_w4:
+ lea dstq, [r2+strideq*4]
+ dec hd
+ jg .h16_end_loop0
+.h16_end:
+ RET
+.h32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -160, 9
+ lea maxbased, [wq+31]
+ and maxbased, 31
+ or maxbased, 32 ; imin(w+31, 63)
+ test angled, 0x400
+ jnz .h32_main
+ vpbroadcastd m2, [pw_3]
+ movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ punpckhwd m1, m0, m0
+ vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m1, m2
+ paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ lea r4, [rsp+128]
+ paddw m0, m1
+ lea r5d, [maxbaseq-31]
+ psrlw m0, 2
+ mova [r4], m0
+.h32_filter_loop:
+ mova m0, [tlq-62]
+ paddw m1, m2, [tlq-66]
+ paddw m0, [tlq-64]
+ pavgw m1, [tlq-58]
+ paddw m0, [tlq-60]
+ sub tlq, 32
+ sub r4, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r4], m0
+ sub r5d, 16
+ jg .h32_filter_loop
+ jl .h32_filter_h8
+ mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ movzx r5d, word [tlq-62]
+ movzx r2d, word [tlq-60]
+ pavgw m2, m3
+ sub r2d, r5d
+ paddw m0, m1
+ lea r2d, [r2+r5*8+4]
+ paddw m0, m2
+ shr r2d, 3
+ psrlw m0, 2
+ mova [r4-32], m0
+ mov [r4-36], r5w
+ mov [r4-34], r2w
+ lea tlq, [rsp+158]
+ mov r4d, 65
+ cmp wd, 64
+ cmove maxbased, r4d
+ jmp .h32_main
+.h32_filter_h8:
+ mova xm0, [tlq-46] ; 0 1 2 3 4 5 6 7
+ pblendw xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6
+ paddw xm2, [tlq-42] ; 2 3 4 5 6 7 8 9
+ paddw xm0, [tlq-44] ; 1 2 3 4 5 6 7 8
+ vpblendd xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5
+ lea tlq, [rsp+158]
+ pavgw xm2, xm3
+ paddw xm0, xm1
+ paddw xm0, xm2
+ psrlw xm0, 2
+ mova [r4-16], xm0
+.h32_main:
+ movd xm6, dyd
+ neg maxbaseq
+ vpbroadcastw m7, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ lea r4d, [maxbaseq+dyq+15*64]
+ neg dyq
+ movd xm4, r4d
+ vpbroadcastd m8, [pw_m1024]
+ lea r4, [dyq+63]
+ vpbroadcastw m4, xm4
+ or maxbased, 63
+ psubw m4, [z_base_inc]
+.h32_loop:
+ mov r5, r4
+ sar r5, 6
+ movu m1, [tlq+r5*2-64]
+ movu m0, [tlq+r5*2-62]
+ pand m3, m5, m4
+ psllw m3, 9
+ psubw m1, m0
+ pmulhrsw m1, m3
+ pcmpgtw m2, m8, m4
+ paddw m0, m1
+ vpblendvb m0, m7, m0, m2
+ movu m2, [tlq+r5*2-32]
+ movu m1, [tlq+r5*2-30]
+ add r4, dyq
+ sub rsp, 64
+ psubw m2, m1
+ pmulhrsw m2, m3
+ psraw m3, m4, 15
+ paddw m4, m6
+ mova [rsp+32*0], m0
+ paddw m1, m2
+ vpblendvb m1, m7, m1, m3
+ mova [rsp+32*1], m1
+ dec wd
+ jz .h32_transpose
+ cmp r4d, maxbased
+ jg .h32_loop
+.h32_end_loop:
+ sub rsp, 64
+ mova [rsp+32*0], m7
+ mova [rsp+32*1], m7
+ dec wd
+ jg .h32_end_loop
+.h32_transpose:
+ lea r3, [strideq*3]
+ lea r4, [strideq*5]
+ mov r8, dstq
+ lea r5, [strideq+r3*2]
+.h32_transpose_loop0:
+ lea r6, [rsp+32]
+ lea r2, [r8+org_wq*2-16]
+.h32_transpose_loop:
+ mova m0, [r6+64*7]
+ mova m1, [r6+64*6]
+ mova m2, [r6+64*5]
+ mova m3, [r6+64*4]
+ mova m4, [r6+64*3]
+ mova m5, [r6+64*2]
+ mova m6, [r6+64*1]
+ mova m7, [r6+64*0]
+ punpckhwd m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0
+ punpcklwd m0, m1 ; a7 b7 a6 b6 a5 b5 a4 b4
+ punpckhwd m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0
+ punpcklwd m2, m3 ; c7 d7 c6 d6 c5 d5 c4 d4
+ punpckhwd m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0
+ punpcklwd m4, m5 ; e7 f7 e6 f6 e5 f5 e4 f4
+ punpckhwd m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0
+ punpcklwd m6, m7 ; g7 h7 g6 h6 g5 h5 g4 h4
+ lea dstq, [r2+strideq*8]
+ sub r6, 32
+ punpckhdq m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0
+ punpckldq m8, m1 ; a3 b3 c3 d3 a2 b2 c2 d2
+ punpckhdq m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0
+ punpckldq m3, m5 ; e3 f3 g3 h3 e2 f2 g2 h2
+ punpckhqdq m5, m7, m1 ; 8 0
+ vextracti128 [r2 +strideq*0], m5, 1
+ punpcklqdq m7, m1 ; 9 1
+ mova [dstq+strideq*0], xm5
+ punpckhqdq m1, m8, m3 ; 10 2
+ vextracti128 [r2 +strideq*1], m7, 1
+ punpcklqdq m8, m3 ; 11 3
+ mova [dstq+strideq*1], xm7
+ punpckhdq m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4
+ vextracti128 [r2 +strideq*2], m1, 1
+ punpckldq m0, m2 ; a7 b7 c7 d7 a6 b6 c6 d6
+ mova [dstq+strideq*2], xm1
+ punpckhdq m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4
+ vextracti128 [r2 +r3 ], m8, 1
+ punpckldq m4, m6 ; e7 f7 g7 h7 e6 f6 g6 h6
+ mova [dstq+r3 ], xm8
+ punpckhqdq m6, m3, m2 ; 12 4
+ vextracti128 [r2 +strideq*4], m6, 1
+ punpcklqdq m3, m2 ; 13 5
+ mova [dstq+strideq*4], xm6
+ punpckhqdq m2, m0, m4 ; 14 6
+ vextracti128 [r2 +r4 ], m3, 1
+ punpcklqdq m0, m4 ; 15 7
+ mova [dstq+r4 ], xm3
+ vextracti128 [r2 +r3*2 ], m2, 1
+ mova [dstq+r3*2 ], xm2
+ vextracti128 [r2 +r5 ], m0, 1
+ mova [dstq+r5 ], xm0
+ lea r2, [dstq+strideq*8]
+ cmp r6, rsp
+ jae .h32_transpose_loop
+ add rsp, 64*8
+ sub org_wd, 8
+ jg .h32_transpose_loop0
+.h32_end:
+ RET
+.h64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -256, 10
+ lea maxbased, [wq+63]
+ test angled, 0x400
+ jnz .h64_main
+ vpbroadcastd m2, [pw_3]
+ movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i
+ punpckhwd m1, m0, m0
+ vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i
+ paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m1, m2
+ paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ lea r4, [rsp+224]
+ paddw m0, m1
+ lea r5d, [wq+32]
+ psrlw m0, 2
+ mova [r4], m0
+.h64_filter_loop:
+ mova m0, [tlq-62]
+ paddw m1, m2, [tlq-66]
+ paddw m0, [tlq-64]
+ pavgw m1, [tlq-58]
+ paddw m0, [tlq-60]
+ sub tlq, 32
+ sub r4, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mova [r4], m0
+ sub r5d, 16
+ jg .h64_filter_loop
+ mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ punpcklwd xm1, xm0, xm0
+ paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h
+ paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g
+ vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d
+ vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e
+ lea tlq, [rsp+254]
+ pavgw m2, m3
+ paddw m0, m1
+ paddw m0, m2
+ psrlw m0, 2
+ mova [r4-32], m0
+.h64_main:
+ neg maxbaseq
+ movd xm4, dyd
+ vpbroadcastw m6, [tlq+maxbaseq*2]
+ shl maxbased, 6
+ vpbroadcastw m4, xm4
+ lea r4d, [maxbaseq+dyq+15*64]
+ neg dyq
+ vpbroadcastd m7, [pw_m1024]
+ movd xm3, r4d
+ lea r4, [dyq+63]
+ paddw m8, m7, m7
+ vpbroadcastw m3, xm3
+ or maxbased, 63
+ paddw m9, m8, m7
+ psubw m3, [z_base_inc]
+.h64_loop:
+ mov r5, r4
+ sar r5, 6
+ movu m1, [tlq+r5*2-128]
+ movu m0, [tlq+r5*2-126]
+ pand m2, m5, m3
+ psllw m2, 9
+ psubw m1, m0
+ pmulhrsw m1, m2
+ sub rsp, 128
+ paddw m0, m1
+ pcmpgtw m1, m9, m3
+ vpblendvb m0, m6, m0, m1
+ mova [rsp+32*0], m0
+ movu m1, [tlq+r5*2-96]
+ movu m0, [tlq+r5*2-94]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m8, m3
+ vpblendvb m0, m6, m0, m1
+ mova [rsp+32*1], m0
+ movu m1, [tlq+r5*2-64]
+ movu m0, [tlq+r5*2-62]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ pcmpgtw m1, m7, m3
+ vpblendvb m0, m6, m0, m1
+ mova [rsp+32*2], m0
+ movu m1, [tlq+r5*2-32]
+ movu m0, [tlq+r5*2-30]
+ psubw m1, m0
+ pmulhrsw m1, m2
+ add r4, dyq
+ psraw m2, m3, 15
+ paddw m3, m4
+ paddw m0, m1
+ vpblendvb m0, m6, m0, m2
+ mova [rsp+32*3], m0
+ dec wd
+ jz .h64_transpose
+ cmp r4d, maxbased
+ jg .h64_loop
+.h64_end_loop:
+ sub rsp, 128
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m6
+ mova [rsp+32*2], m6
+ mova [rsp+32*3], m6
+ dec wd
+ jg .h64_end_loop
+.h64_transpose:
+ lea r2, [strideq*3]
+ lea r3, [strideq*5]
+ mov r5, dstq
+ lea r4, [strideq+r2*2]
+.h64_transpose_loop0:
+ lea r6, [rsp+112]
+ lea dstq, [r5+org_wq*2-32]
+.h64_transpose_loop:
+ mova xm0, [r6+128*15]
+ vinserti128 m0, [r6+128* 7], 1
+ mova xm1, [r6+128*14]
+ vinserti128 m1, [r6+128* 6], 1
+ mova xm2, [r6+128*13]
+ vinserti128 m2, [r6+128* 5], 1
+ mova xm3, [r6+128*12]
+ vinserti128 m3, [r6+128* 4], 1
+ mova xm4, [r6+128*11]
+ vinserti128 m4, [r6+128* 3], 1
+ mova xm5, [r6+128*10]
+ vinserti128 m5, [r6+128* 2], 1
+ mova xm6, [r6+128* 9]
+ vinserti128 m6, [r6+128* 1], 1
+ mova xm7, [r6+128* 8]
+ vinserti128 m7, [r6+128* 0], 1
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m6, m7
+ punpcklwd m6, m7
+ sub r6, 16
+ punpckhdq m7, m8, m1
+ punpckldq m8, m1
+ punpckhdq m1, m3, m5
+ punpckldq m3, m5
+ punpckhqdq m5, m7, m1
+ punpcklqdq m7, m1
+ punpckhqdq m1, m8, m3
+ punpcklqdq m8, m3
+ punpckhdq m3, m0, m2
+ mova [dstq+strideq*0], m5
+ punpckldq m0, m2
+ mova [dstq+strideq*1], m7
+ punpckhdq m2, m4, m6
+ mova [dstq+strideq*2], m1
+ punpckldq m4, m6
+ mova [dstq+r2 ], m8
+ punpckhqdq m6, m3, m2
+ mova [dstq+strideq*4], m6
+ punpcklqdq m3, m2
+ mova [dstq+r3 ], m3
+ punpckhqdq m2, m0, m4
+ mova [dstq+r2*2 ], m2
+ punpcklqdq m0, m4
+ mova [dstq+r4 ], m0
+ lea dstq, [dstq+strideq*8]
+ cmp r6, rsp
+ jae .h64_transpose_loop
+ add rsp, 128*16
+ sub org_wd, 16
+ jg .h64_transpose_loop0
+.h64_end:
+ RET
+
+%macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax
+%ifnum %4
+ pshufb xm%2, xm%4
+%else
+ pshufb xm%2, %4
+%endif
+ vinserti128 m%2, xm%2, 1
+ pshufd m%1, m%2, q0000
+ pmaddwd m%1, m2
+ pshufd m%3, m%2, q1111
+ pmaddwd m%3, m3
+ paddd m%1, m1
+ paddd m%1, m%3
+ pshufd m%3, m%2, q2222
+ pmaddwd m%3, m4
+ paddd m%1, m%3
+ pshufd m%3, m%2, q3333
+ pmaddwd m%3, m5
+ paddd m%1, m%3
+ psrad m%1, 4
+ packusdw m%1, m%1
+ pminsw m%1, m%5
+%endmacro
+
+%macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax
+ pshufb m%2, m%6
+ vpermq m%4, m%2, q3232
+ vinserti128 m%2, xm%2, 1
+ pshufd m%1, m%2, q0000
+ pshufd m%3, m%4, q0000
+ pmaddwd m%1, m2
+ pmaddwd m%3, m2
+ paddd m%1, m1
+ paddd m%3, m1
+ pshufd m%5, m%2, q1111
+ pmaddwd m%5, m3
+ paddd m%1, m%5
+ pshufd m%5, m%4, q1111
+ pmaddwd m%5, m3
+ paddd m%3, m%5
+ pshufd m%5, m%2, q2222
+ pmaddwd m%5, m4
+ paddd m%1, m%5
+ pshufd m%5, m%4, q2222
+ pmaddwd m%5, m4
+ paddd m%3, m%5
+ pshufd m%5, m%2, q3333
+ pmaddwd m%5, m5
+ paddd m%1, m%5
+ pshufd m%5, m%4, q3333
+ pmaddwd m%5, m5
+ paddd m%3, m%5
+ psrad m%1, 4
+ psrad m%3, 4
+ packusdw m%1, m%3
+ pminsw m%1, m%7
+%endmacro
+
+; The ipred_filter SIMD processes 4x2 blocks in the following order which
+; increases parallelism compared to doing things row by row. One redundant
+; block is calculated for w8 and w16, two for w32.
+; w4 w8 w16 w32
+; 1 1 2 1 2 3 5 1 2 3 5 b c d f
+; 2 2 3 2 4 5 7 2 4 5 7 c e f h
+; 3 3 4 4 6 7 9 4 6 7 9 e g h j
+; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
+; 5 8 8 i
+
+cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter
+%assign org_stack_offset stack_offset
+%define base r6-ipred_filter_16bpc_avx2_table
+ lea r6, [filter_intra_taps]
+ tzcnt wd, wm
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ add filterq, r6
+ lea r6, [ipred_filter_16bpc_avx2_table]
+ vbroadcasti128 m0, [tlq-6]
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m1, [base+pd_8]
+ pmovsxbw m2, [filterq+16*0]
+ pmovsxbw m3, [filterq+16*1]
+ pmovsxbw m4, [filterq+16*2]
+ pmovsxbw m5, [filterq+16*3]
+ add wq, r6
+ mov hd, hm
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 10
+ mova xm8, [base+filter_shuf2]
+ vpbroadcastw m9, r8m ; bitdepth_max
+ lea r7, [6+hq*2]
+ sub tlq, r7
+ jmp .w4_loop_start
+.w4_loop:
+ pinsrq xm0, [tlq+hq*2], 0
+ lea dstq, [dstq+strideq*2]
+.w4_loop_start:
+ FILTER_1BLK 6, 0, 7, 8, 9
+ vextracti128 xm0, m6, 1
+ movq [dstq+strideq*0], xm6
+ movq [dstq+strideq*1], xm0
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ vbroadcasti128 m14, [base+filter_shuf3]
+ vpbroadcastw m15, r8m ; bitdepth_max
+ FILTER_1BLK 10, 0, 7, [base+filter_shuf2], 15
+ vpermq m6, m10, q1302 ; ____ ____ | ____ 4321
+ pslldq m8, m0, 4
+ psrldq m7, m6, 2
+ psrldq m0, m6, 10
+ punpcklwd m7, m0
+ vpblendd m8, m6, 0x33 ; _0__ 4321 | ____ 4321
+ vpblendd m8, m7, 0x40 ; _056 4321 | ____ 4321
+ vpblendd m8, [tlq-6], 0x30 ; _056 4321 | ____ 4321
+ lea r7, [16+hq*2]
+ sub tlq, r7
+ jmp .w8_loop_start
+.w8_loop:
+ vpermq m8, m9, q1302 ; ____ 4321 | ____ 4321
+ vpermq m6, m9, q2031
+ psrldq m0, m6, 2
+ psrldq m6, 10
+ punpcklwd m6, m0
+ vpblendd m8, m7, 0x80 ; _0__ 4321 | ____ 4321
+ vpblendd m8, m6, 0x40 ; _056 4321 | ____ 4321
+ mova m10, m9
+.w8_loop_start:
+ vpblendd m8, [tlq+hq*2], 0x0C ; _056 4321 | _056 4321
+ call .main
+ vpblendd m10, m9, 0xCC
+ mova [dstq+strideq*0], xm10
+ vextracti128 [dstq+strideq*1], m10, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ %assign stack_offset stack_offset - stack_size_padded
+ ALLOC_STACK 32, 16
+ vpbroadcastw m15, r8m ; bitdepth_max
+ sub hd, 2
+ TAIL_CALL .w16_main, 0
+.w16_main:
+ mova xm10, [base+filter_shuf2]
+ FILTER_1BLK 13, 0, 6, 10, 15
+ vpermq m12, m13, q3120
+ mova xm14, [base+filter_shuf3]
+ vinserti128 m14, [base+filter_shuf1], 1
+ vpbroadcastq m0, [tlq+10]
+ vpblendd m0, [tlq-16], 0x4C ; ___0 4321 | _056 ____
+ psrldq m6, m12, 8
+ vpblendd m0, m6, 0x03 ; ___0 4321 | _056 4321
+ punpcklwd m6, m12
+ vpblendd m0, m6, 0x80 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 12, 0, 6, 7, 8, 14, 15
+ vpblendd m13, m12, 0xCC
+ vpermq m12, m12, q2031 ; 6___ 5___
+ psrldq xm6, xm12, 2
+ psrldq xm8, xm12, 12
+ vpblendd xm6, xm8, 0x01
+ pblendw xm6, [tlq+10], 0xF8 ; 4321 056_
+ FILTER_1BLK 11, 6, 8, 10, 15
+ vpermq m11, m11, q3120
+ pshufd m9, m11, q1032
+ movu m8, [tlq+6] ; __43 210_ | ____ ____
+ pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____
+ pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____
+ vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+ lea r7, [20+hq*2]
+ sub tlq, r7
+ vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321
+ jmp .w16_loop_start
+.w16_loop:
+ vpermq m13, m13, q3322
+ vpermq m11, m9, q2020
+ vpermq m9, m9, q1302
+ vpermq m6, m12, q0123
+ psrldq m7, 4
+ vpblendd m13, m10, 0xCC
+ vpblendd m9, m7, 0x40
+ mova m0, [rsp+8]
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+.w16_loop_start:
+ mova m13, m12
+ vpblendd m0, [tlq+hq*2], 0x0C
+ psrldq m7, m12, 8
+ punpcklwd m7, m12
+ vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321
+ vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 10, 0, 6, 7, 8, 14, 15
+ vpermq m12, m10, q2031
+ mova [rsp+8], m0
+ psrldq m8, m11, 8
+ psrldq xm6, xm12, 2
+ psrldq xm7, xm12, 10
+ psrldq xm0, xm13, 2
+ punpcklwd m8, m11
+ punpcklwd xm7, xm6
+ vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321
+ vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321
+ vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321
+ call .main
+ vpermq m8, m11, q3120
+ vpblendd m6, m8, m9, 0xCC
+ mova [dstq+strideq*0+16], xm6
+ vextracti128 [dstq+strideq*1+16], m6, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ vpermq m8, m9, q3120
+ vextracti128 xm0, m8, 1 ; 4321 ____
+ pshufd xm11, xm11, q1032
+ vpblendd xm0, xm11, 0x02 ; 4321 0___
+ psrldq xm6, xm8, 2
+ psrldq xm7, xm8, 12
+ pblendw xm0, xm6, 0x4 ; 4321 05__
+ pblendw xm0, xm7, 0x2 ; 4321 056_
+ FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15
+ vpermq m12, m13, q1302
+ vpblendd m12, m10, 0xCC
+ vpblendd m9, m6, 0xCC
+ mova [dstq+strideq*0+ 0], xm12
+ mova [dstq+strideq*0+16], xm9
+ vextracti128 [dstq+strideq*1+ 0], m12, 1
+ vextracti128 [dstq+strideq*1+16], m9, 1
+ ret
+ALIGN function_align
+.w32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK 64, 16
+ vpbroadcastw m15, r8m ; bitdepth_max
+ sub hd, 2
+ lea r3, [dstq+32]
+ lea r5d, [hd*2+20]
+ call .w16_main
+ mov dstq, r3
+ lea tlq, [tlq+r5+32]
+ sub r5d, 20
+ shr r5d, 1
+ sub r5d, 2
+ lea r4, [dstq+strideq*2-2]
+DEFINE_ARGS dst, stride, tl, stride3, left, h
+ lea stride3q, [strideq*3]
+ movu m8, [tlq-6] ; 4321 0___
+ mova xm10, [base+filter_shuf2]
+ pinsrw xm0, xm8, [dstq+strideq*0-2], 2
+ pinsrw xm0, xm0, [dstq+strideq*1-2], 1 ; 4321 056_
+ pinsrw xm9, [leftq+strideq*0], 5
+ pinsrw xm9, [leftq+strideq*1], 4
+ FILTER_1BLK 13, 0, 6, 10, 15
+ vpermq m12, m13, q3120
+ mova xm14, [base+filter_shuf3]
+ vinserti128 m14, [base+filter_shuf1], 1
+ psrldq m6, m12, 8
+ punpcklwd m7, m6, m12
+ vpblendd m0, m6, 0x03 ; ___0 ____ | _0__ 4321
+ vpblendd m0, m7, 0x80 ; 56_0 ____ | _0__ 4321
+ vpblendd m0, m8, 0x30 ; 56_0 4321 | _0__ 4321
+ vpblendd m0, m9, 0x04 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 12, 0, 6, 7, 8, 14, 15
+ vpblendd m13, m12, 0xCC
+ pinsrw xm9, [leftq+strideq*2], 3
+ pinsrw xm9, [leftq+stride3q ], 2
+ lea leftq, [leftq+strideq*4]
+ pinsrw xm9, [leftq+strideq*0], 1
+ pinsrw xm9, [leftq+strideq*1], 0
+ movq [rsp+32], xm9
+ mov r7d, 1
+ pslldq m8, m9, 4
+ vpblendd m0, m8, 0x0C ; ___0 ____ | _056 ____
+ vpermq m12, m12, q2031 ; 6___ 5___
+ psrldq xm6, xm12, 2
+ psrldq xm7, xm12, 12
+ vpblendd xm6, xm7, 0x01 ; ____ _56_
+ pblendw xm6, [tlq+10], 0xF8 ; 4321 056_
+ FILTER_1BLK 11, 6, 7, 10, 15
+ vpermq m11, m11, q3120
+ pshufd m9, m11, q1032
+ vbroadcasti128 m8, [tlq+22] ; __43 210_ | ____ ____
+ pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____
+ pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____
+ vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+ vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321
+ jmp .w32_loop_start
+.w32_loop_last:
+ mova m0, [rsp+0]
+ jmp .w32_loop
+.w32_loop_left:
+ mova m0, [rsp+0]
+ vpblendd m0, [rsp+32+r7*4-12], 0x0C
+ dec r7d
+ jg .w32_loop
+ cmp hd, 2
+ je .w32_loop
+ pinsrw xm6, [rsp+32], 6
+ pinsrw xm6, [leftq+strideq*2], 5
+ pinsrw xm6, [leftq+stride3q ], 4
+ lea leftq, [leftq+strideq*4]
+ pinsrw xm6, [leftq+strideq*0], 3
+ pinsrw xm6, [leftq+strideq*1], 2
+ pinsrw xm6, [leftq+strideq*2], 1
+ pinsrw xm6, [leftq+stride3q ], 0
+ lea leftq, [leftq+strideq*4]
+ movu [rsp+36], xm6
+ pinsrw xm6, [leftq+strideq*0], 1
+ pinsrw xm6, [leftq+strideq*1], 0
+ movd [rsp+32], xm6
+ mov r7d, 4
+.w32_loop:
+ vpermq m13, m13, q3322
+ vpermq m11, m9, q2020
+ vpermq m9, m9, q1302
+ vpermq m6, m12, q0123
+ psrldq m7, 4
+ vpblendd m13, m10, 0xCC
+ vpblendd m9, m7, 0x40 ; ___0 4321 | ____ 4321
+ mova [dstq+strideq*0], xm13
+ vextracti128 [dstq+strideq*1], m13, 1
+.w32_loop_start:
+ mova m13, m12
+ psrldq m7, m12, 8
+ punpcklwd m7, m12
+ vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321
+ vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321
+ FILTER_2BLK 10, 0, 6, 7, 8, 14, 15
+ vpermq m12, m10, q2031
+ mova [rsp+0], m0
+ psrldq m8, m11, 8
+ psrldq xm6, xm12, 2
+ psrldq xm7, xm12, 10
+ psrldq xm0, xm13, 2
+ punpcklwd m8, m11
+ punpcklwd xm7, xm6
+ vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321
+ vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321
+ vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321
+ call .main
+ vpermq m8, m11, q3120
+ vpblendd m6, m8, m9, 0xCC
+ mova [dstq+strideq*0+16], xm6
+ vextracti128 [dstq+strideq*1+16], m6, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop_left
+ jz .w32_loop_last
+ vpermq m8, m9, q3120
+ vextracti128 xm0, m8, 1 ; 4321 ____
+ pshufd xm11, xm11, q1032
+ vpblendd xm0, xm11, 0x02 ; 4321 0___
+ psrldq xm6, xm8, 2
+ psrldq xm7, xm8, 12
+ pblendw xm0, xm6, 0x4 ; 4321 05__
+ pblendw xm0, xm7, 0x2 ; 4321 056_
+ FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15
+ vpermq m12, m13, q1302
+ vpblendd m12, m10, 0xCC
+ vpblendd m9, m6, 0xCC
+ mova [dstq+strideq*0+ 0], xm12
+ mova [dstq+strideq*0+16], xm9
+ vextracti128 [dstq+strideq*1+ 0], m12, 1
+ vextracti128 [dstq+strideq*1+16], m9, 1
+ RET
+.main:
+ FILTER_2BLK 9, 8, 6, 7, 0, 14, 15
+ ret
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+%macro IPRED_CFL 1 ; ac in, unpacked pixels out
+ psignw m3, m%1, m1
+ pabsw m%1, m%1
+ pmulhrsw m%1, m2
+ psignw m%1, m3
+ paddw m%1, m0
+%endmacro
+
+cglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ add tlq, 2
+ movd xm4, wd
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ pavgw xm4, xm6
+ tzcnt wd, wd
+ movd xm5, wd
+ movu m0, [tlq]
+ lea t0, [ipred_cfl_left_16bpc_avx2_table]
+ movsxd r6, [t0+wq*4]
+ add r6, t0
+ add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+
+cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ mov hd, hm ; zero upper half
+ sub tlq, hq
+ movd xm4, hd
+ sub tlq, hq
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ pavgw xm4, xm6
+ tzcnt r6d, hd
+ movd xm5, r6d
+ movu m0, [tlq]
+ lea t0, [ipred_cfl_left_16bpc_avx2_table]
+ movsxd r6, [t0+r6*4]
+ add r6, t0
+ add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table
+ tzcnt wd, wd
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h32:
+ paddw m0, [tlq+32]
+.h16:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h8:
+ psrldq xm1, xm0, 8
+ paddw xm0, xm1
+.h4:
+ punpcklwd xm0, xm6
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ paddd xm0, xm4
+ psrld xm0, xm5
+ vpbroadcastw m0, xm0
+ jmp wq
+
+cglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd xm4, t0d
+ tzcnt t0d, t0d
+ movd xm5, t0d
+ lea t0, [ipred_cfl_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+4*4]
+ psrlw xm4, 1
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h4:
+ movq xm0, [tlq-8]
+ jmp wq
+.w4:
+ movq xm1, [tlq+2]
+ paddw m0, m4
+ paddw m0, m1
+ psrlq m1, m0, 32
+ paddw m0, m1
+ psrld m1, m0, 16
+ paddw m0, m1
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ lea r2d, [hq*2]
+ mov r6d, 0xAAAB6667
+ shrx r6d, r6d, r2d
+ punpckhwd xm1, xm0, xm6
+ punpcklwd xm0, xm6
+ paddd xm0, xm1
+ movd xm1, r6d
+ psrld xm0, 2
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w4_end:
+ vpbroadcastw m0, xm0
+.s4:
+ vpbroadcastw m1, alpham
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq]
+ IPRED_CFL 4
+ pmaxsw m4, m6
+ pminsw m4, m7
+ vextracti128 xm5, m4, 1
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm5
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+r6 ], xm5
+ lea dstq, [dstq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .s4_loop
+ RET
+ALIGN function_align
+.h8:
+ mova xm0, [tlq-16]
+ jmp wq
+.w8:
+ vextracti128 xm1, m0, 1
+ paddw xm0, [tlq+2]
+ paddw xm0, xm4
+ paddw xm0, xm1
+ psrld xm1, xm0, 16
+ paddw xm0, xm1
+ pblendw xm0, xm6, 0xAA
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w8_end:
+ vpbroadcastw m0, xm0
+.s8:
+ vpbroadcastw m1, alpham
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ pmaxsw m4, m6
+ pmaxsw m5, m6
+ pminsw m4, m7
+ pminsw m5, m7
+ mova [dstq+strideq*0], xm4
+ mova [dstq+strideq*2], xm5
+ vextracti128 [dstq+strideq*1], m4, 1
+ vextracti128 [dstq+r6 ], m5, 1
+ lea dstq, [dstq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .s8_loop
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-32]
+ jmp wq
+.w16:
+ paddw m0, [tlq+2]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpckhwd xm1, xm0, xm6
+ punpcklwd xm0, xm6
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w16_end:
+ vpbroadcastw m0, xm0
+.s16:
+ vpbroadcastw m1, alpham
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ pmaxsw m4, m6
+ pmaxsw m5, m6
+ pminsw m4, m7
+ pminsw m5, m7
+ mova [dstq+strideq*0], m4
+ mova [dstq+strideq*1], m5
+ lea dstq, [dstq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .s16_loop
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-32]
+ jmp wq
+.w32:
+ paddw m0, [tlq+ 2]
+ paddw m0, [tlq+34]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpcklwd xm1, xm0, xm6
+ punpckhwd xm0, xm6
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x6667AAAB
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w32_end:
+ vpbroadcastw m0, xm0
+.s32:
+ vpbroadcastw m1, alpham
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ pmaxsw m4, m6
+ pmaxsw m5, m6
+ pminsw m4, m7
+ pminsw m5, m7
+ mova [dstq+32*0], m4
+ mova [dstq+32*1], m5
+ add dstq, strideq
+ add acq, 64
+ dec hd
+ jg .s32_loop
+ RET
+
+cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ mov r6d, r7m
+ shr r6d, 11
+ lea t0, [ipred_cfl_splat_16bpc_avx2_table]
+ tzcnt wd, wd
+ movifnidn hd, hm
+ movsxd wq, [t0+wq*4]
+ vpbroadcastd m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4]
+ pxor m6, m6
+ vpbroadcastw m7, r7m
+ add wq, t0
+ movifnidn acq, acmp
+ jmp wq
+
+cglobal ipred_cfl_ac_420_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+ vpbroadcastd m5, [pw_2]
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ jg .w16
+ je .w8
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ mova xm0, [ypxq+strideq*2]
+ mova xm1, [ypxq+r3 ]
+ vinserti128 m0, [ypxq+strideq*0], 1
+ vinserti128 m1, [ypxq+strideq*1], 1
+ lea ypxq, [ypxq+strideq*4]
+ pmaddwd m0, m5
+ pmaddwd m1, m5
+ paddd m0, m1
+ vextracti128 xm1, m0, 1
+ paddd m4, m0
+ packssdw xm1, xm0
+ mova [acq], xm1
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .dc
+ vpermq m1, m1, q1111
+ pslld xm0, 2
+.w4_hpad_loop:
+ mova [acq], m1
+ paddd m4, m0
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp .dc
+.w8:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m1
+ vextracti128 xm1, m0, 1
+ paddd m4, m0
+ packssdw xm1, xm0, xm1
+ mova [acq], xm1
+ add acq, 16
+ dec hd
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz .dc
+ vinserti128 m1, xm1, 1
+ pslld m0, 2
+ jmp .hpad
+.w8_wpad1:
+ pmaddwd xm0, xm5, [ypxq+strideq*0]
+ pmaddwd xm3, xm5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd xm0, xm3
+ pshufd xm3, xm0, q3333
+ packssdw xm1, xm0, xm3
+ paddd xm0, xm3
+ paddd xm4, xm0
+ mova [acq], xm1
+ add acq, 16
+ dec hd
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16_wpad:
+ mova m0, [ypxq+strideq*0+ 0]
+ mova m1, [ypxq+strideq*1+ 0]
+ cmp wpadd, 2
+ jl .w16_wpad1
+ je .w16_wpad2
+ vpbroadcastd m2, [ypxq+strideq*0+12]
+ vpbroadcastd m3, [ypxq+strideq*1+12]
+ vpblendd m0, m2, 0xf0
+ vpblendd m1, m3, 0xf0
+ jmp .w16_wpad_end
+.w16_wpad2:
+ vpbroadcastd m2, [ypxq+strideq*0+28]
+ vpbroadcastd m3, [ypxq+strideq*1+28]
+ jmp .w16_wpad_end
+.w16_wpad1:
+ vpbroadcastd m2, [ypxq+strideq*0+44]
+ vpbroadcastd m3, [ypxq+strideq*1+44]
+ vinserti128 m2, [ypxq+strideq*0+32], 0
+ vinserti128 m3, [ypxq+strideq*1+32], 0
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ REPX {pmaddwd x, m5}, m0, m1, m2, m3
+ paddd m0, m1
+ paddd m2, m3
+ packssdw m1, m0, m2
+ paddd m0, m2
+ vpermq m1, m1, q3120
+ paddd m4, m0
+ mova [acq], m1
+ add acq, 32
+ dec hd
+ jg .w16_wpad
+ jmp .w16_hpad
+.w16:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+ 0]
+ pmaddwd m2, m5, [ypxq+strideq*0+32]
+ pmaddwd m1, m5, [ypxq+strideq*1+ 0]
+ pmaddwd m3, m5, [ypxq+strideq*1+32]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m1
+ paddd m2, m3
+ packssdw m1, m0, m2
+ paddd m0, m2
+ vpermq m1, m1, q3120
+ paddd m4, m0
+ mova [acq], m1
+ add acq, 32
+ dec hd
+ jg .w16_loop
+.w16_hpad:
+ add hpadd, hpadd
+ jz .dc
+ paddd m0, m0
+.hpad:
+ mova [acq+32*0], m1
+ paddd m4, m0
+ mova [acq+32*1], m1
+ add acq, 32*2
+ sub hpadd, 4
+ jg .hpad
+.dc:
+ vextracti128 xm1, m4, 1
+ sub r5, acq ; -w*h*2
+ tzcnt r1d, r5d
+ paddd xm4, xm1
+ sub r1d, 2
+ punpckhqdq xm1, xm4, xm4
+ movd xm0, r1d
+ paddd xm1, xm4
+ pshuflw xm4, xm1, q1032
+ paddd xm1, xm4
+ psrld xm1, xm0
+ pxor xm0, xm0
+ pavgw xm1, xm0
+ vpbroadcastw m1, xm1
+.dc_loop:
+ mova m0, [acq+r5]
+ psubw m0, m1
+ mova [acq+r5], m0
+ add r5, 32
+ jl .dc_loop
+ RET
+
+cglobal ipred_cfl_ac_422_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+ vpbroadcastd m5, [pw_4]
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ jg .w16
+ je .w8
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ mova xm0, [ypxq+strideq*0]
+ mova xm1, [ypxq+strideq*1]
+ vinserti128 m0, [ypxq+strideq*2], 1
+ vinserti128 m1, [ypxq+r3 ], 1
+ lea ypxq, [ypxq+strideq*4]
+ pmaddwd m0, m5
+ pmaddwd m1, m5
+ paddd m4, m0
+ packssdw m0, m1
+ paddd m4, m1
+ mova [acq], m0
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vextracti128 xm1, m1, 1
+ vpermq m0, m0, q3333
+ pslld xm1, 2
+.w4_hpad_loop:
+ mova [acq], m0
+ paddd m4, m1
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+.w8:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m1, m5, [ypxq+strideq*0]
+ pmaddwd m0, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m4, m1
+ packssdw m1, m0
+ paddd m4, m0
+ vpermq m2, m1, q3120
+ mova [acq], m2
+ add acq, 32
+ sub hd, 2
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vpermq m1, m1, q3131
+ pslld m0, 2
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
+.w8_wpad1:
+ vpbroadcastd m1, [ypxq+strideq*0+12]
+ vpbroadcastd m0, [ypxq+strideq*1+12]
+ vinserti128 m1, [ypxq+strideq*0+ 0], 0
+ vinserti128 m0, [ypxq+strideq*1+ 0], 0
+ lea ypxq, [ypxq+strideq*2]
+ pmaddwd m1, m5
+ pmaddwd m0, m5
+ paddd m4, m1
+ packssdw m1, m0
+ paddd m4, m0
+ vpermq m2, m1, q3120
+ mova [acq], m2
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ pmaddwd m2, m5, [ypxq+strideq*0+ 0]
+ pmaddwd m1, m5, [ypxq+strideq*0+32]
+ pmaddwd m0, m5, [ypxq+strideq*1+ 0]
+ pmaddwd m3, m5, [ypxq+strideq*1+32]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m4, m2
+ packssdw m2, m1
+ paddd m4, m1
+ packssdw m1, m0, m3
+ paddd m0, m3
+ vpermq m2, m2, q3120
+ paddd m4, m0
+ vpermq m1, m1, q3120
+ mova [acq+32*0], m2
+ mova [acq+32*1], m1
+ add acq, 32*2
+ sub hd, 2
+ jg .w16_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad
+.w16_wpad:
+ mova m2, [ypxq+strideq*0+ 0]
+ mova m0, [ypxq+strideq*1+ 0]
+ cmp wpadd, 2
+ jl .w16_wpad1
+ je .w16_wpad2
+ vpbroadcastd m1, [ypxq+strideq*0+12]
+ vpbroadcastd m3, [ypxq+strideq*1+12]
+ vpblendd m2, m1, 0xf0
+ vpblendd m0, m3, 0xf0
+ jmp .w16_wpad_end
+.w16_wpad2:
+ vpbroadcastd m1, [ypxq+strideq*0+28]
+ vpbroadcastd m3, [ypxq+strideq*1+28]
+ jmp .w16_wpad_end
+.w16_wpad1:
+ vpbroadcastd m1, [ypxq+strideq*0+44]
+ vpbroadcastd m3, [ypxq+strideq*1+44]
+ vinserti128 m1, [ypxq+strideq*0+32], 0
+ vinserti128 m3, [ypxq+strideq*1+32], 0
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ REPX {pmaddwd x, m5}, m2, m0, m1, m3
+ paddd m4, m2
+ packssdw m2, m1
+ paddd m4, m1
+ packssdw m1, m0, m3
+ paddd m0, m3
+ vpermq m2, m2, q3120
+ paddd m4, m0
+ vpermq m1, m1, q3120
+ mova [acq+32*0], m2
+ mova [acq+32*1], m1
+ add acq, 32*2
+ sub hd, 2
+ jg .w16_wpad
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad
+
+cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ lea r6, [ipred_cfl_ac_444_16bpc_avx2_table]
+ tzcnt wd, wm
+ movifnidn hpadd, hpadm
+ vpbroadcastd m5, [pw_1]
+ movsxd wq, [r6+wq*4]
+ shl hpadd, 2
+ add wq, r6
+ mov hd, hm
+ pxor m4, m4
+ sub hd, hpadd
+ jmp wq
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ movq xm0, [ypxq+strideq*0]
+ movhps xm0, [ypxq+strideq*1]
+ vpbroadcastq m1, [ypxq+strideq*2]
+ vpbroadcastq m2, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ vpblendd m0, m1, 0x30
+ vpblendd m0, m2, 0xc0
+ psllw m0, 3
+ pmaddwd m1, m0, m5
+ mova [acq], m0
+ add acq, 32
+ paddd m4, m1
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vpermq m0, m0, q3333
+ paddd m1, m1
+ mova [acq+32*0], m0
+ vpermq m1, m1, q3333
+ mova [acq+32*1], m0
+ add acq, 32*2
+ paddd m4, m1
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+.w8:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w8_loop:
+ mova xm2, [ypxq+strideq*0]
+ vinserti128 m2, [ypxq+strideq*1], 1
+ mova xm1, [ypxq+strideq*2]
+ vinserti128 m1, [ypxq+r3 ], 1
+ lea ypxq, [ypxq+strideq*4]
+ psllw m2, 3
+ psllw m1, 3
+ mova [acq+32*0], m2
+ pmaddwd m2, m5
+ mova [acq+32*1], m1
+ pmaddwd m0, m1, m5
+ add acq, 32*2
+ paddd m4, m2
+ paddd m4, m0
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ vperm2i128 m1, m1, 0x11
+ pslld m0, 2
+ pxor m2, m2
+ vpblendd m0, m2, 0x0f
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
+.w16_wpad2:
+ vpbroadcastw m3, [ypxq+strideq*0+14]
+ vpbroadcastw m0, [ypxq+strideq*1+14]
+ vpblendd m2, m3, 0xf0
+ vpblendd m1, m0, 0xf0
+ jmp .w16_wpad_end
+.w16:
+ mov r5, acq
+.w16_loop:
+ mova m2, [ypxq+strideq*0]
+ mova m1, [ypxq+strideq*1]
+ test wpadd, wpadd
+ jnz .w16_wpad2
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ psllw m2, 3
+ psllw m1, 3
+ mova [acq+32*0], m2
+ pmaddwd m2, m5
+ mova [acq+32*1], m1
+ pmaddwd m0, m1, m5
+ add acq, 32*2
+ paddd m4, m2
+ paddd m4, m0
+ sub hd, 2
+ jg .w16_loop
+ add hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ paddd m0, m0
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
+.w32:
+ mov r5, acq
+ test wpadd, wpadd
+ jnz .w32_wpad
+.w32_loop:
+ mova m0, [ypxq+ 0]
+ mova m1, [ypxq+32]
+ add ypxq, strideq
+ psllw m0, 3
+ psllw m1, 3
+ pmaddwd m2, m0, m5
+ mova [acq+32*0], m0
+ pmaddwd m3, m1, m5
+ mova [acq+32*1], m1
+ add acq, 32*2
+ paddd m2, m3
+ paddd m4, m2
+ dec hd
+ jg .w32_loop
+.w32_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+ paddd m2, m2
+.w32_hpad_loop:
+ mova [acq+32*0], m0
+ mova [acq+32*1], m1
+ paddd m4, m2
+ mova [acq+32*2], m0
+ mova [acq+32*3], m1
+ add acq, 32*4
+ sub hpadd, 2
+ jg .w32_hpad_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
+.w32_wpad:
+ mova m0, [ypxq+ 0]
+ cmp wpadd, 4
+ jl .w32_wpad2
+ je .w32_wpad4
+ vpbroadcastw m1, [ypxq+14]
+ vpblendd m0, m1, 0xf0
+ jmp .w32_wpad_end
+.w32_wpad4:
+ vpbroadcastw m1, [ypxq+30]
+ jmp .w32_wpad_end
+.w32_wpad2:
+ vpbroadcastw m1, [ypxq+46]
+ vinserti128 m1, [ypxq+32], 0
+.w32_wpad_end:
+ add ypxq, strideq
+ psllw m0, 3
+ psllw m1, 3
+ pmaddwd m2, m0, m5
+ mova [acq+32*0], m0
+ pmaddwd m3, m1, m5
+ mova [acq+32*1], m1
+ add acq, 32*2
+ paddd m2, m3
+ paddd m4, m2
+ dec hd
+ jg .w32_wpad
+ jmp .w32_hpad
+
+cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h
+ vbroadcasti128 m3, [palq]
+ lea r2, [pal_pred_16bpc_avx2_table]
+ tzcnt wd, wm
+ vbroadcasti128 m4, [pal_pred_shuf]
+ movifnidn hd, hm
+ movsxd wq, [r2+wq*4]
+ pshufb m3, m4
+ punpckhqdq m4, m3, m3
+ add wq, r2
+DEFINE_ARGS dst, stride, stride3, idx, w, h
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ mova xm2, [idxq]
+ add idxq, 16
+ pshufb xm1, xm3, xm2
+ pshufb xm2, xm4, xm2
+ punpcklbw xm0, xm1, xm2
+ punpckhbw xm1, xm2
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+strideq*1], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ movu m2, [idxq] ; only 16-byte alignment
+ add idxq, 32
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ vextracti128 [dstq+strideq*2], m0, 1
+ vextracti128 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ vpermq m2, [idxq+ 0], q3120
+ vpermq m5, [idxq+32], q3120
+ add idxq, 64
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ pshufb m1, m3, m5
+ pshufb m2, m4, m5
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ vpermq m2, [idxq+ 0], q3120
+ vpermq m5, [idxq+32], q3120
+ add idxq, 64
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0+ 0], m0
+ mova [dstq+strideq*0+32], m1
+ pshufb m1, m3, m5
+ pshufb m2, m4, m5
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*1+ 0], m0
+ mova [dstq+strideq*1+32], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+.w64:
+ vpermq m2, [idxq+ 0], q3120
+ vpermq m5, [idxq+32], q3120
+ add idxq, 64
+ pshufb m1, m3, m2
+ pshufb m2, m4, m2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+ 0], m0
+ mova [dstq+32], m1
+ pshufb m1, m3, m5
+ pshufb m2, m4, m5
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+64], m0
+ mova [dstq+96], m1
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
+
+%endif
diff --git a/third_party/dav1d/src/x86/ipred16_avx512.asm b/third_party/dav1d/src/x86/ipred16_avx512.asm
new file mode 100644
index 0000000000..1a307adc98
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred16_avx512.asm
@@ -0,0 +1,833 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+ipred_shuf: db 14, 15, 14, 15, 0, 1, 2, 3, 6, 7, 6, 7, 0, 1, 2, 3
+ db 10, 11, 10, 11, 8, 9, 10, 11, 2, 3, 2, 3, 8, 9, 10, 11
+ db 12, 13, 12, 13, 4, 5, 6, 7, 4, 5, 4, 5, 4, 5, 6, 7
+ db 8, 9, 8, 9, 12, 13, 14, 15, 0, 1, 0, 1, 12, 13, 14, 15
+smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+ db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
+ db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
+ db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
+pal_pred_perm: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
+ db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
+ db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
+ db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5
+ times 4 db 10, 11, 12, 13, 2, 3, -1, -1
+filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7
+ times 4 db 26, 27, 28, 29, 14, 15, -1, -1
+filter_permC: dd 8 ; dq 8, 10, 1, 11, 0, 9
+pw_1: times 2 dw 1
+ dd 10
+filter_rnd: dd 32
+ dd 1
+ dd 8
+ dd 11
+filter_shift: times 2 dw 6
+ dd 0
+ times 2 dw 4
+ dd 9
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE ipred_paeth_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64
+
+cextern smooth_weights_1d_16bpc
+cextern smooth_weights_2d_16bpc
+cextern filter_intra_taps
+
+SECTION .text
+
+%macro PAETH 3 ; top, signed_ldiff, ldiff
+ paddw m0, m%2, m2
+ psubw m1, m0, m3 ; tldiff
+ psubw m0, m%1 ; tdiff
+ pabsw m1, m1
+ pabsw m0, m0
+ pcmpgtw k1, m0, m1
+ pminsw m0, m1
+ pcmpgtw k2, m%3, m0
+ vpblendmw m0{k1}, m%1, m3
+ vpblendmw m0{k2}, m2, m0
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h
+%define base r6-ipred_paeth_16bpc_avx512icl_table
+ lea r6, [ipred_paeth_16bpc_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastw m3, [tlq] ; topleft
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastq m4, [tlq+2] ; top
+ movsldup m7, [base+ipred_shuf]
+ lea r6, [strideq*3]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w4_loop:
+ sub tlq, 16
+ vbroadcasti32x4 m2, [tlq]
+ pshufb m2, m7 ; left
+ PAETH 4, 5, 6
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm8, ym0, 1
+ vextracti32x4 xm9, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm8
+ movq [dstq+r6 ], xm9
+ sub hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm8
+ movhps [dstq+r6 ], xm9
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_end:
+ RET
+.w8:
+ vbroadcasti32x4 m4, [tlq+2]
+ movsldup m7, [base+ipred_shuf]
+ lea r6, [strideq*3]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w8_loop:
+ sub tlq, 8
+ vpbroadcastq m2, [tlq]
+ pshufb m2, m7
+ PAETH 4, 5, 6
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+r6 ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16:
+ vbroadcasti32x8 m4, [tlq+2]
+ movsldup m7, [base+ipred_shuf]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w16_loop:
+ sub tlq, 4
+ vpbroadcastd m2, [tlq]
+ pshufb m2, m7
+ PAETH 4, 5, 6
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+ movu m4, [tlq+2]
+ psubw m5, m4, m3
+ pabsw m6, m5
+.w32_loop:
+ sub tlq, 2
+ vpbroadcastw m2, [tlq]
+ PAETH 4, 5, 6
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+.w64:
+ movu m4, [tlq+ 2]
+ movu m7, [tlq+66]
+ psubw m5, m4, m3
+ psubw m8, m7, m3
+ pabsw m6, m5
+ pabsw m9, m8
+.w64_loop:
+ sub tlq, 2
+ vpbroadcastw m2, [tlq]
+ PAETH 4, 5, 6
+ mova [dstq+64*0], m0
+ PAETH 7, 8, 9
+ mova [dstq+64*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
+%define base r6-$$
+ lea r6, [$$]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4]
+ lea weightsq, [base+smooth_weights_1d_16bpc+hq*4]
+ neg hq
+ vpbroadcastw m6, [tlq+hq*2] ; bottom
+ lea wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq]
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vpbroadcastq m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+.w4_loop:
+ vbroadcasti32x4 m3, [weightsq+hq*2]
+ pshufb m3, m4
+ pmulhrsw m3, m5
+ paddw m3, m6
+ vextracti32x4 xm0, m3, 3
+ vextracti32x4 xm1, ym3, 1
+ vextracti32x4 xm2, m3, 2
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ add hq, 8
+ jg .end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ jl .w4_loop
+.end:
+ RET
+.w8:
+ vbroadcasti32x4 m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+.w8_loop:
+ vpbroadcastq m0, [weightsq+hq*2]
+ pshufb m0, m4
+ pmulhrsw m0, m5
+ paddw m0, m6
+ vextracti32x4 [dstq+strideq*0], m0, 3
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w8_loop
+ RET
+.w16:
+ vbroadcasti32x8 m5, [tlq+2] ; top
+ movsldup m4, [ipred_shuf]
+ psubw m5, m6 ; top - bottom
+.w16_loop:
+ vpbroadcastd m0, [weightsq+hq*2+0]
+ vpbroadcastd m1, [weightsq+hq*2+4]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ paddw m0, m6
+ paddw m1, m6
+ vextracti32x8 [dstq+strideq*0], m0, 1
+ mova [dstq+strideq*1], ym0
+ vextracti32x8 [dstq+strideq*2], m1, 1
+ mova [dstq+stride3q ], ym1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w16_loop
+ RET
+.w32:
+ movu m5, [tlq+2]
+ psubw m5, m6
+.w32_loop:
+ vpbroadcastw m0, [weightsq+hq*2+0]
+ vpbroadcastw m1, [weightsq+hq*2+2]
+ vpbroadcastw m2, [weightsq+hq*2+4]
+ vpbroadcastw m3, [weightsq+hq*2+6]
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w32_loop
+ RET
+.w64:
+ movu m4, [tlq+ 2]
+ movu m5, [tlq+66]
+ psubw m4, m6
+ psubw m5, m6
+.w64_loop:
+ vpbroadcastw m1, [weightsq+hq*2+0]
+ vpbroadcastw m3, [weightsq+hq*2+2]
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m5
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m5
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*0+64*1], m1
+ mova [dstq+strideq*1+64*0], m2
+ mova [dstq+strideq*1+64*1], m3
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w64_loop
+ RET
+
+cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3
+ lea r6, [$$]
+ mov wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m6, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ add hd, hd
+ movsxd wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4]
+ sub tlq, hq
+ lea stride3q, [strideq*3]
+ lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq]
+ jmp wq
+.w4:
+ movsldup m4, [base+ipred_shuf]
+ vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2]
+.w4_loop:
+ vbroadcasti32x4 m0, [tlq+hq-16] ; left
+ pshufb m0, m4
+ psubw m0, m6 ; left - right
+ pmulhrsw m0, m5
+ paddw m0, m6
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ sub hd, 8*2
+ jl .end
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.end:
+ RET
+.w8:
+ movsldup m4, [base+ipred_shuf]
+ vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2]
+.w8_loop:
+ vpbroadcastq m0, [tlq+hq-8] ; left
+ pshufb m0, m4
+ psubw m0, m6 ; left - right
+ pmulhrsw m0, m5
+ paddw m0, m6
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w8_loop
+ RET
+.w16:
+ movsldup m4, [base+ipred_shuf]
+ vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2]
+.w16_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ vpbroadcastd m1, [tlq+hq-8]
+ pshufb m0, m4
+ pshufb m1, m4
+ psubw m0, m6
+ psubw m1, m6
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ paddw m0, m6
+ paddw m1, m6
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w16_loop
+ RET
+.w32:
+ movu m5, [base+smooth_weights_1d_16bpc+32*2]
+.w32_loop:
+ vpbroadcastq m3, [tlq+hq-8]
+ punpcklwd m3, m3
+ psubw m3, m6
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hq, 4*2
+ jg .w32_loop
+ RET
+.w64:
+ movu m4, [base+smooth_weights_1d_16bpc+64*2]
+ movu m5, [base+smooth_weights_1d_16bpc+64*3]
+.w64_loop:
+ vpbroadcastw m1, [tlq+hq-2]
+ vpbroadcastw m3, [tlq+hq-4]
+ psubw m1, m6
+ psubw m3, m6
+ pmulhrsw m0, m4, m1
+ pmulhrsw m1, m5
+ pmulhrsw m2, m4, m3
+ pmulhrsw m3, m5
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*0+64*1], m1
+ mova [dstq+strideq*1+64*0], m2
+ mova [dstq+strideq*1+64*1], m3
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
+ lea r6, [$$]
+ mov wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m13, [tlq+wq*2] ; right
+ tzcnt wd, wd
+ add hd, hd
+ movsxd wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4]
+ mov r5d, 0x55555555
+ sub tlq, hq
+ mova m14, [base+smooth_perm]
+ kmovd k1, r5d
+ vpbroadcastw m0, [tlq] ; bottom
+ mov r5, 0x3333333333333333
+ pxor m15, m15
+ lea wq, [base+ipred_smooth_16bpc_avx512icl_table+wq]
+ kmovq k2, r5
+ lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2]
+ jmp wq
+.w4:
+ vpbroadcastq m5, [tlq+hq+2]
+ movshdup m3, [base+ipred_shuf]
+ movsldup m4, [base+ipred_shuf]
+ vbroadcasti32x4 m6, [base+smooth_weights_2d_16bpc+4*4]
+ lea stride3q, [strideq*3]
+ punpcklwd m5, m0 ; top, bottom
+.w4_loop:
+ vbroadcasti32x4 m0, [v_weightsq]
+ vpbroadcastq m2, [tlq+hq-8]
+ mova m1, m13
+ pshufb m0, m3
+ pmaddwd m0, m5
+ pshufb m1{k2}, m2, m4 ; left, right
+ vpdpwssd m0, m1, m6
+ vpermb m0, m14, m0
+ pavgw ym0, ym15
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ add v_weightsq, 4*4
+ sub hd, 4*2
+ jg .w4_loop
+ RET
+.w8:
+ vbroadcasti32x4 ym5, [tlq+hq+2]
+ movshdup m6, [base+ipred_shuf]
+ movsldup m7, [base+ipred_shuf]
+ pmovzxwd m5, ym5
+ vbroadcasti32x8 m8, [base+smooth_weights_2d_16bpc+8*4]
+ lea stride3q, [strideq*3]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+.w8_loop:
+ vpbroadcastq m0, [v_weightsq+0]
+ vpbroadcastq m1, [v_weightsq+8]
+ vpbroadcastd m3, [tlq+hq-4]
+ vpbroadcastd m4, [tlq+hq-8]
+ pshufb m0, m6
+ pmaddwd m0, m5
+ pshufb m1, m6
+ pmaddwd m1, m5
+ mova m2, m13
+ pshufb m2{k2}, m3, m7 ; left, right
+ mova m3, m13
+ pshufb m3{k2}, m4, m7
+ vpdpwssd m0, m2, m8
+ vpdpwssd m1, m3, m8
+ add v_weightsq, 4*4
+ vpermt2b m0, m14, m1
+ pavgw m0, m15
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w8_loop
+ RET
+.w16:
+ pmovzxwd m5, [tlq+hq+2]
+ mova m6, [base+smooth_weights_2d_16bpc+16*4]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+.w16_loop:
+ vpbroadcastd m0, [v_weightsq+0]
+ vpbroadcastd m1, [v_weightsq+4]
+ pmaddwd m0, m5
+ pmaddwd m1, m5
+ mova m2, m13
+ vpbroadcastw m2{k1}, [tlq+hq-2] ; left, right
+ mova m3, m13
+ vpbroadcastw m3{k1}, [tlq+hq-4]
+ vpdpwssd m0, m2, m6
+ vpdpwssd m1, m3, m6
+ add v_weightsq, 2*4
+ vpermt2b m0, m14, m1
+ pavgw m0, m15
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxwd m5, [tlq+hq+ 2]
+ pmovzxwd m6, [tlq+hq+34]
+ mova m7, [base+smooth_weights_2d_16bpc+32*4]
+ mova m8, [base+smooth_weights_2d_16bpc+32*6]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+ vpblendmw m6{k1}, m0, m6
+.w32_loop:
+ vpbroadcastd m2, [v_weightsq+0]
+ vpbroadcastd m3, [v_weightsq+4]
+ pmaddwd m0, m5, m2
+ pmaddwd m2, m6
+ pmaddwd m1, m5, m3
+ pmaddwd m3, m6
+ mova m4, m13
+ vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right
+ vpdpwssd m0, m4, m7
+ vpdpwssd m2, m4, m8
+ mova m4, m13
+ vpbroadcastw m4{k1}, [tlq+hq-4]
+ vpdpwssd m1, m4, m7
+ vpdpwssd m3, m4, m8
+ add v_weightsq, 2*4
+ vpermt2b m0, m14, m2
+ vpermt2b m1, m14, m3
+ pavgw m0, m15
+ pavgw m1, m15
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hq, 2*2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxwd m5, [tlq+hq+ 2]
+ pmovzxwd m6, [tlq+hq+34]
+ pmovzxwd m7, [tlq+hq+66]
+ pmovzxwd m8, [tlq+hq+98]
+ mova m9, [base+smooth_weights_2d_16bpc+64*4]
+ vpblendmw m5{k1}, m0, m5 ; top, bottom
+ mova m10, [base+smooth_weights_2d_16bpc+64*5]
+ vpblendmw m6{k1}, m0, m6
+ mova m11, [base+smooth_weights_2d_16bpc+64*6]
+ vpblendmw m7{k1}, m0, m7
+ mova m12, [base+smooth_weights_2d_16bpc+64*7]
+ vpblendmw m8{k1}, m0, m8
+.w64_loop:
+ vpbroadcastd m3, [v_weightsq]
+ mova m4, m13
+ vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right
+ pmaddwd m0, m5, m3
+ pmaddwd m2, m6, m3
+ pmaddwd m1, m7, m3
+ pmaddwd m3, m8
+ vpdpwssd m0, m4, m9
+ vpdpwssd m2, m4, m10
+ vpdpwssd m1, m4, m11
+ vpdpwssd m3, m4, m12
+ add v_weightsq, 1*4
+ vpermt2b m0, m14, m2
+ vpermt2b m1, m14, m3
+ pavgw m0, m15
+ pavgw m1, m15
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ sub hd, 1*2
+ jg .w64_loop
+ RET
+
+cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
+ lea r6, [pal_pred_16bpc_avx512icl_table]
+ tzcnt wd, wm
+ mova m2, [pal_pred_perm]
+ movsxd wq, [r6+wq*4]
+ mova xm3, [palq]
+ movifnidn hd, hm
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ pmovzxbw ym0, [idxq]
+ add idxq, 16
+ vpermw ym0, ym0, ym3
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ pmovzxbw m0, [idxq]
+ add idxq, 32
+ vpermw m0, m0, m3
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ vpermb m1, m2, [idxq]
+ add idxq, 64
+ vpermw m0, m1, m3
+ psrlw m1, 8
+ vpermw m1, m1, m3
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ vpermb m1, m2, [idxq]
+ add idxq, 64
+ vpermw m0, m1, m3
+ psrlw m1, 8
+ vpermw m1, m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+.w64:
+ vpermb m1, m2, [idxq]
+ add idxq, 64
+ vpermw m0, m1, m3
+ psrlw m1, 8
+ vpermw m1, m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
+
+; The ipred_filter SIMD processes 4x2 blocks in the following order which
+; increases parallelism compared to doing things row by row.
+; w4 w8 w16 w32
+; 1 1 2 1 2 5 6 1 2 5 6 9 a d e
+; 2 2 3 2 3 6 7 2 3 6 7 a b e f
+; 3 3 4 3 4 7 8 3 4 7 8 b c f g
+; 4 4 5 4 5 8 9 4 5 8 9 c d g h
+
+cglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top
+%define base r6-$$
+ lea r6, [$$]
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ movifnidn hd, hm
+ movu xm0, [tlq-6]
+ pmovsxbw m7, [base+filter_intra_taps+filterq+32*0]
+ pmovsxbw m8, [base+filter_intra_taps+filterq+32*1]
+ mov r5d, r8m ; bitdepth_max
+ movsldup m9, [base+filter_permA]
+ movshdup m10, [base+filter_permA]
+ shr r5d, 11 ; is_12bpc
+ jnz .12bpc
+ psllw m7, 2 ; upshift multipliers so that packusdw
+ psllw m8, 2 ; will perform clipping for free
+.12bpc:
+ vpbroadcastd m5, [base+filter_rnd+r5*8]
+ vpbroadcastd m6, [base+filter_shift+r5*8]
+ sub wd, 8
+ jl .w4
+.w8:
+ call .main4
+ movsldup m11, [filter_permB]
+ lea r5d, [hq*2+2]
+ movshdup m12, [filter_permB]
+ lea topq, [tlq+2]
+ mova m13, [filter_permC]
+ sub hd, 4
+ vinserti32x4 ym0, [topq], 1 ; a0 b0 t0 t1
+ sub tlq, r5
+%if WIN64
+ push r7
+ push r8
+%endif
+ mov r7, dstq
+ mov r8d, hd
+.w8_loop:
+ movlps xm4, xm0, [tlq+hq*2]
+ call .main8
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jge .w8_loop
+ test wd, wd
+ jz .end
+ mov r2d, 0x0d
+ kmovb k1, r2d
+ lea r2, [strideq*3]
+.w16:
+ movd xmm0, [r7+strideq*1+12]
+ vpblendd xmm0, [topq+8], 0x0e ; t1 t2
+ pinsrw xm4, xmm0, [r7+strideq*0+14], 2
+ call .main8
+ add r7, 16
+ vinserti32x4 ym0, [topq+16], 1 ; a2 b2 t2 t3
+ mov hd, r8d
+ mov dstq, r7
+ add topq, 16
+.w16_loop:
+ movd xmm1, [dstq+strideq*2-4]
+ punpcklwd xm4, xmm1, xmm0
+ movd xmm0, [dstq+r2-4]
+ shufps xm4{k1}, xmm0, xm0, q3210
+ call .main8
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jge .w16_loop
+ sub wd, 8
+ jg .w16
+.end:
+ vpermb m2, m11, m0
+ mova ym1, ym5
+ vpdpwssd m1, m2, m7
+ vpermb m2, m12, m0
+ vpdpwssd m1, m2, m8
+%if WIN64
+ pop r8
+ pop r7
+%endif
+ vextracti32x8 ym2, m1, 1
+ paddd ym1, ym2
+ packusdw ym1, ym1
+ vpsrlvw ym1, ym6
+ vpermt2q m0, m13, m1
+ vextracti32x4 [dstq+strideq*0], m0, 2
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ RET
+.w4_loop:
+ movlps xm0, [tlq-10]
+ lea dstq, [dstq+strideq*2]
+ sub tlq, 4
+.w4:
+ call .main4
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.main4:
+ vpermb m2, m9, m0
+ mova ym1, ym5
+ vpdpwssd m1, m2, m7
+ vpermb m0, m10, m0
+ vpdpwssd m1, m0, m8
+ vextracti32x8 ym0, m1, 1
+ paddd ym0, ym1
+ vextracti32x4 xm1, ym0, 1
+ packusdw xm0, xm1 ; clip
+ vpsrlvw xm0, xm6
+ ret
+ALIGN function_align
+.main8:
+ vpermb m3, m11, m0
+ mova ym2, ym5
+ vpdpwssd m2, m3, m7
+ vpermb m3, m9, m4
+ mova ym1, ym5
+ vpdpwssd m1, m3, m7
+ vpermb m3, m12, m0
+ vpdpwssd m2, m3, m8
+ vpermb m3, m10, m4
+ vpdpwssd m1, m3, m8
+ vextracti32x8 ym4, m2, 1
+ vextracti32x8 ym3, m1, 1
+ paddd ym2, ym4
+ paddd ym1, ym3
+ packusdw ym1, ym2 ; clip
+ vpsrlvw ym1, ym6
+ vpermt2q m0, m13, m1 ; c0 d0 b0 b1 a0 a1
+ vextracti32x4 [dstq+strideq*0], m0, 2
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ ret
+
+%endif
diff --git a/third_party/dav1d/src/x86/ipred16_sse.asm b/third_party/dav1d/src/x86/ipred16_sse.asm
new file mode 100644
index 0000000000..07ea9567e1
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred16_sse.asm
@@ -0,0 +1,1923 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1
+pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+
+pb_0_1: times 4 db 0, 1
+pb_2_3: times 4 db 2, 3
+pw_1: times 4 dw 1
+pw_2: times 4 dw 2
+pw_4: times 4 dw 4
+pw_512: times 4 dw 512
+pw_2048: times 4 dw 2048
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4)
+%define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4)
+%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4)
+
+JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64
+JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \
+ s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4
+JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32
+JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32
+JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64
+
+cextern smooth_weights_1d_16bpc
+cextern smooth_weights_2d_16bpc
+cextern filter_intra_taps
+
+SECTION .text
+
+INIT_XMM ssse3
+cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
+ LEA r5, ipred_dc_left_16bpc_ssse3_table
+ movd m4, wm
+ tzcnt wd, wm
+ add tlq, 2
+ movifnidn hd, hm
+ pxor m3, m3
+ pavgw m4, m3
+ movd m5, wd
+ movu m0, [tlq]
+ movsxd r6, [r5+wq*4]
+ add r6, r5
+ add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_left_16bpc_ssse3_table
+ mov hd, hm
+ movd m4, hm
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ pxor m3, m3
+ sub tlq, hq
+ pavgw m4, m3
+ movd m5, r6d
+ movu m0, [tlq]
+ movsxd r6, [r5+r6*4]
+ add r6, r5
+ add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu m2, [tlq+112]
+ movu m1, [tlq+ 96]
+ paddw m0, m2
+ movu m2, [tlq+ 80]
+ paddw m1, m2
+ movu m2, [tlq+ 64]
+ paddw m0, m2
+ paddw m0, m1
+.h32:
+ movu m1, [tlq+ 48]
+ movu m2, [tlq+ 32]
+ paddw m1, m2
+ paddw m0, m1
+.h16:
+ movu m1, [tlq+ 16]
+ paddw m0, m1
+.h8:
+ movhlps m1, m0
+ paddw m0, m1
+.h4:
+ punpcklwd m0, m3
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ lea stride3q, [strideq*3]
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ jmp wq
+
+cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd m4, r5d
+ tzcnt r5d, r5d
+ movd m5, r5d
+ LEA r5, ipred_dc_16bpc_ssse3_table
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ pxor m3, m3
+ psrlw m4, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movq m0, [tlq-8]
+ jmp wq
+.w4:
+ movq m1, [tlq+2]
+ paddw m1, m0
+ punpckhwd m0, m3
+ punpcklwd m1, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ cmp hd, 4
+ jg .w4_mul
+ psrlw m0, 3
+ jmp .w4_end
+.w4_mul:
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 16
+ cmove r2d, r3d
+ psrld m0, 2
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w4_end:
+ pshuflw m0, m0, q0000
+.s4:
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+.h8:
+ mova m0, [tlq-16]
+ jmp wq
+.w8:
+ movu m1, [tlq+2]
+ paddw m0, m1
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 32
+ cmove r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w8_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+.h16:
+ mova m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w16:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ paddw m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ test hd, 8|32
+ cmovz r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w16_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s16c:
+ mova m1, m0
+.s16:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ mova [dstq+strideq*2+16*0], m0
+ mova [dstq+strideq*2+16*1], m1
+ mova [dstq+stride3q +16*0], m0
+ mova [dstq+stride3q +16*1], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-48]
+ paddw m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w32:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ movu m2, [tlq+34]
+ paddw m0, m2
+ movu m2, [tlq+50]
+ paddw m1, m2
+ paddw m0, m1
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 32
+ je .w32_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 8
+ cmove r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w32_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s32c:
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+.s32:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ mova [dstq+strideq*0+16*2], m2
+ mova [dstq+strideq*0+16*3], m3
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ mova [dstq+strideq*1+16*2], m2
+ mova [dstq+strideq*1+16*3], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s32
+ RET
+.h64:
+ mova m0, [tlq-128]
+ mova m1, [tlq-112]
+ paddw m0, [tlq- 96]
+ paddw m1, [tlq- 80]
+ paddw m0, [tlq- 64]
+ paddw m1, [tlq- 48]
+ paddw m0, [tlq- 32]
+ paddw m1, [tlq- 16]
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+ 18]
+ paddw m1, m2
+ movu m2, [tlq+ 34]
+ paddw m0, m2
+ movu m2, [tlq+ 50]
+ paddw m1, m2
+ movu m2, [tlq+ 66]
+ paddw m0, m2
+ movu m2, [tlq+ 82]
+ paddw m1, m2
+ movu m2, [tlq+ 98]
+ paddw m0, m2
+ movu m2, [tlq+114]
+ paddw m1, m2
+ paddw m0, m1
+ punpcklwd m1, m0, m3
+ punpckhwd m0, m3
+ paddd m0, m1
+ paddd m4, m0
+ punpckhqdq m0, m0
+ paddd m0, m4
+ pshuflw m1, m0, q1032
+ paddd m0, m1
+ psrld m0, m5
+ cmp hd, 64
+ je .w64_end
+ mov r2d, 0xAAAB
+ mov r3d, 0x6667
+ cmp hd, 16
+ cmove r2d, r3d
+ movd m1, r2d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w64_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m0
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m0
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m0
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m0
+ add dstq, strideq
+ dec hd
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ mov r6d, r8m
+ LEA r5, ipred_dc_128_16bpc_ssse3_table
+ tzcnt wd, wm
+ shr r6d, 11
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_splat_16bpc_ssse3_table
+ movifnidn hd, hm
+ movu m0, [tlq+ 2]
+ movu m1, [tlq+ 18]
+ movu m2, [tlq+ 34]
+ movu m3, [tlq+ 50]
+ cmp wd, 64
+ je .w64
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w64:
+ WIN64_SPILL_XMM 8
+ movu m4, [tlq+ 66]
+ movu m5, [tlq+ 82]
+ movu m6, [tlq+ 98]
+ movu m7, [tlq+114]
+.w64_loop:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ mova [dstq+16*4], m4
+ mova [dstq+16*5], m5
+ mova [dstq+16*6], m6
+ mova [dstq+16*7], m7
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
+%define base r5-ipred_h_16bpc_ssse3_table
+ tzcnt wd, wm
+ LEA r5, ipred_h_16bpc_ssse3_table
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ movddup m2, [base+pb_0_1]
+ movddup m3, [base+pb_2_3]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ sub tlq, 8
+ movq m3, [tlq]
+ pshuflw m0, m3, q3333
+ pshuflw m1, m3, q2222
+ pshuflw m2, m3, q1111
+ pshuflw m3, m3, q0000
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m1
+ movq [dstq+strideq*2], m2
+ movq [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ sub tlq, 8
+ movq m3, [tlq]
+ punpcklwd m3, m3
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ sub tlq, 4
+ movd m1, [tlq]
+ pshufb m0, m1, m3
+ pshufb m1, m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m0
+ mova [dstq+strideq*1+16*0], m1
+ mova [dstq+strideq*1+16*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16
+ RET
+.w32:
+ sub tlq, 4
+ movd m1, [tlq]
+ pshufb m0, m1, m3
+ pshufb m1, m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m0
+ mova [dstq+strideq*0+16*2], m0
+ mova [dstq+strideq*0+16*3], m0
+ mova [dstq+strideq*1+16*0], m1
+ mova [dstq+strideq*1+16*1], m1
+ mova [dstq+strideq*1+16*2], m1
+ mova [dstq+strideq*1+16*3], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+.w64:
+ sub tlq, 2
+ movd m0, [tlq]
+ pshufb m0, m2
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m0
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m0
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m0
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m0
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
+
+cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left
+%define base r5-ipred_paeth_16bpc_ssse3_table
+ movifnidn hd, hm
+ pshuflw m4, [tlq], q0000
+ mov leftq, tlq
+ add hd, hd
+ punpcklqdq m4, m4 ; topleft
+ sub leftq, hq
+ and wd, ~7
+ jnz .w8
+ movddup m5, [tlq+2] ; top
+ psubw m6, m5, m4
+ pabsw m7, m6
+.w4_loop:
+ movd m1, [leftq+hq-4]
+ punpcklwd m1, m1
+ punpckldq m1, m1 ; left
+%macro PAETH 0
+ paddw m0, m6, m1
+ psubw m2, m4, m0 ; tldiff
+ psubw m0, m5 ; tdiff
+ pabsw m2, m2
+ pabsw m0, m0
+ pminsw m2, m0
+ pcmpeqw m0, m2
+ pand m3, m5, m0
+ pandn m0, m4
+ por m0, m3
+ pcmpgtw m3, m7, m2
+ pand m0, m3
+ pandn m3, m1
+ por m0, m3
+%endmacro
+ PAETH
+ movhps [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2*2
+ jg .w4_loop
+ RET
+.w8:
+%if ARCH_X86_32
+ PUSH r6
+ %define r7d hm
+ %assign regs_used 7
+%elif WIN64
+ movaps r4m, m8
+ PUSH r7
+ %assign regs_used 8
+%endif
+%if ARCH_X86_64
+ movddup m8, [pb_0_1]
+%endif
+ lea tlq, [tlq+wq*2+2]
+ neg wq
+ mov r7d, hd
+.w8_loop0:
+ movu m5, [tlq+wq*2]
+ mov r6, dstq
+ add dstq, 16
+ psubw m6, m5, m4
+ pabsw m7, m6
+.w8_loop:
+ movd m1, [leftq+hq-2]
+%if ARCH_X86_64
+ pshufb m1, m8
+%else
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+%endif
+ PAETH
+ mova [r6], m0
+ add r6, strideq
+ sub hd, 1*2
+ jg .w8_loop
+ mov hd, r7d
+ add wq, 8
+ jl .w8_loop0
+%if WIN64
+ movaps m8, r4m
+%endif
+ RET
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 4
+%endif
+
+cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights
+ LEA weightsq, smooth_weights_1d_16bpc
+ mov hd, hm
+ lea weightsq, [weightsq+hq*4]
+ neg hq
+ movd m5, [tlq+hq*2] ; bottom
+ pshuflw m5, m5, q0000
+ punpcklqdq m5, m5
+ cmp wd, 4
+ jne .w8
+ movddup m4, [tlq+2] ; top
+ lea r3, [strideq*3]
+ psubw m4, m5 ; top - bottom
+.w4_loop:
+ movq m1, [weightsq+hq*2]
+ punpcklwd m1, m1
+ pshufd m0, m1, q1100
+ punpckhdq m1, m1
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r3 ], m1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w4_loop
+ RET
+.w8:
+%if ARCH_X86_32
+ PUSH r6
+ %assign regs_used 7
+ mov hm, hq
+ %define hq hm
+%elif WIN64
+ PUSH r7
+ %assign regs_used 8
+%endif
+.w8_loop0:
+ mov t0, hq
+ movu m4, [tlq+2]
+ add tlq, 16
+ mov r6, dstq
+ add dstq, 16
+ psubw m4, m5
+.w8_loop:
+ movq m3, [weightsq+t0*2]
+ punpcklwd m3, m3
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [r6+strideq*0], m0
+ mova [r6+strideq*1], m1
+ lea r6, [r6+strideq*2]
+ mova [r6+strideq*0], m2
+ mova [r6+strideq*1], m3
+ lea r6, [r6+strideq*2]
+ add t0, 4
+ jl .w8_loop
+ sub wd, 8
+ jg .w8_loop0
+ RET
+
+cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights
+ LEA weightsq, smooth_weights_1d_16bpc
+ mov wd, wm
+ movifnidn hd, hm
+ movd m5, [tlq+wq*2] ; right
+ sub tlq, 8
+ add hd, hd
+ pshuflw m5, m5, q0000
+ sub tlq, hq
+ punpcklqdq m5, m5
+ cmp wd, 4
+ jne .w8
+ movddup m4, [weightsq+4*2]
+ lea r3, [strideq*3]
+.w4_loop:
+ movq m1, [tlq+hq] ; left
+ punpcklwd m1, m1
+ psubw m1, m5 ; left - right
+ pshufd m0, m1, q3322
+ punpckldq m1, m1
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ paddw m0, m5
+ paddw m1, m5
+ movhps [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movhps [dstq+strideq*2], m1
+ movq [dstq+r3 ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4*2
+ jg .w4_loop
+ RET
+.w8:
+ lea weightsq, [weightsq+wq*4]
+ neg wq
+%if ARCH_X86_32
+ PUSH r6
+ %assign regs_used 7
+ %define hd hm
+%elif WIN64
+ PUSH r7
+ %assign regs_used 8
+%endif
+.w8_loop0:
+ mov t0d, hd
+ mova m4, [weightsq+wq*2]
+ mov r6, dstq
+ add dstq, 16
+.w8_loop:
+ movq m3, [tlq+t0*(1+ARCH_X86_32)]
+ punpcklwd m3, m3
+ psubw m3, m5
+ pshufd m0, m3, q3333
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ REPX {paddw x, m5}, m0, m1, m2, m3
+ mova [r6+strideq*0], m0
+ mova [r6+strideq*1], m1
+ lea r6, [r6+strideq*2]
+ mova [r6+strideq*0], m2
+ mova [r6+strideq*1], m3
+ lea r6, [r6+strideq*2]
+ sub t0d, 4*(1+ARCH_X86_64)
+ jg .w8_loop
+ add wq, 8
+ jl .w8_loop0
+ RET
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 10
+%else
+DECLARE_REG_TMP 3
+%endif
+
+cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \
+ h_weights, v_weights, top
+ LEA h_weightsq, smooth_weights_2d_16bpc
+ mov wd, wm
+ mov hd, hm
+ movd m7, [tlq+wq*2] ; right
+ lea v_weightsq, [h_weightsq+hq*8]
+ neg hq
+ movd m6, [tlq+hq*2] ; bottom
+ pshuflw m7, m7, q0000
+ pshuflw m6, m6, q0000
+ cmp wd, 4
+ jne .w8
+ movq m4, [tlq+2] ; top
+ mova m5, [h_weightsq+4*4]
+ punpcklwd m4, m6 ; top, bottom
+ pxor m6, m6
+.w4_loop:
+ movq m1, [v_weightsq+hq*4]
+ sub tlq, 4
+ movd m3, [tlq] ; left
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ pmaddwd m0, m4
+ punpcklwd m3, m7 ; left, right
+ pmaddwd m1, m4
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q0000
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m1, m3
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pavgw m0, m6
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+%if ARCH_X86_32
+ lea h_weightsq, [h_weightsq+wq*4]
+ mov t0, tlq
+ mov r1m, tlq
+ mov r2m, hq
+ %define m8 [h_weightsq+16*0]
+ %define m9 [h_weightsq+16*1]
+%else
+%if WIN64
+ movaps r4m, m8
+ movaps r6m, m9
+ PUSH r7
+ PUSH r8
+%endif
+ PUSH r9
+ PUSH r10
+ %assign regs_used 11
+ lea h_weightsq, [h_weightsq+wq*8]
+ lea topq, [tlq+wq*2]
+ neg wq
+ mov r8, tlq
+ mov r9, hq
+%endif
+ punpcklqdq m6, m6
+.w8_loop0:
+%if ARCH_X86_32
+ movu m5, [t0+2]
+ add t0, 16
+ mov r0m, t0
+%else
+ movu m5, [topq+wq*2+2]
+ mova m8, [h_weightsq+wq*4+16*0]
+ mova m9, [h_weightsq+wq*4+16*1]
+%endif
+ mov t0, dstq
+ add dstq, 16
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+.w8_loop:
+ movd m1, [v_weightsq+hq*4]
+ sub tlq, 2
+ movd m3, [tlq] ; left
+ pshufd m1, m1, q0000
+ pmaddwd m0, m4, m1
+ pshuflw m3, m3, q0000
+ pmaddwd m1, m5
+ punpcklwd m3, m7 ; left, right
+ pmaddwd m2, m8, m3
+ pmaddwd m3, m9
+ paddd m0, m2
+ paddd m1, m3
+ psrld m0, 8
+ psrld m1, 8
+ packssdw m0, m1
+ pxor m1, m1
+ pavgw m0, m1
+ mova [t0], m0
+ add t0, strideq
+ inc hq
+ jl .w8_loop
+%if ARCH_X86_32
+ mov t0, r0m
+ mov tlq, r1m
+ add h_weightsq, 16*2
+ mov hq, r2m
+ sub dword wm, 8
+ jg .w8_loop0
+%else
+ mov tlq, r8
+ mov hq, r9
+ add wq, 8
+ jl .w8_loop0
+%endif
+%if WIN64
+ movaps m8, r4m
+ movaps m9, r6m
+%endif
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter
+%else
+cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter
+%define m8 [esp+16*0]
+%define m9 [esp+16*1]
+%define m10 [esp+16*2]
+%define m11 [esp+16*3]
+%define m12 [esp+16*4]
+%define m13 [esp+16*5]
+%define m14 [esp+16*6]
+%define m15 [esp+16*7]
+%endif
+%define base r6-$$
+ movifnidn hd, hm
+ movd m6, r8m ; bitdepth_max
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ LEA r6, $$
+ shl filterd, 6
+ movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3
+ mova m1, [base+filter_intra_taps+filterq+16*0]
+ mova m2, [base+filter_intra_taps+filterq+16*1]
+ mova m3, [base+filter_intra_taps+filterq+16*2]
+ mova m4, [base+filter_intra_taps+filterq+16*3]
+ pxor m5, m5
+%if ARCH_X86_64
+ punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper
+ punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid
+ punpcklbw m10, m5, m2 ; having to perform sign-extension.
+ punpckhbw m11, m5, m2
+ punpcklbw m12, m5, m3
+ punpckhbw m13, m5, m3
+ punpcklbw m14, m5, m4
+ punpckhbw m15, m5, m4
+%else
+ punpcklbw m7, m5, m1
+ mova m8, m7
+ punpckhbw m7, m5, m1
+ mova m9, m7
+ punpcklbw m7, m5, m2
+ mova m10, m7
+ punpckhbw m7, m5, m2
+ mova m11, m7
+ punpcklbw m7, m5, m3
+ mova m12, m7
+ punpckhbw m7, m5, m3
+ mova m13, m7
+ punpcklbw m7, m5, m4
+ mova m14, m7
+ punpckhbw m7, m5, m4
+ mova m15, m7
+%endif
+ mova m7, [base+filter_shuf]
+ add hd, hd
+ mov r5, dstq
+ pshuflw m6, m6, q0000
+ mov r6, tlq
+ punpcklqdq m6, m6
+ sub tlq, hq
+.left_loop:
+ pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __
+ pshufd m1, m0, q0000
+ pmaddwd m2, m8, m1
+ pmaddwd m1, m9
+ pshufd m4, m0, q1111
+ pmaddwd m3, m10, m4
+ pmaddwd m4, m11
+ paddd m2, m3
+ paddd m1, m4
+ pshufd m4, m0, q2222
+ pmaddwd m3, m12, m4
+ pmaddwd m4, m13
+ paddd m2, m3
+ paddd m1, m4
+ pshufd m3, m0, q3333
+ pmaddwd m0, m14, m3
+ pmaddwd m3, m15
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 11 ; x >> 3
+ psrad m1, 11
+ packssdw m0, m1
+ pmaxsw m0, m5
+ pavgw m0, m5 ; (x + 8) >> 4
+ pminsw m0, m6
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movlps m0, [tlq+hq-10]
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2*2
+ jg .left_loop
+ sub wd, 4
+ jz .end
+ sub tld, r6d ; -h*2
+ sub r6, r5 ; tl-dst
+.right_loop0:
+ add r5, 8
+ mov hd, tld
+ movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __
+ mov dstq, r5
+.right_loop:
+ pshufd m2, m0, q0000
+ pmaddwd m1, m8, m2
+ pmaddwd m2, m9
+ pshufd m4, m0, q1111
+ pmaddwd m3, m10, m4
+ pmaddwd m4, m11
+ pinsrw m0, [dstq+strideq*0-2], 5
+ paddd m1, m3
+ paddd m2, m4
+ pshufd m0, m0, q2222
+ movddup m4, [dstq+strideq*1-8]
+ pmaddwd m3, m12, m0
+ pmaddwd m0, m13
+ paddd m1, m3
+ paddd m0, m2
+ pshuflw m2, m4, q3333
+ punpcklwd m2, m5
+ pmaddwd m3, m14, m2
+ pmaddwd m2, m15
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 11
+ psrad m0, 11
+ packssdw m0, m1
+ pmaxsw m0, m5
+ pavgw m0, m5
+ pminsw m0, m6
+ movhps [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ palignr m0, m4, 14
+ lea dstq, [dstq+strideq*2]
+ add hd, 2*2
+ jl .right_loop
+ sub wd, 4
+ jg .right_loop0
+.end:
+ RET
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac
+ LEA t0, ipred_cfl_left_16bpc_ssse3_table
+ movd m4, wd
+ tzcnt wd, wd
+ movifnidn hd, hm
+ add tlq, 2
+ movsxd r6, [t0+wq*4]
+ movd m5, wd
+ jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start)
+
+cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ LEA t0, ipred_cfl_left_16bpc_ssse3_table
+ tzcnt wd, wm
+ lea r6d, [hq*2]
+ movd m4, hd
+ sub tlq, r6
+ tzcnt r6d, hd
+ movd m5, r6d
+ movsxd r6, [t0+r6*4]
+.start:
+ movd m7, r7m
+ movu m0, [tlq]
+ add r6, t0
+ add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table
+ movsxd wq, [t0+wq*4]
+ pxor m6, m6
+ pshuflw m7, m7, q0000
+ pcmpeqw m3, m3
+ add wq, t0
+ movifnidn acq, acmp
+ pavgw m4, m6
+ punpcklqdq m7, m7
+ jmp r6
+.h32:
+ movu m1, [tlq+48]
+ movu m2, [tlq+32]
+ paddw m0, m1
+ paddw m0, m2
+.h16:
+ movu m1, [tlq+16]
+ paddw m0, m1
+.h8:
+ pshufd m1, m0, q1032
+ paddw m0, m1
+.h4:
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshuflw m0, m4, q1032
+ paddd m0, m4
+ psrld m0, m5
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ jmp wq
+
+%macro IPRED_CFL 2 ; dst, src
+ pabsw m%1, m%2
+ pmulhrsw m%1, m2
+ psignw m%2, m1
+ psignw m%1, m%2
+ paddw m%1, m0
+ pmaxsw m%1, m6
+ pminsw m%1, m7
+%endmacro
+
+cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd m4, t0d
+ tzcnt t0d, t0d
+ movd m5, t0d
+ LEA t0, ipred_cfl_16bpc_ssse3_table
+ tzcnt wd, wd
+ movd m7, r7m
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+4*4]
+ psrlw m4, 1
+ pxor m6, m6
+ pshuflw m7, m7, q0000
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ pcmpeqw m3, m3
+ punpcklqdq m7, m7
+ jmp r6
+.h4:
+ movq m0, [tlq-8]
+ jmp wq
+.w4:
+ movq m1, [tlq+2]
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ cmp hd, 4
+ jg .w4_mul
+ psrld m0, 3
+ jmp .w4_end
+.w4_mul:
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 16
+ cmove r6d, r2d
+ movd m1, r6d
+ psrld m0, 2
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w4_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s4:
+ movd m1, alpham
+ lea r6, [strideq*3]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ add acq, 16*2
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ movq [dstq+strideq*0], m3
+ movhps [dstq+strideq*1], m3
+ movq [dstq+strideq*2], m4
+ movhps [dstq+r6 ], m4
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4_loop
+ RET
+.h8:
+ mova m0, [tlq-16]
+ jmp wq
+.w8:
+ movu m1, [tlq+2]
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 32
+ cmove r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w8_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s8:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ add acq, 16*2
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+strideq*0], m3
+ mova [dstq+strideq*1], m4
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s8_loop
+ RET
+.h16:
+ mova m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w16:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ test hd, 8|32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w16_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s16:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ add acq, 16*2
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+16*0], m3
+ mova [dstq+16*1], m4
+ add dstq, strideq
+ dec hd
+ jg .s16_loop
+ RET
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-48]
+ paddw m0, [tlq-32]
+ paddw m0, [tlq-16]
+ jmp wq
+.w32:
+ movu m1, [tlq+ 2]
+ movu m2, [tlq+18]
+ paddw m1, m2
+ movu m2, [tlq+34]
+ paddw m1, m2
+ movu m2, [tlq+50]
+ paddw m1, m2
+ paddw m0, m1
+ pmaddwd m0, m3
+ psubd m4, m0
+ pshufd m0, m4, q1032
+ paddd m0, m4
+ pshuflw m4, m0, q1032
+ paddd m0, m4
+ psrld m0, m5
+ cmp hd, 32
+ je .w32_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 8
+ cmove r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+ psrlw m0, 1
+.w32_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s32:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq+16*0]
+ mova m5, [acq+16*1]
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+16*0], m3
+ mova [dstq+16*1], m4
+ mova m4, [acq+16*2]
+ mova m5, [acq+16*3]
+ add acq, 16*4
+ IPRED_CFL 3, 4
+ IPRED_CFL 4, 5
+ mova [dstq+16*2], m3
+ mova [dstq+16*3], m4
+ add dstq, strideq
+ dec hd
+ jg .s32_loop
+ RET
+
+cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac
+ tzcnt wd, wm
+ LEA t0, ipred_cfl_splat_16bpc_ssse3_table
+ mov r6d, r7m
+ movifnidn hd, hm
+ shr r6d, 11
+ movd m7, r7m
+ movsxd wq, [t0+wq*4]
+ movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8]
+ pshuflw m7, m7, q0000
+ pxor m6, m6
+ add wq, t0
+ movifnidn acq, acmp
+ punpcklqdq m7, m7
+ jmp wq
+
+cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+%if ARCH_X86_32 && PIC
+ pcmpeqw m5, m5
+ pabsw m5, m5
+ paddw m5, m5
+%else
+ movddup m5, [pw_2]
+%endif
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ mov r5, acq
+ jg .w16
+ je .w8
+ lea r3, [strideq*3]
+.w4_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ pmaddwd m2, m5, [ypxq+strideq*2]
+ pmaddwd m3, m5, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ paddd m0, m1
+ paddd m2, m3
+ paddd m4, m0
+ packssdw m0, m2
+ paddd m4, m2
+ mova [acq], m0
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .dc
+ punpckhqdq m0, m0
+ pslld m2, 2
+.w4_hpad:
+ mova [acq+16*0], m0
+ paddd m4, m2
+ mova [acq+16*1], m0
+ add acq, 16*2
+ sub hpadd, 4
+ jg .w4_hpad
+ jmp .dc
+.w8:
+%if ARCH_X86_32
+ cmp dword wpadm, 0
+%else
+ test wpadd, wpadd
+%endif
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+16*0]
+ pmaddwd m2, m5, [ypxq+strideq*1+16*0]
+ pmaddwd m1, m5, [ypxq+strideq*0+16*1]
+ pmaddwd m3, m5, [ypxq+strideq*1+16*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m2
+ paddd m1, m3
+ paddd m2, m0, m1
+ packssdw m0, m1
+ paddd m4, m2
+ mova [acq], m0
+ add acq, 16
+ dec hd
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz .dc
+ pslld m2, 2
+ mova m1, m0
+ jmp .hpad
+.w8_wpad1:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m0, m1
+ pshufd m1, m0, q3333
+ paddd m2, m0, m1
+ packssdw m0, m1
+ paddd m4, m2
+ mova [acq], m0
+ add acq, 16
+ dec hd
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16_wpad3:
+ pshufd m3, m0, q3333
+ mova m1, m3
+ mova m2, m3
+ jmp .w16_wpad_end
+.w16_wpad2:
+ pshufd m1, m3, q3333
+ mova m2, m1
+ jmp .w16_wpad_end
+.w16_wpad1:
+ pshufd m2, m1, q3333
+ jmp .w16_wpad_end
+.w16:
+ movifnidn wpadd, wpadm
+ WIN64_SPILL_XMM 7
+.w16_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+16*0]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*0]
+ paddd m0, m6
+ cmp wpadd, 2
+ jg .w16_wpad3
+ pmaddwd m3, m5, [ypxq+strideq*0+16*1]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*1]
+ paddd m3, m6
+ je .w16_wpad2
+ pmaddwd m1, m5, [ypxq+strideq*0+16*2]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*2]
+ paddd m1, m6
+ jp .w16_wpad1
+ pmaddwd m2, m5, [ypxq+strideq*0+16*3]
+ pmaddwd m6, m5, [ypxq+strideq*1+16*3]
+ paddd m2, m6
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ paddd m6, m0, m3
+ packssdw m0, m3
+ paddd m6, m1
+ mova [acq+16*0], m0
+ packssdw m1, m2
+ paddd m2, m6
+ mova [acq+16*1], m1
+ add acq, 16*2
+ paddd m4, m2
+ dec hd
+ jg .w16_loop
+ WIN64_RESTORE_XMM
+ add hpadd, hpadd
+ jz .dc
+ paddd m2, m2
+.hpad:
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ paddd m4, m2
+ mova [acq+16*2], m0
+ mova [acq+16*3], m1
+ add acq, 16*4
+ sub hpadd, 4
+ jg .hpad
+.dc:
+ sub r5, acq ; -w*h*2
+ pshufd m2, m4, q1032
+ tzcnt r1d, r5d
+ paddd m2, m4
+ sub r1d, 2
+ pshufd m4, m2, q2301
+ movd m0, r1d
+ paddd m2, m4
+ psrld m2, m0
+ pxor m0, m0
+ pavgw m2, m0
+ packssdw m2, m2
+.dc_loop:
+ mova m0, [acq+r5+16*0]
+ mova m1, [acq+r5+16*1]
+ psubw m0, m2
+ psubw m1, m2
+ mova [acq+r5+16*0], m0
+ mova [acq+r5+16*1], m1
+ add r5, 16*2
+ jl .dc_loop
+ RET
+
+cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+ movifnidn hpadd, hpadm
+%if ARCH_X86_32 && PIC
+ pcmpeqw m5, m5
+ pabsw m5, m5
+ psllw m5, 2
+%else
+ movddup m5, [pw_4]
+%endif
+ mov hd, hm
+ shl hpadd, 2
+ pxor m4, m4
+ sub hd, hpadd
+ cmp dword wm, 8
+ mov r5, acq
+ jg .w16
+ je .w8
+ lea r3, [strideq*3]
+.w4_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m3, m5, [ypxq+strideq*1]
+ pmaddwd m1, m5, [ypxq+strideq*2]
+ pmaddwd m2, m5, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ paddd m4, m0
+ packssdw m0, m3
+ paddd m3, m1
+ packssdw m1, m2
+ paddd m4, m2
+ paddd m4, m3
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ add acq, 16*2
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ punpckhqdq m1, m1
+ pslld m2, 3
+ mova [acq+16*0], m1
+ mova [acq+16*1], m1
+ paddd m4, m2
+ mova [acq+16*2], m1
+ mova [acq+16*3], m1
+ add acq, 16*4
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+.w8:
+%if ARCH_X86_32
+ cmp dword wpadm, 0
+%else
+ test wpadd, wpadd
+%endif
+ jnz .w8_wpad1
+.w8_loop:
+ pmaddwd m0, m5, [ypxq+strideq*0+16*0]
+ pmaddwd m2, m5, [ypxq+strideq*0+16*1]
+ pmaddwd m1, m5, [ypxq+strideq*1+16*0]
+ pmaddwd m3, m5, [ypxq+strideq*1+16*1]
+ lea ypxq, [ypxq+strideq*2]
+ paddd m4, m0
+ packssdw m0, m2
+ paddd m4, m2
+ mova [acq+16*0], m0
+ paddd m2, m1, m3
+ packssdw m1, m3
+ paddd m4, m2
+ mova [acq+16*1], m1
+ add acq, 16*2
+ sub hd, 2
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ pslld m2, 2
+ mova m0, m1
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+.w8_wpad1:
+ pmaddwd m0, m5, [ypxq+strideq*0]
+ pmaddwd m1, m5, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ pshufd m2, m0, q3333
+ pshufd m3, m1, q3333
+ paddd m4, m0
+ packssdw m0, m2
+ paddd m4, m2
+ paddd m2, m1, m3
+ packssdw m1, m3
+ paddd m4, m2
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ add acq, 16*2
+ sub hd, 2
+ jg .w8_wpad1
+ jmp .w8_hpad
+.w16_wpad3:
+ pshufd m3, m0, q3333
+ mova m1, m3
+ mova m2, m3
+ jmp .w16_wpad_end
+.w16_wpad2:
+ pshufd m1, m3, q3333
+ mova m2, m1
+ jmp .w16_wpad_end
+.w16_wpad1:
+ pshufd m2, m1, q3333
+ jmp .w16_wpad_end
+.w16:
+ movifnidn wpadd, wpadm
+ WIN64_SPILL_XMM 7
+.w16_loop:
+ pmaddwd m0, m5, [ypxq+16*0]
+ cmp wpadd, 2
+ jg .w16_wpad3
+ pmaddwd m3, m5, [ypxq+16*1]
+ je .w16_wpad2
+ pmaddwd m1, m5, [ypxq+16*2]
+ jp .w16_wpad1
+ pmaddwd m2, m5, [ypxq+16*3]
+.w16_wpad_end:
+ add ypxq, strideq
+ paddd m6, m0, m3
+ packssdw m0, m3
+ mova [acq+16*0], m0
+ paddd m6, m1
+ packssdw m1, m2
+ paddd m2, m6
+ mova [acq+16*1], m1
+ add acq, 16*2
+ paddd m4, m2
+ dec hd
+ jg .w16_loop
+ WIN64_RESTORE_XMM
+ add hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ paddd m2, m2
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+
+cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
+%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table
+ LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table
+ tzcnt wd, wm
+ movifnidn hpadd, hpadm
+ pxor m4, m4
+ movsxd wq, [r6+wq*4]
+ movddup m5, [base+pw_1]
+ add wq, r6
+ mov hd, hm
+ shl hpadd, 2
+ sub hd, hpadd
+ jmp wq
+.w4:
+ lea r3, [strideq*3]
+ mov r5, acq
+.w4_loop:
+ movq m0, [ypxq+strideq*0]
+ movhps m0, [ypxq+strideq*1]
+ movq m1, [ypxq+strideq*2]
+ movhps m1, [ypxq+r3 ]
+ lea ypxq, [ypxq+strideq*4]
+ psllw m0, 3
+ psllw m1, 3
+ mova [acq+16*0], m0
+ pmaddwd m0, m5
+ mova [acq+16*1], m1
+ pmaddwd m2, m5, m1
+ add acq, 16*2
+ paddd m4, m0
+ paddd m4, m2
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ punpckhqdq m1, m1
+ mova [acq+16*0], m1
+ pslld m2, 2
+ mova [acq+16*1], m1
+ punpckhqdq m2, m2
+ mova [acq+16*2], m1
+ paddd m4, m2
+ mova [acq+16*3], m1
+ add acq, 16*4
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+.w8:
+ mov r5, acq
+.w8_loop:
+ mova m0, [ypxq+strideq*0]
+ mova m1, [ypxq+strideq*1]
+ lea ypxq, [ypxq+strideq*2]
+ psllw m0, 3
+ psllw m1, 3
+ mova [acq+16*0], m0
+ pmaddwd m0, m5
+ mova [acq+16*1], m1
+ pmaddwd m2, m5, m1
+ add acq, 16*2
+ paddd m4, m0
+ paddd m4, m2
+ sub hd, 2
+ jg .w8_loop
+.w8_hpad:
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ pslld m2, 2
+ mova m0, m1
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+.w16_wpad2:
+ pshufhw m3, m2, q3333
+ pshufhw m1, m0, q3333
+ punpckhqdq m3, m3
+ punpckhqdq m1, m1
+ jmp .w16_wpad_end
+.w16:
+ movifnidn wpadd, wpadm
+ mov r5, acq
+.w16_loop:
+ mova m2, [ypxq+strideq*0+16*0]
+ mova m0, [ypxq+strideq*1+16*0]
+ psllw m2, 3
+ psllw m0, 3
+ test wpadd, wpadd
+ jnz .w16_wpad2
+ mova m3, [ypxq+strideq*0+16*1]
+ mova m1, [ypxq+strideq*1+16*1]
+ psllw m3, 3
+ psllw m1, 3
+.w16_wpad_end:
+ lea ypxq, [ypxq+strideq*2]
+ mova [acq+16*0], m2
+ pmaddwd m2, m5
+ mova [acq+16*1], m3
+ pmaddwd m3, m5
+ paddd m4, m2
+ pmaddwd m2, m5, m0
+ mova [acq+16*2], m0
+ paddd m4, m3
+ pmaddwd m3, m5, m1
+ mova [acq+16*3], m1
+ add acq, 16*4
+ paddd m2, m3
+ paddd m4, m2
+ sub hd, 2
+ jg .w16_loop
+ add hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+ paddd m2, m2
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
+.w32_wpad6:
+ pshufhw m1, m0, q3333
+ punpckhqdq m1, m1
+ mova m2, m1
+ mova m3, m1
+ jmp .w32_wpad_end
+.w32_wpad4:
+ pshufhw m2, m1, q3333
+ punpckhqdq m2, m2
+ mova m3, m2
+ jmp .w32_wpad_end
+.w32_wpad2:
+ pshufhw m3, m2, q3333
+ punpckhqdq m3, m3
+ jmp .w32_wpad_end
+.w32:
+ movifnidn wpadd, wpadm
+ mov r5, acq
+ WIN64_SPILL_XMM 8
+.w32_loop:
+ mova m0, [ypxq+16*0]
+ psllw m0, 3
+ cmp wpadd, 4
+ jg .w32_wpad6
+ mova m1, [ypxq+16*1]
+ psllw m1, 3
+ je .w32_wpad4
+ mova m2, [ypxq+16*2]
+ psllw m2, 3
+ jnp .w32_wpad2
+ mova m3, [ypxq+16*3]
+ psllw m3, 3
+.w32_wpad_end:
+ add ypxq, strideq
+ pmaddwd m6, m5, m0
+ mova [acq+16*0], m0
+ pmaddwd m7, m5, m1
+ mova [acq+16*1], m1
+ paddd m6, m7
+ pmaddwd m7, m5, m2
+ mova [acq+16*2], m2
+ paddd m6, m7
+ pmaddwd m7, m5, m3
+ mova [acq+16*3], m3
+ add acq, 16*4
+ paddd m6, m7
+ paddd m4, m6
+ dec hd
+ jg .w32_loop
+%if WIN64
+ mova m5, m6
+ WIN64_RESTORE_XMM
+ SWAP 5, 6
+%endif
+ test hpadd, hpadd
+ jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+.w32_hpad_loop:
+ mova [acq+16*0], m0
+ mova [acq+16*1], m1
+ paddd m4, m6
+ mova [acq+16*2], m2
+ mova [acq+16*3], m3
+ add acq, 16*4
+ dec hpadd
+ jg .w32_hpad_loop
+ jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
+
+cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
+%define base r2-pal_pred_16bpc_ssse3_table
+%if ARCH_X86_32
+ %define hd r2d
+%endif
+ mova m3, [palq]
+ LEA r2, pal_pred_16bpc_ssse3_table
+ tzcnt wd, wm
+ pshufb m3, [base+pal_pred_shuf]
+ movsxd wq, [r2+wq*4]
+ pshufd m4, m3, q1032
+ add wq, r2
+ movifnidn hd, hm
+ jmp wq
+.w4:
+ mova m0, [idxq]
+ add idxq, 16
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ mova m0, [idxq]
+ add idxq, 16
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8
+ RET
+.w16:
+ mova m0, [idxq]
+ add idxq, 16
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w16
+ RET
+.w32:
+ mova m0, [idxq+16*0]
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova m2, [idxq+16*1]
+ add idxq, 16*2
+ mova [dstq+16*0], m0
+ pshufb m0, m3, m2
+ mova [dstq+16*1], m1
+ pshufb m1, m4, m2
+ punpcklbw m2, m0, m1
+ punpckhbw m0, m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m0
+ add dstq, strideq
+ dec hd
+ jg .w32
+ RET
+.w64:
+ mova m0, [idxq+16*0]
+ pshufb m1, m3, m0
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova m2, [idxq+16*1]
+ mova [dstq+16*0], m0
+ pshufb m0, m3, m2
+ mova [dstq+16*1], m1
+ pshufb m1, m4, m2
+ punpcklbw m2, m0, m1
+ punpckhbw m0, m1
+ mova m1, [idxq+16*2]
+ mova [dstq+16*2], m2
+ pshufb m2, m3, m1
+ mova [dstq+16*3], m0
+ pshufb m0, m4, m1
+ punpcklbw m1, m2, m0
+ punpckhbw m2, m0
+ mova m0, [idxq+16*3]
+ add idxq, 16*4
+ mova [dstq+16*4], m1
+ pshufb m1, m3, m0
+ mova [dstq+16*5], m2
+ pshufb m2, m4, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ add dstq, strideq
+ dec hd
+ jg .w64
+ RET
diff --git a/third_party/dav1d/src/x86/ipred_avx2.asm b/third_party/dav1d/src/x86/ipred_avx2.asm
new file mode 100644
index 0000000000..dd188a7f37
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred_avx2.asm
@@ -0,0 +1,5387 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+%macro SMOOTH_WEIGHT_TABLE 1-*
+ %rep %0
+ db %1-128, 127-%1
+ %rotate 1
+ %endrep
+%endmacro
+
+; sm_weights[], but modified to precalculate x and 256-x with offsets to
+; enable efficient use of pmaddubsw (which requires signed values)
+smooth_weights: SMOOTH_WEIGHT_TABLE \
+ 0, 0, 255, 128, 255, 149, 85, 64, \
+ 255, 197, 146, 105, 73, 50, 37, 32, \
+ 255, 225, 196, 170, 145, 123, 102, 84, \
+ 68, 54, 43, 33, 26, 20, 17, 16, \
+ 255, 240, 225, 210, 196, 182, 169, 157, \
+ 145, 133, 122, 111, 101, 92, 83, 74, \
+ 66, 59, 52, 45, 39, 34, 29, 25, \
+ 21, 17, 14, 12, 10, 9, 8, 8, \
+ 255, 248, 240, 233, 225, 218, 210, 203, \
+ 196, 189, 182, 176, 169, 163, 156, 150, \
+ 144, 138, 133, 127, 121, 116, 111, 106, \
+ 101, 96, 91, 86, 82, 77, 73, 69, \
+ 65, 61, 57, 54, 50, 47, 44, 41, \
+ 38, 35, 32, 29, 27, 25, 22, 20, \
+ 18, 16, 15, 13, 12, 10, 9, 8, \
+ 7, 6, 6, 5, 5, 4, 4, 4
+
+pb_1to32: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+pb_32to1: db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17
+pb_16to1: db 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
+ db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
+z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16
+ db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
+ db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0
+z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
+ db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
+ db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line
+pb_128: times 4 db 128 ; those are just placed here for alignment.
+pb_36_m4: times 2 db 36, -4
+z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0
+z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
+z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
+z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8
+z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8
+z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+z2_shuf_h2: db 3, 2, 7, 6, 11, 10, 15, 14, 2, 1, 6, 5, 10, 9, 14, 13
+z2_shuf_h4: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11
+z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8
+z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
+ dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64
+z2_base_inc: dw 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64, 8*64
+ dw 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64, 16*64
+z2_ymul: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7
+ db 32, 32, 32, 32, 12, 12, 12, 12, 1, 0, 1, 0, 5, -1, -1, -1 ; 0, 4, 1, 5
+; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5
+filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1
+ db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1
+filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1
+filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1
+pb_127_m127: times 2 db 127, -127
+ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
+ db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15
+ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1
+ db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0
+pw_64: times 2 dw 64
+
+cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1
+ times 9 db 7, -1
+cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ ; w=8, w_pad=1 as well as second half of previous one
+cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5
+ times 5 db 6, 7
+ ; w=16,w_pad=2
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ times 8 db 14, 15
+ ; w=16,w_pad=3
+ db 0, 1, 2, 3, 4, 5
+ times 13 db 6, 7
+pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+%define pb_0to15 cfl_ac_w16_pad_shuffle
+%define pb_1 (ipred_h_shuf+12)
+%define pb_2 (ipred_h_shuf+20)
+%define pb_3 (ipred_h_shuf+ 4)
+%define pb_4 (ipred_h_shuf+24)
+%define pb_5 (ipred_h_shuf+ 8)
+%define pb_7 (ipred_h_shuf+ 0)
+%define pb_8 (z_upsample2 +12)
+%define pb_12 (z2_y_shuf_h4+20)
+%define pb_14 (z2_y_shuf_h4+ 4)
+%define pb_15 (z_filter_s +32)
+%define pb_27 (z2_y_shuf_h4+ 8)
+%define pb_31 (z2_y_shuf_h4+12)
+%define pb_32 (z2_y_shuf_h4+16)
+%define pb_90 (z2_y_shuf_h4+ 0)
+%define pw_1 (z2_y_shuf_h4+24)
+%define pw_8 (z_filter_k +32)
+
+pw_62: times 2 dw 62
+pw_128: times 2 dw 128
+pw_255: times 2 dw 255
+pw_512: times 2 dw 512
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
+%define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4)
+
+JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_filter, avx2, w4, w8, w16, w32
+JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32
+JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
+JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3
+JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32
+JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64
+
+cextern dr_intra_derivative
+cextern filter_intra_taps
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
+ lea r5, [ipred_dc_left_avx2_table]
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ mov r6d, 0x8000
+ shrx r6d, r6d, wd
+ movd xm3, r6d
+ movsxd r6, [r5+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ mov r5d, 0x8000
+ shrx r5d, r5d, r6d
+ movd xm3, r5d
+ lea r5, [ipred_dc_left_avx2_table]
+ movsxd r6, [r5+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu m1, [tlq+32] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h32:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h16:
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+.h8:
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+.h4:
+ pmaddwd xm0, xm2
+ pmulhrsw xm0, xm3
+ lea stride3q, [strideq*3]
+ vpbroadcastb m0, xm0
+ mova m1, m0
+ jmp wq
+
+cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd xm4, r5d
+ tzcnt r5d, r5d
+ movd xm5, r5d
+ lea r5, [ipred_dc_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ pcmpeqd m3, m3
+ psrlw xm4, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movd xm0, [tlq-4]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w4:
+ movd xm1, [tlq+1]
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq xm1, xm0, xm0
+ lea r2d, [hq*2]
+ mov r6d, 0x55563334
+ paddw xm0, xm1
+ shrx r6d, r6d, r2d
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ movd xm1, r6d
+ psrlw xm0, 2
+ pmulhuw xm0, xm1
+.w4_end:
+ vpbroadcastb xm0, xm0
+.s4:
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm0
+ movd [dstq+strideq*2], xm0
+ movd [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+ALIGN function_align
+.h8:
+ movq xm0, [tlq-8]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w8:
+ movq xm1, [tlq+1]
+ vextracti128 xm2, m0, 1
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm2
+ punpckhqdq xm2, xm0, xm0
+ paddw xm0, xm2
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmove r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w8_end:
+ vpbroadcastb xm0, xm0
+.s8:
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm0
+ movq [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+ALIGN function_align
+.h16:
+ mova xm0, [tlq-16]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w16:
+ movu xm1, [tlq+1]
+ vextracti128 xm2, m0, 1
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm2
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w16_end:
+ vpbroadcastb xm0, xm0
+.s16:
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm0
+ mova [dstq+strideq*2], xm0
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ psubw xm0, xm4
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x33345556
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w32_end:
+ vpbroadcastb m0, xm0
+.s32:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+ALIGN function_align
+.h64:
+ mova m0, [tlq-64]
+ mova m1, [tlq-32]
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 1]
+ movu m2, [tlq+33]
+ pmaddubsw m1, m3
+ pmaddubsw m2, m3
+ paddw m0, m1
+ paddw m0, m2
+ vextracti128 xm1, m0, 1
+ psubw xm0, xm4
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x33345556
+ shrx r6d, r6d, hd
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w64_end:
+ vpbroadcastb m0, xm0
+ mova m1, m0
+.s64:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m0
+ mova [dstq+strideq*2+32*1], m1
+ mova [dstq+stride3q +32*0], m0
+ mova [dstq+stride3q +32*1], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_splat_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m0, [r5-ipred_dc_splat_avx2_table+pb_128]
+ mova m1, m0
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_splat_avx2_table]
+ tzcnt wd, wm
+ movu m0, [tlq+ 1]
+ movu m1, [tlq+33]
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+%macro IPRED_H 2 ; w, store_type
+ vpbroadcastb m0, [tlq-1]
+ vpbroadcastb m1, [tlq-2]
+ vpbroadcastb m2, [tlq-3]
+ sub tlq, 4
+ vpbroadcastb m3, [tlq+0]
+ mov%2 [dstq+strideq*0], m0
+ mov%2 [dstq+strideq*1], m1
+ mov%2 [dstq+strideq*2], m2
+ mov%2 [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w%1
+ RET
+ALIGN function_align
+%endmacro
+
+INIT_XMM avx2
+cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_h_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ IPRED_H 4, d
+.w8:
+ IPRED_H 8, q
+.w16:
+ IPRED_H 16, a
+INIT_YMM avx2
+.w32:
+ IPRED_H 32, a
+.w64:
+ vpbroadcastb m0, [tlq-1]
+ vpbroadcastb m1, [tlq-2]
+ vpbroadcastb m2, [tlq-3]
+ sub tlq, 4
+ vpbroadcastb m3, [tlq+0]
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m0
+ mova [dstq+strideq*1+32*0], m1
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m2
+ mova [dstq+strideq*2+32*1], m2
+ mova [dstq+stride3q +32*0], m3
+ mova [dstq+stride3q +32*1], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w64
+ RET
+
+%macro PAETH 2 ; top, ldiff
+ pavgb m1, m%1, m3 ; Calculating tldiff normally requires
+ pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it
+ pand m0, m4 ; in 8-bit with some tricks which avoids
+ psubusb m2, m5, m1 ; having to unpack everything to 16-bit.
+ psubb m1, m0
+ psubusb m1, m5
+ por m1, m2
+ paddusb m1, m1
+ por m1, m0 ; min(tldiff, 255)
+ psubusb m2, m5, m3
+ psubusb m0, m3, m5
+ por m2, m0 ; tdiff
+ pminub m2, m%2
+ pcmpeqb m0, m%2, m2 ; ldiff <= tdiff
+ vpblendvb m0, m%1, m3, m0
+ pminub m1, m2
+ pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff
+ vpblendvb m0, m5, m0, m1
+%endmacro
+
+cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h
+%define base r5-ipred_paeth_avx2_table
+ lea r5, [ipred_paeth_avx2_table]
+ tzcnt wd, wm
+ vpbroadcastb m5, [tlq] ; topleft
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m4, [base+pb_1]
+ add wq, r5
+ jmp wq
+.w4:
+ vpbroadcastd m6, [tlq+1] ; top
+ mova m8, [base+ipred_h_shuf]
+ lea r3, [strideq*3]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0 ; ldiff
+.w4_loop:
+ sub tlq, 8
+ vpbroadcastq m3, [tlq]
+ pshufb m3, m8 ; left
+ PAETH 6, 7
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r3 ], xm1, 2
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+r3 ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 8
+ jg .w4_loop
+.ret:
+ RET
+ALIGN function_align
+.w8:
+ vpbroadcastq m6, [tlq+1]
+ mova m8, [base+ipred_h_shuf]
+ lea r3, [strideq*3]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w8_loop:
+ sub tlq, 4
+ vpbroadcastd m3, [tlq]
+ pshufb m3, m8
+ PAETH 6, 7
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ vbroadcasti128 m6, [tlq+1]
+ mova xm8, xm4 ; lower half = 1, upper half = 0
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w16_loop:
+ sub tlq, 2
+ vpbroadcastd m3, [tlq]
+ pshufb m3, m8
+ PAETH 6, 7
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w32_loop:
+ dec tlq
+ vpbroadcastb m3, [tlq]
+ PAETH 6, 7
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ movu m6, [tlq+ 1]
+ movu m7, [tlq+33]
+%if WIN64
+ movaps r4m, xmm9
+%endif
+ psubusb m8, m5, m6
+ psubusb m0, m6, m5
+ psubusb m9, m5, m7
+ psubusb m1, m7, m5
+ por m8, m0
+ por m9, m1
+.w64_loop:
+ dec tlq
+ vpbroadcastb m3, [tlq]
+ PAETH 6, 8
+ mova [dstq+32*0], m0
+ PAETH 7, 9
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+%if WIN64
+ movaps xmm9, r4m
+%endif
+ RET
+
+%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
+ ; w * a = (w - 128) * a + 128 * a
+ ; (256 - w) * b = (127 - w) * b + 129 * b
+ pmaddubsw m0, m%3, m%1
+ pmaddubsw m1, m%4, m%2
+ paddw m0, m%5
+ paddw m1, m%6
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+%endmacro
+
+cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights
+%define base r6-ipred_smooth_v_avx2_table
+ lea r6, [ipred_smooth_v_avx2_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m0, [base+pb_127_m127]
+ vpbroadcastd m1, [base+pw_128]
+ lea weightsq, [base+smooth_weights+hq*4]
+ neg hq
+ vpbroadcastb m5, [tlq+hq] ; bottom
+ add wq, r6
+ jmp wq
+.w4:
+ vpbroadcastd m2, [tlq+1]
+ punpcklbw m2, m5 ; top, bottom
+ mova m5, [base+ipred_v_shuf]
+ lea r3, [strideq*3]
+ punpckldq m4, m5, m5
+ punpckhdq m5, m5
+ pmaddubsw m3, m2, m0
+ paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok
+ paddw m3, m1 ; 128 * top + 129 * bottom + 128
+.w4_loop:
+ vbroadcasti128 m1, [weightsq+hq*2]
+ pshufb m0, m1, m4
+ pshufb m1, m5
+ SMOOTH 0, 1, 2, 2, 3, 3
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 1
+ pextrd [dstq+r3 ], xm1, 1
+ cmp hd, -4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm1, 2
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+r3 ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ add hq, 8
+ jl .w4_loop
+.ret:
+ RET
+ALIGN function_align
+.w8:
+ vpbroadcastq m2, [tlq+1]
+ punpcklbw m2, m5
+ mova m5, [base+ipred_v_shuf]
+ lea r3, [strideq*3]
+ pshufd m4, m5, q0000
+ pshufd m5, m5, q1111
+ pmaddubsw m3, m2, m0
+ paddw m1, m2
+ paddw m3, m1
+.w8_loop:
+ vpbroadcastq m1, [weightsq+hq*2]
+ pshufb m0, m1, m4
+ pshufb m1, m5
+ SMOOTH 0, 1, 2, 2, 3, 3
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ WIN64_SPILL_XMM 7
+ vbroadcasti128 m3, [tlq+1]
+ mova m6, [base+ipred_v_shuf]
+ punpcklbw m2, m3, m5
+ punpckhbw m3, m5
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w16_loop:
+ vpbroadcastd m1, [weightsq+hq*2]
+ pshufb m1, m6
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 6
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m5
+ punpckhbw m3, m5
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w32_loop:
+ vpbroadcastw m1, [weightsq+hq*2]
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m0
+ add dstq, strideq
+ inc hq
+ jl .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ WIN64_SPILL_XMM 11
+ movu m4, [tlq+ 1]
+ movu m8, [tlq+33]
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m7, m8, m5
+ punpckhbw m8, m5
+ pmaddubsw m5, m3, m0
+ pmaddubsw m6, m4, m0
+ pmaddubsw m9, m7, m0
+ pmaddubsw m10, m8, m0
+ paddw m2, m1, m3
+ paddw m5, m2
+ paddw m2, m1, m4
+ paddw m6, m2
+ paddw m0, m1, m7
+ paddw m9, m0
+ paddw m1, m8
+ paddw m10, m1
+.w64_loop:
+ vpbroadcastw m2, [weightsq+hq*2]
+ SMOOTH 2, 2, 3, 4, 5, 6
+ mova [dstq+32*0], m0
+ SMOOTH 2, 2, 7, 8, 9, 10
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ inc hq
+ jl .w64_loop
+ RET
+
+%macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used
+ %assign stack_offset 0
+ %assign stack_size_padded 0
+ %assign regs_used %2
+ %xdefine rstk rsp
+ SETUP_STACK_POINTER %1
+ %if regs_used != %2 && WIN64
+ PUSH r%2
+ %endif
+ ALLOC_STACK %1, %3
+%endmacro
+
+cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h
+%define base r6-ipred_smooth_h_avx2_table
+ lea r6, [ipred_smooth_h_avx2_table]
+ mov wd, wm
+ vpbroadcastb m3, [tlq+wq] ; right
+ tzcnt wd, wd
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m4, [base+pb_127_m127]
+ vpbroadcastd m5, [base+pw_128]
+ add wq, r6
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 8
+ vpbroadcastq m6, [base+smooth_weights+4*2]
+ mova m7, [base+ipred_h_shuf]
+ sub tlq, 8
+ sub tlq, hq
+ lea r3, [strideq*3]
+.w4_loop:
+ vpbroadcastq m2, [tlq+hq]
+ pshufb m2, m7
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m6
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r3 ], xm1, 2
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+r3 ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 8
+ jg .w4_loop
+.ret:
+ RET
+ALIGN function_align
+.w8:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 8
+ vbroadcasti128 m6, [base+smooth_weights+8*2]
+ mova m7, [base+ipred_h_shuf]
+ sub tlq, 4
+ lea r3, [strideq*3]
+ sub tlq, hq
+.w8_loop:
+ vpbroadcastd m2, [tlq+hq]
+ pshufb m2, m7
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4
+ paddw m0, m1
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m6
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ SETUP_STACK_FRAME 32*4, 7, 8
+ lea r3, [rsp+64*2-4]
+ call .prep ; only worthwhile for for w16 and above
+ sub tlq, 2
+ vpbroadcastd xm6, [base+pb_1]
+ mova xm7, [base+ipred_v_shuf+16]
+ vinserti128 m7, [base+ipred_v_shuf+ 0], 1
+ vbroadcasti128 m4, [base+smooth_weights+16*2]
+ vbroadcasti128 m5, [base+smooth_weights+16*3]
+.w16_loop:
+ vpbroadcastd m1, [tlq+hq]
+ vpbroadcastd m2, [r3+hq*2]
+ pshufb m1, m6
+ punpcklbw m1, m3
+ pshufb m2, m7
+ SMOOTH 4, 5, 1, 1, 2, 2
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ SETUP_STACK_FRAME 32*4, 7, 6
+ lea r3, [rsp+64*2-2]
+ call .prep
+ dec tlq
+ mova xm4, [base+smooth_weights+16*4]
+ vinserti128 m4, [base+smooth_weights+16*6], 1
+ mova xm5, [base+smooth_weights+16*5]
+ vinserti128 m5, [base+smooth_weights+16*7], 1
+.w32_loop:
+ vpbroadcastb m1, [tlq+hq]
+ punpcklbw m1, m3
+ vpbroadcastw m2, [r3+hq*2]
+ SMOOTH 4, 5, 1, 1, 2, 2
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ SETUP_STACK_FRAME 32*4, 7, 9
+ lea r3, [rsp+64*2-2]
+ call .prep
+ add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table
+ dec tlq
+ mova xm5, [r6-16*7]
+ vinserti128 m5, [r6-16*5], 1
+ mova xm6, [r6-16*6]
+ vinserti128 m6, [r6-16*4], 1
+ mova xm7, [r6-16*3]
+ vinserti128 m7, [r6-16*1], 1
+ mova xm8, [r6-16*2]
+ vinserti128 m8, [r6-16*0], 1
+.w64_loop:
+ vpbroadcastb m2, [tlq+hq]
+ punpcklbw m2, m3
+ vpbroadcastw m4, [r3+hq*2]
+ SMOOTH 5, 6, 2, 2, 4, 4
+ mova [dstq+32*0], m0
+ SMOOTH 7, 8, 2, 2, 4, 4
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+ALIGN function_align
+.prep:
+ vpermq m2, [tlq-32*1], q3120
+ punpckhbw m1, m2, m3
+ punpcklbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m1, m5 ; 1 * left + 256 * right + 128
+ paddw m0, m1 ; 128 * left + 129 * right + 128
+ pmaddubsw m1, m2, m4
+ paddw m2, m5
+ paddw m1, m2
+ vpermq m2, [tlq-32*2], q3120
+ mova [rsp+gprsize+32*3], m0
+ mova [rsp+gprsize+32*2], m1
+ punpckhbw m1, m2, m3
+ punpcklbw m2, m3
+ pmaddubsw m0, m1, m4
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m2, m5
+ paddw m1, m2
+ mova [rsp+gprsize+32*1], m0
+ mova [rsp+gprsize+32*0], m1
+ sub r3, hq
+ sub tlq, hq
+ sub r3, hq
+ ret
+
+%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
+ pmaddubsw m0, m%3, m%1
+ pmaddubsw m1, m%4, m%2
+%ifnum %5
+ paddw m0, m%5
+%else
+ paddw m0, %5
+%endif
+%ifnum %6
+ paddw m1, m%6
+%else
+ paddw m1, %6
+%endif
+ pavgw m0, m2
+ pavgw m1, m3
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+%endmacro
+
+cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights
+%define base r6-ipred_smooth_avx2_table
+ lea r6, [ipred_smooth_avx2_table]
+ mov wd, wm
+ vpbroadcastb m4, [tlq+wq] ; right
+ tzcnt wd, wd
+ mov hd, hm
+ mov r5, tlq
+ sub r5, hq
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m5, [base+pb_127_m127]
+ vpbroadcastb m0, [r5] ; bottom
+ vpbroadcastd m3, [base+pw_255]
+ add wq, r6
+ lea v_weightsq, [base+smooth_weights+hq*2]
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 12
+ mova m10, [base+ipred_h_shuf]
+ vpbroadcastq m11, [base+smooth_weights+4*2]
+ mova m7, [base+ipred_v_shuf]
+ vpbroadcastd m8, [tlq+1]
+ sub tlq, 8
+ lea r3, [strideq*3]
+ sub tlq, hq
+ punpcklbw m8, m0 ; top, bottom
+ pshufd m6, m7, q2200
+ pshufd m7, m7, q3311
+ pmaddubsw m9, m8, m5
+ paddw m3, m8 ; 1 * top + 255 * bottom + 255
+ paddw m9, m3 ; 128 * top + 129 * bottom + 255
+.w4_loop:
+ vpbroadcastq m1, [tlq+hq]
+ pshufb m1, m10
+ punpcklbw m0, m1, m4 ; left, right
+ punpckhbw m1, m4
+ pmaddubsw m2, m0, m5 ; 127 * left - 127 * right
+ pmaddubsw m3, m1, m5
+ paddw m2, m0 ; 128 * left + 129 * right
+ paddw m3, m1
+ pmaddubsw m0, m11
+ pmaddubsw m1, m11
+ paddw m2, m0
+ paddw m3, m1
+ vbroadcasti128 m1, [v_weightsq]
+ add v_weightsq, 16
+ pshufb m0, m1, m6
+ pshufb m1, m7
+ SMOOTH_2D_END 0, 1, 8, 8, 9, 9
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r3 ], xm1, 2
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+r3 ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 8
+ jg .w4_loop
+.ret:
+ RET
+ALIGN function_align
+.w8:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+ mova m10, [base+ipred_h_shuf]
+ vbroadcasti128 m11, [base+smooth_weights+8*2]
+ mova m7, [base+ipred_v_shuf]
+ vpbroadcastq m8, [tlq+1]
+ sub tlq, 4
+ lea r3, [strideq*3]
+ sub tlq, hq
+ punpcklbw m8, m0
+ pshufd m6, m7, q0000
+ pshufd m7, m7, q1111
+ pmaddubsw m9, m8, m5
+ paddw m3, m8
+ paddw m9, m3
+.w8_loop:
+ vpbroadcastd m1, [tlq+hq]
+ pshufb m1, m10
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ pmaddubsw m2, m0, m5
+ pmaddubsw m3, m1, m5
+ paddw m2, m0
+ paddw m3, m1
+ pmaddubsw m0, m11
+ pmaddubsw m1, m11
+ paddw m2, m0
+ paddw m3, m1
+ vpbroadcastq m1, [v_weightsq]
+ add v_weightsq, 8
+ pshufb m0, m1, m6
+ pshufb m1, m7
+ SMOOTH_2D_END 0, 1, 8, 8, 9, 9
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ SETUP_STACK_FRAME 32*4, 7, 14
+ vbroadcasti128 m11, [tlq+1]
+ lea r3, [rsp+64*2-4]
+ punpcklbw m10, m11, m0 ; top, bottom
+ punpckhbw m11, m0
+ call .prep_v
+ sub tlq, 2
+ pmaddubsw m12, m10, m5
+ pmaddubsw m13, m11, m5
+ vpbroadcastd xm5, [base+pb_1]
+ mova m9, [base+ipred_v_shuf]
+ vbroadcasti128 m6, [base+smooth_weights+16*2]
+ vbroadcasti128 m7, [base+smooth_weights+16*3]
+ vperm2i128 m8, m9, m9, 0x01
+ paddw m0, m10, m3
+ paddw m3, m11
+ paddw m12, m0
+ paddw m13, m3
+.w16_loop:
+ vpbroadcastd m3, [tlq+hq]
+ vpbroadcastd m0, [r3+hq*2]
+ vpbroadcastd m1, [v_weightsq]
+ add v_weightsq, 4
+ pshufb m3, m5
+ punpcklbw m3, m4 ; left, right
+ pmaddubsw m2, m3, m6
+ pmaddubsw m3, m7
+ pshufb m0, m8
+ pshufb m1, m9
+ paddw m2, m0
+ paddw m3, m0
+ SMOOTH_2D_END 1, 1, 10, 11, 12, 13
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ SETUP_STACK_FRAME 32*4, 7, 11
+ movu m8, [tlq+1]
+ lea r3, [rsp+64*2-2]
+ punpcklbw m7, m8, m0
+ punpckhbw m8, m0
+ call .prep_v
+ dec tlq
+ pmaddubsw m9, m7, m5
+ pmaddubsw m10, m8, m5
+ mova xm5, [base+smooth_weights+16*4]
+ vinserti128 m5, [base+smooth_weights+16*6], 1
+ mova xm6, [base+smooth_weights+16*5]
+ vinserti128 m6, [base+smooth_weights+16*7], 1
+ paddw m0, m7, m3
+ paddw m3, m8
+ paddw m9, m0
+ paddw m10, m3
+.w32_loop:
+ vpbroadcastb m3, [tlq+hq]
+ punpcklbw m3, m4
+ vpbroadcastw m0, [r3+hq*2]
+ vpbroadcastw m1, [v_weightsq]
+ add v_weightsq, 2
+ pmaddubsw m2, m3, m5
+ pmaddubsw m3, m6
+ paddw m2, m0
+ paddw m3, m0
+ SMOOTH_2D_END 1, 1, 7, 8, 9, 10
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ SETUP_STACK_FRAME 32*8, 7, 16
+ movu m13, [tlq+1 ]
+ movu m15, [tlq+33]
+ add r6, smooth_weights+16*15-ipred_smooth_avx2_table
+ lea r3, [rsp+64*2-2]
+ punpcklbw m12, m13, m0
+ punpckhbw m13, m0
+ punpcklbw m14, m15, m0
+ punpckhbw m15, m0
+ call .prep_v
+ dec tlq
+ pmaddubsw m0, m12, m5
+ pmaddubsw m1, m13, m5
+ pmaddubsw m2, m14, m5
+ pmaddubsw m5, m15, m5
+ mova xm8, [r6-16*7]
+ vinserti128 m8, [r6-16*5], 1
+ mova xm9, [r6-16*6]
+ vinserti128 m9, [r6-16*4], 1
+ mova xm10, [r6-16*3]
+ vinserti128 m10, [r6-16*1], 1
+ mova xm11, [r6-16*2]
+ vinserti128 m11, [r6-16*0], 1
+ lea r6, [rsp+32*4]
+ paddw m0, m3
+ paddw m1, m3
+ paddw m2, m3
+ paddw m3, m5
+ paddw m0, m12
+ paddw m1, m13
+ paddw m2, m14
+ paddw m3, m15
+ mova [r6+32*0], m0
+ mova [r6+32*1], m1
+ mova [r6+32*2], m2
+ mova [r6+32*3], m3
+.w64_loop:
+ vpbroadcastb m5, [tlq+hq]
+ punpcklbw m5, m4
+ vpbroadcastw m6, [r3+hq*2]
+ vpbroadcastw m7, [v_weightsq]
+ add v_weightsq, 2
+ pmaddubsw m2, m5, m8
+ pmaddubsw m3, m5, m9
+ paddw m2, m6
+ paddw m3, m6
+ SMOOTH_2D_END 7, 7, 12, 13, [r6+32*0], [r6+32*1]
+ mova [dstq+32*0], m0
+ pmaddubsw m2, m5, m10
+ pmaddubsw m3, m5, m11
+ paddw m2, m6
+ paddw m3, m6
+ SMOOTH_2D_END 7, 7, 14, 15, [r6+32*2], [r6+32*3]
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+ALIGN function_align
+.prep_v:
+ vpermq m2, [tlq-32*1], q3120
+ punpckhbw m1, m2, m4
+ punpcklbw m2, m4
+ pmaddubsw m0, m1, m5 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m2, m5
+ paddw m1, m2
+ vpermq m2, [tlq-32*2], q3120
+ mova [rsp+gprsize+32*3], m0
+ mova [rsp+gprsize+32*2], m1
+ punpckhbw m1, m2, m4
+ punpcklbw m2, m4
+ pmaddubsw m0, m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m5
+ paddw m1, m2
+ mova [rsp+gprsize+32*1], m0
+ mova [rsp+gprsize+32*0], m1
+ sub r3, hq
+ sub tlq, hq
+ sub r3, hq
+ ret
+
+cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z1_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea r7, [dr_intra_derivative]
+ inc tlq
+ movsxd wq, [r6+wq*4]
+ add wq, r6
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ movzx dxd, word [r7+dxq]
+ xor angled, 0x4ff ; d = 90 - angle
+ vpbroadcastd m3, [pw_512]
+ vpbroadcastd m4, [pw_62]
+ vpbroadcastd m5, [pw_64]
+ jmp wq
+.w4:
+ cmp angleb, 40
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ ALLOC_STACK -32, 8
+ mova xm1, [tlq-1]
+ pshufb xm0, xm1, [z_upsample1]
+ pshufb xm1, [z_upsample2]
+ vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
+ add dxd, dxd ; pw_512 (which is already in m3)
+ pmaddubsw xm0, xm2 ; for rounding instead of pw_2048
+ pextrd [rsp+16], xm1, 3 ; top[max_base_x]
+ pmaddubsw xm1, xm2
+ movd xm7, dxd
+ mov r3d, dxd ; xpos
+ vpbroadcastw m7, xm7
+ paddw xm1, xm0
+ movq xm0, [tlq]
+ pmulhrsw xm1, xm3
+ pslldq m6, m7, 8
+ paddw xm2, xm7, xm7
+ lea r2, [strideq*3]
+ paddw m6, m7
+ packuswb xm1, xm1
+ paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1
+ punpcklbw xm0, xm1
+ psllw m7, 2
+ mova [rsp], xm0
+.w4_upsample_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ vpbroadcastq m1, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vpbroadcastq m2, [rsp+r5]
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ movq xm0, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ movhps xm0, [rsp+r5]
+ vpblendd m1, m2, 0xc0
+ pand m2, m4, m6 ; frac
+ vpblendd m0, m1, 0xf0
+ psubw m1, m5, m2 ; 64-frac
+ psllw m2, 8
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ paddw m6, m7 ; xpos += dx
+ pmulhrsw m0, m3
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*2], xm0
+ pextrd [dstq+r2 ], xm0, 1
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; w4/w8/w16
+ ; The C version uses a lot of branches, but we can do all the comparisons
+ ; in parallel and use popcnt to get the final filter strength value.
+%define base r3-z_filter_t0
+ lea r3, [z_filter_t0]
+ movd xm0, maxbased
+ movd xm2, angled
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m2, xm2
+ pcmpeqb m1, m0, [base+z_filter_wh]
+ pand m1, m2
+ mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases
+ pcmpgtb m1, m2
+ pmovmskb r5d, m1
+ ret
+.w4_no_upsample:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -16, 11
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ lea maxbased, [hq+3]
+ call .filter_strength
+ mov maxbased, 7
+ test r5d, r5d
+ jz .w4_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd m7, [base+pb_8]
+ vbroadcasti128 m2, [tlq-1]
+ pminub m1, m7, [base+z_filter_s]
+ vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0]
+ pminub m7, [base+z_filter_s+8]
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2]
+ pshufb m0, m2, m1
+ shufps m1, m7, q2121
+ pmaddubsw m0, m8
+ pshufb m1, m2, m1
+ pmaddubsw m1, m9
+ pshufb m2, m7
+ pmaddubsw m2, m10
+ paddw m0, m1
+ paddw m0, m2
+ pmulhrsw m0, m3
+ mov r3d, 9
+ mov tlq, rsp
+ cmp hd, 4
+ cmovne maxbased, r3d
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ mova [tlq], xm0
+.w4_main:
+ movd xm6, dxd
+ vpbroadcastq m0, [z_base_inc] ; base_inc << 6
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ mov r3d, dxd ; xpos
+ movd xm9, maxbased
+ vpbroadcastw m9, xm9
+ vbroadcasti128 m8, [z1_shuf_w4]
+ psrlw m7, 8 ; top[max_base_x]
+ paddw m10, m6, m6
+ psubw m9, m0 ; max_base_x
+ vpblendd m6, m10, 0xcc
+ mova xm0, xm10
+ paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1
+ paddw m10, m10
+.w4_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ vpbroadcastq m1, [tlq+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vpbroadcastq m2, [tlq+r5]
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ movq xm0, [tlq+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ movhps xm0, [tlq+r5]
+ vpblendd m1, m2, 0xc0
+ pand m2, m4, m6 ; frac
+ vpblendd m0, m1, 0xf0
+ psubw m1, m5, m2 ; 64-frac
+ psllw m2, 8
+ pshufb m0, m8
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ pcmpgtw m1, m9, m6 ; base < max_base_x
+ pmulhrsw m0, m3
+ paddw m6, m10 ; xpos += dx
+ lea r5, [dstq+strideq*2]
+ vpblendvb m0, m7, m0, m1
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [r5 +strideq*0], xm0
+ pextrd [r5 +strideq*1], xm0, 1
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ sub hd, 4
+ jz .w4_end
+ lea dstq, [dstq+strideq*4]
+ cmp r3d, maxbased
+ jb .w4_loop
+ packuswb xm7, xm7
+ lea r6, [strideq*3]
+.w4_end_loop:
+ movd [dstq+strideq*0], xm7
+ movd [dstq+strideq*1], xm7
+ movd [dstq+strideq*2], xm7
+ movd [dstq+r6 ], xm7
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_end_loop
+.w4_end:
+ RET
+ALIGN function_align
+.w8:
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -32, 8
+ movu xm2, [z_filter_s+6]
+ mova xm0, [tlq-1]
+ movd xm6, hd
+ vinserti128 m0, [tlq+7], 1
+ vpbroadcastb xm6, xm6
+ vbroadcasti128 m1, [z_upsample1]
+ pminub xm6, xm2
+ vpbroadcastd m7, [pb_36_m4]
+ vinserti128 m2, xm6, 1
+ add dxd, dxd
+ pshufb m1, m0, m1
+ pshufb m2, m0, m2
+ movd xm6, dxd
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ vpbroadcastw m6, xm6
+ mov r3d, dxd
+ psrldq m0, 1
+ lea r2, [strideq*3]
+ paddw m7, m6, m6
+ paddw m1, m2
+ vpblendd m6, m7, 0xf0
+ pmulhrsw m1, m3
+ pslldq m2, m7, 8
+ paddw m7, m7
+ paddw m6, m2
+ packuswb m1, m1
+ punpcklbw m0, m1
+ mova [rsp], m0
+.w8_upsample_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm0, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vinserti128 m0, [rsp+r5], 1
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ punpcklqdq m1, m2, m2 ; frac0 frac1
+ pmaddubsw m0, m1
+ movu xm1, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ vinserti128 m1, [rsp+r5], 1
+ punpckhqdq m2, m2 ; frac2 frac3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ paddw m6, m7
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*2], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+r2 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_upsample_loop
+ RET
+.w8_no_intra_edge_filter:
+ and maxbased, 7
+ or maxbased, 8 ; imin(h+7, 15)
+ jmp .w8_main
+.w8_no_upsample:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -32, 10
+ lea maxbased, [hq+7]
+ test angled, 0x400
+ jnz .w8_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .w8_main ; filter_strength == 0
+ popcnt r5d, r5d
+ movu xm2, [tlq]
+ pminub xm1, xm0, [base+z_filter_s+14]
+ vinserti128 m2, [tlq-1], 1
+ vinserti128 m1, [base+z_filter_s+ 0], 1
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0]
+ pminub xm0, [base+z_filter_s+22]
+ vinserti128 m0, [base+z_filter_s+ 8], 1
+ pshufb m6, m2, m1
+ pmaddubsw m6, m7
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1]
+ movzx r3d, byte [tlq+15]
+ shufps m1, m0, q2121
+ pshufb m1, m2, m1
+ pmaddubsw m1, m7
+ paddw m1, m6
+ sub r5d, 3
+ jnz .w8_3tap
+ ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
+ ; which also results in an awkward edge case where out[w*2] is
+ ; slightly different from out[max_base_x] when h > w.
+ vpbroadcastd m7, [z_filter_k+4*8]
+ movzx r2d, byte [tlq+14]
+ pshufb m2, m0
+ pmaddubsw m2, m7
+ sub r2d, r3d
+ lea r2d, [r2+r3*8+4]
+ shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
+ mov [rsp+16], r2b
+ paddw m1, m2
+.w8_3tap:
+ pmulhrsw m1, m3
+ sar r5d, 1
+ mov tlq, rsp
+ add r5d, 17 ; w*2 + (filter_strength == 3)
+ cmp hd, 16
+ cmovns maxbased, r5d
+ mov [tlq+r5], r3b
+ vextracti128 xm0, m1, 1
+ packuswb xm0, xm1
+ mova [tlq], xm0
+.w8_main:
+ movd xm2, dxd
+ vbroadcasti128 m0, [z_base_inc]
+ vpbroadcastw m2, xm2
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ movd xm9, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ vpbroadcastw m9, xm9
+ psrlw m7, 8
+ psubw m9, m0
+ mov r3d, dxd
+ paddw m6, m2, m2
+ vpblendd m2, m6, 0xf0
+.w8_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6
+ pand m0, m4, m2
+ psubw m1, m5, m0
+ psllw m0, 8
+ por m1, m0
+ movu xm0, [tlq+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vinserti128 m0, [tlq+r5], 1
+ pshufb m0, m8
+ pmaddubsw m0, m1
+ pcmpgtw m1, m9, m2
+ paddw m2, m6
+ pmulhrsw m0, m3
+ vpblendvb m0, m7, m0, m1
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ sub hd, 2
+ jz .w8_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w8_loop
+ packuswb xm7, xm7
+.w8_end_loop:
+ movq [dstq+strideq*0], xm7
+ movq [dstq+strideq*1], xm7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16_no_intra_edge_filter:
+ and maxbased, 15
+ or maxbased, 16 ; imin(h+15, 31)
+ jmp .w16_main
+ALIGN function_align
+.w16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 12
+ lea maxbased, [hq+15]
+ test angled, 0x400
+ jnz .w16_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .w16_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd m1, [base+pb_12]
+ vbroadcasti128 m6, [base+z_filter_s+8]
+ vinserti128 m2, m6, [base+z_filter_s], 0
+ vinserti128 m6, [base+z_filter_s+16], 1
+ mova xm10, [tlq-1]
+ vinserti128 m10, [tlq+3], 1
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0]
+ vbroadcasti128 m7, [base+z_filter_s+14]
+ vinserti128 m8, m7, [base+z_filter_s+6], 0
+ vinserti128 m7, [base+z_filter_s+22], 1
+ psubw m0, m1
+ movu xm11, [tlq+12]
+ vinserti128 m11, [tlq+16], 1
+ pminub m8, m0
+ pminub m7, m0
+ pshufb m0, m10, m2
+ shufps m2, m6, q2121
+ pmaddubsw m0, m9
+ pshufb m1, m11, m8
+ shufps m8, m7, q2121
+ pmaddubsw m1, m9
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
+ movzx r3d, byte [tlq+31]
+ pshufb m2, m10, m2
+ pmaddubsw m2, m9
+ pshufb m8, m11, m8
+ pmaddubsw m8, m9
+ paddw m0, m2
+ paddw m1, m8
+ sub r5d, 3
+ jnz .w16_3tap
+ vpbroadcastd m9, [z_filter_k+4*8]
+ movzx r2d, byte [tlq+30]
+ pshufb m10, m6
+ pmaddubsw m10, m9
+ pshufb m11, m7
+ pmaddubsw m11, m9
+ sub r2d, r3d
+ lea r2d, [r2+r3*8+4]
+ shr r2d, 3
+ mov [rsp+32], r2b
+ paddw m0, m10
+ paddw m1, m11
+.w16_3tap:
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ sar r5d, 1
+ mov tlq, rsp
+ add r5d, 33
+ cmp hd, 32
+ cmovns maxbased, r5d
+ mov [tlq+r5], r3b
+ packuswb m0, m1
+ vpermq m0, m0, q3120
+ mova [tlq], m0
+.w16_main:
+ movd xm6, dxd
+ vbroadcasti128 m0, [z_base_inc]
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ movd xm9, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ vpbroadcastw m9, xm9
+ mov r3d, dxd
+ psubw m9, m0
+ paddw m11, m6, m6
+ psubw m10, m9, m3 ; 64*8
+ vpblendd m6, m11, 0xf0
+.w16_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ movu xm0, [tlq+r3+0]
+ movu xm1, [tlq+r3+8]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vinserti128 m0, [tlq+r5+0], 1
+ vinserti128 m1, [tlq+r5+8], 1
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m9, m6
+ pcmpgtw m2, m10, m6
+ packsswb m1, m2
+ paddw m6, m11
+ vpblendvb m0, m7, m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w16_end
+ lea dstq, [dstq+strideq*2]
+ cmp r3d, maxbased
+ jb .w16_loop
+.w16_end_loop:
+ mova [dstq+strideq*0], xm7
+ mova [dstq+strideq*1], xm7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_end_loop
+.w16_end:
+ RET
+ALIGN function_align
+.w32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 15
+ lea r3d, [hq+31]
+ mov maxbased, 63
+ cmp hd, 32
+ cmovs maxbased, r3d
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w32_main
+ vbroadcasti128 m0, [pb_0to15]
+ sub r3d, 29 ; h+2
+ movu xm13, [tlq+29] ; 32-39
+ movd xm1, r3d
+ movu xm14, [tlq+37] ; 40-47
+ sub r3d, 8 ; h-6
+ vinserti128 m14, [tlq+51], 1 ; 56-63
+ vpbroadcastb xm1, xm1
+ mova xm11, [tlq- 1] ; 0- 7
+ vinserti128 m11, [tlq+13], 1 ; 16-23
+ movd xm2, r3d
+ movu xm12, [tlq+ 5] ; 8-15
+ vinserti128 m12, [tlq+19], 1 ; 24-31
+ pminub xm1, xm0 ; clip 32x8
+ mova m7, [z_filter_s+0]
+ pshufb xm13, xm1
+ vpbroadcastd m1, [pb_12]
+ vpbroadcastb xm2, xm2
+ vinserti128 m13, [tlq+43], 1 ; 48-55
+ vinserti128 m8, m7, [z_filter_s+4], 1
+ vpblendd m2, m1, 0xf0
+ vinserti128 m7, [z_filter_s+12], 0
+ pminub m2, m0 ; clip 32x16 and 32x(32|64)
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ pshufb m14, m2
+ pshufb m0, m11, m8
+ shufps m8, m7, q1021
+ pmaddubsw m0, m9
+ pshufb m2, m12, m8
+ pmaddubsw m2, m9
+ pshufb m1, m13, m8
+ pmaddubsw m1, m9
+ pshufb m6, m14, m8
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ pshufb m10, m11, m8
+ shufps m8, m7, q2121
+ pmaddubsw m10, m9
+ paddw m0, m10
+ pshufb m10, m12, m8
+ pmaddubsw m10, m9
+ paddw m2, m10
+ pshufb m10, m13, m8
+ pmaddubsw m10, m9
+ paddw m1, m10
+ pshufb m10, m14, m8
+ pmaddubsw m10, m9
+ paddw m6, m10
+ vpbroadcastd m9, [z_filter_k+4*2+12*2]
+ pshufb m11, m8
+ pmaddubsw m11, m9
+ pshufb m12, m7
+ pmaddubsw m12, m9
+ movzx r3d, byte [tlq+63]
+ movzx r2d, byte [tlq+62]
+ paddw m0, m11
+ paddw m2, m12
+ pshufb m13, m7
+ pmaddubsw m13, m9
+ pshufb m14, m7
+ pmaddubsw m14, m9
+ paddw m1, m13
+ paddw m6, m14
+ sub r2d, r3d
+ lea r2d, [r2+r3*8+4] ; edge case for 32x64
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ shr r2d, 3
+ mov [rsp+64], r2b
+ mov tlq, rsp
+ mov [tlq+65], r3b
+ mov r3d, 65
+ cmp hd, 64
+ cmove maxbased, r3d
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq+ 0], m0
+ mova [tlq+32], m1
+.w32_main:
+ movd xm6, dxd
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ movd xm9, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ vpbroadcastw m9, xm9
+ mov r5d, dxd
+ psubw m9, [z_base_inc]
+ mova m11, m6
+ psubw m10, m9, m3 ; 64*8
+.w32_loop:
+ mov r3d, r5d
+ shr r3d, 6
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ movu m0, [tlq+r3+0]
+ movu m1, [tlq+r3+8]
+ add r5d, dxd
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m9, m6
+ pcmpgtw m2, m10, m6
+ packsswb m1, m2
+ paddw m6, m11
+ vpblendvb m0, m7, m0, m1
+ mova [dstq], m0
+ dec hd
+ jz .w32_end
+ add dstq, strideq
+ cmp r5d, maxbased
+ jb .w32_loop
+ test hb, 1
+ jz .w32_end_loop
+ mova [dstq], m7
+ add dstq, strideq
+ dec hd
+ jz .w32_end
+.w32_end_loop:
+ mova [dstq+strideq*0], m7
+ mova [dstq+strideq*1], m7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_end_loop
+.w32_end:
+ RET
+ALIGN function_align
+.w64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -128, 16
+ lea maxbased, [hq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w64_main
+ mova xm11, [tlq- 1] ; 0- 7
+ vinserti128 m11, [tlq+13], 1 ; 16-23
+ movu xm12, [tlq+ 5] ; 8-15
+ vinserti128 m12, [tlq+19], 1 ; 24-31
+ mova m7, [z_filter_s+0]
+ vinserti128 m8, m7, [z_filter_s+4], 1
+ vinserti128 m7, [z_filter_s+12], 0
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ movu xm13, [tlq+29] ; 32-39
+ vinserti128 m13, [tlq+43], 1 ; 48-55
+ movu xm14, [tlq+37] ; 40-47
+ vinserti128 m14, [tlq+51], 1 ; 56-63
+ pshufb m0, m11, m8
+ shufps m8, m7, q1021
+ pmaddubsw m0, m9
+ pshufb m2, m12, m8
+ pmaddubsw m2, m9
+ pshufb m1, m13, m8
+ pmaddubsw m1, m9
+ pshufb m6, m14, m8
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ pshufb m10, m11, m8
+ shufps m15, m8, m7, q2121
+ pmaddubsw m10, m9
+ paddw m0, m10
+ pshufb m10, m12, m15
+ pmaddubsw m10, m9
+ paddw m2, m10
+ pshufb m10, m13, m15
+ pmaddubsw m10, m9
+ paddw m1, m10
+ pshufb m10, m14, m15
+ pmaddubsw m10, m9
+ paddw m6, m10
+ vpbroadcastd m10, [z_filter_k+4*2+12*2]
+ pshufb m11, m15
+ pmaddubsw m11, m10
+ pshufb m12, m7
+ pmaddubsw m12, m10
+ pshufb m13, m7
+ pmaddubsw m13, m10
+ pshufb m14, m7
+ pmaddubsw m14, m10
+ paddw m0, m11
+ paddw m2, m12
+ paddw m1, m13
+ paddw m6, m14
+ movu xm11, [tlq+ 61] ; 64- 71
+ vinserti128 m11, [tlq+ 75], 1 ; 80- 87
+ movu xm12, [tlq+ 69] ; 72- 79
+ vinserti128 m12, [tlq+ 83], 1 ; 88- 95
+ movu xm13, [tlq+ 93] ; 96-103
+ vinserti128 m13, [tlq+107], 1 ; 112-119
+ movu xm14, [tlq+101] ; 104-111
+ vinserti128 m14, [tlq+115], 1 ; 120-127
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ lea r3d, [hq-20]
+ mov tlq, rsp
+ packuswb m0, m2
+ packuswb m1, m6
+ vpbroadcastd xm2, [pb_14]
+ vbroadcasti128 m6, [pb_0to15]
+ mova [tlq+32*0], m0
+ mova [tlq+32*1], m1
+ movd xm0, r3d
+ vpbroadcastd m1, [pb_12]
+ vpbroadcastb m0, xm0
+ paddb m0, m2
+ pminub m0, m6 ; clip 64x16 and 64x32
+ pshufb m12, m0
+ pminub m1, m6 ; clip 64x64
+ pshufb m14, m1
+ pshufb m0, m11, m7
+ pmaddubsw m0, m10
+ pshufb m2, m12, m7
+ pmaddubsw m2, m10
+ pshufb m1, m13, m7
+ pmaddubsw m1, m10
+ pshufb m6, m14, m7
+ pmaddubsw m6, m10
+ pshufb m7, m11, m15
+ pmaddubsw m7, m9
+ pshufb m10, m12, m15
+ pmaddubsw m10, m9
+ paddw m0, m7
+ pshufb m7, m13, m15
+ pmaddubsw m7, m9
+ paddw m2, m10
+ pshufb m10, m14, m15
+ pmaddubsw m10, m9
+ paddw m1, m7
+ paddw m6, m10
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ pshufb m11, m8
+ pmaddubsw m11, m9
+ pshufb m12, m8
+ pmaddubsw m12, m9
+ pshufb m13, m8
+ pmaddubsw m13, m9
+ pshufb m14, m8
+ pmaddubsw m14, m9
+ paddw m0, m11
+ paddw m2, m12
+ paddw m1, m13
+ paddw m6, m14
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq+32*2], m0
+ mova [tlq+32*3], m1
+.w64_main:
+ movd xm12, dxd
+ vpbroadcastb m7, [tlq+maxbaseq]
+ lea r3d, [dxq-64]
+ shl maxbased, 6
+ vpbroadcastw m12, xm12
+ sub r3d, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ movd xm6, r3d
+ mov r5d, dxd
+ mova m10, [pb_1to32]
+ vpbroadcastd m11, [pb_32]
+ vpbroadcastw m6, xm6
+.w64_loop:
+ mov r3d, r5d
+ shr r3d, 6
+ movu m0, [tlq+r3+ 0]
+ movu m1, [tlq+r3+ 8]
+ pand m2, m4, m6
+ psubw m9, m5, m2
+ psllw m2, 8
+ por m9, m2
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m9
+ pmaddubsw m1, m9
+ psraw m2, m6, 6
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packsswb m2, m2
+ paddb m2, m10
+ packuswb m0, m1
+ vpblendvb m0, m7, m0, m2
+ mova [dstq+ 0], m0
+ movu m0, [tlq+r3+32]
+ movu m1, [tlq+r3+40]
+ add r5d, dxd
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m9
+ pmaddubsw m1, m9
+ paddb m2, m11
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m6, m12
+ packuswb m0, m1
+ vpblendvb m0, m7, m0, m2
+ mova [dstq+32], m0
+ dec hd
+ jz .w64_end
+ add dstq, strideq
+ cmp r5d, maxbased
+ jb .w64_loop
+.w64_end_loop:
+ mova [dstq+ 0], m7
+ mova [dstq+32], m7
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+
+cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy
+%define base r9-z_filter_t0
+ lea r9, [ipred_z2_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea dxq, [dr_intra_derivative-90]
+ movsxd wq, [r9+wq*4]
+ movzx dyd, angleb
+ xor angled, 0x400
+ mov r8, dxq
+ sub dxq, dyq
+ add wq, r9
+ add r9, z_filter_t0-ipred_z2_avx2_table
+ mova m2, [tlq-64]
+ mova m0, [tlq-32]
+ mova m1, [tlq]
+ and dyd, ~1
+ and dxq, ~1
+ movzx dyd, word [r8+dyq] ; angle - 90
+ movzx dxd, word [dxq+270] ; 180 - angle
+ vpbroadcastd m13, [base+pw_512]
+ vpbroadcastd m14, [base+pw_62]
+ vpbroadcastd m15, [base+pw_64]
+ mova [rsp+ 0], m2
+ mova [rsp+32], m0
+ mova [rsp+64], m1
+ neg dxd
+ neg dyd
+ jmp wq
+.w4:
+ vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6
+ vbroadcasti128 m10, [base+z1_shuf_w4]
+ vbroadcasti128 m11, [base+z2_shuf_h4]
+ lea r2d, [dxq+(65<<6)] ; xpos
+ movd xm5, dyd
+ mov r8d, (63-4)<<6
+ mov dyq, -4
+ pshuflw xm5, xm5, q0000
+ pmullw xm5, [base+z2_ymul]
+ test angled, 0x400
+ jnz .w4_main ; !enable_intra_edge_filter
+ lea r3d, [hq+2]
+ add angled, 1022
+ shl r3d, 6
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ vpbroadcastd xm3, [base+pb_4]
+ call .upsample_above
+ sub angled, 1075 ; angle - 53
+ lea r3d, [hq+3]
+ xor angled, 0x7f ; 180 - angle
+ call .filter_strength
+ jmp .w4_filter_left
+ALIGN function_align
+.filter_strength:
+ movd xm8, r3d
+ mov r3d, angled
+ movd xm7, angled
+ vpbroadcastb m8, xm8
+ shr r3d, 8 ; is_sm << 1
+ vpbroadcastb m7, xm7
+ pcmpeqb m8, [base+z_filter_wh]
+ mova xm9, [r9+r3*8]
+ pand m0, m8, m7
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+ ret
+ALIGN function_align
+.upsample_above: ; w4/w8
+ pshufb xm2, xm1, [base+z_upsample1-2]
+ pminub xm3, [base+z_filter_s+4]
+ vpbroadcastd xm4, [base+pb_36_m4]
+ vbroadcasti128 m10, [base+pb_0to15]
+ pshufb xm3, xm1, xm3
+ pmaddubsw xm2, xm4
+ pmaddubsw xm3, xm4
+ lea r2d, [r2+dxq+(1<<6)]
+ add dxd, dxd
+ paddw xm2, xm3
+ pmulhrsw xm2, xm13
+ sub r8d, 3<<6
+ paddw m6, m6
+ packuswb xm2, xm2
+ punpcklbw xm1, xm2
+ mova [rsp+gprsize+64], xm1
+ ret
+ALIGN function_align
+.upsample_left: ; h4/h8
+ mov r3d, hd
+ and r3d, 4
+ movd xm2, [rsp+gprsize+64]
+ movddup xm0, [rsp+gprsize+56]
+ movd xm1, r3d
+ palignr xm2, xm0, 1
+ vpbroadcastb xm1, xm1
+ pshufb xm2, [base+z_filter_s+18]
+ vpbroadcastd xm3, [base+pb_36_m4]
+ pmaxub xm1, [base+z_upsample1-2]
+ pshufb xm1, xm0, xm1
+ pmaddubsw xm2, xm3
+ pmaddubsw xm1, xm3
+ paddw xm5, xm5
+ add dyq, dyq
+ paddw xm1, xm2
+ pmulhrsw xm1, xm13
+ vbroadcasti128 m11, [base+z2_upsample]
+ paddw xm5, xm15
+ packuswb xm1, xm1
+ punpcklbw xm0, xm1
+ mova [rsp+gprsize+48], xm0
+ ret
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ sub angled, 1112 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w4_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd xm2, [base+pb_4]
+ pminub xm2, [base+z_filter_s]
+ vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
+ pshufb xm3, xm1, xm2 ; 00 01 12 23
+ pshufd xm2, xm2, q0321
+ pmaddubsw xm0, xm3, xm0
+ pshufb xm2, xm1, xm2 ; 12 23 34 44
+ pmaddubsw xm2, xm4
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2]
+ punpckhqdq xm3, xm3 ; 34 44 44 44
+ pmaddubsw xm3, xm4
+ movd xm4, r6m ; max_width
+ pminsw xm4, xm15
+ vpbroadcastb xm4, xm4
+ paddw xm0, xm2
+ paddw xm0, xm3
+ pmulhrsw xm0, xm13
+ psubb xm4, [base+pb_1to32]
+ psrlq xm1, 8
+ packuswb xm0, xm0
+ vpblendvb xm0, xm1, xm4
+ movd [rsp+65], xm0
+.w4_no_filter_above:
+ lea r3d, [hq+2]
+ add angled, 973 ; angle + 883
+ shl r3d, 6
+ test r3d, angled
+ jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+ vpbroadcastd xm0, [base+pb_90]
+ psubb xm0, xm7 ; 180 - angle
+ pand xm0, xm8 ; reuse from previous filter_strength call
+ pcmpgtb xm0, xm9
+ pmovmskb r3d, xm0
+.w4_filter_left:
+ test r3d, r3d
+ jz .w4_main
+ popcnt r3d, r3d
+ mov r5d, 10
+ cmp hd, 16
+ movu xm2, [rsp+49]
+ vinserti128 m2, [rsp+43], 1
+ cmovs r5d, hd
+ xor r5d, 15 ; h == 16 ? 5 : 15 - h
+ movd xm0, r5d
+ vbroadcasti128 m1, [base+z_filter_s+12]
+ vbroadcasti128 m4, [base+z_filter_s+16]
+ vinserti128 m3, m1, [z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab
+ vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd
+ vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef
+ vpbroadcastb m0, xm0
+ pmaxub m0, m3
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0]
+ pshufb m0, m2, m0
+ pmaddubsw m0, m3
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*1]
+ pshufb m1, m2, m1
+ pmaddubsw m1, m3
+ vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2]
+ pshufb m2, m4
+ pmaddubsw m2, m3
+ movd xm4, r7m ; max_height
+ pminsw xm4, xm15
+ vpbroadcastb xm4, xm4
+ psubb xm4, [base+pb_16to1]
+ paddw m1, m0
+ paddw m1, m2
+ pmulhrsw m1, m13
+ vextracti128 xm0, m1, 1
+ packuswb xm0, xm1
+ vpblendvb xm0, [rsp+48], xm4
+ mova [rsp+48], xm0
+ jmp .w4_main
+.w4_upsample_left:
+ call .upsample_left
+.w4_main:
+ movd xm0, dxd
+ mova m12, [base+z2_y_shuf_h4]
+ lea r5, [rsp+56] ; left-7
+ vpbroadcastw m0, xm0
+ lea r9, [strideq*3]
+ psraw xm1, xm5, 6
+ pand xm5, xm14 ; frac_y
+ pxor xm2, xm2
+ paddw m7, m0, m0
+ psubw xm4, xm2, xm1 ; base_y
+ vpblendd m0, m7, 0xcc
+ mova xm1, xm7
+ punpcklwd xm4, xm2
+ paddw m0, m1 ; xpos2 xpos3 xpos0 xpos1
+ psubw xm1, xm15, xm5 ; 64-frac_y
+ psllw xm5, 8
+ paddw m7, m7
+ paddw m6, m0
+ por xm5, xm1 ; 64-frac_y, frac_y
+ vpbroadcastq m5, xm5
+.w4_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ vpbroadcastq m1, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ vpbroadcastq m2, [rsp+r3]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ movq xm0, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ movhps xm0, [rsp+r3]
+ vpblendd m1, m2, 0xc0
+ pand m2, m14, m6 ; frac_x
+ vpblendd m0, m1, 0xf0
+ psubw m1, m15, m2 ; 64-frac_x
+ psllw m2, 8
+ pshufb m0, m10
+ por m1, m2 ; 64-frac_x, frac_x
+ pmaddubsw m0, m1
+ cmp r3d, 64
+ jge .w4_toponly
+ mova m1, m7 ; arbitrary negative value
+ vpgatherdq m3, [r5+xm4], m1
+ pshufb m1, m3, m11
+ vpermd m1, m12, m1
+ pmaddubsw m1, m5
+ psraw m2, m6, 15 ; base_x < topleft
+ vpblendvb m0, m1, m2
+.w4_toponly:
+ pmulhrsw m0, m13
+ paddw m6, m7 ; xpos += dx
+ add r5, dyq
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*2], xm0
+ pextrd [dstq+r9 ], xm0, 1
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ sub hd, 4
+ jz .w4_end
+ lea dstq, [dstq+strideq*4]
+ cmp r2d, r8d
+ jge .w4_loop
+.w4_leftonly_loop:
+ mova m1, m7
+ vpgatherdq m2, [r5+xm4], m1
+ add r5, dyq
+ pshufb m0, m2, m11
+ vpermd m0, m12, m0
+ pmaddubsw m0, m5
+ pmulhrsw m0, m13
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*2], xm0
+ pextrd [dstq+r9 ], xm0, 1
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_leftonly_loop
+.w4_end:
+ RET
+.w8:
+ vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6
+ movd xm5, dyd
+ vbroadcasti128 m10, [base+z_filter_s+2]
+ vbroadcasti128 m11, [base+z2_shuf_h4]
+ lea r2d, [dxq+(65<<6)] ; xpos
+ vpbroadcastw xm5, xm5
+ mov r8d, (63-8)<<6
+ mov dyq, -4
+ pmullw xm5, [base+z2_ymul]
+ test angled, 0x400
+ jnz .w8_main
+ lea r3d, [angleq+126]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ vpbroadcastd xm3, [base+pb_8]
+ movhps [rsp+80], xm1
+ call .upsample_above
+ sub angled, 53 ; angle - 53
+ lea r3d, [hq+7]
+ xor angled, 0x7f ; 180 - angle
+ call .filter_strength
+ jmp .w8_filter_left
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ sub angled, 90 ; angle - 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w8_no_filter_above
+ popcnt r3d, r3d
+ vpbroadcastd xm3, [base+pb_8]
+ pminub xm3, [base+z_filter_s+8]
+ vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1]
+ pshufb xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67
+ pmaddubsw xm0, xm2, xm0
+ pshufb xm3, xm1, xm3 ; 34 45 56 67 78 88 88 88
+ shufps xm2, xm3, q2121 ; 12 23 34 45 56 67 78 88
+ pmaddubsw xm2, xm4
+ vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2]
+ pmaddubsw xm3, xm4
+ movd xm4, r6m ; max_width
+ pminuw xm4, xm15
+ vpbroadcastb xm4, xm4
+ paddw xm0, xm2
+ paddw xm0, xm3
+ pmulhrsw xm0, xm13
+ psubb xm4, [base+pb_1to32]
+ psrldq xm1, 1
+ packuswb xm0, xm0
+ vpblendvb xm0, xm1, xm4
+ movq [rsp+65], xm0
+.w8_no_filter_above:
+ lea r3d, [angleq-51]
+ mov r3b, hb
+ cmp r3d, 8
+ jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
+ vpbroadcastd m0, [base+pb_90]
+ psubb m0, m7
+ pand m0, m8
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+.w8_filter_left:
+ test r3d, r3d
+ jz .w8_main
+ popcnt r3d, r3d
+ vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2]
+ cmp hd, 32
+ jne .w8_filter_left_h16
+ movu xm2, [rsp+27]
+ vinserti128 m2, [rsp+35], 1
+ vpbroadcastd xm0, [base+pb_5]
+ vbroadcasti128 m3, [base+z_filter_s+ 8]
+ vbroadcasti128 m1, [base+z_filter_s+12]
+ vbroadcasti128 m4, [base+z_filter_s+16]
+ pmaxub m3, m0
+ pshufb m3, m2, m3
+ pmaddubsw m3, m7
+ pshufb m1, m2, m1
+ pmaddubsw m1, m8
+ pshufb m2, m4
+ pmaddubsw m2, m9
+ paddw m3, m1
+ paddw m3, m2
+ pmulhrsw m3, m13
+ jmp .w8_filter_left_top16
+.w8_filter_left_h16:
+ mov r5d, 10
+ cmp hd, 16
+ cmovs r5d, hd
+ xor r5d, 15 ; h == 16 ? 5 : 15 - h
+ movd xm0, r5d
+ vpbroadcastb m0, xm0
+.w8_filter_left_top16:
+ vbroadcasti128 m1, [base+z_filter_s+12]
+ vinserti128 m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab
+ vbroadcasti128 m4, [base+z_filter_s+16]
+ vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd
+ vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef
+ pmaxub m0, m2
+ movu xm2, [rsp+49]
+ vinserti128 m2, [rsp+43], 1
+ pshufb m0, m2, m0
+ pmaddubsw m0, m7
+ movd xm7, r7m ; max_height
+ pshufb m1, m2, m1
+ pmaddubsw m1, m8
+ pshufb m2, m4
+ pmaddubsw m2, m9
+ pminsw xm7, xm15
+ paddw m1, m0
+ vpbroadcastb m7, xm7
+ paddw m1, m2
+ pmulhrsw m1, m13
+ psubb m7, [base+pb_32to1]
+ packuswb m3, m1
+ vpermq m3, m3, q1320
+ vpblendvb m3, [rsp+32], m7
+ mova [rsp+32], m3
+ jmp .w8_main
+.w8_upsample_left:
+ call .upsample_left
+.w8_main:
+ movd xm3, dxd
+ lea r5, [rsp+56] ; left-7
+ pshufd xm1, xm5, q3120
+ pand xm5, xm14
+ vpbroadcastw m3, xm3
+ pxor xm0, xm0
+ psubw xm2, xm15, xm5
+ psraw xm1, 6
+ lea r9, [strideq*3]
+ paddw m7, m3, m3
+ psubw xm9, xm0, xm1 ; base_y
+ psllw xm5, 8
+ punpcklwd xm8, xm9, xm0 ; base_y 0, 1, 4, 5
+ vpblendd m3, m7, 0xf0 ; xpos0 xpos1
+ por xm5, xm2 ; 64-frac_y, frac_y
+ punpckhwd xm9, xm0 ; base_y 2, 3, 6, 7
+ paddw m6, m3
+ vinserti128 m12, m5, xm5, 1
+.w8_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu xm0, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ vinserti128 m0, [rsp+r3], 1
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ movu xm1, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ vinserti128 m1, [rsp+r3], 1
+ pand m2, m14, m6
+ paddsw m4, m6, m7
+ psubw m5, m15, m2
+ psllw m2, 8
+ pshufb m0, m10
+ por m2, m5
+ pmaddubsw m0, m2
+ pand m2, m14, m4
+ psubw m5, m15, m2
+ psllw m2, 8
+ pshufb m1, m10
+ por m2, m5
+ pmaddubsw m1, m2
+ cmp r3d, 64
+ jge .w8_toponly
+ mova m5, m7
+ vpgatherdq m3, [r5+xm9], m7
+ mova m7, m5
+ vpgatherdq m2, [r5+xm8], m5
+ pshufb m3, m11
+ pshufb m2, m11
+ punpckldq m5, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m2, m3 ; a2 b2 c2 d2 a3 b3 c3 d3 e2 f2 g2 h2 e3 f3 g3 h3
+ vpermq m5, m5, q3120 ; y0 y1
+ vpermq m2, m2, q3120 ; y2 y3
+ pmaddubsw m5, m12
+ pmaddubsw m2, m12
+ psraw m6, 15 ; base_x < topleft
+ vpblendvb m0, m5, m6
+ psraw m3, m4, 15
+ vpblendvb m1, m2, m3
+.w8_toponly:
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ paddw m6, m4, m7 ; xpos += dx
+ add r5, dyq
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*2], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+r9 ], xm1
+ sub hd, 4
+ jz .w8_end
+ lea dstq, [dstq+strideq*4]
+ cmp r2d, r8d
+ jge .w8_loop
+.w8_leftonly_loop:
+ mova m0, m7
+ vpgatherdq m5, [r5+xm9], m7
+ mova m7, m0
+ vpgatherdq m3, [r5+xm8], m0
+ add r5, dyq
+ pshufb m2, m5, m11
+ pshufb m1, m3, m11
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+ pmaddubsw m0, m12
+ pmaddubsw m1, m12
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*2], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+r9 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_leftonly_loop
+.w8_end:
+ RET
+.w16:
+ mov r8d, hd
+ test angled, 0x400
+ jnz .w16_main
+ lea r3d, [hq+15]
+ sub angled, 90
+ call .filter_strength
+ test r3d, r3d
+ jz .w16_no_filter_above
+ popcnt r3d, r3d
+ vbroadcasti128 m6, [tlq+1]
+ mova xm2, [base+z_filter_s]
+ vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de
+ movu xm3, [base+z_filter_s+8]
+ vinserti128 m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab ab bc cd de ef ff ff ff
+ vpblendd m1, m6, 0xf0
+ vpbroadcastd m0, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*2]
+ pshufb m2, m1, m2
+ pshufb m1, m3
+ pmaddubsw m0, m2, m0
+ shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff
+ pmaddubsw m2, m4
+ pmaddubsw m1, m5
+ movd xm4, r6m ; max_width
+ pminsw xm4, xm15
+ vpbroadcastb xm4, xm4
+ paddw m0, m2
+ paddw m0, m1
+ pmulhrsw m0, m13
+ psubb xm4, [base+pb_1to32]
+ vextracti128 xm2, m0, 1
+ packuswb xm0, xm2
+ vpblendvb xm0, xm6, xm4
+ movu [rsp+65], xm0
+.w16_no_filter_above:
+ vpbroadcastd m0, [base+pb_90]
+ psubb m0, m7
+ pand m0, m8
+ pcmpgtb m0, m9
+ pmovmskb r3d, m0
+ test r3d, r3d
+ jz .w16_main
+ popcnt r3d, r3d
+ vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0]
+ vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1]
+ vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2]
+.w16_filter_left:
+ movd xm6, r7m ; max_height
+ pminsw xm6, xm15
+ vpbroadcastb m6, xm6
+ cmp hd, 32
+ jl .w16_filter_left_h16
+ vpbroadcastd xm0, [base+pb_5]
+ vbroadcasti128 m10, [base+z_filter_s+ 8]
+ vbroadcasti128 m11, [base+z_filter_s+12]
+ vbroadcasti128 m12, [base+z_filter_s+16]
+ je .w16_filter_left_h32
+ movu m3, [tlq-69]
+ movu m5, [tlq-61]
+ pmaxub m1, m10, m0
+ pshufb m1, m3, m1
+ pmaddubsw m1, m7
+ pshufb m2, m3, m11
+ pmaddubsw m2, m8
+ pshufb m3, m12
+ pmaddubsw m3, m9
+ paddw m1, m2
+ pshufb m2, m5, m10
+ pmaddubsw m2, m7
+ pshufb m4, m5, m11
+ pmaddubsw m4, m8
+ pshufb m5, m12
+ pmaddubsw m5, m9
+ paddw m1, m3
+ vpbroadcastd m3, [base+pb_32]
+ paddb m3, [base+pb_32to1]
+ paddw m2, m4
+ paddw m2, m5
+ pmulhrsw m1, m13
+ pmulhrsw m2, m13
+ psubb m3, m6, m3
+ packuswb m1, m2
+ vpblendvb m1, [tlq-64], m3
+ mova [rsp], m1
+ jmp .w16_filter_left_top32
+.w16_filter_left_h32:
+ pmaxub m10, m0
+.w16_filter_left_top32:
+ movu xm2, [tlq-37]
+ vinserti128 m2, [tlq-29], 1
+ pshufb m3, m2, m10
+ pshufb m1, m2, m11
+ pshufb m2, m12
+ pmaddubsw m3, m7
+ pmaddubsw m1, m8
+ pmaddubsw m2, m9
+ paddw m3, m1
+ paddw m3, m2
+ pmulhrsw m3, m13
+ jmp .w16_filter_left_top16
+.w16_filter_left_h16:
+ mov r5d, 10
+ cmp hd, 16
+ cmovs r5d, hd
+ xor r5d, 15 ; h == 16 ? 5 : 15 - h
+ movd xm0, r5d
+ vpbroadcastb m0, xm0
+.w16_filter_left_top16:
+ movu xm2, [tlq-15]
+ vinserti128 m2, [tlq-21], 1
+ vbroadcasti128 m1, [base+z_filter_s+12]
+ vbroadcasti128 m4, [base+z_filter_s+16]
+ vinserti128 m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 34 45 56 67 78 89 9a ab
+ vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd
+ vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef
+ pmaxub m0, m5
+ pshufb m0, m2, m0
+ pmaddubsw m0, m7
+ pshufb m1, m2, m1
+ pmaddubsw m1, m8
+ pshufb m2, m4
+ pmaddubsw m2, m9
+ psubb m6, [base+pb_32to1]
+ paddw m1, m0
+ paddw m1, m2
+ pmulhrsw m1, m13
+ packuswb m3, m1
+ vpermq m3, m3, q1320
+ vpblendvb m3, [tlq-32], m6
+ mova [rsp+32], m3
+.w16_main:
+ movd xm1, dyd
+ vbroadcasti128 m10, [base+z_filter_s+2]
+ movd xm7, dxd
+ vbroadcasti128 m11, [base+z2_shuf_h2]
+ vpbroadcastw m1, xm1
+ vpbroadcastw m7, xm7
+ mov r7, dstq
+ pmullw m0, m1, [base+z2_ymul]
+ psllw xm1, 4
+ paddw m6, m7, [base+z2_base_inc]
+ lea r9d, [dxq+(65<<6)] ; xpos
+ movd [rsp+156], xm1
+.w16_loop0:
+ mov r2d, r9d
+ mova [rsp+160], m0
+ lea r5, [rsp+60] ; left-3
+ mova [rsp+192], m6
+ pxor m1, m1
+ psraw m2, m0, 6
+ pand m0, m14
+ psubw m9, m1, m2 ; base_y
+ psubw m12, m15, m0
+ punpcklwd m8, m9, m1 ; base_y 0, 1, 2, 3, 8, 9, 10, 11
+ psllw m0, 8
+ punpckhwd m9, m1 ; base_y 4, 5, 6, 7, 12, 13, 14, 15
+ por m12, m0 ; 64-frac_y, frac_y
+.w16_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movu xm0, [rsp+r2]
+ vinserti128 m0, [rsp+r2+8], 1
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ movu xm1, [rsp+r3]
+ vinserti128 m1, [rsp+r3+8], 1
+ pand m2, m14, m6
+ paddsw m5, m6, m7
+ psubw m3, m15, m2
+ psllw m2, 8
+ pshufb m0, m10
+ por m2, m3
+ pmaddubsw m0, m2
+ pand m2, m14, m5
+ psubw m3, m15, m2
+ psllw m2, 8
+ pshufb m1, m10
+ por m2, m3
+ pmaddubsw m1, m2
+ cmp r3d, 64
+ jge .w16_toponly
+ punpckhwd m2, m5, m5 ; mask out unnecessary loads
+ vpgatherdd m4, [r5+m9], m2
+ punpcklwd m2, m5, m5
+ vpgatherdd m3, [r5+m8], m2
+ pshufb m4, m11 ; e0 f0 g0 h0 e1 f1 g1 h1 m0 n0 o0 p0 m1 n1 o1 p1
+ pshufb m3, m11 ; a0 b0 c0 d0 a1 b1 c1 d1 i0 j0 k0 l0 i1 j1 k1 l1
+ punpcklqdq m2, m3, m4 ; y0
+ punpckhqdq m3, m4 ; y1
+ pmaddubsw m2, m12
+ pmaddubsw m3, m12
+ psraw m6, 15 ; base_x < topleft
+ vpblendvb m0, m2, m6
+ psraw m6, m5, 15
+ vpblendvb m1, m3, m6
+.w16_toponly:
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ paddw m6, m5, m7 ; xpos += dx
+ sub r5, 2
+ packuswb m0, m1
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ sub hd, 2
+ jz .w16_end
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, (63-16)<<6
+ jge .w16_loop
+.w16_leftonly_loop:
+ mova m0, m7
+ vpgatherdd m4, [r5+m9], m7
+ mova m7, m0
+ vpgatherdd m3, [r5+m8], m0
+ sub r5, 2
+ pshufb m2, m4, m11
+ pshufb m1, m3, m11
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pmaddubsw m0, m12
+ pmaddubsw m1, m12
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ packuswb m0, m1
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_leftonly_loop
+.w16_end:
+ sub r8d, 1<<8
+ jl .w16_ret
+ vpbroadcastd m0, [rsp+156]
+ paddw m0, [rsp+160] ; base_y += 16*dy
+ paddw m6, m13, [rsp+192]
+ add r7, 16
+ add r9d, 16<<6
+ movzx hd, r8b
+ mov dstq, r7
+ paddw m6, m13 ; base_x += 16*64
+ jmp .w16_loop0
+.w16_ret:
+ RET
+.w32:
+ mova m2, [tlq+32]
+ lea r8d, [hq+(1<<8)]
+ mova [rsp+96], m2
+ test angled, 0x400
+ jnz .w16_main
+ vpbroadcastd m7, [base+z_filter_k+4*2+12*0]
+ vpbroadcastd m8, [base+z_filter_k+4*2+12*1]
+ vpbroadcastd m9, [base+z_filter_k+4*2+12*2]
+ mova xm5, [base+z_filter_s]
+ vinserti128 m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67 45 56 67 78 89 9a ab bc
+ vinserti128 m1, [tlq+11], 1
+ movu xm6, [base+z_filter_s+12]
+ vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff
+ movu xm3, [tlq+ 6]
+ vinserti128 m3, [tlq+17], 1
+ movd xm0, r6m ; max_width
+ pminsw xm0, xm15
+ vpbroadcastb m10, xm0
+.w32_filter_above:
+ pshufb m0, m1, m5
+ shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de
+ pmaddubsw m0, m7
+ pshufb m2, m1, m4
+ shufps m5, m6, q2132 ; 34 45 56 67 78 89 9a ab 89 9a ab bc cd de ef ff
+ pmaddubsw m2, m8
+ pshufb m1, m5
+ pmaddubsw m1, m9
+ paddw m0, m2
+ paddw m0, m1
+ pshufb m1, m3, m4
+ pmaddubsw m1, m7
+ pshufb m2, m3, m5
+ pmaddubsw m2, m8
+ pshufb m3, m6
+ pmaddubsw m3, m9
+ paddw m1, m2
+ paddw m1, m3
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ psubb m10, [base+pb_1to32]
+ packuswb m0, m1
+ vpblendvb m0, [tlq+1], m10
+ movu [rsp+65], m0
+ jmp .w16_filter_left
+.w64:
+ mova m2, [tlq+32]
+ mov r3d, [tlq+64]
+ lea r8d, [hq+(3<<8)]
+ mova [rsp+ 96], m2
+ mov [rsp+128], r3d
+ test angled, 0x400
+ jnz .w16_main
+ vpbroadcastd m7, [base+z_filter_k+4*2+12*0]
+ vpbroadcastd m8, [base+z_filter_k+4*2+12*1]
+ vpbroadcastd m9, [base+z_filter_k+4*2+12*2]
+ movu xm6, [base+z_filter_s+ 4]
+ vinserti128 m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89 45 56 67 78 89 9a ab bc
+ movu xm3, [tlq+30]
+ vinserti128 m3, [tlq+43], 1
+ movu xm5, [base+z_filter_s+16]
+ vinserti128 m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef ab bc cd de ef ff ff ff
+ pshufb m0, m3, m6
+ shufps m4, m6, m5, q1021 ; 34 45 56 67 78 89 9a ab 67 78 89 9a ab bc cd de
+ pmaddubsw m0, m7
+ pshufb m2, m3, m4
+ shufps m6, m5, q2132 ; 56 67 78 89 9a ab bc cd 89 9a ab bc cd de ef ff
+ pmaddubsw m2, m8
+ pshufb m3, m6
+ pmaddubsw m3, m9
+ paddw m0, m2
+ paddw m0, m3
+ movu xm2, [tlq+36]
+ vinserti128 m2, [tlq+49], 1
+ pshufb m4, m2, m4
+ pmaddubsw m4, m7
+ pshufb m3, m2, m6
+ pmaddubsw m3, m8
+ pshufb m2, m5
+ pmaddubsw m2, m9
+ movd xm5, r6m ; max_width
+ pminsw xm5, xm15
+ vpbroadcastb m10, xm5
+ paddw m3, m4
+ paddw m2, m3
+ vpbroadcastd m3, [base+pb_32]
+ pmulhrsw m0, m13
+ pmulhrsw m2, m13
+ mova xm5, [base+z_filter_s]
+ vinserti128 m5, [base+z_filter_s+6], 1
+ psubb m3, m10, m3
+ psubb m3, [base+pb_1to32]
+ vinserti128 m1, [tlq+13], 1
+ packuswb m0, m2
+ vpblendvb m0, [tlq+33], m3
+ movu xm3, [tlq+ 6]
+ vinserti128 m3, [tlq+19], 1
+ movu [rsp+97], m0
+ jmp .w32_filter_above
+
+cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z3_avx2_table]
+ tzcnt hd, hm
+ movifnidn angled, anglem
+ lea r7, [dr_intra_derivative+45*2-1]
+ dec tlq
+ movsxd hq, [r6+hq*4]
+ sub angled, 180
+ add hq, r6
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ movzx dyd, word [r7+dyq]
+ vpbroadcastd m3, [pw_512]
+ vpbroadcastd m4, [pw_62]
+ vpbroadcastd m5, [pw_64]
+ mov org_wd, wd
+ jmp hq
+.h4:
+ lea r7, [strideq*3]
+ cmp angleb, 40
+ jae .h4_no_upsample
+ lea r4d, [angleq-1024]
+ sar r4d, 7
+ add r4d, wd
+ jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
+ ALLOC_STACK -32, 9
+ movu xm8, [tlq-7]
+ pshufb xm0, xm8, [z_upsample1-4]
+ vpbroadcastb xm2, xm8
+ pshufb xm1, xm8, [z_filter_s+2]
+ mova [rsp+16], xm2 ; top[max_base_y]
+ vpbroadcastd xm2, [pb_36_m4]
+ add dyd, dyd
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm2
+ movd xm7, dyd
+ mov r2d, dyd
+ vpbroadcastw m7, xm7
+ paddw xm1, xm0
+ pmulhrsw xm1, xm3
+ pslldq m6, m7, 8
+ paddw xm2, xm7, xm7
+ paddw m6, m7
+ packuswb xm1, xm1
+ paddw m6, m2
+ punpcklbw xm1, xm8
+ mova xm8, [z_transpose4]
+ psllw m7, 2
+ pshufb xm1, [pb_15to0]
+ mova [rsp], xm1
+.h4_upsample_loop:
+ lea r4d, [r2+dyq]
+ shr r2d, 6
+ vpbroadcastq m1, [rsp+r2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6
+ vpbroadcastq m2, [rsp+r4]
+ lea r4d, [r2+dyq]
+ shr r2d, 6
+ movq xm0, [rsp+r2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6
+ movhps xm0, [rsp+r4]
+ vpblendd m1, m2, 0xc0
+ pand m2, m4, m6
+ vpblendd m0, m1, 0xf0
+ psubw m1, m5, m2
+ psllw m2, 8
+ por m1, m2
+ pmaddubsw m0, m1
+ paddw m6, m7
+ pmulhrsw m0, m3
+ vextracti128 xm1, m0, 1
+ packuswb xm1, xm0
+ pshufb xm1, xm8
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+r7 ], xm1, 3
+ add dstq, 4
+ sub wd, 4
+ jg .h4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; h4/h8/h16
+%define base r4-z_filter_t0
+ lea r4, [z_filter_t0]
+ movd xm0, maxbased
+ movd xm2, angled
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m2, xm2
+ pcmpeqb m1, m0, [base+z_filter_wh]
+ pand m1, m2
+ mova xm2, [r4+angleq*8]
+ pcmpgtb m1, m2
+ pmovmskb r5d, m1
+ ret
+.h4_no_upsample:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -16, 12
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h4_main
+ lea maxbased, [wq+3]
+ call .filter_strength
+ mov maxbased, 7
+ test r5d, r5d
+ jz .h4_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd m7, [base+pb_7]
+ vbroadcasti128 m2, [tlq-14]
+ pmaxub m1, m7, [base+z_filter_s-4]
+ vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0]
+ pmaxub m7, [base+z_filter_s+4]
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
+ vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2]
+ pshufb m0, m2, m1
+ shufps m1, m7, q2121
+ pmaddubsw m0, m8
+ pshufb m1, m2, m1
+ pmaddubsw m1, m9
+ pshufb m2, m7
+ pmaddubsw m2, m10
+ paddw m0, m1
+ paddw m0, m2
+ pmulhrsw m0, m3
+ mov r4d, 9
+ lea tlq, [rsp+15]
+ cmp wd, 4
+ cmovne maxbased, r4d
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ mova [rsp], xm0
+.h4_main:
+ movd xm6, dyd
+ vpbroadcastq m0, [z_base_inc] ; base_inc << 6
+ mov r4, tlq
+ sub tlq, 4
+ neg dyq
+ vpbroadcastw m6, xm6
+ sub r4, maxbaseq
+ shl maxbased, 6
+ vpbroadcastb m7, [r4]
+ lea r4, [dyq+63] ; ypos
+ movd xm9, maxbased
+ not maxbased
+ vbroadcasti128 m8, [z3_shuf_w4]
+ add maxbased, 64
+ vpbroadcastw m9, xm9
+ psrlw m7, 8 ; top[max_base_y]
+ paddw m10, m6, m6
+ psubw m9, m0 ; max_base_y
+ vpblendd m6, m10, 0xcc
+ mova xm0, xm10
+ paddw m6, m0 ; ypos2 ypos3 ypos0 ypos1
+ paddw m10, m10
+ mova xm11, [z_transpose4]
+.h4_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base0
+ vpbroadcastq m1, [tlq+r4]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base1
+ vpbroadcastq m2, [tlq+r5]
+ lea r5, [r4+dyq]
+ sar r4, 6 ; base2
+ movq xm0, [tlq+r4]
+ lea r4, [r5+dyq]
+ sar r5, 6 ; base3
+ movhps xm0, [tlq+r5]
+ vpblendd m1, m2, 0xc0
+ pand m2, m4, m6 ; frac
+ vpblendd m0, m1, 0xf0
+ psubw m1, m5, m2 ; 64-frac
+ psllw m2, 8
+ pshufb m0, m8
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ pcmpgtw m1, m9, m6 ; base < max_base_y
+ pmulhrsw m0, m3
+ paddw m6, m10 ; ypos += dy
+ vpblendvb m0, m7, m0, m1
+ vextracti128 xm1, m0, 1
+ packuswb xm1, xm0
+ pshufb xm1, xm11 ; transpose
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+r7 ], xm1, 3
+ sub wd, 4
+ jz .h4_end
+ add dstq, 4
+ cmp r4d, maxbased
+ jg .h4_loop
+ packuswb xm7, xm7
+.h4_end_loop:
+ movd [dstq+strideq*0], xm7
+ movd [dstq+strideq*1], xm7
+ movd [dstq+strideq*2], xm7
+ movd [dstq+r7 ], xm7
+ add dstq, 4
+ sub wd, 4
+ jg .h4_end_loop
+.h4_end:
+ RET
+ALIGN function_align
+.h8:
+ lea r4d, [angleq+216]
+ mov r4b, wb
+ cmp r4d, 8
+ ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -32, 8
+ and r4d, 4
+ mova xm0, [tlq-15]
+ vinserti128 m0, [tlq- 9], 1
+ movd xm1, r4d
+ movu xm2, [z_filter_s+2]
+ vinserti128 m2, [z_filter_s+6], 1
+ vpbroadcastb xm1, xm1 ; w & 4
+ vpbroadcastd m7, [pb_36_m4]
+ pmaxub xm1, [z_upsample1-4] ; clip 4x8
+ vinserti128 m1, [z_upsample1], 1
+ add dyd, dyd
+ pshufb m1, m0, m1
+ pshufb m2, m0, m2
+ vinserti128 m0, [tlq-7], 1
+ movd xm6, dyd
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ vpbroadcastw m6, xm6
+ mov r2d, dyd
+ lea r5, [strideq*3]
+ paddw m7, m6, m6
+ paddw m1, m2
+ vpblendd m6, m7, 0xf0
+ pmulhrsw m1, m3
+ pslldq m2, m7, 8
+ paddw m7, m7
+ paddw m6, m2
+ vbroadcasti128 m2, [pb_15to0]
+ packuswb m1, m1
+ punpcklbw m1, m0
+ pshufb m1, m2
+ vextracti128 [rsp+ 0], m1, 1
+ mova [rsp+16], xm1
+.h8_upsample_loop:
+ lea r4d, [r2+dyq]
+ shr r2d, 6 ; base0
+ movu xm0, [rsp+r2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6 ; base1
+ vinserti128 m0, [rsp+r4], 1
+ lea r4d, [r2+dyq]
+ shr r2d, 6 ; base2
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ punpcklqdq m1, m2, m2 ; frac0 frac1
+ pmaddubsw m0, m1
+ movu xm1, [rsp+r2]
+ lea r2d, [r4+dyq]
+ shr r4d, 6 ; base3
+ vinserti128 m1, [rsp+r4], 1
+ punpckhqdq m2, m2 ; frac2 frac3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ paddw m6, m7
+ pmulhrsw m1, m3
+ lea r4, [dstq+strideq*4]
+ psllw m1, 8
+ por m0, m1
+ vextracti128 xm1, m0, 1
+ punpcklbw xm2, xm0, xm1
+ punpckhbw xm0, xm1
+ movd [dstq+strideq*0], xm2
+ pextrd [dstq+strideq*1], xm2, 1
+ pextrd [dstq+strideq*2], xm2, 2
+ pextrd [dstq+r5 ], xm2, 3
+ movd [r4 +strideq*0], xm0
+ pextrd [r4 +strideq*1], xm0, 1
+ pextrd [r4 +strideq*2], xm0, 2
+ pextrd [r4 +r5 ], xm0, 3
+ add dstq, 4
+ sub wd, 4
+ jg .h8_upsample_loop
+ RET
+.h8_no_intra_edge_filter:
+ and maxbased, 7
+ or maxbased, 8 ; imin(w+7, 15)
+ jmp .h8_main
+.h8_no_upsample:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -32, 10
+ lea maxbased, [wq+7]
+ test angled, 0x400
+ jnz .h8_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .h8_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd xm6, [base+pb_15]
+ pcmpeqb xm1, xm1
+ psubusb xm6, xm0
+ psubb xm6, xm1 ; w == 4 ? 5 : 1
+ movu xm2, [tlq-16]
+ pmaxub xm1, xm6, [base+z_filter_s]
+ vinserti128 m2, [tlq-14], 1
+ vinserti128 m1, [base+z_filter_s+12], 1
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0]
+ pmaxub xm6, [base+z_filter_s+ 8]
+ vinserti128 m6, [base+z_filter_s+20], 1
+ pshufb m0, m2, m1
+ pmaddubsw m0, m7
+ vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1]
+ movzx r4d, byte [tlq-15]
+ shufps m1, m6, q2121
+ pshufb m1, m2, m1
+ pmaddubsw m1, m7
+ paddw m0, m1
+ sub r5d, 3
+ jnz .h8_3tap
+ vpbroadcastd m7, [z_filter_k+4*8]
+ movzx r2d, byte [tlq-14]
+ pshufb m2, m6
+ pmaddubsw m2, m7
+ sub r2d, r4d
+ lea r2d, [r2+r4*8+4]
+ shr r2d, 3
+ mov [rsp+15], r2b
+ paddw m0, m2
+.h8_3tap:
+ pmulhrsw m0, m3
+ sar r5d, 1
+ lea tlq, [rsp+31]
+ add r5d, 17
+ cmp wd, 16
+ cmovns maxbased, r5d
+ neg r5
+ mov [tlq+r5], r4b
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ mova [tlq-15], xm0
+.h8_main:
+ movd xm2, dyd
+ vbroadcasti128 m0, [z_base_inc]
+ mov r4, tlq
+ sub tlq, 8
+ neg dyq
+ vpbroadcastw m2, xm2
+ sub r4, maxbaseq
+ shl maxbased, 6
+ vpbroadcastb m7, [r4]
+ lea r4, [dyq+63]
+ movd xm9, maxbased
+ not maxbased
+ vbroadcasti128 m8, [z3_shuf]
+ add maxbased, 64
+ vpbroadcastw m9, xm9
+ psrlw m7, 8
+ psubw m9, m0
+ paddw m6, m2, m2
+ vpblendd m2, m6, 0x0f
+.h8_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6
+ pand m0, m4, m2
+ psubw m1, m5, m0
+ psllw m0, 8
+ por m1, m0
+ vbroadcasti128 m0, [tlq+r4]
+ lea r4, [r5+dyq]
+ sar r5, 6
+ vinserti128 m0, [tlq+r5], 0
+ sub rsp, 8*2
+ pshufb m0, m8
+ pmaddubsw m0, m1
+ pcmpgtw m1, m9, m2
+ paddw m2, m6
+ pmulhrsw m0, m3
+ vpblendvb m0, m7, m0, m1
+ vextracti128 xm1, m0, 1
+ psllw xm0, 8
+ por xm0, xm1 ; interleave rows (partial transpose)
+ mova [rsp], xm0
+ sub wd, 2
+ jz .h8_transpose
+ cmp r4d, maxbased
+ jg .h8_loop
+ packuswb xm0, xm7, xm7
+.h8_end_loop:
+ sub rsp, 8*2
+ mova [rsp], xm0
+ sub wd, 2
+ jg .h8_end_loop
+.h8_transpose:
+ mova xm2, [rsp+16*1]
+ sub org_wd, 8
+ lea r2, [strideq*3]
+ lea r6, [dstq+org_wq]
+ cmovns dstq, r6
+ punpcklwd xm1, xm2, xm0
+ punpckhwd xm2, xm0
+ lea r6, [dstq+strideq*4]
+ jge .h8_w8
+ add rsp, 16*2
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+r2 ], xm1, 3
+ movd [r6 +strideq*0], xm2
+ pextrd [r6 +strideq*1], xm2, 1
+ pextrd [r6 +strideq*2], xm2, 2
+ pextrd [r6 +r2 ], xm2, 3
+ jmp .h8_end
+.h8_w8_loop:
+ mova xm0, [rsp+16*0]
+ mova xm2, [rsp+16*1]
+ punpcklwd xm1, xm2, xm0
+ punpckhwd xm2, xm0
+.h8_w8: ; w8/w16/w32
+ mova xm0, [rsp+16*2]
+ mova xm4, [rsp+16*3]
+ add rsp, 16*4
+ punpcklwd xm3, xm4, xm0
+ punpckhwd xm4, xm0
+ punpckldq xm0, xm3, xm1
+ punpckhdq xm3, xm1
+ punpckldq xm1, xm4, xm2
+ punpckhdq xm4, xm2
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+r2 ], xm3
+ movq [r6 +strideq*0], xm1
+ movhps [r6 +strideq*1], xm1
+ movq [r6 +strideq*2], xm4
+ movhps [r6 +r2 ], xm4
+ sub dstq, 8
+ sub r6, 8
+ sub org_wd, 8
+ jge .h8_w8_loop
+.h8_end:
+ RET
+.h16_no_intra_edge_filter:
+ and maxbased, 15
+ or maxbased, 16 ; imin(w+15, 31)
+ jmp .h16_main
+ALIGN function_align
+.h16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 12
+ lea maxbased, [wq+15]
+ test angled, 0x400
+ jnz .h16_no_intra_edge_filter
+ call .filter_strength
+ test r5d, r5d
+ jz .h16_main ; filter_strength == 0
+ popcnt r5d, r5d
+ vpbroadcastd m11, [base+pb_27]
+ vpbroadcastd m1, [base+pb_1]
+ vbroadcasti128 m6, [base+z_filter_s+12]
+ vinserti128 m2, m6, [base+z_filter_s+4], 0
+ vinserti128 m6, [base+z_filter_s+20], 1
+ movu xm10, [tlq-18]
+ vinserti128 m10, [tlq-14], 1
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0]
+ vbroadcasti128 m7, [base+z_filter_s+8]
+ vinserti128 m8, m7, [base+z_filter_s+0], 0
+ vinserti128 m7, [base+z_filter_s+16], 1
+ psubusb m11, m0
+ por m1, m11
+ movu xm11, [tlq-32]
+ vinserti128 m11, [tlq-28], 1
+ pmaxub m8, m1
+ pmaxub m7, m1
+ pshufb m0, m10, m2
+ shufps m2, m6, q2121
+ pmaddubsw m0, m9
+ pshufb m1, m11, m8
+ shufps m8, m7, q2121
+ pmaddubsw m1, m9
+ vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
+ movzx r4d, byte [tlq-31]
+ pshufb m2, m10, m2
+ pmaddubsw m2, m9
+ pshufb m8, m11, m8
+ pmaddubsw m8, m9
+ paddw m0, m2
+ paddw m1, m8
+ sub r5d, 3
+ jnz .h16_3tap
+ vpbroadcastd m9, [z_filter_k+4*8]
+ movzx r2d, byte [tlq-30]
+ pshufb m10, m6
+ pmaddubsw m10, m9
+ pshufb m11, m7
+ pmaddubsw m11, m9
+ sub r2d, r4d
+ lea r2d, [r2+r4*8+4]
+ shr r2d, 3
+ mov [rsp+31], r2b
+ paddw m0, m10
+ paddw m1, m11
+.h16_3tap:
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ sar r5d, 1
+ lea tlq, [rsp+63]
+ add r5d, 33
+ cmp wd, 32
+ cmovns maxbased, r5d
+ neg r5
+ mov [tlq+r5], r4b
+ packuswb m0, m1
+ vpermq m0, m0, q2031
+ mova [tlq-31], m0
+.h16_main:
+ movd xm6, dyd
+ vbroadcasti128 m0, [z_base_inc]
+ mov r4, tlq
+ sub tlq, 8
+ neg dyq
+ vpbroadcastw m6, xm6
+ sub r4, maxbaseq
+ shl maxbased, 6
+ vpbroadcastb m7, [r4]
+ lea r4, [dyq+63]
+ movd xm9, maxbased
+ not maxbased
+ vbroadcasti128 m8, [z3_shuf]
+ add maxbased, 64
+ vpbroadcastw m9, xm9
+ psubw m9, m0
+ paddw m11, m6, m6
+ psubw m10, m9, m3 ; 64*8
+ vpblendd m6, m11, 0xf0
+.h16_loop:
+ lea r5, [r4+dyq]
+ sar r4, 6
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ movu xm0, [tlq+r4-0]
+ movu xm1, [tlq+r4-8]
+ lea r4, [r5+dyq]
+ sar r5, 6
+ vinserti128 m0, [tlq+r5-0], 1
+ vinserti128 m1, [tlq+r5-8], 1
+ sub rsp, 32
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m9, m6
+ pcmpgtw m2, m10, m6
+ packsswb m1, m2
+ paddw m6, m11
+ vpblendvb m0, m7, m0, m1
+ vpermq m0, m0, q3120
+ mova [rsp], m0
+ sub wd, 2
+ jz .h16_transpose
+ cmp r4d, maxbased
+ jg .h16_loop
+ mova m0, m7
+.h16_end_loop:
+ sub rsp, 32
+ mova [rsp], m7
+ sub wd, 2
+ jg .h16_end_loop
+.h16_transpose:
+ mova m2, [rsp+32*1]
+ sub org_wd, 8
+ lea r2, [strideq*3]
+ lea r6, [dstq+org_wq]
+ cmovns dstq, r6
+ punpcklbw m1, m2, m0
+ punpckhbw m2, m0
+ lea r3, [strideq*5]
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ lea r4, [strideq+r2*2] ; stride*7
+ jge .h16_w8
+ add rsp, 32*2
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ vextracti128 xm0, m0, 1
+ movd [dstq+strideq*4], xm1
+ pextrd [dstq+r3 ], xm1, 1
+ pextrd [dstq+r2*2 ], xm1, 2
+ pextrd [dstq+r4 ], xm1, 3
+ lea dstq, [dstq+strideq*8]
+ vextracti128 xm1, m1, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ movd [dstq+strideq*4], xm1
+ pextrd [dstq+r3 ], xm1, 1
+ pextrd [dstq+r2*2 ], xm1, 2
+ pextrd [dstq+r4 ], xm1, 3
+ jmp .h16_end
+.h16_w8_loop:
+ mova m0, [rsp+32*0]
+ mova m2, [rsp+32*1]
+ punpcklbw m1, m2, m0
+ punpckhbw m2, m0
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+.h16_w8:
+ mova m2, [rsp+32*2]
+ mova m4, [rsp+32*3]
+ lea r6, [dstq+strideq*8]
+ add rsp, 32*4
+ punpcklbw m3, m4, m2
+ punpckhbw m4, m2
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ punpckldq m4, m2, m0
+ punpckhdq m2, m0
+ punpckldq m0, m3, m1
+ punpckhdq m3, m1
+ movq [dstq+strideq*0], xm4
+ movhps [dstq+strideq*1], xm4
+ vextracti128 xm4, m4, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+r2 ], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*4], xm0
+ movhps [dstq+r3 ], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+r2*2 ], xm3
+ movhps [dstq+r4 ], xm3
+ vextracti128 xm3, m3, 1
+ movq [r6+strideq*0], xm4
+ movhps [r6+strideq*1], xm4
+ movq [r6+strideq*2], xm2
+ movhps [r6+r2 ], xm2
+ movq [r6+strideq*4], xm0
+ movhps [r6+r3 ], xm0
+ movq [r6+r2*2 ], xm3
+ movhps [r6+r4 ], xm3
+ sub dstq, 8
+ sub org_wd, 8
+ jge .h16_w8_loop
+.h16_end:
+ RET
+ALIGN function_align
+.h32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 15
+ lea maxbased, [wq+31]
+ and maxbased, 31
+ or maxbased, 32 ; imin(w+31, 63)
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h32_main
+ vbroadcasti128 m0, [pb_0to15]
+ mov r4d, 21
+ mov r5d, 3
+ movu xm11, [tlq-66] ; 56-63
+ vinserti128 m11, [tlq-52], 1 ; 40-47
+ sub r4d, wd ; 21-w
+ cmovns r5d, r4d
+ movu xm12, [tlq-58] ; 48-55
+ vinserti128 m12, [tlq-44], 1 ; 32-39
+ sub r4d, 8 ; 13-w
+ movd xm1, r5d
+ movu xm13, [tlq-34] ; 24-31
+ vinserti128 m13, [tlq-20], 1 ; 8-15
+ movd xm2, r4d
+ vpbroadcastb m1, xm1
+ movu xm14, [tlq-28] ; 16-23
+ vinserti128 m14, [tlq-14], 1 ; 0- 7
+ vpbroadcastb m2, xm2
+ pmaxsb m1, m0 ; clip 16x32 and (32|64)x32
+ movu m7, [z_filter_s+4]
+ pshufb m11, m1
+ vinserti128 m8, m7, [z_filter_s+8], 1
+ vinserti128 m7, [z_filter_s+16], 0
+ pmaxsb m2, m0 ; clip 8x32
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ pshufb m12, m2
+ pshufb m0, m11, m8
+ pmaddubsw m0, m9
+ pshufb m2, m12, m8
+ pmaddubsw m2, m9
+ pshufb m1, m13, m8
+ pmaddubsw m1, m9
+ shufps m8, m7, q1021
+ pshufb m6, m14, m8
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ pshufb m10, m11, m8
+ pmaddubsw m10, m9
+ paddw m0, m10
+ pshufb m10, m12, m8
+ pmaddubsw m10, m9
+ paddw m2, m10
+ pshufb m10, m13, m8
+ pmaddubsw m10, m9
+ shufps m8, m7, q2121
+ paddw m1, m10
+ pshufb m10, m14, m8
+ pmaddubsw m10, m9
+ paddw m6, m10
+ vpbroadcastd m9, [z_filter_k+4*2+12*2]
+ pshufb m11, m8
+ pmaddubsw m11, m9
+ pshufb m12, m8
+ pmaddubsw m12, m9
+ movzx r4d, byte [tlq-63]
+ movzx r2d, byte [tlq-62]
+ paddw m0, m11
+ paddw m2, m12
+ pshufb m13, m8
+ pmaddubsw m13, m9
+ pshufb m14, m7
+ pmaddubsw m14, m9
+ paddw m1, m13
+ paddw m6, m14
+ sub r2d, r4d
+ lea r2d, [r2+r4*8+4] ; edge case for 64x32
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ shr r2d, 3
+ mov [rsp+31], r2b
+ lea tlq, [rsp+95]
+ mov [tlq-65], r4b
+ mov r4d, 65
+ cmp wd, 64
+ cmove maxbased, r4d
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq-63], m0
+ mova [tlq-31], m1
+.h32_main:
+ movd xm6, dyd
+ mov r4, tlq
+ sub tlq, 8
+ neg dyq
+ vpbroadcastw m6, xm6
+ sub r4, maxbaseq
+ shl maxbased, 6
+ vpbroadcastb m7, [r4]
+ lea r4, [dyq+63]
+ movd xm9, maxbased
+ not maxbased
+ vbroadcasti128 m8, [z3_shuf]
+ add maxbased, 64
+ vpbroadcastw m9, xm9
+ psubw m9, [z_base_inc]
+ mova m11, m6
+ psubw m10, m9, m3 ; 64*8
+.h32_loop:
+ mov r5, r4
+ sar r5, 6
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ movu xm0, [tlq+r5- 0]
+ vinserti128 m0, [tlq+r5-16], 1
+ movu xm1, [tlq+r5- 8]
+ vinserti128 m1, [tlq+r5-24], 1
+ sub rsp, 32
+ add r4, dyq
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m9, m6
+ pcmpgtw m2, m10, m6
+ packsswb m1, m2
+ paddw m6, m11
+ vpblendvb m0, m7, m0, m1
+ mova [rsp], m0
+ dec wd
+ jz .h32_transpose
+ cmp r4d, maxbased
+ jg .h32_loop
+.h32_end_loop:
+ sub rsp, 32
+ mova [rsp], m7
+ dec wd
+ jg .h32_end_loop
+.h32_transpose:
+ lea dstq, [dstq+org_wq-8]
+ lea r2, [strideq*3]
+ lea r3, [strideq*5]
+ lea r4, [strideq+r2*2] ; stride*7
+.h32_w8_loop:
+ mova m7, [rsp+32*0]
+ mova m6, [rsp+32*1]
+ mova m5, [rsp+32*2]
+ mova m4, [rsp+32*3]
+ mova m3, [rsp+32*4]
+ mova m2, [rsp+32*5]
+ mova m1, [rsp+32*6]
+ mova m0, [rsp+32*7]
+ lea r6, [dstq+strideq*8]
+ add rsp, 32*8
+ punpcklbw m8, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m5, m6, m7
+ punpckhbw m6, m7
+ punpcklwd m7, m8, m1
+ punpckhwd m8, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ punpckldq m6, m7, m2
+ punpckhdq m7, m2
+ punpckldq m2, m8, m3
+ punpckhdq m8, m3
+ punpckldq m3, m1, m5
+ punpckhdq m1, m5
+ punpckldq m5, m0, m4
+ punpckhdq m0, m4
+ movq [dstq+strideq*0], xm6
+ movhps [dstq+strideq*1], xm6
+ vextracti128 xm6, m6, 1
+ movq [dstq+strideq*2], xm7
+ movhps [dstq+r2 ], xm7
+ vextracti128 xm7, m7, 1
+ movq [dstq+strideq*4], xm2
+ movhps [dstq+r3 ], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+r2*2 ], xm8
+ movhps [dstq+r4 ], xm8
+ vextracti128 xm8, m8, 1
+ movq [r6+strideq*0], xm3
+ movhps [r6+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [r6+strideq*2], xm1
+ movhps [r6+r2 ], xm1
+ vextracti128 xm1, m1, 1
+ movq [r6+strideq*4], xm5
+ movhps [r6+r3 ], xm5
+ vextracti128 xm5, m5, 1
+ movq [r6+r2*2 ], xm0
+ movhps [r6+r4 ], xm0
+ lea r6, [r6+strideq*8]
+ vextracti128 xm0, m0, 1
+ movq [r6+strideq*0], xm6
+ movhps [r6+strideq*1], xm6
+ movq [r6+strideq*2], xm7
+ movhps [r6+r2 ], xm7
+ movq [r6+strideq*4], xm2
+ movhps [r6+r3 ], xm2
+ movq [r6+r2*2 ], xm8
+ movhps [r6+r4 ], xm8
+ lea r6, [r6+strideq*8]
+ movq [r6+strideq*0], xm3
+ movhps [r6+strideq*1], xm3
+ movq [r6+strideq*2], xm1
+ movhps [r6+r2 ], xm1
+ movq [r6+strideq*4], xm5
+ movhps [r6+r3 ], xm5
+ movq [r6+r2*2 ], xm0
+ movhps [r6+r4 ], xm0
+ sub dstq, 8
+ sub org_wd, 8
+ jg .h32_w8_loop
+ RET
+ALIGN function_align
+.h64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -128, 16
+ lea maxbased, [wq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h64_main
+ mov r4d, 21
+ vpbroadcastb xm11, [tlq-127]
+ vpblendd xm11, [tlq-130], 0x0e ; 120-127
+ sub r4d, wd ; 21-w
+ mov r5d, 3
+ vinserti128 m11, [tlq-116], 1 ; 104-111
+ movu m7, [z_filter_s+4]
+ cmp wd, 32
+ cmove r4d, r5d
+ vinserti128 m8, m7, [z_filter_s+8], 1
+ vbroadcasti128 m6, [pb_0to15]
+ movd xm1, r4d
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ movu xm12, [tlq-122] ; 112-119
+ vinserti128 m12, [tlq-108], 1 ; 96-103
+ vpbroadcastb m1, xm1
+ movu xm13, [tlq- 98] ; 88- 95
+ vinserti128 m13, [tlq- 84], 1 ; 72- 79
+ movu xm14, [tlq- 90] ; 80- 87
+ vinserti128 m14, [tlq- 76], 1 ; 64- 71
+ vinserti128 m7, [z_filter_s+16], 0
+ pshufb m0, m11, m8
+ pmaddubsw m0, m9
+ pshufb m2, m12, m8
+ pmaddubsw m2, m9
+ pmaxsb m1, m6 ; clip (16|32)x64
+ pshufb m13, m1
+ pshufb m1, m13, m8
+ pmaddubsw m1, m9
+ pshufb m6, m14, m8
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ shufps m15, m8, m7, q1021
+ pshufb m10, m11, m15
+ pmaddubsw m10, m9
+ paddw m0, m10
+ pshufb m10, m12, m15
+ pmaddubsw m10, m9
+ paddw m2, m10
+ pshufb m10, m13, m15
+ pmaddubsw m10, m9
+ paddw m1, m10
+ pshufb m10, m14, m15
+ pmaddubsw m10, m9
+ paddw m6, m10
+ vpbroadcastd m9, [z_filter_k+4*2+12*2]
+ shufps m10, m8, m7, q2132
+ pshufb m11, m10
+ pmaddubsw m11, m9
+ pshufb m12, m10
+ pmaddubsw m12, m9
+ pshufb m13, m10
+ pmaddubsw m13, m9
+ pshufb m14, m10
+ pmaddubsw m14, m9
+ paddw m0, m11
+ paddw m2, m12
+ paddw m1, m13
+ paddw m6, m14
+ movu xm11, [tlq-66] ; 56-63
+ vinserti128 m11, [tlq-52], 1 ; 40-47
+ movu xm12, [tlq-58] ; 48-55
+ vinserti128 m12, [tlq-44], 1 ; 32-39
+ movu xm13, [tlq-34] ; 24-31
+ vinserti128 m13, [tlq-20], 1 ; 8-15
+ movu xm14, [tlq-28] ; 16-23
+ vinserti128 m14, [tlq-14], 1 ; 0- 7
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ lea tlq, [rsp+127]
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq-127], m0
+ mova [tlq- 95], m1
+ pshufb m0, m11, m10
+ pmaddubsw m0, m9
+ pshufb m2, m12, m10
+ pmaddubsw m2, m9
+ pshufb m1, m13, m10
+ pmaddubsw m1, m9
+ pshufb m6, m14, m7
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ pshufb m7, m11, m15
+ pmaddubsw m7, m9
+ paddw m0, m7
+ pshufb m7, m12, m15
+ pmaddubsw m7, m9
+ paddw m2, m7
+ pshufb m7, m13, m15
+ pmaddubsw m7, m9
+ paddw m1, m7
+ pshufb m7, m14, m10
+ pmaddubsw m7, m9
+ paddw m6, m7
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ pshufb m11, m8
+ pmaddubsw m11, m9
+ pshufb m12, m8
+ pmaddubsw m12, m9
+ pshufb m13, m8
+ pmaddubsw m13, m9
+ pshufb m14, m15
+ pmaddubsw m14, m9
+ paddw m0, m11
+ paddw m2, m12
+ paddw m1, m13
+ paddw m6, m14
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq-63], m0
+ mova [tlq-31], m1
+.h64_main:
+ movd xm12, dyd
+ neg maxbaseq
+ vbroadcasti128 m8, [z3_shuf]
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m12, xm12
+ lea r5d, [dyq+maxbaseq-64]
+ neg dyq
+ or maxbased, 63
+ lea r4, [dyq+63]
+ movd xm6, r5d
+ mova xm10, [pb_1to32+16]
+ vinserti128 m10, [pb_1to32], 1
+ vpbroadcastd m11, [pb_32]
+ vpbroadcastw m6, xm6
+.h64_loop:
+ mov r5, r4
+ sar r5, 6
+ movu m0, [tlq+r5-24]
+ movu m1, [tlq+r5-32]
+ pand m2, m4, m6
+ psubw m9, m5, m2
+ psllw m2, 8
+ por m9, m2
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m9
+ pmaddubsw m1, m9
+ psraw m2, m6, 6
+ sub rsp, 64
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packsswb m2, m2
+ paddb m2, m10
+ packuswb m0, m1
+ vpblendvb m0, m7, m0, m2
+ mova [rsp+32], m0
+ movu m0, [tlq+r5-56]
+ movu m1, [tlq+r5-64]
+ add r4, dyq
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m9
+ pmaddubsw m1, m9
+ paddb m2, m11
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m6, m12
+ packuswb m0, m1
+ vpblendvb m0, m7, m0, m2
+ mova [rsp], m0
+ dec wd
+ jz .h64_transpose
+ cmp r4d, maxbased
+ jg .h64_loop
+.h64_end_loop:
+ sub rsp, 64
+ mova [rsp+32], m7
+ mova [rsp+ 0], m7
+ dec wd
+ jg .h64_end_loop
+.h64_transpose:
+ lea r2, [strideq*3]
+ lea r3, [strideq*5]
+ imul r5, strideq, -8
+ lea dstq, [dstq+org_wq-16]
+ lea r4, [strideq+r2*2] ; stride*7
+.h64_transpose_loop0:
+ lea r6, [rsp+16*3]
+.h64_transpose_loop:
+ mova xm0, [r6+64*15]
+ vinserti128 m0, [r6+64* 7], 1
+ mova xm1, [r6+64*14]
+ vinserti128 m1, [r6+64* 6], 1
+ mova xm2, [r6+64*13]
+ vinserti128 m2, [r6+64* 5], 1
+ mova xm3, [r6+64*12]
+ vinserti128 m3, [r6+64* 4], 1
+ mova xm4, [r6+64*11]
+ vinserti128 m4, [r6+64* 3], 1
+ mova xm5, [r6+64*10]
+ vinserti128 m5, [r6+64* 2], 1
+ mova xm6, [r6+64* 9]
+ vinserti128 m6, [r6+64* 1], 1
+ mova xm7, [r6+64* 8]
+ vinserti128 m7, [r6+64* 0], 1
+ sub r6, 16
+ punpcklbw m8, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m5, m6, m7
+ punpckhbw m6, m7
+ punpcklwd m7, m8, m1
+ punpckhwd m8, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ punpckldq m6, m7, m2
+ punpckhdq m7, m2
+ punpckldq m2, m8, m3
+ punpckhdq m8, m3
+ punpckldq m3, m1, m5
+ punpckhdq m1, m5
+ punpckldq m5, m0, m4
+ punpckhdq m0, m4
+ vpermq m6, m6, q3120
+ vpermq m7, m7, q3120
+ vpermq m2, m2, q3120
+ vpermq m8, m8, q3120
+ vpermq m3, m3, q3120
+ vpermq m1, m1, q3120
+ vpermq m5, m5, q3120
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm6
+ vextracti128 [dstq+strideq*1], m6, 1
+ mova [dstq+strideq*2], xm7
+ vextracti128 [dstq+r2 ], m7, 1
+ mova [dstq+strideq*4], xm2
+ vextracti128 [dstq+r3 ], m2, 1
+ mova [dstq+r2*2 ], xm8
+ vextracti128 [dstq+r4 ], m8, 1
+ sub dstq, r5
+ mova [dstq+strideq*0], xm3
+ vextracti128 [dstq+strideq*1], m3, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+r2 ], m1, 1
+ mova [dstq+strideq*4], xm5
+ vextracti128 [dstq+r3 ], m5, 1
+ mova [dstq+r2*2 ], xm0
+ vextracti128 [dstq+r4 ], m0, 1
+ sub dstq, r5
+ cmp r6, rsp
+ jae .h64_transpose_loop
+ add rsp, 64*16
+ lea dstq, [dstq+r5*8-16]
+ sub org_wd, 16
+ jg .h64_transpose_loop0
+.h64_end:
+ RET
+
+%macro FILTER_XMM 4 ; dst, src, tmp, shuf
+%ifnum %4
+ pshufb xm%2, xm%4
+%else
+ pshufb xm%2, %4
+%endif
+ pshufd xm%1, xm%2, q0000 ; p0 p1
+ pmaddubsw xm%1, xm2
+ pshufd xm%3, xm%2, q1111 ; p2 p3
+ pmaddubsw xm%3, xm3
+ paddw xm%1, xm1
+ paddw xm%1, xm%3
+ pshufd xm%3, xm%2, q2222 ; p4 p5
+ pmaddubsw xm%3, xm4
+ paddw xm%1, xm%3
+ pshufd xm%3, xm%2, q3333 ; p6 __
+ pmaddubsw xm%3, xm5
+ paddw xm%1, xm%3
+ psraw xm%1, 4
+ packuswb xm%1, xm%1
+%endmacro
+
+%macro FILTER_YMM 4 ; dst, src, tmp, shuf
+ pshufb m%2, m%4
+ pshufd m%1, m%2, q0000
+ pmaddubsw m%1, m2
+ pshufd m%3, m%2, q1111
+ pmaddubsw m%3, m3
+ paddw m%1, m1
+ paddw m%1, m%3
+ pshufd m%3, m%2, q2222
+ pmaddubsw m%3, m4
+ paddw m%1, m%3
+ pshufd m%3, m%2, q3333
+ pmaddubsw m%3, m5
+ paddw m%1, m%3
+ psraw m%1, 4
+ vperm2i128 m%3, m%1, m%1, 0x01
+ packuswb m%1, m%3
+%endmacro
+
+; The ipred_filter SIMD processes 4x2 blocks in the following order which
+; increases parallelism compared to doing things row by row. One redundant
+; block is calculated for w8 and w16, two for w32.
+; w4 w8 w16 w32
+; 1 1 2 1 2 3 5 1 2 3 5 b c d f
+; 2 2 3 2 4 5 7 2 4 5 7 c e f h
+; 3 3 4 4 6 7 9 4 6 7 9 e g h j
+; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
+; 5 8 8 i
+
+cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter
+%define base r6-ipred_filter_avx2_table
+ lea r6, [filter_intra_taps]
+ tzcnt wd, wm
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ add filterq, r6
+ lea r6, [ipred_filter_avx2_table]
+ movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m1, [base+pw_8]
+ vbroadcasti128 m2, [filterq+16*0]
+ vbroadcasti128 m3, [filterq+16*1]
+ vbroadcasti128 m4, [filterq+16*2]
+ vbroadcasti128 m5, [filterq+16*3]
+ add wq, r6
+ mov hd, hm
+ jmp wq
+.w4:
+ WIN64_SPILL_XMM 9
+ mova xm8, [base+filter_shuf2]
+ sub tlq, 3
+ sub tlq, hq
+ jmp .w4_loop_start
+.w4_loop:
+ pinsrd xm0, xm6, [tlq+hq], 0
+ lea dstq, [dstq+strideq*2]
+.w4_loop_start:
+ FILTER_XMM 6, 0, 7, 8
+ movd [dstq+strideq*0], xm6
+ pextrd [dstq+strideq*1], xm6, 1
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 10
+ mova m8, [base+filter_shuf1]
+ FILTER_XMM 7, 0, 6, [base+filter_shuf2]
+ vpbroadcastd m0, [tlq+4]
+ vpbroadcastd m6, [tlq+5]
+ sub tlq, 4
+ sub tlq, hq
+ vpbroadcastq m7, xm7
+ vpblendd m7, m6, 0x20
+.w8_loop:
+ vpbroadcastd xm6, [tlq+hq]
+ palignr m6, m0, 12
+ vpblendd m0, m6, m7, 0xeb ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ mova xm6, xm7
+ call .main
+ vpblendd xm6, xm7, 0x0c
+ pshufd xm6, xm6, q3120
+ movq [dstq+strideq*0], xm6
+ movhps [dstq+strideq*1], xm6
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+%if WIN64
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign xmm_regs_used 15
+ %assign stack_size_padded 0x98
+ SUB rsp, stack_size_padded
+%endif
+ sub hd, 2
+ TAIL_CALL .w16_main, 0
+.w16_main:
+%if WIN64
+ movaps [rsp+0xa8], xmm6
+ movaps [rsp+0xb8], xmm7
+ movaps [rsp+0x28], xmm8
+ movaps [rsp+0x38], xmm9
+ movaps [rsp+0x48], xmm10
+ movaps [rsp+0x58], xmm11
+ movaps [rsp+0x68], xmm12
+ movaps [rsp+0x78], xmm13
+ movaps [rsp+0x88], xmm14
+%endif
+ FILTER_XMM 12, 0, 7, [base+filter_shuf2]
+ vpbroadcastd m0, [tlq+5]
+ vpblendd m0, [tlq-12], 0x14
+ mova m8, [base+filter_shuf1]
+ vpbroadcastq m7, xm12
+ vpblendd m0, m7, 0xc2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ call .main ; c0 d0 a1 b1 a1 b1 c0 d0
+ movlps xm9, xm7, [tlq+5] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ vinserti128 m14, m8, [base+filter_shuf3], 0
+ vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1
+ FILTER_XMM 6, 9, 10, 14
+ vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2
+ vpbroadcastd m9, [tlq+13]
+ vpbroadcastd m10, [tlq+12]
+ psrld m11, m8, 4
+ vpblendd m6, m9, 0x20 ; top
+ sub tlq, 6
+ sub tlq, hq
+.w16_loop:
+ vpbroadcastd xm9, [tlq+hq]
+ palignr m9, m0, 12
+ vpblendd m0, m9, m7, 0xe2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ mova xm13, xm7
+ call .main ; e0 f0 c1 d1 c1 d1 e0 f0
+ vpblendd m9, m12, m10, 0xf0
+ vpblendd m12, m6, 0xc0
+ pshufd m9, m9, q3333
+ vpblendd m9, m6, 0xee
+ vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2
+ vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2
+ vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3
+ vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1
+ mova [dstq+strideq*0], xm9
+ vextracti128 [dstq+strideq*1], m9, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4
+ pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER_XMM 0, 7, 9, [base+filter_shuf1+16]
+ vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3
+ shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3
+ shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm6
+ ret
+ALIGN function_align
+.w32:
+ sub rsp, stack_size_padded
+ sub hd, 2
+ lea r3, [dstq+16]
+ lea r5d, [hq-2]
+ call .w16_main
+ add tlq, r5
+ mov dstq, r3
+ lea r3, [strideq-4]
+ lea r4, [r3+strideq*2]
+ movq xm0, [tlq+21]
+ pinsrd xm0, [dstq-4], 2
+ pinsrd xm0, [dstq+r3*1], 3
+ FILTER_XMM 12, 0, 7, 14 ; a0 b0 a0 b0
+ movq xm7, [dstq+r3*2]
+ pinsrd xm7, [dstq+r4], 2
+ palignr xm7, xm0, 12 ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6
+ vpbroadcastd m0, [tlq+28]
+ vpbroadcastd m9, [tlq+29]
+ vbroadcasti128 m8, [base+filter_shuf1+16]
+ vpblendd m0, m9, 0x20
+ vpblendd m0, m7, 0x0f
+ vpbroadcastq m7, xm12
+ vpblendd m0, m7, 0xc2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ call .main ; c0 d0 a1 b1 a1 b1 c0 d0
+ add r3, 2
+ lea r4, [r4+strideq*2]
+ movlps xm9, xm7, [tlq+29] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1
+ FILTER_XMM 6, 9, 10, 14
+ vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2
+ vpbroadcastd m9, [tlq+37]
+ vpbroadcastd m10, [tlq+36]
+ vpblendd m6, m9, 0x20 ; top
+.w32_loop:
+ movq xm9, [dstq+r3*4]
+ pinsrd xm9, [dstq+r4], 2
+.w32_loop_last:
+ palignr m9, m0, 12
+ vpblendd m0, m9, m7, 0xe2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ mova xm13, xm7 ; c0 d0
+ call .main ; e0 f0 c1 d1 c1 d1 e0 f0
+ vpblendd m9, m12, m10, 0xf0
+ vpblendd m12, m6, 0xc0
+ pshufd m9, m9, q3333
+ vpblendd m9, m6, 0xee
+ vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2
+ vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2
+ vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3
+ vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1
+ mova [dstq+strideq*0], xm9
+ vextracti128 [dstq+strideq*1], m9, 1
+ lea dstq, [dstq+strideq*2]
+ sub r5d, 2
+ jg .w32_loop
+ jz .w32_loop_last
+ vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4
+ pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER_XMM 0, 7, 9, [base+filter_shuf1+16]
+ vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3
+ shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3
+ shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm6
+ RET
+ALIGN function_align
+.main:
+ FILTER_YMM 7, 0, 9, 8
+ ret
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+%macro IPRED_CFL 1 ; ac in, unpacked pixels out
+ psignw m3, m%1, m1
+ pabsw m%1, m%1
+ pmulhrsw m%1, m2
+ psignw m%1, m3
+ paddw m%1, m0
+%endmacro
+
+cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ lea t0, [ipred_cfl_left_avx2_table]
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ mov r6d, 0x8000
+ shrx r6d, r6d, wd
+ movd xm3, r6d
+ movsxd r6, [t0+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, t0
+ add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+
+cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ mov t0d, 0x8000
+ shrx t0d, t0d, r6d
+ movd xm3, t0d
+ lea t0, [ipred_cfl_left_avx2_table]
+ movsxd r6, [t0+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, t0
+ add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h32:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h16:
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+.h8:
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+.h4:
+ pmaddwd xm0, xm2
+ pmulhrsw xm0, xm3
+ vpbroadcastw m0, xm0
+ jmp wq
+
+cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd xm4, t0d
+ tzcnt t0d, t0d
+ movd xm5, t0d
+ lea t0, [ipred_cfl_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+4*4]
+ pcmpeqd m3, m3
+ psrlw xm4, 1
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h4:
+ movd xm0, [tlq-4]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w4:
+ movd xm1, [tlq+1]
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq xm1, xm0, xm0
+ lea r2d, [hq*2]
+ mov r6d, 0x55563334
+ paddw xm0, xm1
+ shrx r6d, r6d, r2d
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ movd xm1, r6d
+ psrlw xm0, 2
+ pmulhuw xm0, xm1
+.w4_end:
+ vpbroadcastw m0, xm0
+.s4:
+ vpbroadcastw m1, alpham
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq]
+ IPRED_CFL 4
+ packuswb m4, m4
+ vextracti128 xm5, m4, 1
+ movd [dstq+strideq*0], xm4
+ pextrd [dstq+strideq*1], xm4, 1
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+r6 ], xm5, 1
+ lea dstq, [dstq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .s4_loop
+ RET
+ALIGN function_align
+.h8:
+ movq xm0, [tlq-8]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w8:
+ movq xm1, [tlq+1]
+ vextracti128 xm2, m0, 1
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm2
+ punpckhqdq xm2, xm0, xm0
+ paddw xm0, xm2
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmove r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w8_end:
+ vpbroadcastw m0, xm0
+.s8:
+ vpbroadcastw m1, alpham
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ vextracti128 xm5, m4, 1
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*1], xm5
+ movhps [dstq+strideq*2], xm4
+ movhps [dstq+r6 ], xm5
+ lea dstq, [dstq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .s8_loop
+ RET
+ALIGN function_align
+.h16:
+ mova xm0, [tlq-16]
+ pmaddubsw xm0, xm3
+ jmp wq
+.w16:
+ movu xm1, [tlq+1]
+ vextracti128 xm2, m0, 1
+ pmaddubsw xm1, xm3
+ psubw xm0, xm4
+ paddw xm0, xm2
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w16_end:
+ vpbroadcastw m0, xm0
+.s16:
+ vpbroadcastw m1, alpham
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ vpermq m4, m4, q3120
+ mova [dstq+strideq*0], xm4
+ vextracti128 [dstq+strideq*1], m4, 1
+ lea dstq, [dstq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .s16_loop
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ psubw xm0, xm4
+ paddw xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddw xm0, xm1
+ psrlq xm1, xm0, 32
+ paddw xm0, xm1
+ pmaddwd xm0, xm3
+ psrlw xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x33345556
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+.w32_end:
+ vpbroadcastw m0, xm0
+.s32:
+ vpbroadcastw m1, alpham
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq]
+ mova m5, [acq+32]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ vpermq m4, m4, q3120
+ mova [dstq], m4
+ add dstq, strideq
+ add acq, 64
+ dec hd
+ jg .s32_loop
+ RET
+
+cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ lea t0, [ipred_cfl_splat_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [t0+wq*4]
+ vpbroadcastd m0, [t0-ipred_cfl_splat_avx2_table+pw_128]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp wq
+
+cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
+ movifnidn hpadd, hpadm
+ movifnidn wd, wm
+ mov hd, hm
+ mov szd, wd
+ mov ac_bakq, acq
+ imul szd, hd
+ shl hpadd, 2
+ sub hd, hpadd
+ vpbroadcastd m2, [pb_2]
+ pxor m4, m4
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movq xm0, [yq]
+ movq xm1, [yq+strideq]
+ movhps xm0, [yq+strideq*2]
+ movhps xm1, [yq+stride3q]
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm2
+ paddw xm0, xm1
+ mova [acq], xm0
+ paddw xm4, xm0
+ lea yq, [yq+strideq*4]
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ vpermq m0, m0, q1111
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp .calc_avg
+
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ mova xm0, [yq]
+ mova xm1, [yq+strideq]
+ vinserti128 m0, [yq+strideq*2], 1
+ vinserti128 m1, [yq+stride3q], 1
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 2
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w8_hpad
+.w8_wpad:
+ vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle]
+.w8_wpad_loop:
+ movq xm0, [yq]
+ movq xm1, [yq+strideq]
+ vinserti128 m0, [yq+strideq*2], 1
+ vinserti128 m1, [yq+stride3q], 1
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ pshufb m0, m3
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad_loop
+ test hpadd, hpadd
+ jz .calc_avg
+.w8_hpad:
+ vpermq m0, m0, q3232
+.w8_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 2
+ jg .w8_hpad_loop
+ jmp .calc_avg
+
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w16_hpad_loop
+.w16_wpad:
+ DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
+ lea iptrq, [ipred_cfl_ac_420_avx2_table]
+ shl wpadd, 2
+ mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \
+ ipred_cfl_ac_420_avx2_table+wpadq*8-32]
+ movsxd wpadq, [iptrq+wpadq+4]
+ add iptrq, wpadq
+ jmp iptrq
+.w16_pad3:
+ vpbroadcastq m0, [yq]
+ vpbroadcastq m1, [yq+strideq]
+ jmp .w16_wpad_end
+.w16_pad2:
+ vbroadcasti128 m0, [yq]
+ vbroadcasti128 m1, [yq+strideq]
+ jmp .w16_wpad_end
+.w16_pad1:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ ; fall-through
+.w16_wpad_end:
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ pshufb m0, m3
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jz .w16_wpad_done
+ jmp iptrq
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg
+.w16_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ dec hpadd
+ jg .w16_hpad_loop
+ ; fall-through
+
+.calc_avg:
+ vpbroadcastd m2, [pw_1]
+ pmaddwd m0, m4, m2
+ vextracti128 xm1, m0, 1
+ tzcnt r1d, szd
+ paddd xm0, xm1
+ movd xm2, r1d
+ movd xm3, szd
+ punpckhqdq xm1, xm0, xm0
+ paddd xm0, xm1
+ psrad xm3, 1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm3
+ paddd xm0, xm1
+ psrad xm0, xm2
+ vpbroadcastw m0, xm0
+.sub_loop:
+ mova m1, [ac_bakq]
+ psubw m1, m0
+ mova [ac_bakq], m1
+ add ac_bakq, 32
+ sub szd, 16
+ jg .sub_loop
+ RET
+
+cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
+ movifnidn hpadd, hpadm
+ movifnidn wd, wm
+ mov hd, hm
+ mov szd, wd
+ mov ac_bakq, acq
+ imul szd, hd
+ shl hpadd, 2
+ sub hd, hpadd
+ vpbroadcastd m2, [pb_4]
+ pxor m4, m4
+ pxor m5, m5
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movq xm1, [yq]
+ movhps xm1, [yq+strideq]
+ movq xm0, [yq+strideq*2]
+ movhps xm0, [yq+stride3q]
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm2
+ mova [acq], xm1
+ mova [acq+16], xm0
+ paddw xm4, xm0
+ paddw xm5, xm1
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ vpermq m0, m0, q1111
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp .calc_avg
+
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ mova xm1, [yq]
+ vinserti128 m1, [yq+strideq], 1
+ mova xm0, [yq+strideq*2]
+ vinserti128 m0, [yq+stride3q], 1
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w8_hpad
+.w8_wpad:
+ vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle]
+.w8_wpad_loop:
+ movq xm1, [yq]
+ vinserti128 m1, [yq+strideq], 1
+ movq xm0, [yq+strideq*2]
+ vinserti128 m0, [yq+stride3q], 1
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pshufb m0, m3
+ pshufb m1, m3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_wpad_loop
+ test hpadd, hpadd
+ jz .calc_avg
+.w8_hpad:
+ vpermq m0, m0, q3232
+.w8_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 2
+ jg .w8_hpad_loop
+ jmp .calc_avg
+
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m1, [yq]
+ mova m0, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w16_hpad_loop
+.w16_wpad:
+ DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
+ lea iptrq, [ipred_cfl_ac_422_avx2_table]
+ shl wpadd, 2
+ mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \
+ ipred_cfl_ac_422_avx2_table+wpadq*8-32]
+ movsxd wpadq, [iptrq+wpadq+4]
+ add iptrq, wpadq
+ jmp iptrq
+.w16_pad3:
+ vpbroadcastq m1, [yq]
+ vpbroadcastq m0, [yq+strideq]
+ jmp .w16_wpad_end
+.w16_pad2:
+ vbroadcasti128 m1, [yq]
+ vbroadcasti128 m0, [yq+strideq]
+ jmp .w16_wpad_end
+.w16_pad1:
+ mova m1, [yq]
+ mova m0, [yq+strideq]
+ ; fall-through
+.w16_wpad_end:
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pshufb m0, m3
+ pshufb m1, m3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jz .w16_wpad_done
+ jmp iptrq
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg
+.w16_hpad_loop:
+ mova [acq], m0
+ mova [acq+32], m0
+ paddw m4, m0
+ paddw m5, m0
+ add acq, 64
+ sub hpadd, 2
+ jg .w16_hpad_loop
+ ; fall-through
+
+.calc_avg:
+ vpbroadcastd m2, [pw_1]
+ pmaddwd m5, m5, m2
+ pmaddwd m0, m4, m2
+ paddd m0, m5
+ vextracti128 xm1, m0, 1
+ tzcnt r1d, szd
+ paddd xm0, xm1
+ movd xm2, r1d
+ movd xm3, szd
+ punpckhqdq xm1, xm0, xm0
+ paddd xm0, xm1
+ psrad xm3, 1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm3
+ paddd xm0, xm1
+ psrad xm0, xm2
+ vpbroadcastw m0, xm0
+.sub_loop:
+ mova m1, [ac_bakq]
+ psubw m1, m0
+ mova [ac_bakq], m1
+ add ac_bakq, 32
+ sub szd, 16
+ jg .sub_loop
+ RET
+
+cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
+ movifnidn hpadd, hpadm
+ movifnidn wd, wm
+ mov hd, hm
+ mov szd, wd
+ imul szd, hd
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m4, m4
+ vpbroadcastd m5, [pw_1]
+ tzcnt r8d, wd
+ lea r5, [ipred_cfl_ac_444_avx2_table]
+ movsxd r8, [r5+r8*4+12]
+ add r5, r8
+
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
+ mov ac_bakq, acq
+ jmp r5
+
+.w4:
+ lea stride3q, [strideq*3]
+ pxor xm2, xm2
+.w4_loop:
+ movd xm1, [yq]
+ movd xm0, [yq+strideq*2]
+ pinsrd xm1, [yq+strideq], 1
+ pinsrd xm0, [yq+stride3q], 1
+ punpcklbw xm1, xm2
+ punpcklbw xm0, xm2
+ psllw xm1, 3
+ psllw xm0, 3
+ mova [acq], xm1
+ mova [acq+16], xm0
+ paddw xm1, xm0
+ paddw xm4, xm1
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_mul
+ pshufd xm0, xm0, q3232
+ paddw xm1, xm0, xm0
+.w4_hpad_loop:
+ mova [acq], xm0
+ mova [acq+16], xm0
+ paddw xm4, xm1
+ add acq, 32
+ sub hpadd, 4
+ jg .w4_hpad_loop
+ jmp .calc_avg_mul
+
+.w8:
+ lea stride3q, [strideq*3]
+ pxor m2, m2
+.w8_loop:
+ movq xm1, [yq]
+ movq xm0, [yq+strideq*2]
+ vinserti128 m1, [yq+strideq], 1
+ vinserti128 m0, [yq+stride3q], 1
+ punpcklbw m1, m2
+ punpcklbw m0, m2
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m1, m0
+ paddw m4, m1
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_mul
+ vpermq m0, m0, q3232
+ paddw m1, m0, m0
+.w8_hpad_loop:
+ mova [acq], m0
+ mova [acq+32], m0
+ paddw m4, m1
+ add acq, 64
+ sub hpadd, 4
+ jg .w8_hpad_loop
+ jmp .calc_avg_mul
+
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ pmovzxbw m1, [yq]
+ pmovzxbw m0, [yq+strideq]
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m1, m0
+ pmaddwd m1, m5
+ paddd m4, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w16_hpad
+.w16_wpad:
+ mova m3, [cfl_ac_444_w16_pad1_shuffle]
+.w16_wpad_loop:
+ vpbroadcastq m1, [yq]
+ vpbroadcastq m0, [yq+strideq]
+ pshufb m1, m3
+ pshufb m0, m3
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m1, m0
+ pmaddwd m1, m5
+ paddd m4, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_wpad_loop
+ test hpadd, hpadd
+ jz .calc_avg
+.w16_hpad:
+ paddw m1, m0, m0
+ pmaddwd m1, m5
+.w16_hpad_loop:
+ mova [acq], m0
+ mova [acq+32], m0
+ paddd m4, m1
+ add acq, 64
+ sub hpadd, 2
+ jg .w16_hpad_loop
+ jmp .calc_avg
+
+.w32:
+ test wpadd, wpadd
+ jnz .w32_wpad
+.w32_loop:
+ pmovzxbw m1, [yq]
+ pmovzxbw m0, [yq+16]
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m2, m1, m0
+ pmaddwd m2, m5
+ paddd m4, m2
+ add yq, strideq
+ add acq, 64
+ dec hd
+ jg .w32_loop
+ test hpadd, hpadd
+ jz .calc_avg
+ jmp .w32_hpad_loop
+.w32_wpad:
+ DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
+ lea iptrq, [ipred_cfl_ac_444_avx2_table]
+ add wpadd, wpadd
+ mova m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table]
+ movsxd wpadq, [iptrq+wpadq+4]
+ add iptrq, wpadq
+ jmp iptrq
+.w32_pad3:
+ vpbroadcastq m1, [yq]
+ pshufb m1, m3
+ vpermq m0, m1, q3232
+ jmp .w32_wpad_end
+.w32_pad2:
+ pmovzxbw m1, [yq]
+ pshufhw m0, m1, q3333
+ vpermq m0, m0, q3333
+ jmp .w32_wpad_end
+.w32_pad1:
+ pmovzxbw m1, [yq]
+ vpbroadcastq m0, [yq+16]
+ pshufb m0, m3
+ ; fall-through
+.w32_wpad_end:
+ psllw m1, 3
+ psllw m0, 3
+ mova [acq], m1
+ mova [acq+32], m0
+ paddw m2, m1, m0
+ pmaddwd m2, m5
+ paddd m4, m2
+ add yq, strideq
+ add acq, 64
+ dec hd
+ jz .w32_wpad_done
+ jmp iptrq
+.w32_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg
+.w32_hpad_loop:
+ mova [acq], m1
+ mova [acq+32], m0
+ paddd m4, m2
+ add acq, 64
+ dec hpadd
+ jg .w32_hpad_loop
+ jmp .calc_avg
+
+.calc_avg_mul:
+ pmaddwd m4, m5
+.calc_avg:
+ vextracti128 xm1, m4, 1
+ tzcnt r1d, szd
+ paddd xm0, xm4, xm1
+ movd xm2, r1d
+ movd xm3, szd
+ punpckhqdq xm1, xm0, xm0
+ paddd xm0, xm1
+ psrad xm3, 1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm3
+ paddd xm0, xm1
+ psrad xm0, xm2
+ vpbroadcastw m0, xm0
+.sub_loop:
+ mova m1, [ac_bakq]
+ psubw m1, m0
+ mova [ac_bakq], m1
+ add ac_bakq, 32
+ sub szd, 16
+ jg .sub_loop
+ RET
+
+cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
+ vbroadcasti128 m4, [palq]
+ lea r2, [pal_pred_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r2+wq*4]
+ packuswb m4, m4
+ add wq, r2
+ lea r2, [strideq*3]
+ jmp wq
+.w4:
+ pshufb xm0, xm4, [idxq]
+ add idxq, 16
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+ALIGN function_align
+.w8:
+ pshufb xm0, xm4, [idxq+16*0]
+ pshufb xm1, xm4, [idxq+16*1]
+ add idxq, 16*2
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r2 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+ALIGN function_align
+.w16:
+ pshufb m0, m4, [idxq+32*0]
+ pshufb m1, m4, [idxq+32*1]
+ add idxq, 32*2
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+r2 ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+ALIGN function_align
+.w32:
+ pshufb m0, m4, [idxq+32*0]
+ pshufb m1, m4, [idxq+32*1]
+ pshufb m2, m4, [idxq+32*2]
+ pshufb m3, m4, [idxq+32*3]
+ add idxq, 32*4
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r2 ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w32
+ RET
+ALIGN function_align
+.w64:
+ pshufb m0, m4, [idxq+32*0]
+ pshufb m1, m4, [idxq+32*1]
+ pshufb m2, m4, [idxq+32*2]
+ pshufb m3, m4, [idxq+32*3]
+ add idxq, 32*4
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64
+ RET
+
+%endif
diff --git a/third_party/dav1d/src/x86/ipred_avx512.asm b/third_party/dav1d/src/x86/ipred_avx512.asm
new file mode 100644
index 0000000000..38c86b54f5
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred_avx512.asm
@@ -0,0 +1,1432 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+%macro SMOOTH_WEIGHT_TABLE 1-*
+ %rep %0
+ db %1-128, 127-%1
+ %rotate 1
+ %endrep
+%endmacro
+
+smooth_weights: SMOOTH_WEIGHT_TABLE \
+ 0, 0, 255, 128, 255, 149, 85, 64, \
+ 255, 197, 146, 105, 73, 50, 37, 32, \
+ 255, 225, 196, 170, 145, 123, 102, 84, \
+ 68, 54, 43, 33, 26, 20, 17, 16, \
+ 255, 240, 225, 210, 196, 182, 169, 157, \
+ 145, 133, 122, 111, 101, 92, 83, 74, \
+ 66, 59, 52, 45, 39, 34, 29, 25, \
+ 21, 17, 14, 12, 10, 9, 8, 8, \
+ 255, 248, 240, 233, 225, 218, 210, 203, \
+ 196, 189, 182, 176, 169, 163, 156, 150, \
+ 144, 138, 133, 127, 121, 116, 111, 106, \
+ 101, 96, 91, 86, 82, 77, 73, 69, \
+ 65, 61, 57, 54, 50, 47, 44, 41, \
+ 38, 35, 32, 29, 27, 25, 22, 20, \
+ 18, 16, 15, 13, 12, 10, 9, 8, \
+ 7, 6, 6, 5, 5, 4, 4, 4
+
+; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __
+filter_taps: db 10, 0, 0, 0, 2, 10, 0, 0, 1, 1, 10, 0, 1, 1, 2, 10
+ db 6, 0, 0, 0, 2, 6, 0, 0, 2, 2, 6, 0, 1, 2, 2, 6
+ db 0, 12, -6, 0, 0, 9, -5, 0, 0, 7, -3, 0, 0, 5, -3, 0
+ db 12, 2, -4, 0, 9, 2, -3, 0, 7, 2, -3, 0, 5, 3, -3, 0
+ db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16
+ db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16
+ db 0, 10,-10, 0, 0, 6, -6, 0, 0, 4, -4, 0, 0, 2, -2, 0
+ db 10, 0,-10, 0, 6, 0, -6, 0, 4, 0, -4, 0, 2, 0, -2, 0
+ db 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8
+ db 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4
+ db 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0
+ db 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0
+ db 8, 0, 0, 0, 3, 8, 0, 0, 2, 3, 8, 0, 1, 2, 3, 8
+ db 4, 0, 0, 0, 3, 4, 0, 0, 2, 3, 4, 0, 2, 2, 3, 4
+ db 0, 10, -2, 0, 0, 6, -1, 0, 0, 4, -1, 0, 0, 2, 0, 0
+ db 10, 3, -1, 0, 6, 4, -1, 0, 4, 4, -1, 0, 3, 3, -1, 0
+ db 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14
+ db 12, 0, 0, 0, 1, 12, 0, 0, 0, 0, 12, 0, 0, 0, 1, 12
+ db 0, 14,-12, 0, 0, 12,-10, 0, 0, 11, -9, 0, 0, 10, -8, 0
+ db 14, 0,-10, 0, 12, 0, -9, 0, 11, 1, -8, 0, 9, 1, -7, 0
+filter_perm: db 0, 1, 2, 3, 24, 25, 26, 27, 4, 5, 6, 7, 28, 29, 30, 31
+ db 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7,131
+ db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147
+ db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163
+filter_end: dd 2, 3, 16, 17, -1, -1, 20, 21, 0, 6, 24, 30, 1, 7, 25, 31
+smooth_shuf: db 7, 7, 7, 7, 0, 1, 0, 1, 3, 3, 3, 3, 8, 9, 8, 9
+ db 5, 5, 5, 5, 4, 5, 4, 5, 1, 1, 1, 1, 12, 13, 12, 13
+ db 6, 6, 6, 6, 2, 3, 2, 3, 2, 2, 2, 2, 10, 11, 10, 11
+ db 4, 4, 4, 4, 6, 7, 6, 7, 0, 0, 0, 0, 14, 15, 14, 15
+smooth_endA: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
+ db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
+smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79
+ db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95
+ db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111
+ db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127
+ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4
+ db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
+
+pb_127_m127: times 2 db 127, -127
+pb_128: times 4 db 128
+pw_128: times 2 dw 128
+pw_255: times 2 dw 255
+
+%define pb_1 (ipred_h_shuf+24)
+%define pb_2 (ipred_h_shuf+20)
+%define pb_3 (ipred_h_shuf+16)
+%define pd_8 (filter_taps+128)
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4)
+
+JMP_TABLE ipred_h_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64
+JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64
+JMP_TABLE pal_pred_8bpc, avx512icl, w4, w8, w16, w32, w64
+
+SECTION .text
+
+INIT_ZMM avx512icl
+cglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h
+ lea r5, [ipred_dc_left_8bpc_avx512icl_table]
+ movd xm0, wm
+ tzcnt wd, wm
+ inc tlq
+ movifnidn hd, hm
+ movu ym1, [tlq]
+ movd xmm3, wd
+ movsxd r6, [r5+wq*4]
+ vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1]
+ psrld xm0, 1
+ vpdpbusd ym0, ym1, ym2
+ add r6, r5
+ add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_left_8bpc_avx512icl_table]
+ mov hd, hm
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movd xm0, hm
+ movu ym1, [tlq]
+ movd xmm3, r6d
+ movsxd r6, [r5+r6*4]
+ vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1]
+ psrld xm0, 1
+ vpdpbusd ym0, ym1, ym2
+ add r6, r5
+ add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu ym1, [tlq+32] ; unaligned when jumping here from dc_top
+ vpdpbusd ym0, ym1, ym2
+.h32:
+ vextracti32x4 xm1, ym0, 1
+ paddd xm0, xm1
+.h16:
+ punpckhqdq xm1, xm0, xm0
+ paddd xm0, xm1
+.h8:
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+.h4:
+ vpsrlvd xm0, xmm3
+ lea stride3q, [strideq*3]
+ vpbroadcastb m0, xm0
+ jmp wq
+
+cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd xm0, r5d
+ tzcnt r5d, r5d
+ movd xmm4, r5d
+ lea r5, [ipred_dc_8bpc_avx512icl_table]
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ vpbroadcastd ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1]
+ psrld xm0, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movd xmm1, [tlq-4]
+ vpdpbusd xm0, xmm1, xm3
+ jmp wq
+.w4:
+ movd xmm1, [tlq+1]
+ vpdpbusd xm0, xmm1, xm3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xmm0, xm0, 3
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq xmm1, xm0, xm0
+ lea r2d, [hq*2]
+ mov r6d, 0x55563334
+ paddd xmm1, xm0
+ shrx r6d, r6d, r2d
+ psrlq xmm0, xmm1, 32
+ paddd xmm0, xmm1
+ movd xmm1, r6d
+ psrld xmm0, 2
+ pmulhuw xmm0, xmm1
+.w4_end:
+ vpbroadcastb xm0, xmm0
+.s4:
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm0
+ movd [dstq+strideq*2], xm0
+ movd [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+.h8:
+ movq xmm1, [tlq-8]
+ vpdpbusd xm0, xmm1, xm3
+ jmp wq
+.w8:
+ movq xmm1, [tlq+1]
+ vextracti32x4 xm2, ym0, 1
+ vpdpbusd xm0, xmm1, xm3
+ paddd xmm2, xm2, xm0
+ punpckhqdq xmm0, xmm2, xmm2
+ paddd xmm0, xmm2
+ psrlq xmm1, xmm0, 32
+ paddd xmm0, xmm1
+ vpsrlvd xmm0, xmm4
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmove r6d, r2d
+ movd xmm1, r6d
+ pmulhuw xmm0, xmm1
+.w8_end:
+ vpbroadcastb xm0, xmm0
+.s8:
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm0
+ movq [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+.h16:
+ mova xmm1, [tlq-16]
+ vpdpbusd xm0, xmm1, xm3
+ jmp wq
+.w16:
+ movu xmm1, [tlq+1]
+ vextracti32x4 xm2, ym0, 1
+ vpdpbusd xm0, xmm1, xm3
+ paddd xmm2, xm2, xm0
+ punpckhqdq xmm0, xmm2, xmm2
+ paddd xmm0, xmm2
+ psrlq xmm1, xmm0, 32
+ paddd xmm0, xmm1
+ vpsrlvd xmm0, xmm4
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xmm1, r6d
+ pmulhuw xmm0, xmm1
+.w16_end:
+ vpbroadcastb xm0, xmm0
+.s16:
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm0
+ mova [dstq+strideq*2], xm0
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+.h32:
+ mova ym1, [tlq-32]
+ vpdpbusd ym0, ym1, ym3
+ jmp wq
+.w32:
+ movu ym1, [tlq+1]
+ vpdpbusd ym0, ym1, ym3
+ vextracti32x4 xm1, ym0, 1
+ paddd xmm1, xm1, xm0
+ punpckhqdq xmm0, xmm1, xmm1
+ paddd xmm0, xmm1
+ psrlq xmm1, xmm0, 32
+ paddd xmm0, xmm1
+ vpsrlvd xmm0, xmm4
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x33345556
+ shrx r6d, r6d, r2d
+ movd xmm1, r6d
+ pmulhuw xmm0, xmm1
+.w32_end:
+ vpbroadcastb ym0, xmm0
+.s32:
+ mova [dstq+strideq*0], ym0
+ mova [dstq+strideq*1], ym0
+ mova [dstq+strideq*2], ym0
+ mova [dstq+stride3q ], ym0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+.h64:
+ mova ym1, [tlq-64]
+ mova ym2, [tlq-32]
+ vpdpbusd ym0, ym1, ym3
+ vpdpbusd ym0, ym2, ym3
+ jmp wq
+.w64:
+ movu ym1, [tlq+ 1]
+ movu ym2, [tlq+33]
+ vpdpbusd ym0, ym1, ym3
+ vpdpbusd ym0, ym2, ym3
+ vextracti32x4 xm1, ym0, 1
+ paddd xmm1, xm1, xm0
+ punpckhqdq xmm0, xmm1, xmm1
+ paddd xmm0, xmm1
+ psrlq xmm1, xmm0, 32
+ paddd xmm0, xmm1
+ vpsrlvd xmm0, xmm4
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x33345556
+ shrx r6d, r6d, hd
+ movd xmm1, r6d
+ pmulhuw xmm0, xmm1
+.w64_end:
+ vpbroadcastb m0, xmm0
+.s64:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_splat_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_dc_splat_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ movu m0, [tlq+1]
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3
+%define base r6-ipred_h_8bpc_avx512icl_table
+ lea r6, [ipred_h_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ lea stride3q, [strideq*3]
+ sub tlq, hq
+ add wq, r6
+ jmp wq
+.w4:
+ mova xmm1, [base+ipred_h_shuf+16]
+.w4_loop:
+ movd xmm0, [tlq+hq-4]
+ pshufb xmm0, xmm1
+ movd [dstq+strideq*0], xmm0
+ pextrd [dstq+strideq*1], xmm0, 1
+ pextrd [dstq+strideq*2], xmm0, 2
+ pextrd [dstq+stride3q ], xmm0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8:
+ movsldup xmm2, [base+ipred_h_shuf+16]
+ movshdup xmm3, [base+ipred_h_shuf+16]
+.w8_loop:
+ movd xmm1, [tlq+hq-4]
+ pshufb xmm0, xmm1, xmm2
+ pshufb xmm1, xmm3
+ movq [dstq+strideq*0], xmm0
+ movq [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xmm0
+ movhps [dstq+stride3q ], xmm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16:
+ movsldup m1, [base+smooth_shuf]
+.w16_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ pshufb m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ vpbroadcastd ym3, [base+pb_1]
+ vpord m2, m3, [base+pb_2] {1to16}
+.w32_loop:
+ vpbroadcastd m1, [tlq+hq-4]
+ pshufb m0, m1, m2
+ pshufb m1, m3
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w32_loop
+ RET
+.w64:
+ vpbroadcastd m4, [base+pb_3]
+ vpbroadcastd m5, [base+pb_2]
+ vpbroadcastd m6, [base+pb_1]
+ pxor m7, m7
+.w64_loop:
+ vpbroadcastd m3, [tlq+hq-4]
+ pshufb m0, m3, m4
+ pshufb m1, m3, m5
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w64_loop
+ RET
+
+%macro PAETH 0
+ psubusb m1, m5, m4
+ psubusb m0, m4, m5
+ por m1, m0 ; tdiff
+ pavgb m2, m6, m4
+ vpcmpub k1, m1, m7, 1 ; tdiff < ldiff
+ vpblendmb m0{k1}, m4, m6
+ vpternlogd m4, m6, m8, 0x28 ; (m4 ^ m6) & m8
+ psubusb m3, m5, m2
+ psubb m2, m4
+ psubusb m2, m5
+ por m2, m3
+ pminub m1, m7
+ paddusb m2, m2
+ por m2, m4 ; min(tldiff, 255)
+ vpcmpub k1, m2, m1, 1 ; tldiff < ldiff && tldiff < tdiff
+ vmovdqu8 m0{k1}, m5
+%endmacro
+
+cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3
+ lea r6, [ipred_paeth_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ vpbroadcastb m5, [tlq] ; topleft
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1]
+ lea topq, [tlq+1]
+ sub tlq, hq
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+INIT_YMM avx512icl
+.w4:
+ vpbroadcastd m6, [topq]
+ mova m9, [ipred_h_shuf]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0 ; ldiff
+.w4_loop:
+ vpbroadcastq m4, [tlq+hq-8]
+ pshufb m4, m9 ; left
+ PAETH
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm0, 3
+ sub hd, 8
+ jl .w4_ret
+ vextracti32x4 xm0, m0, 1
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm0, 3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.w4_ret:
+ RET
+INIT_ZMM avx512icl
+.w8:
+ vpbroadcastq m6, [topq]
+ movsldup m9, [smooth_shuf]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w8_loop:
+ vpbroadcastq m4, [tlq+hq-8]
+ pshufb m4, m9
+ PAETH
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ sub hd, 8
+ jl .w8_ret
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ jg .w8_loop
+.w8_ret:
+ RET
+.w16:
+ vbroadcasti32x4 m6, [topq]
+ movsldup m9, [smooth_shuf]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w16_loop:
+ vpbroadcastd m4, [tlq+hq-4]
+ pshufb m4, m9
+ PAETH
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ vbroadcasti32x8 m6, [topq]
+ mova ym9, ym8
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w32_loop:
+ vpbroadcastd m4, [tlq+hq-2]
+ pshufb m4, m9
+ PAETH
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ movu m6, [topq]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w64_loop:
+ vpbroadcastb m4, [tlq+hq-1]
+ PAETH
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
+%define base r6-ipred_smooth_v_8bpc_avx512icl_table
+ lea r6, [ipred_smooth_v_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m0, [base+pb_127_m127]
+ vpbroadcastd m1, [base+pw_128]
+ lea weightsq, [base+smooth_weights+hq*4]
+ neg hq
+ vpbroadcastb m4, [tlq+hq] ; bottom
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vpbroadcastd m2, [tlq+1]
+ movshdup m5, [smooth_shuf]
+ mova ym6, [smooth_endA]
+ punpcklbw m2, m4 ; top, bottom
+ pmaddubsw m3, m2, m0
+ paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok
+ paddw m3, m1 ; 128 * top + 129 * bottom + 128
+.w4_loop:
+ vbroadcasti32x4 m0, [weightsq+hq*2]
+ pshufb m0, m5
+ pmaddubsw m0, m2, m0
+ paddw m0, m3
+ vpermb m0, m6, m0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm1, 2
+ add hq, 8
+ jg .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+stride3q ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ jl .w4_loop
+.ret:
+ RET
+.w8:
+ vpbroadcastq m2, [tlq+1]
+ movshdup m5, [smooth_shuf]
+ mova ym6, [smooth_endA]
+ punpcklbw m2, m4
+ pmaddubsw m3, m2, m0
+ paddw m1, m2
+ paddw m3, m1
+.w8_loop:
+ vpbroadcastq m0, [weightsq+hq*2]
+ pshufb m0, m5
+ pmaddubsw m0, m2, m0
+ paddw m0, m3
+ vpermb m0, m6, m0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w8_loop
+ RET
+.w16:
+ vbroadcasti32x4 m3, [tlq+1]
+ movshdup m6, [smooth_shuf]
+ mova m7, [smooth_endB]
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w16_loop:
+ vpbroadcastq m1, [weightsq+hq*2]
+ pshufb m1, m6
+ pmaddubsw m0, m2, m1
+ pmaddubsw m1, m3, m1
+ paddw m0, m4
+ paddw m1, m5
+ vpermt2b m0, m7, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w16_loop
+ RET
+.w32:
+ vbroadcasti32x8 m3, [tlq+1]
+ movshdup m6, [smooth_shuf]
+ mova m7, [smooth_endB]
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w32_loop:
+ vpbroadcastd m1, [weightsq+hq*2]
+ pshufb m1, m6
+ pmaddubsw m0, m2, m1
+ pmaddubsw m1, m3, m1
+ paddw m0, m4
+ paddw m1, m5
+ vpermt2b m0, m7, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w32_loop
+ RET
+.w64:
+ movu m3, [tlq+1]
+ mova m6, [smooth_endB]
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+.w64_loop:
+ vpbroadcastw m1, [weightsq+hq*2]
+ pmaddubsw m0, m2, m1
+ pmaddubsw m1, m3, m1
+ paddw m0, m4
+ paddw m1, m5
+ vpermt2b m0, m6, m1
+ mova [dstq], m0
+ add dstq, strideq
+ inc hq
+ jl .w64_loop
+ RET
+
+cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3
+%define base r5-ipred_smooth_h_8bpc_avx512icl_table
+ lea r5, [ipred_smooth_h_8bpc_avx512icl_table]
+ mov r6d, wd
+ tzcnt wd, wd
+ vpbroadcastb m4, [tlq+r6] ; right
+ mov hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m5, [base+pb_127_m127]
+ vpbroadcastd m6, [base+pw_128]
+ sub tlq, hq
+ add wq, r5
+ vpmovb2m k1, m6
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movsldup m3, [smooth_shuf]
+ vpbroadcastq m7, [smooth_weights+4*2]
+ mova ym8, [smooth_endA]
+.w4_loop:
+ vpbroadcastq m0, [tlq+hq-8]
+ mova m2, m4
+ vpshufb m2{k1}, m0, m3 ; left, right
+ pmaddubsw m0, m2, m5
+ pmaddubsw m1, m2, m7
+ paddw m2, m6
+ paddw m0, m2
+ paddw m0, m1
+ vpermb m0, m8, m0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm1, 2
+ sub hd, 8
+ jl .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+stride3q ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.ret:
+ RET
+.w8:
+ movsldup m3, [smooth_shuf]
+ vbroadcasti32x4 m7, [smooth_weights+8*2]
+ mova ym8, [smooth_endA]
+.w8_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ mova m2, m4
+ vpshufb m2{k1}, m0, m3
+ pmaddubsw m0, m2, m5
+ pmaddubsw m1, m2, m7
+ paddw m2, m6
+ paddw m0, m2
+ paddw m0, m1
+ vpermb m0, m8, m0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16:
+ movsldup m7, [smooth_shuf]
+ vbroadcasti32x4 m8, [smooth_weights+16*2]
+ vbroadcasti32x4 m9, [smooth_weights+16*3]
+ mova m10, [smooth_endB]
+.w16_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ mova m3, m4
+ vpshufb m3{k1}, m0, m7
+ pmaddubsw m2, m3, m5
+ pmaddubsw m0, m3, m8
+ pmaddubsw m1, m3, m9
+ paddw m3, m6
+ paddw m2, m3
+ paddw m0, m2
+ paddw m1, m2
+ vpermt2b m0, m10, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ mova m10, [smooth_endA]
+ vpbroadcastd ym7, [pb_1]
+ vbroadcasti32x8 m8, [smooth_weights+32*2]
+ vbroadcasti32x8 m9, [smooth_weights+32*3]
+ vshufi32x4 m10, m10, q3120
+.w32_loop:
+ vpbroadcastd m0, [tlq+hq-2]
+ mova m3, m4
+ vpshufb m3{k1}, m0, m7
+ pmaddubsw m2, m3, m5
+ pmaddubsw m0, m3, m8
+ pmaddubsw m1, m3, m9
+ paddw m3, m6
+ paddw m2, m3
+ paddw m0, m2
+ paddw m1, m2
+ vpermt2b m0, m10, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ mova m7, [smooth_weights+64*2]
+ mova m8, [smooth_weights+64*3]
+ mova m9, [smooth_endA]
+.w64_loop:
+ mova m3, m4
+ vpbroadcastb m3{k1}, [tlq+hq-1]
+ pmaddubsw m2, m3, m5
+ pmaddubsw m0, m3, m7
+ pmaddubsw m1, m3, m8
+ paddw m3, m6
+ paddw m2, m3
+ paddw m0, m2
+ paddw m1, m2
+ vpermt2b m0, m9, m1
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
+%define base r5-ipred_smooth_8bpc_avx512icl_table
+ lea r5, [ipred_smooth_8bpc_avx512icl_table]
+ mov r6d, wd
+ tzcnt wd, wd
+ mov hd, hm
+ vpbroadcastb m6, [tlq+r6] ; right
+ sub tlq, hq
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m7, [base+pb_127_m127]
+ vpbroadcastb m0, [tlq] ; bottom
+ vpbroadcastd m1, [base+pw_255]
+ add wq, r5
+ lea v_weightsq, [base+smooth_weights+hq*2]
+ vpmovb2m k1, m1
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vpbroadcastd m8, [tlq+hq+1]
+ movsldup m4, [smooth_shuf]
+ movshdup m5, [smooth_shuf]
+ vpbroadcastq m9, [smooth_weights+4*2]
+ mova ym11, [smooth_endA]
+
+ punpcklbw m8, m0 ; top, bottom
+ pmaddubsw m10, m8, m7
+ paddw m1, m8 ; 1 * top + 256 * bottom + 255
+ paddw m10, m1 ; 128 * top + 129 * bottom + 255
+.w4_loop:
+ vpbroadcastq m1, [tlq+hq-8]
+ vbroadcasti32x4 m0, [v_weightsq]
+ add v_weightsq, 16
+ mova m2, m6
+ vpshufb m2{k1}, m1, m4 ; left, right
+ pmaddubsw m1, m2, m7 ; 127 * left - 127 * right
+ pshufb m0, m5
+ pmaddubsw m0, m8, m0
+ paddw m1, m2 ; 128 * left + 129 * right
+ pmaddubsw m2, m9
+ paddw m0, m10
+ paddw m1, m2
+ pavgw m0, m1
+ vpermb m0, m11, m0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq+strideq*0], xm0
+ movd [dstq+strideq*1], xm1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm1, 2
+ sub hd, 8
+ jl .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm1, 1
+ pextrd [dstq+strideq*2], xm0, 3
+ pextrd [dstq+stride3q ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ jg .w4_loop
+.ret:
+ RET
+.w8:
+ vpbroadcastq m8, [tlq+hq+1]
+ movsldup m4, [smooth_shuf]
+ movshdup m5, [smooth_shuf]
+ vbroadcasti32x4 m9, [smooth_weights+8*2]
+ mova ym11, [smooth_endA]
+ punpcklbw m8, m0
+ pmaddubsw m10, m8, m7
+ paddw m1, m8
+ paddw m10, m1
+.w8_loop:
+ vpbroadcastd m1, [tlq+hq-4]
+ vpbroadcastq m0, [v_weightsq]
+ add v_weightsq, 8
+ mova m2, m6
+ vpshufb m2{k1}, m1, m4
+ pmaddubsw m1, m2, m7
+ pshufb m0, m5
+ pmaddubsw m0, m8, m0
+ paddw m1, m2
+ pmaddubsw m2, m9
+ paddw m0, m10
+ paddw m1, m2
+ pavgw m0, m1
+ vpermb m0, m11, m0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16:
+ vbroadcasti32x4 m9, [tlq+hq+1]
+ movsldup m5, [smooth_shuf]
+ movshdup m10, [smooth_shuf]
+ vbroadcasti32x4 m11, [smooth_weights+16*2]
+ vbroadcasti32x4 m12, [smooth_weights+16*3]
+ mova m15, [smooth_endB]
+ punpcklbw m8, m9, m0
+ punpckhbw m9, m0
+ pmaddubsw m13, m8, m7
+ pmaddubsw m14, m9, m7
+ paddw m0, m1, m8
+ paddw m1, m9
+ paddw m13, m0
+ paddw m14, m1
+.w16_loop:
+ vpbroadcastd m0, [tlq+hq-4]
+ vpbroadcastq m1, [v_weightsq]
+ add v_weightsq, 8
+ mova m4, m6
+ vpshufb m4{k1}, m0, m5
+ pmaddubsw m2, m4, m7
+ pshufb m1, m10
+ pmaddubsw m0, m8, m1
+ pmaddubsw m1, m9, m1
+ paddw m2, m4
+ pmaddubsw m3, m4, m11
+ pmaddubsw m4, m12
+ paddw m0, m13
+ paddw m1, m14
+ paddw m3, m2
+ paddw m4, m2
+ pavgw m0, m3
+ pavgw m1, m4
+ vpermt2b m0, m15, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ vbroadcasti32x8 m9, [tlq+hq+1]
+ movshdup m10, [smooth_shuf]
+ mova m12, [smooth_weights+32*2]
+ vpbroadcastd ym5, [pb_1]
+ mova m15, [smooth_endB]
+ punpcklbw m8, m9, m0
+ punpckhbw m9, m0
+ pmaddubsw m13, m8, m7
+ pmaddubsw m14, m9, m7
+ vshufi32x4 m11, m12, m12, q2020
+ vshufi32x4 m12, m12, q3131
+ paddw m0, m1, m8
+ paddw m1, m9
+ paddw m13, m0
+ paddw m14, m1
+.w32_loop:
+ vpbroadcastd m0, [tlq+hq-2]
+ vpbroadcastd m1, [v_weightsq]
+ add v_weightsq, 4
+ mova m4, m6
+ vpshufb m4{k1}, m0, m5
+ pmaddubsw m2, m4, m7
+ pshufb m1, m10
+ pmaddubsw m0, m8, m1
+ pmaddubsw m1, m9, m1
+ paddw m2, m4
+ pmaddubsw m3, m4, m11
+ pmaddubsw m4, m12
+ paddw m0, m13
+ paddw m1, m14
+ paddw m3, m2
+ paddw m4, m2
+ pavgw m0, m3
+ pavgw m1, m4
+ vpermt2b m0, m15, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ movu m9, [tlq+hq+1]
+ mova m11, [smooth_weights+64*2]
+ mova m2, [smooth_weights+64*3]
+ mova m14, [smooth_endB]
+ punpcklbw m8, m9, m0
+ punpckhbw m9, m0
+ pmaddubsw m12, m8, m7
+ pmaddubsw m13, m9, m7
+ vshufi32x4 m10, m11, m2, q2020
+ vshufi32x4 m11, m2, q3131
+ paddw m0, m1, m8
+ paddw m1, m9
+ paddw m12, m0
+ paddw m13, m1
+.w64_loop:
+ mova m4, m6
+ vpbroadcastb m4{k1}, [tlq+hq-1]
+ vpbroadcastw m1, [v_weightsq]
+ add v_weightsq, 2
+ pmaddubsw m2, m4, m7
+ pmaddubsw m0, m8, m1
+ pmaddubsw m1, m9, m1
+ paddw m2, m4
+ pmaddubsw m3, m4, m10
+ pmaddubsw m4, m11
+ paddw m0, m12
+ paddw m1, m13
+ paddw m3, m2
+ paddw m4, m2
+ pavgw m0, m3
+ pavgw m1, m4
+ vpermt2b m0, m14, m1
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
+ lea r6, [pal_pred_8bpc_avx512icl_table]
+ tzcnt wd, wm
+ vbroadcasti32x4 m4, [palq]
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ packuswb m4, m4
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ pshufb xmm0, xm4, [idxq]
+ add idxq, 16
+ movd [dstq+strideq*0], xmm0
+ pextrd [dstq+strideq*1], xmm0, 1
+ pextrd [dstq+strideq*2], xmm0, 2
+ pextrd [dstq+stride3q ], xmm0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ pshufb xmm0, xm4, [idxq+16*0]
+ pshufb xmm1, xm4, [idxq+16*1]
+ add idxq, 16*2
+ movq [dstq+strideq*0], xmm0
+ movhps [dstq+strideq*1], xmm0
+ movq [dstq+strideq*2], xmm1
+ movhps [dstq+stride3q ], xmm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ pshufb m0, m4, [idxq]
+ add idxq, 64
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ pshufb m0, m4, [idxq+64*0]
+ pshufb m1, m4, [idxq+64*1]
+ add idxq, 64*2
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w32
+ RET
+.w64:
+ pshufb m0, m4, [idxq+64*0]
+ pshufb m1, m4, [idxq+64*1]
+ pshufb m2, m4, [idxq+64*2]
+ pshufb m3, m4, [idxq+64*3]
+ add idxq, 64*4
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w64
+ RET
+
+; The ipred_filter code processes 4x2 blocks in the following order
+; which increases parallelism compared to doing things row by row.
+; Some redundant blocks are calculated for w > 4.
+; w4 w8 w16 w32
+; 1 1 2 1 2 3 4 1 2 3 4 9 a b c
+; 2 2 3 2 3 4 5 2 3 4 5 a b c d
+; 3 3 4 3 4 5 6 3 4 5 6 b c d e
+; 4 4 5 4 5 6 7 4 5 6 7 c d e f
+; 5 5 6 5 6 7 8 5 6 7 8 d e f g
+; 6 6 7 6 7 8 9 6 7 8 9 e f g h
+; 7 7 8 7 8 9 a 7 8 9 a f g h i
+; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___
+; 9 9 a b h i j
+; a b i j
+; b j
+
+cglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt
+%define base r6-filter_taps
+ lea r6, [filter_taps]
+%ifidn fltd, fltm
+ movzx fltd, fltb
+%else
+ movzx fltd, byte fltm
+%endif
+ vpbroadcastd xmm2, [tlq+1] ; t0 t0 t0 t0
+ movifnidn hd, hm
+ shl fltd, 6
+ vpbroadcastd m6, [base+pd_8]
+ vpbroadcastd xmm3, [tlq-2] ; l1 l0 tl __
+ vbroadcasti32x4 m7, [r6+fltq+16*0] ; p1 p2 p3 p4
+ vbroadcasti32x4 m8, [r6+fltq+16*1]
+ vbroadcasti32x4 m9, [r6+fltq+16*2] ; p6 p5 p0 __
+ vbroadcasti32x4 m10, [r6+fltq+16*3]
+ mova xmm0, xm6
+ vpdpbusd xmm0, xmm2, xm7
+ mova xmm1, xm6
+ vpdpbusd xmm1, xmm2, xm8
+ vpdpbusd xmm0, xmm3, xm9
+ vpdpbusd xmm1, xmm3, xm10
+ packssdw xmm0, xmm1
+ cmp wd, 8
+ jb .w4
+ vpbroadcastd ym2, [tlq+5]
+ mova m11, [base+filter_perm]
+ mov r5, 0xffffffffffff000f
+ psrldq xmm2, 1 ; __ t0
+ kmovq k1, r5 ; 0x000f
+ psraw xm5, xmm0, 4
+ packuswb xmm2, xm5 ; __ t0 a0 b0
+ pshufd ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0 t1 t1 t1 t1
+ je .w8
+ kxnorb k3, k3, k3 ; 0x00ff
+ vpbroadcastd xm3, [tlq-4]
+ kandnq k2, k3, k1 ; 0xffffffffffff0000
+ vpermb ym3{k2}, ym11, ymm2 ; l3 l2 l1 __ b3 a3 t3 __
+ mova ym0, ym6
+ vpdpbusd ym0, ym2, ym7
+ mova ym1, ym6
+ vpdpbusd ym1, ym2, ym8
+ pshufb ym5{k2}, ym2, ym11 ; a0 b0 __ t0
+ vpbroadcastd m2, [tlq+9]
+ vpdpbusd ym0, ym3, ym9
+ vpdpbusd ym1, ym3, ym10
+ vpbroadcastd xm3, [tlq-6] ; l5 l4 l3 __
+ kunpckbw k4, k1, k3 ; 0x0fff
+ packssdw ym0, ym1
+ psraw ym0, 4 ; a0 d0 a1 b1
+ packuswb ym5, ym0 ; a0 b0 c0 d0 __ t1 a1 b1
+ pshufd m2{k3}, m5, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 t2 t2 t2 t2
+ vpermb m3{k2}, m11, m5 ; l5 l4 l3 __ d3 c3 b3 __ b7 a7 t7 __
+ mova m4, m6
+ vpdpbusd m4, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ psrldq m0, m2, 1 ; __ d0 __ b0 __ t0
+ vpbroadcastd m2, [tlq+13]
+ vpdpbusd m4, m3, m9
+ vpdpbusd m1, m3, m10
+ mova m12, [base+filter_end]
+ lea r5d, [hq-6]
+ mov r6, dstq
+ cmovp hd, r5d ; w == 16 ? h : h - 6
+ packssdw m4, m1
+ psraw m4, 4 ; e0 f0 c1 d1 a2 b2
+ packuswb m0, m4 ; __ d0 e0 f0 __ b1 c1 d1 __ t2 a2 b2
+ pshufd m2{k4}, m0, q3333 ; f0 f0 f0 f0 d1 d1 d1 d1 b2 b2 b2 b2 t3 t3 t3 t3
+.w16_loop:
+ vpbroadcastd xm3, [tlq-8]
+ vpermb m3{k2}, m11, m0 ; l7 l6 l5 __ f3 e3 d3 __ d7 c7 b7 __ bb ab tb __
+ mova m1, m6
+ vpdpbusd m1, m2, m7
+ mova m0, m6
+ vpdpbusd m0, m2, m8
+ sub tlq, 2
+ vpdpbusd m1, m3, m9
+ vpdpbusd m0, m3, m10
+ packssdw m1, m0
+ mova m0, m4
+ psraw m4, m1, 4 ; g0 h0 e1 f1 c2 d2 a3 b3
+ packuswb m0, m4 ; e0 f0 g0 h0 c1 d1 e1 f1 a2 b2 c2 d2 __ __ a3 b3
+ pshufd m2, m0, q3333 ; h0 h0 h0 h0 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3
+ vpermt2d m5, m12, m0 ; c0 d0 e0 f0 __ __ c1 d1 a0 a1 a2 a3 b0 b1 b2 b3
+ vextracti32x4 [dstq+strideq*0], m5, 2
+ vextracti32x4 [dstq+strideq*1], m5, 3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ cmp wd, 16
+ je .ret
+ mova xm13, [filter_perm+16]
+ mova xmm3, [r6+strideq*0]
+ punpckhdq xmm3, [r6+strideq*1]
+ vpbroadcastd m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3
+ pinsrb xm3, xmm3, [tlq+r5+16], 7
+ pshufb xm3, xm13
+ vpermb m3{k2}, m11, m0 ; bf af tf __ h3 g3 f3 __ f7 e7 d7 __ db cb bb __
+ mova m0, m6
+ vpdpbusd m0, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ kunpckbw k5, k3, k1 ; 0xff0f
+ lea r3, [strideq*3]
+ vpdpbusd m0, m3, m9
+ vpdpbusd m1, m3, m10
+ packssdw m0, m1
+ psraw m0, 4 ; a4 b4 g1 h1 e2 f2 c3 d3
+ packuswb m4, m0 ; g0 h0 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3
+ vpblendmb m1{k3}, m4, m2 ; __ t4 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3
+ vpbroadcastd ym2, [tlq+r5+21]
+ pshufd m2{k5}, m4, q3333 ; b4 b4 b4 b4 t5 t5 t5 t5 f2 f2 f2 f2 d3 d3 d3 d3
+ vpermt2d m5, m12, m4 ; e0 f0 g0 h0 __ __ e1 f1 c0 c1 c2 c3 d0 d1 d2 d3
+ vextracti32x4 [dstq+strideq*0], m5, 2
+ vextracti32x4 [dstq+strideq*1], m5, 3
+ punpckhqdq xmm3, [r6+r3]
+ pinsrb xmm3, [r6+strideq*2+15], 11
+ pshufb xm3, xmm3, xm13
+ vpermb m3{k2}, m11, m1 ; df cf bf __ bj aj tj __ h7 g7 f7 __ fb eb db __
+ mova m4, m6
+ vpdpbusd m4, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ kxnord k3, k3, k4 ; 0xfffff0ff
+ lea r4, [strideq*5]
+ vpdpbusd m4, m3, m9
+ vpdpbusd m1, m3, m10
+ packssdw m4, m1
+ psraw m4, 4 ; c4 d4 a5 b5 g2 h2 e3 f3
+ packuswb m0, m4 ; a4 b4 c4 d4 g1 h1 a5 b5 e2 f2 g2 h2 __ __ e3 f3
+ vpblendmw m1{k3}, m2, m0 ; a4 b4 c4 d4 __ t5 a5 b5 e2 f2 g2 h2 __ __ e3 f3
+ vpbroadcastd m2, [tlq+r5+25]
+ pshufd m2{k3}, m0, q3333 ; d4 d4 d4 d4 b5 b5 b5 b5 t6 t6 t6 t6 f3 f3 f3 f3
+ vpermt2d m5, m12, m0 ; g0 h0 a4 b4 __ __ g1 h1 e0 e1 e2 e3 f0 f1 f2 f3
+ vextracti32x4 [dstq+strideq*2], m5, 2
+ vextracti32x4 [dstq+r3 ], m5, 3
+ punpckhqdq xmm3, [r6+r4]
+ pinsrb xmm3, [r6+strideq*4+15], 11
+ pshufb xm3, xmm3, xm13
+ vpermb m3{k2}, m11, m1 ; ff ef df __ dj cj bj __ bn an tn __ hb hb fb __
+ mova m0, m6
+ vpdpbusd m0, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ kunpckwd k1, k1, k2 ; 0x000f0000
+ vpdpbusd m0, m3, m9
+ vpdpbusd m1, m3, m10
+ packssdw m0, m1
+ psraw m0, 4 ; e4 f4 c5 d5 a6 b6 g3 h3
+ packuswb m4, m0 ; c4 d4 e4 f4 a5 b5 c5 d5 g2 h2 a6 b6 __ __ g3 h3
+ vpblendmw m1{k1}, m4, m2 ; c4 d4 e4 f4 a5 b5 c5 d5 __ t6 a6 b6 __ __ g3 h3
+ vpbroadcastd m2, [tlq+r5+29]
+ pshufd m2{k4}, m4, q3333 ; f4 f4 f4 f4 d5 d5 d5 d5 b6 b6 b6 b6 t7 t7 t7 t7
+ vpermt2d m5, m12, m4 ; a4 b4 c4 d4 __ __ a5 b5 g0 g1 g2 g3 h0 h1 h2 h3
+ vextracti32x4 [dstq+strideq*4], m5, 2
+ vextracti32x4 [dstq+r4 ], m5, 3
+ lea r0, [strideq+r3*2]
+.w32_loop:
+ punpckhqdq xmm3, [r6+r0]
+ pinsrb xmm3, [r6+r3*2+15], 11
+ pshufb xm3, xmm3, xm13
+ vpermb m3{k2}, m11, m1 ; hf gf ff __ fj ej dj __ dn cn bn __ br ar tr __
+.w32_loop_tail:
+ mova m4, m6
+ vpdpbusd m4, m2, m7
+ mova m1, m6
+ vpdpbusd m1, m2, m8
+ vpdpbusd m4, m3, m9
+ vpdpbusd m1, m3, m10
+ packssdw m4, m1
+ mova m1, m0
+ psraw m0, m4, 4 ; g4 h4 e5 f5 c6 d6 a7 b7
+ packuswb m1, m0 ; e4 f4 g4 h4 c5 d5 e5 f5 a6 b6 c6 d6 __ __ a7 b7
+ pshufd m2, m1, q3333 ; h4 h4 h4 h4 f5 f5 f5 f5 d6 d6 d6 d6 b7 b7 b7 b7
+ vpermt2d m5, m12, m1 ; c4 d4 e4 f4 __ __ c5 d5 a4 a5 a6 a7 b4 b5 b6 b7
+ vextracti32x4 [r6+strideq*0+16], m5, 2
+ vextracti32x4 [r6+strideq*1+16], m5, 3
+ lea r6, [r6+strideq*2]
+ sub r5d, 2
+ jg .w32_loop
+ vpermb m3, m11, m1
+ cmp r5d, -6
+ jg .w32_loop_tail
+.ret:
+ RET
+.w8:
+ vpermb ym3, ym11, ymm2
+.w8_loop:
+ vpbroadcastd ym3{k1}, [tlq-4] ; l3 l2 l1 __ b3 a3 t3 __
+ mova ym0, ym6
+ vpdpbusd ym0, ym2, ym7
+ mova ym1, ym6
+ vpdpbusd ym1, ym2, ym8
+ sub tlq, 2
+ vpdpbusd ym0, ym3, ym9
+ vpdpbusd ym1, ym3, ym10
+ mova ym3, ym5
+ packssdw ym0, ym1
+ psraw ym5, ym0, 4 ; c0 d0 a1 b1
+ packuswb ym3, ym5 ; a0 b0 c0 d0 __ __ a1 b1
+ pshufd ym2, ym3, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1
+ vpermb ym3, ym11, ym3 ; a0 a1 b0 b1
+ movq [dstq+strideq*0], xm3
+ movhps [dstq+strideq*1], xm3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w4_loop:
+ vpbroadcastd xmm3, [tlq-4] ; l3 l2 l1 __
+ mova xmm0, xm6
+ vpdpbusd xmm0, xmm2, xm7
+ mova xmm1, xm6
+ vpdpbusd xmm1, xmm2, xm8
+ sub tlq, 2
+ vpdpbusd xmm0, xmm3, xm9
+ vpdpbusd xmm1, xmm3, xm10
+ packssdw xmm0, xmm1
+.w4:
+ psraw xmm0, 4 ; a0 b0
+ packuswb xmm0, xmm0
+ movd [dstq+strideq*0], xmm0
+ pshufd xmm2, xmm0, q1111 ; b0 b0 b0 b0
+ movd [dstq+strideq*1], xmm2
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/ipred_sse.asm b/third_party/dav1d/src/x86/ipred_sse.asm
new file mode 100644
index 0000000000..67e90b79ae
--- /dev/null
+++ b/third_party/dav1d/src/x86/ipred_sse.asm
@@ -0,0 +1,5409 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%macro SMOOTH_WEIGHT_TABLE 1-*
+ %rep %0
+ db %1-128, 127-%1
+ %rotate 1
+ %endrep
+%endmacro
+
+; sm_weights[], but modified to precalculate x and 256-x with offsets to
+; enable efficient use of pmaddubsw (which requires signed values)
+smooth_weights: SMOOTH_WEIGHT_TABLE \
+ 0, 0, 255, 128, 255, 149, 85, 64, \
+ 255, 197, 146, 105, 73, 50, 37, 32, \
+ 255, 225, 196, 170, 145, 123, 102, 84, \
+ 68, 54, 43, 33, 26, 20, 17, 16, \
+ 255, 240, 225, 210, 196, 182, 169, 157, \
+ 145, 133, 122, 111, 101, 92, 83, 74, \
+ 66, 59, 52, 45, 39, 34, 29, 25, \
+ 21, 17, 14, 12, 10, 9, 8, 8, \
+ 255, 248, 240, 233, 225, 218, 210, 203, \
+ 196, 189, 182, 176, 169, 163, 156, 150, \
+ 144, 138, 133, 127, 121, 116, 111, 106, \
+ 101, 96, 91, 86, 82, 77, 73, 69, \
+ 65, 61, 57, 54, 50, 47, 44, 41, \
+ 38, 35, 32, 29, 27, 25, 22, 20, \
+ 18, 16, 15, 13, 12, 10, 9, 8, \
+ 7, 6, 6, 5, 5, 4, 4, 4
+
+ipred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
+ipred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
+ipred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
+z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8
+z_transpose4: db 8, 12, 0, 4, 9, 13, 1, 5, 10, 14, 2, 6, 11, 15, 3, 7
+z3_shuf: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+z3_shuf_h4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8
+filter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1
+filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1
+z_filter_wh4: db 7, 7, 19, 7,
+z_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39
+pd_32768: dd 32768
+z3_filter_k_tail: db 64, 0, 64, 0, 64, 0, 56, 8
+z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
+z3_base_inc: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64
+z_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1
+z_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15
+ db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3
+z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0
+z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
+ db 7, 8, 8, 9, 9, 10, 10, 11
+z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64
+z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11
+z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8
+z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64
+pw_m1to4: dw -1, -2, -3, -4
+z_filter_k: times 4 db 0, 16
+ times 4 db 0, 20
+ times 4 db 8, 16
+ times 4 db 32, 16
+ times 4 db 24, 20
+ times 4 db 16, 16
+ times 4 db 0, 0
+ times 4 db 0, 0
+pw_8: times 8 db 8, 0
+pb_3: times 16 db 3
+pb_16: times 16 db 16
+pw_62: times 8 dw 62
+pw_64: times 8 dw 64
+pw_256: times 8 dw 256
+pw_512: times 8 dw 512
+pw_m256: times 8 dw -256
+pb_2: times 8 db 2
+pb_4: times 8 db 4
+pb_8: times 8 db 8
+pb_128: times 8 db 128
+pb_m16: times 8 db -16
+pw_128: times 4 dw 128
+pw_255: times 4 dw 255
+pb_36_m4: times 4 db 36, -4
+pb_127_m127: times 4 db 127, -127
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4)
+%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4)
+
+JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64
+JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64
+JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
+ s4-8*4, s8-8*4, s16-8*4, s32-8*4
+JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32
+JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32
+
+cextern dr_intra_derivative
+cextern filter_intra_taps
+
+SECTION .text
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8
+ pshuflw m1, m0, %3 ; extend 8 byte for 2 pos
+ punpcklqdq m1, m1
+ mova [dstq + %2], m1
+%if %1 > 16
+ mova [dstq + 16 + %2], m1
+%endif
+%if %1 > 32
+ mova [dstq + 32 + %2], m1
+ mova [dstq + 48 + %2], m1
+%endif
+%endmacro
+
+%macro IPRED_H 1 ; width
+ sub tlq, 4
+ movd m0, [tlq] ; get 4 bytes of topleft data
+ punpcklbw m0, m0 ; extend 2 byte
+%if %1 == 4
+ pshuflw m1, m0, q2233
+ movd [dstq+strideq*0], m1
+ psrlq m1, 32
+ movd [dstq+strideq*1], m1
+ pshuflw m0, m0, q0011
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+stride3q ], m0
+
+%elif %1 == 8
+ punpcklwd m0, m0
+ punpckhdq m1, m0, m0
+ punpckldq m0, m0
+ movq [dstq+strideq*1], m1
+ movhps [dstq+strideq*0], m1
+ movq [dstq+stride3q ], m0
+ movhps [dstq+strideq*2], m0
+%else
+ IPRED_SET %1, 0, q3333
+ IPRED_SET %1, strideq, q2222
+ IPRED_SET %1, strideq*2, q1111
+ IPRED_SET %1, stride3q, q0000
+%endif
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w%1
+ RET
+%endmacro
+
+INIT_XMM ssse3
+cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_h_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ IPRED_H 4
+.w8:
+ IPRED_H 8
+.w16:
+ IPRED_H 16
+.w32:
+ IPRED_H 32
+.w64:
+ IPRED_H 64
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_splat_ssse3_table
+ tzcnt wd, wm
+ movu m0, [tlq+ 1]
+ movu m1, [tlq+17]
+ movu m2, [tlq+33]
+ movu m3, [tlq+49]
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd m4, r5d
+ tzcnt r5d, r5d
+ movd m5, r5d
+ LEA r5, ipred_dc_ssse3_table
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+20]
+ pcmpeqd m3, m3
+ psrlw m4, 1 ; dc = (width + height) >> 1;
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movd m0, [tlq-4]
+ pmaddubsw m0, m3
+ jmp wq
+.w4:
+ movd m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m0, m4
+ paddw m0, m1
+ pmaddwd m0, m3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw m0, 3 ; dc >>= ctz(width + height);
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq m1, m0, m0
+ paddw m0, m1
+ psrlq m1, m0, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8
+ cmovz r6d, r2d
+ movd m5, r6d
+ pmulhuw m0, m5
+.w4_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s4:
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+ALIGN function_align
+.h8:
+ movq m0, [tlq-8]
+ pmaddubsw m0, m3
+ jmp wq
+.w8:
+ movq m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ paddw m0, m1
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w8_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s8:
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-16]
+ pmaddubsw m0, m3
+ jmp wq
+.w16:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8|32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w16_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ mova m2, [tlq-16]
+ pmaddubsw m2, m3
+ paddw m0, m2
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ movu m2, [tlq+17]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 64|16
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w32_end:
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+.s32:
+ mova [dstq], m0
+ mova [dstq+16], m1
+ mova [dstq+strideq], m0
+ mova [dstq+strideq+16], m1
+ mova [dstq+strideq*2], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q], m0
+ mova [dstq+stride3q+16], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+ALIGN function_align
+.h64:
+ mova m0, [tlq-64]
+ mova m1, [tlq-48]
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ paddw m0, m1
+ mova m1, [tlq-32]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ mova m1, [tlq-16]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 1]
+ movu m2, [tlq+17]
+ pmaddubsw m1, m3
+ pmaddubsw m2, m3
+ paddw m1, m2
+ movu m2, [tlq+33]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ movu m2, [tlq+49]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w64_end:
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+.s64:
+ mova [dstq], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ mova [dstq+strideq], m0
+ mova [dstq+strideq+16], m1
+ mova [dstq+strideq+32], m2
+ mova [dstq+strideq+48], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s64
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_left_ssse3_table
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+ movd m2, r6d
+ psrld m3, m2
+ movsxd r6, [r5+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu m1, [tlq+48] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+ movu m1, [tlq+32] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h32:
+ movu m1, [tlq+16] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h16:
+ pshufd m1, m0, q3232 ; psrlq m1, m0, 16
+ paddw m0, m1
+.h8:
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+.h4:
+ pmaddwd m0, m2
+ pmulhrsw m0, m3
+ lea stride3q, [strideq*3]
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_splat_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h
+ LEA r5, ipred_dc_left_ssse3_table
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+ movd m2, wd
+ psrld m3, m2
+ movsxd r6, [r5+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
+ ; w * a = (w - 128) * a + 128 * a
+ ; (256 - w) * b = (127 - w) * b + 129 * b
+ ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b]
+ pmaddubsw m6, m%3, m%1
+ pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b
+ paddw m6, m%5
+ paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128]
+ psrlw m6, 8
+ psrlw m0, 8
+ packuswb m6, m0
+%endmacro
+
+cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights
+%define base r6-ipred_smooth_v_ssse3_table
+ LEA r6, ipred_smooth_v_ssse3_table
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ movddup m0, [base+pb_127_m127]
+ movddup m1, [base+pw_128]
+ lea weightsq, [base+smooth_weights+hq*4]
+ neg hq
+ movd m5, [tlq+hq]
+ pxor m2, m2
+ pshufb m5, m2
+ add wq, r6
+ jmp wq
+.w4:
+ movd m2, [tlq+1]
+ punpckldq m2, m2
+ punpcklbw m2, m5 ; top, bottom
+ lea r3, [strideq*3]
+ mova m4, [base+ipred_v_shuf]
+ mova m5, m4
+ punpckldq m4, m4
+ punpckhdq m5, m5
+ pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom
+ paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok
+ paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128
+.w4_loop:
+ movu m1, [weightsq+hq*2]
+ pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop
+ pshufb m1, m5
+ SMOOTH 0, 1, 2, 2, 3, 3
+ movd [dstq+strideq*0], m6
+ pshuflw m1, m6, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m6, m6
+ movd [dstq+strideq*2], m6
+ psrlq m6, 32
+ movd [dstq+r3 ], m6
+ lea dstq, [dstq+strideq*4]
+ add hq, 4
+ jl .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ movq m2, [tlq+1]
+ punpcklbw m2, m5
+ mova m5, [base+ipred_v_shuf]
+ lea r3, [strideq*3]
+ pshufd m4, m5, q0000
+ pshufd m5, m5, q1111
+ pmaddubsw m3, m2, m0
+ paddw m1, m2
+ paddw m3, m1 ; m3 is output for loop
+.w8_loop:
+ movq m1, [weightsq+hq*2]
+ pshufb m0, m1, m4
+ pshufb m1, m5
+ SMOOTH 0, 1, 2, 2, 3, 3
+ movq [dstq+strideq*0], m6
+ movhps [dstq+strideq*1], m6
+ lea dstq, [dstq+strideq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m5
+ punpckhbw m3, m5
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1 ; m4 and m5 is output for loop
+.w16_loop:
+ movd m1, [weightsq+hq*2]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m6
+ add dstq, strideq
+ add hq, 1
+ jl .w16_loop
+ RET
+ALIGN function_align
+.w32:
+%if WIN64
+ movaps [rsp+24], xmm7
+ %define xmm_regs_used 8
+%endif
+ mova m7, m5
+.w32_loop_init:
+ mov r3d, 2
+.w32_loop:
+ movddup m0, [base+pb_127_m127]
+ movddup m1, [base+pw_128]
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+ movd m1, [weightsq+hq*2]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m6
+ add tlq, 16
+ add dstq, 16
+ dec r3d
+ jg .w32_loop
+ lea dstq, [dstq-32+strideq]
+ sub tlq, 32
+ add hq, 1
+ jl .w32_loop_init
+ RET
+ALIGN function_align
+.w64:
+%if WIN64
+ movaps [rsp+24], xmm7
+ %define xmm_regs_used 8
+%endif
+ mova m7, m5
+.w64_loop_init:
+ mov r3d, 4
+.w64_loop:
+ movddup m0, [base+pb_127_m127]
+ movddup m1, [base+pw_128]
+ movu m3, [tlq+1]
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ pmaddubsw m4, m2, m0
+ pmaddubsw m5, m3, m0
+ paddw m0, m1, m2
+ paddw m1, m3
+ paddw m4, m0
+ paddw m5, m1
+ movd m1, [weightsq+hq*2]
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ SMOOTH 1, 1, 2, 3, 4, 5
+ mova [dstq], m6
+ add tlq, 16
+ add dstq, 16
+ dec r3d
+ jg .w64_loop
+ lea dstq, [dstq-64+strideq]
+ sub tlq, 64
+ add hq, 1
+ jl .w64_loop_init
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h
+%define base r6-ipred_smooth_h_ssse3_table
+ LEA r6, ipred_smooth_h_ssse3_table
+ mov wd, wm
+ movd m3, [tlq+wq]
+ pxor m1, m1
+ pshufb m3, m1 ; right
+ tzcnt wd, wd
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ movddup m4, [base+pb_127_m127]
+ movddup m5, [base+pw_128]
+ add wq, r6
+ jmp wq
+.w4:
+ movddup m6, [base+smooth_weights+4*2]
+ mova m7, [base+ipred_h_shuf]
+ sub tlq, 4
+ sub tlq, hq
+ lea r3, [strideq*3]
+.w4_loop:
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m7
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m6
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r3 ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ mova m6, [base+smooth_weights+8*2]
+ mova m7, [base+ipred_h_shuf]
+ sub tlq, 4
+ sub tlq, hq
+ punpckldq m7, m7
+.w8_loop:
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m7
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m6
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ mova m6, [base+smooth_weights+16*2]
+ mova m7, [base+smooth_weights+16*3]
+ sub tlq, 1
+ sub tlq, hq
+.w16_loop:
+ pxor m1, m1
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m1
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m6
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ pmaddubsw m2, m7
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq], m0
+ lea dstq, [dstq+strideq]
+ sub hd, 1
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ sub tlq, 1
+ sub tlq, hq
+ pxor m6, m6
+.w32_loop_init:
+ mov r5, 2
+ lea r3, [base+smooth_weights+16*4]
+.w32_loop:
+ mova m7, [r3]
+ add r3, 16
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m6
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m7
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ mova m7, [r3]
+ add r3, 16
+ pmaddubsw m2, m7
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, 16
+ dec r5
+ jg .w32_loop
+ lea dstq, [dstq-32+strideq]
+ sub hd, 1
+ jg .w32_loop_init
+ RET
+ALIGN function_align
+.w64:
+ sub tlq, 1
+ sub tlq, hq
+ pxor m6, m6
+.w64_loop_init:
+ mov r5, 4
+ lea r3, [base+smooth_weights+16*8]
+.w64_loop:
+ mova m7, [r3]
+ add r3, 16
+ movd m2, [tlq+hq] ; left
+ pshufb m2, m6
+ punpcklbw m1, m2, m3 ; left, right
+ punpckhbw m2, m3
+ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
+ paddw m0, m1 ; 128 * left + 129 * right
+ pmaddubsw m1, m7
+ paddw m1, m5
+ paddw m0, m1
+ pmaddubsw m1, m2, m4
+ paddw m1, m2
+ mova m7, [r3]
+ add r3, 16
+ pmaddubsw m2, m7
+ paddw m2, m5
+ paddw m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, 16
+ dec r5
+ jg .w64_loop
+ lea dstq, [dstq-64+strideq]
+ sub hd, 1
+ jg .w64_loop_init
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+%macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3
+ pmaddubsw m6, m%3, m%1
+ mova m0, m6
+ pmaddubsw m6, m%4, m%2
+ mova m1, m6
+%ifnum %5
+ paddw m0, m%5
+%else
+ paddw m0, %5
+%endif
+%ifnum %6
+ paddw m1, m%6
+%else
+ paddw m1, %6
+%endif
+%ifnum %7
+%else
+ mova m3, %7
+%endif
+ pavgw m0, m2
+ pavgw m1, m3
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+%endmacro
+
+%macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5]
+ mova m1, [rsp+16*%1] ; top
+ punpckhbw m6, m1, m0 ; top, bottom
+ punpcklbw m1, m0 ; top, bottom
+ pmaddubsw m2, m1, m5
+ mova [rsp+16*%2], m1
+ paddw m1, m3 ; 1 * top + 255 * bottom + 255
+ paddw m2, m1 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*%3], m2
+ pmaddubsw m2, m6, m5
+ mova [rsp+16*%4], m6
+ paddw m6, m3 ; 1 * top + 255 * bottom + 255
+ paddw m2, m6 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*%5], m2
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+pb_3] ; topleft[-(1 + y)]
+ punpcklbw m1, m4 ; left, right
+ pmaddubsw m2, m1, m5 ; 127 * left - 127 * right
+ paddw m2, m1 ; 128 * left + 129 * right
+ mova m3, m2
+ pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width];
+ pmaddubsw m1, %7
+ paddw m2, m3, m0
+ paddw m3, m1
+ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ mova m7, [rsp+16*%9]
+ pshufb m1, m7
+ mova [rsp+16*%8], m3
+ mova m4, [rsp+16*%2]
+ mova m5, [rsp+16*%3]
+ mova m3, [rsp+16*%4]
+ mova m7, [rsp+16*%5]
+ SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8]
+ mova [dstq], m0
+ movddup m3, [base+pw_255] ; recovery
+ mova m0, [rsp+16*%10] ; recovery
+ mova m4, [rsp+16*%11] ; recovery
+ mova m5, [rsp+16*%12] ; recovery
+%endmacro
+
+cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights
+%define base r6-ipred_smooth_ssse3_table
+ mov wd, wm
+ mov hd, hm
+ LEA r6, ipred_smooth_ssse3_table
+ movd m4, [tlq+wq] ; right
+ pxor m2, m2
+ pshufb m4, m2
+ tzcnt wd, wd
+ mov r5, tlq
+ sub r5, hq
+ movsxd wq, [r6+wq*4]
+ movddup m5, [base+pb_127_m127]
+ movd m0, [r5]
+ pshufb m0, m2 ; bottom
+ movddup m3, [base+pw_255]
+ add wq, r6
+ lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height]
+ jmp wq
+.w4:
+ mova m7, [base+ipred_v_shuf]
+ movd m1, [tlq+1] ; left
+ pshufd m1, m1, q0000
+ sub tlq, 4
+ lea r3, [strideq*3]
+ sub tlq, hq
+ punpcklbw m1, m0 ; top, bottom
+ pshufd m6, m7, q1100
+ pshufd m7, m7, q3322
+ pmaddubsw m2, m1, m5
+ paddw m3, m1 ; 1 * top + 255 * bottom + 255
+ paddw m2, m3 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width];
+ punpcklqdq m1, m1
+ mova [rsp+16*2], m1
+ mova [rsp+16*3], m4
+ mova [rsp+16*4], m6
+ mova [rsp+16*5], m5
+.w4_loop:
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+ipred_h_shuf]
+ punpcklbw m0, m1, m4 ; left, right
+ punpckhbw m1, m4
+ pmaddubsw m2, m0, m5 ; 127 * left - 127 * right
+ pmaddubsw m3, m1, m5
+ paddw m2, m0 ; 128 * left + 129 * right
+ paddw m3, m1
+ mova m4, [rsp+16*2]
+ pmaddubsw m0, m4
+ pmaddubsw m1, m4
+ paddw m2, m0
+ paddw m3, m1
+ movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ add v_weightsq, 8
+ pshufb m0, m1, m6
+ pshufb m1, m7
+ mova m4, [rsp+16*0]
+ mova m5, [rsp+16*1]
+ SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
+ mova m4, [rsp+16*3]
+ mova m6, [rsp+16*4]
+ mova m5, [rsp+16*5]
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r3 ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ mova m7, [base+ipred_v_shuf]
+ movq m1, [tlq+1] ; left
+ punpcklqdq m1, m1
+ sub tlq, 4
+ sub tlq, hq
+ punpcklbw m1, m0
+ pshufd m6, m7, q0000
+ pshufd m7, m7, q1111
+ pmaddubsw m2, m1, m5
+ paddw m3, m1
+ paddw m2, m3
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width];
+ mova [rsp+16*2], m1
+ mova [rsp+16*3], m4
+ mova [rsp+16*4], m6
+ mova [rsp+16*5], m5
+.w8_loop:
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+ipred_h_shuf]
+ pshufd m1, m1, q1100
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ pmaddubsw m2, m0, m5
+ pmaddubsw m3, m1, m5
+ paddw m2, m0
+ paddw m3, m1
+ mova m4, [rsp+16*2]
+ pmaddubsw m0, m4
+ pmaddubsw m1, m4
+ paddw m2, m0
+ paddw m3, m1
+ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ add v_weightsq, 4
+ pshufb m0, m1, m6
+ pshufb m1, m7
+ mova m4, [rsp+16*0]
+ mova m5, [rsp+16*1]
+ SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3
+ mova m4, [rsp+16*3]
+ mova m6, [rsp+16*4]
+ mova m5, [rsp+16*5]
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ mova m7, [base+ipred_v_shuf]
+ movu m1, [tlq+1] ; left
+ sub tlq, 4
+ sub tlq, hq
+ punpckhbw m6, m1, m0 ; top, bottom
+ punpcklbw m1, m0 ; top, bottom
+ pshufd m7, m7, q0000
+ mova [rsp+16*2], m7
+ pmaddubsw m2, m6, m5
+ mova [rsp+16*5], m6
+ paddw m6, m3 ; 1 * top + 255 * bottom + 255
+ paddw m2, m6 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*6], m2
+ pmaddubsw m2, m1, m5
+ paddw m3, m1 ; 1 * top + 255 * bottom + 255
+ mova [rsp+16*0], m1
+ paddw m2, m3 ; 128 * top + 129 * bottom + 255
+ mova [rsp+16*1], m2
+ mova [rsp+16*3], m4
+ mova [rsp+16*4], m5
+.w16_loop:
+ movd m1, [tlq+hq] ; left
+ pshufb m1, [base+pb_3] ; topleft[-(1 + y)]
+ punpcklbw m1, m4 ; left, right
+ pmaddubsw m2, m1, m5 ; 127 * left - 127 * right
+ paddw m2, m1 ; 128 * left + 129 * right
+ mova m0, m1
+ mova m3, m2
+ pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width];
+ pmaddubsw m1, [base+smooth_weights+16*3]
+ paddw m2, m0
+ paddw m3, m1
+ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height];
+ add v_weightsq, 2
+ mova m7, [rsp+16*2]
+ pshufb m1, m7
+ mova [rsp+16*7], m3
+ mova m4, [rsp+16*0]
+ mova m5, [rsp+16*1]
+ mova m3, [rsp+16*5]
+ mova m7, [rsp+16*6]
+ SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7]
+ mova m4, [rsp+16*3]
+ mova m5, [rsp+16*4]
+ mova [dstq], m0
+ lea dstq, [dstq+strideq]
+ sub hd, 1
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m1, [tlq+1] ; top topleft[1 + x]
+ movu m2, [tlq+17] ; top
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ sub tlq, 4
+ sub tlq, hq
+ mova m7, [base+ipred_v_shuf]
+ pshufd m7, m7, q0000
+ mova [rsp+16*2], m7
+ mova [rsp+16*3], m0
+ mova [rsp+16*4], m4
+ mova [rsp+16*5], m5
+.w32_loop:
+ SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5
+ lea dstq, [dstq-16+strideq]
+ add v_weightsq, 2
+ sub hd, 1
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ movu m1, [tlq+1] ; top topleft[1 + x]
+ movu m2, [tlq+17] ; top
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ movu m1, [tlq+33] ; top
+ movu m2, [tlq+49] ; top
+ mova [rsp+16*11], m1
+ mova [rsp+16*12], m2
+ sub tlq, 4
+ sub tlq, hq
+ mova m7, [base+ipred_v_shuf]
+ pshufd m7, m7, q0000
+ mova [rsp+16*2], m7
+ mova [rsp+16*3], m0
+ mova [rsp+16*4], m4
+ mova [rsp+16*5], m5
+.w64_loop:
+ SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5
+ add dstq, 16
+ SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5
+ lea dstq, [dstq-48+strideq]
+ add v_weightsq, 2
+ sub hd, 1
+ jg .w64_loop
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx
+ %define base r7-$$
+ lea r7, [$$]
+ mova m8, [base+pw_62]
+ mova m9, [base+pw_64]
+ mova m10, [base+pw_512]
+%else
+cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx
+ %define base r1-$$
+ %define m8 [base+pw_62]
+ %define m9 [base+pw_64]
+ %define m10 [base+pw_512]
+ %define strideq r3
+ %define stridemp dword [rsp+16*12]
+ mov stridemp, r1
+ LEA r1, $$
+%endif
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ inc tlq
+ movsxd wq, [base+ipred_z1_ssse3_table+wq*4]
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ lea wq, [base+wq+ipred_z1_ssse3_table]
+ movzx dxd, word [base+dr_intra_derivative+dxq]
+ xor angled, 0x4ff ; d = 90 - angle
+ jmp wq
+.w4:
+ lea r3d, [angleq+88]
+ test r3d, 0x480
+ jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40
+ sar r3d, 9
+ add r3d, hd
+ cmp r3d, 8
+ jg .w4_no_upsample ; h > 8 || (w == h && is_sm)
+ mova m1, [tlq-1]
+ pshufb m0, m1, [base+z_upsample1]
+ pshufb m1, [base+z_upsample2]
+ movddup m2, [base+pb_36_m4]
+ add dxd, dxd
+ pmaddubsw m0, m2
+ pshufd m7, m1, q3333
+ movd [rsp+16], m7 ; top[max_base_x]
+ pmaddubsw m1, m2
+ movd m6, dxd
+ mov r5d, dxd ; xpos
+ pshufb m6, [base+pw_256]
+ paddw m1, m0
+ movq m0, [tlq]
+ pmulhrsw m1, m10
+ paddw m7, m6, m6
+ punpcklqdq m6, m7 ; xpos0 xpos1
+ packuswb m1, m1
+ punpcklbw m0, m1
+ movifnidn strideq, stridemp
+ mova [rsp], m0
+.w4_upsample_loop:
+ lea r2d, [r5+dxq]
+ shr r5d, 6 ; base0
+ movq m0, [rsp+r5]
+ lea r5d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movhps m0, [rsp+r2]
+ pand m2, m8, m6 ; frac
+ psubw m1, m9, m2 ; 64-frac
+ psllw m2, 8
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ paddw m6, m7 ; xpos += dx
+ pmulhrsw m0, m10
+ packuswb m0, m0
+ movd [dstq+strideq*0], m0
+ pshuflw m0, m0, q1032
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_upsample_loop
+ RET
+.w4_no_upsample:
+ mov r3d, 7 ; max_base
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ lea r3d, [hq+3]
+ movd m0, r3d
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ pcmpeqb m1, m0, [base+z_filter_wh4]
+ pand m1, m2
+ pcmpgtb m1, [base+z_filter_t_w48+angleq*8]
+ pmovmskb r5d, m1
+ mov r3d, 7
+ test r5d, r5d
+ jz .w4_main ; filter_strength == 0
+ mova m3, [tlq-1]
+ imul r5d, 0x55555555
+ movu m7, [base+z_filter_s+8]
+ shr r5d, 30 ; filter_strength
+ movddup m0, [base+pb_8]
+ pminub m7, m0
+ pshufb m0, m3, [base+z_filter_s]
+ movddup m4, [base+z_filter_k-8+r5*8+24*0]
+ pshufb m3, m7
+ movddup m5, [base+z_filter_k-8+r5*8+24*1]
+ shufps m2, m0, m3, q2121
+ movddup m6, [base+z_filter_k-8+r5*8+24*2]
+ pmaddubsw m0, m4
+ pmaddubsw m1, m2, m4
+ pmaddubsw m2, m5
+ paddd m5, m6
+ pmaddubsw m4, m3, m5
+ pmaddubsw m3, m6
+ paddw m0, m2
+ paddw m1, m4
+ paddw m0, m3
+ pshufd m1, m1, q3333
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ mov r5d, 9
+ mov tlq, rsp
+ cmp hd, 4
+ cmovne r3d, r5d
+ packuswb m0, m1
+ mova [tlq], m0
+.w4_main:
+ add tlq, r3
+ movd m5, dxd
+ movddup m0, [base+z_base_inc] ; base_inc << 6
+ movd m7, [tlq] ; top[max_base_x]
+ shl r3d, 6
+ movd m4, r3d
+ pshufb m5, [base+pw_256]
+ mov r5d, dxd ; xpos
+ pshufb m7, [base+pw_m256]
+ sub r5, r3
+ pshufb m4, [base+pw_256]
+ mova m3, [base+z1_shuf_w4]
+ paddw m6, m5, m5
+ psubw m4, m0 ; max_base_x
+ punpcklqdq m5, m6 ; xpos0 xpos1
+.w4_loop:
+ lea r3, [r5+dxq]
+ sar r5, 6 ; base0
+ movq m0, [tlq+r5]
+ lea r5, [r3+dxq]
+ sar r3, 6 ; base1
+ movhps m0, [tlq+r3]
+ pand m2, m8, m5 ; frac
+ psubw m1, m9, m2 ; 64-frac
+ psllw m2, 8
+ pshufb m0, m3
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ movifnidn strideq, stridemp
+ pcmpgtw m1, m4, m5 ; base < max_base_x
+ pmulhrsw m0, m10
+ paddw m5, m6 ; xpos += dx
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movd [dstq+strideq*0], m0
+ pshuflw m0, m0, q1032
+ movd [dstq+strideq*1], m0
+ sub hd, 2
+ jz .w4_end
+ lea dstq, [dstq+strideq*2]
+ test r5d, r5d
+ jl .w4_loop
+ packuswb m7, m7
+.w4_end_loop:
+ movd [dstq+strideq*0], m7
+ movd [dstq+strideq*1], m7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_end_loop
+.w4_end:
+ RET
+.w8:
+ lea r3d, [angleq+88]
+ and r3d, ~0x7f
+ or r3d, hd
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ mova m5, [base+z_upsample1]
+ movu m3, [base+z_filter_s+6]
+ movd m4, hd
+ mova m0, [tlq-1]
+ movu m1, [tlq+7]
+ pxor m7, m7
+ pshufb m4, m7
+ movddup m7, [base+pb_36_m4]
+ pminub m4, m3
+ add dxd, dxd
+ pshufb m2, m0, m5
+ pmaddubsw m2, m7
+ pshufb m0, m3
+ pmaddubsw m0, m7
+ movd m6, dxd
+ pshufb m3, m1, m5
+ pmaddubsw m3, m7
+ pshufb m1, m4
+ pmaddubsw m1, m7
+ pshufb m6, [base+pw_256]
+ mov r5d, dxd
+ paddw m2, m0
+ paddw m7, m6, m6
+ paddw m3, m1
+ punpcklqdq m6, m7 ; xpos0 xpos1
+ movu m1, [tlq]
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ packuswb m2, m3
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ movifnidn strideq, stridemp
+ mova [rsp+16*0], m0
+ mova [rsp+16*1], m1
+.w8_upsample_loop:
+ lea r2d, [r5+dxq]
+ shr r5d, 6 ; base0
+ movu m0, [rsp+r5]
+ lea r5d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movu m1, [rsp+r2]
+ pand m2, m8, m6
+ psubw m3, m9, m2
+ psllw m2, 8
+ por m3, m2
+ punpcklqdq m2, m3, m3 ; frac0
+ pmaddubsw m0, m2
+ punpckhqdq m3, m3 ; frac1
+ pmaddubsw m1, m3
+ paddw m6, m7
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_upsample_loop
+ RET
+.w8_no_upsample:
+ lea r3d, [hq+7]
+ movd m0, r3d
+ and r3d, 7
+ or r3d, 8 ; imin(h+7, 15)
+ test angled, 0x400
+ jnz .w8_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movu m1, [base+z_filter_wh8]
+ psrldq m3, [base+z_filter_t_w48+angleq*8], 4
+ pcmpeqb m1, m0
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w8_main ; filter_strength == 0
+ movd m3, [tlq-1]
+ movu m0, [tlq+16*0]
+ imul r5d, 0x55555555
+ movu m1, [tlq+16*1]
+ shr r5d, 30 ; filter_strength
+ movd m2, [tlq+r3]
+ lea tlq, [rsp+16*4]
+ sub r5, 3
+ mova [tlq-16*1], m0
+ pxor m7, m7
+ mova [tlq+16*0], m1
+ pshufb m3, m7
+ pshufb m2, m7
+ mova [tlq-16*2], m3
+ movq [tlq+r3-15], m2
+ call .filter_edge
+ sar r5d, 1
+ add r5d, 17
+ cmp hd, 8
+ cmova r3d, r5d
+.w8_main:
+ add tlq, r3
+ movd m5, dxd
+ movd m7, [tlq]
+ shl r3d, 6
+ movu m3, [base+z_filter_s+2]
+ movd m4, r3d
+ pshufb m5, [base+pw_256]
+ mov r5d, dxd
+ pshufb m7, [base+pw_m256]
+ sub r5, r3
+ pshufb m4, [base+pw_256]
+ psubw m4, [base+z_base_inc]
+ mova m6, m5
+.w8_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m0, [tlq+r3]
+ pand m1, m8, m5
+ psubw m2, m9, m1
+ psllw m1, 8
+ pshufb m0, m3
+ por m1, m2
+ pmaddubsw m0, m1
+ pcmpgtw m1, m4, m5
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movq [dstq], m0
+ dec hd
+ jz .w8_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w8_loop
+ packuswb m7, m7
+.w8_end_loop:
+ movq [dstq], m7
+ add dstq, strideq
+ dec hd
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16:
+ lea r3d, [hq+15]
+ movd m0, r3d
+ and r3d, 15
+ or r3d, 16 ; imin(h+15, 31)
+ test angled, 0x400
+ jnz .w16_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movq m3, [base+z_filter_t_w16+angleq*4]
+ pcmpeqb m1, m0, [base+z_filter_wh16]
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w16_main ; filter_strength == 0
+ movd m4, [tlq-1]
+ movu m0, [tlq+16*0]
+ imul r5d, 0x24924924
+ movu m1, [tlq+16*1]
+ shr r5d, 30
+ movd m2, [tlq+30]
+ adc r5, -4 ; filter_strength-3
+ movd m3, [tlq+r3]
+ lea tlq, [rsp+16*4]
+ mova [tlq-16*1], m0
+ pxor m7, m7
+ mova [tlq+16*0], m1
+ pshufb m4, m7
+ movd [rsp], m2
+ pshufb m3, m7
+ mova [tlq-16*2], m4
+ movd [tlq+r3-16], m3
+ call .filter_edge
+ cmp hd, 16
+ jle .w16_main
+ pshuflw m0, [rsp], q0000
+ sar r5, 1
+ movd m1, [base+z_filter_k_tail+4+r5*4]
+ lea r3d, [r5+33]
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq+32], m0
+.w16_main:
+ add tlq, r3
+ movd m5, dxd
+ movd m7, [tlq]
+ movd m4, r3d
+ shl r3d, 6
+ pshufb m5, [base+pw_256]
+ pxor m6, m6
+ pshufb m7, m6
+ mov r5d, dxd
+ pshufb m4, m6
+ sub r5, r3
+ psubb m4, [base+pb_0to15]
+ mova m6, m5
+.w16_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m3, m5, 6
+ packsswb m3, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ paddw m5, m6
+ pcmpgtb m2, m4, m3
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq], m0
+ dec hd
+ jz .w16_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w16_loop
+.w16_end_loop:
+ mova [dstq], m7
+ add dstq, strideq
+ dec hd
+ jg .w16_end_loop
+.w16_end:
+ RET
+.w32:
+ lea r3d, [hq+31]
+ and r3d, 31
+ or r3d, 32 ; imin(h+31, 63)
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w32_main
+ movd m6, [tlq-1]
+ movu m0, [tlq+16*0]
+ movu m1, [tlq+16*1]
+ movu m2, [tlq+16*2]
+ movu m3, [tlq+16*3]
+ movd m4, [tlq+62]
+ movd m5, [tlq+r3]
+ lea tlq, [rsp+16*6]
+ mova [tlq-16*3], m0
+ pxor m7, m7
+ mova [tlq-16*2], m1
+ pshufb m6, m7
+ mova [tlq-16*1], m2
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq+16*0], m3
+ movd [rsp], m4
+ pshufb m5, m7
+ mova [tlq-16*4], m6
+ movd [tlq+r3-48], m5
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+ cmp hd, 32
+ jle .w32_main
+ pshuflw m0, [rsp], q0000
+ movd m1, [base+z_filter_k_tail+4]
+ add r3d, 2
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq+64], m0
+.w32_main:
+ add tlq, r3
+ movd m0, r3d
+ movd m7, [tlq]
+ shl r3d, 6
+ movd m5, dxd
+ pxor m6, m6
+ mov r5d, dxd
+ pshufb m0, m6
+ pshufb m5, [base+pw_256]
+ sub r5, r3
+ pshufb m7, m6
+ psubb m0, [base+pb_0to15]
+ movddup m1, [base+pb_m16]
+ mova [rsp+16*0], m0
+ paddb m0, m1
+ mova [rsp+16*1], m0
+ mova m6, m5
+.w32_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+16*0+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+16*0+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m4, m5, 6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packsswb m4, m4
+ pcmpgtb m2, [rsp+16*0], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*1+0]
+ movu m2, [tlq+r3+16*1+1]
+ mova [dstq+16*0], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*1], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq+16*1], m0
+ dec hd
+ jz .w32_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w32_loop
+.w32_end_loop:
+ mova [dstq+16*0], m7
+ mova [dstq+16*1], m7
+ add dstq, strideq
+ dec hd
+ jg .w32_end_loop
+.w32_end:
+ RET
+.w64:
+ lea r3d, [hq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w64_main
+ movd m4, [tlq-1]
+ movu m0, [tlq+16*0]
+ movu m1, [tlq+16*1]
+ movu m2, [tlq+16*2]
+ movu m3, [tlq+16*3]
+ mova [rsp+16*3], m0
+ pxor m7, m7
+ mova [rsp+16*4], m1
+ pshufb m4, m7
+ mova [rsp+16*5], m2
+ mova [rsp+16*6], m3
+ mova [rsp+16*2], m4
+ movu m0, [tlq+16*4]
+ movu m1, [tlq+16*5]
+ movu m2, [tlq+16*6]
+ movu m3, [tlq+16*7]
+ movd m4, [tlq+r3]
+ lea tlq, [rsp+16*10]
+ mova [tlq-16*3], m0
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq-16*2], m1
+ pshufb m4, m7
+ mova [tlq-16*1], m2
+ mova [tlq+16*0], m3
+ movd [tlq+r3-16*7], m4
+ cmp hd, 64
+ jl .w64_filter96 ; skip one call if the last 32 bytes aren't used
+ call .filter_edge
+.w64_filter96:
+ sub tlq, 16*2
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+.w64_main:
+ add tlq, r3
+ movd m0, r3d
+ movd m7, [tlq]
+ shl r3d, 6
+ movd m5, dxd
+ pxor m6, m6
+ mov r5d, dxd
+ pshufb m0, m6
+ sub r5, r3
+ pshufb m5, [base+pw_256]
+ pshufb m7, m6
+ psubb m0, [base+pb_0to15]
+ movddup m1, [base+pb_m16]
+ mova [rsp+16*0], m0
+ paddb m0, m1
+ mova [rsp+16*1], m0
+ paddb m0, m1
+ mova [rsp+16*2], m0
+ paddb m0, m1
+ mova [rsp+16*3], m0
+ mova m6, m5
+.w64_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+16*0+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+16*0+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m4, m5, 6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packsswb m4, m4
+ pcmpgtb m2, [rsp+16*0], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*1+0]
+ movu m2, [tlq+r3+16*1+1]
+ mova [dstq+16*0], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*1], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*2+0]
+ movu m2, [tlq+r3+16*2+1]
+ mova [dstq+16*1], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*2], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*3+0]
+ movu m2, [tlq+r3+16*3+1]
+ mova [dstq+16*2], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*3], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq+16*3], m0
+ dec hd
+ jz .w64_end
+ movifnidn strideq, stridemp
+ add dstq, strideq
+ add r5, dxq
+ jl .w64_loop
+.w64_end_loop:
+ mova [dstq+16*0], m7
+ mova [dstq+16*1], m7
+ mova [dstq+16*2], m7
+ mova [dstq+16*3], m7
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+ALIGN function_align
+.filter_edge: ; 32 pixels/iteration
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
+ movu m2, [tlq-18]
+ movu m1, [tlq-17]
+ movu m3, [tlq- 2]
+ movu m4, [tlq- 1]
+ punpcklbw m0, m2, m1
+ pmaddubsw m0, m7
+ punpckhbw m2, m1
+ pmaddubsw m2, m7
+ punpcklbw m1, m3, m4
+ pmaddubsw m1, m7
+ punpckhbw m3, m4
+ pmaddubsw m3, m7
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
+ mova m5, [tlq-16]
+ movu m6, [tlq-15]
+ punpcklbw m4, m5, m6
+ pmaddubsw m4, m7
+ punpckhbw m5, m6
+ pmaddubsw m5, m7
+ paddw m0, m4
+ paddw m2, m5
+ mova m5, [tlq+ 0]
+ movu m6, [tlq+ 1]
+ punpcklbw m4, m5, m6
+ pmaddubsw m4, m7
+ punpckhbw m5, m6
+ pmaddubsw m5, m7
+ paddw m1, m4
+ paddw m3, m5
+ test r5d, r5d
+ jnz .filter_end ; 3-tap
+ movddup m7, [base+z_filter_k+8*8]
+ movu m5, [tlq-14]
+ movu m6, [tlq+ 2]
+ punpcklbw m4, m5, m5
+ pmaddubsw m4, m7
+ punpckhbw m5, m5
+ pmaddubsw m5, m7
+ paddw m0, m4
+ paddw m2, m5
+ punpcklbw m5, m6, m6
+ pmaddubsw m5, m7
+ punpckhbw m6, m6
+ pmaddubsw m6, m7
+ paddw m1, m5
+ paddw m3, m6
+.filter_end:
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m10}, m0, m2, m1, m3
+%else
+ mova m4, m10
+ REPX {pmulhrsw x, m4 }, m0, m2, m1, m3
+%endif
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [tlq+16*0], m0
+ mova [tlq+16*1], m1
+ ret
+
+%if ARCH_X86_64
+cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy
+ %define base r7-$$
+ %define maxwm r6m
+ %define maxhm r7m
+ lea r7, [$$]
+ mov hd, hm
+ mova m8, [base+pw_62]
+ mova m9, [base+pw_64]
+ lea r9d, [wq-4]
+ mova m10, [base+pw_512]
+ shl r9d, 6
+ mova m11, [base+z1_shuf_w4]
+ or r9d, hd
+ mova m12, [base+z2_h_shuf]
+%else
+cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx
+ %define base r1-$$
+ %define m8 [base+pw_62]
+ %define m9 [base+pw_64]
+ %define m10 [base+pw_512]
+ %define m11 [rsp+16*16]
+ %define m12 [rsp+16*17]
+ %define r8 [rsp+16*6+4*1]
+ %define r9b byte [rsp+16*18+4*0]
+ %define r9d dword [rsp+16*18+4*0]
+ %define r10d dword [rsp+16*18+4*1]
+ %define r11d dword [rsp+16*18+4*2]
+ %define maxwm [rsp+16*18+4*3]
+ %define maxhm [rsp+16*19+4*0]
+ %define stridemp [rsp+16*19+4*1]
+ %define strideq r3
+ %define dyd r4
+ %define dyq r4
+ mov stridemp, r1
+ mov r1d, r6m
+ mov r4d, r7m
+ mov maxwm, r1d
+ mov maxhm, r4d
+ LEA r1, $$
+ lea hd, [wq-4]
+ mova m0, [base+z1_shuf_w4]
+ shl hd, 6
+ mova m1, [base+z2_h_shuf]
+ or hd, hm
+ mova m11, m0
+ mov r9d, hd
+ mova m12, m1
+%endif
+ tzcnt wd, wd
+ movifnidn angled, anglem
+ movsxd wq, [base+ipred_z2_ssse3_table+wq*4]
+%if ARCH_X86_64
+ movzx dxd, angleb
+%else
+ movzx dxd, byte anglem
+%endif
+ xor angled, 0x400
+ mova m0, [tlq-16*4]
+ mov dyd, dxd
+ mova m1, [tlq-16*3]
+ neg dxq
+ mova m2, [tlq-16*2]
+ and dyd, ~1
+ mova m3, [tlq-16*1]
+ and dxq, ~1
+ movd m4, [tlq]
+ movu m5, [tlq+16*0+1]
+ movu m6, [tlq+16*1+1]
+ movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90
+ movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle
+ mova [rsp+16*2], m0
+ pxor m7, m7
+ mova [rsp+16*3], m1
+ pshufb m4, m7
+ mova [rsp+16*4], m2
+ lea wq, [base+ipred_z2_ssse3_table+wq]
+ mova [rsp+16*5], m3
+ neg dxd
+ mova [rsp+16*6], m4
+ or dyd, 4<<16
+ mova [rsp+16*7], m4
+ mova [rsp+16*8], m5
+ mova [rsp+16*9], m6
+ movq m0, [base+z_base_inc+2]
+ movsldup m1, [base+z2_dy_offset]
+ movq m2, [base+pw_256] ; 4<<6
+ movq [rsp+16*14+8*0], m0
+ movq [rsp+16*15+8*0], m1
+ movq [rsp+16*15+8*1], m2
+%if ARCH_X86_64
+ lea r10d, [dxq+(128<<6)] ; xpos
+%else
+ mov [rsp+16*7+4*1], dyd
+ lea r4d, [dxq+(128<<6)]
+ mov r10d, r4d
+ movzx hd, r9b
+%endif
+ mov r11d, (128-4)<<6
+ jmp wq
+.w4:
+ test angled, 0x400
+ jnz .w4_main
+ movd m5, [tlq+4]
+ lea r3d, [hq+2]
+ add angled, 1022
+ pshufb m5, m7
+ shl r3d, 6
+ movd [rsp+16*8+4], m5
+ test r3d, angled
+ jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
+ call .upsample_above
+ sub angled, 1075 ; angle - 53
+ lea r3d, [hq+3]
+ xor angled, 0x7f ; 180 - angle
+ movd m0, r3d
+ movd m6, angled
+ shr angled, 8 ; is_sm << 1
+ pshufb m0, m7
+ pshufb m6, m7
+ pcmpeqb m0, [base+z_filter_wh4]
+ pand m6, m0
+ pcmpgtb m6, [base+z_filter_t_w48+angleq*8]
+ jmp .w8_filter_left
+.upsample_above: ; w4/w8
+ movq m3, [rsp+gprsize+16*8-2]
+ movq m1, [rsp+gprsize+16*8-1]
+ movq m0, [rsp+gprsize+16*8+0]
+ movq m4, [rsp+gprsize+16*8+1]
+ movddup m5, [base+pb_36_m4]
+ punpcklbw m1, m3
+ punpcklbw m2, m0, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+%if ARCH_X86_64
+ mova m11, [base+pb_0to15]
+ lea r10d, [r10+dxq+(1<<6)]
+ mov r11d, (128-7)<<6
+%else
+ mova m3, [base+pb_0to15]
+ mov r3d, [rsp+gprsize+16*18+4*1]
+ mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6
+ lea r3d, [r3+dxq+(1<<6)]
+ mov [rsp+gprsize+16*18+4*1], r3d
+ mova [rsp+gprsize+16*16], m3
+%endif
+ add dxd, dxd
+ paddw m1, m2
+ pmulhrsw m1, m10
+ movq m2, [rsp+gprsize+16*14]
+ paddw m2, m2
+ movq [rsp+gprsize+16*14], m2
+ packuswb m1, m1
+ punpcklbw m1, m0
+ mova [rsp+gprsize+16*8], m1
+ ret
+.w4_no_upsample_above:
+ lea r3d, [hq+3]
+ mov [rsp], angled
+ sub angled, 1112 ; angle - 90
+ movd m0, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ movu m3, [base+z_filter_wh4]
+ mova m4, [base+z_filter_t_w48+angleq*8]
+ call .w8_filter_top
+ mov angled, [rsp]
+ lea r3d, [hq+2]
+ sub angled, 139
+ shl r3d, 6
+ test r3d, angled
+ jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
+.upsample_left: ; w4/w8
+ neg hq
+ movd m0, [tlq+hq]
+ pshufb m0, m7
+ movd [rsp+16*6+hq-4], m0
+ movq m3, [rsp+16*5+7]
+ movq m0, [rsp+16*5+8]
+ movq m2, [rsp+16*5+9]
+ movq m4, [rsp+16*5+10]
+ movddup m5, [base+pb_36_m4]
+ punpcklbw m1, m0, m3
+ punpcklbw m2, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ movshdup m3, [base+z2_dy_offset]
+%if ARCH_X86_64
+ mova m12, [base+z2_upsample]
+ add dyd, dyd
+%else
+ mova m4, [base+z2_upsample]
+ shl dword [rsp+16*7+4*1], 1
+ mova m12, m4
+%endif
+ paddw m1, m2
+ pmulhrsw m1, m10
+ movq [rsp+16*15], m3
+ packuswb m1, m1
+ punpcklbw m0, m1
+ mova [rsp+16*5], m0
+.w4_main:
+ movd m6, dxd
+%if ARCH_X86_64
+ movd m3, dyd
+%else
+ movd m3, [rsp+16*7+4*1]
+%endif
+ movddup m0, [rsp+16*14+8*0]
+ pshufb m6, [base+pw_256]
+ paddw m7, m6, m6
+ movq m5, [base+pw_m1to4]
+ pshuflw m4, m3, q0000
+ punpcklqdq m6, m7
+ pmullw m4, m5
+ pshuflw m3, m3, q1111
+ paddw m6, m0
+ pshuflw m0, m4, q3333
+ psubw m4, [rsp+16*15]
+ movq [rsp+16*6+8*1], m3
+ movq [rsp+8*1], m0 ; dy*4
+%if ARCH_X86_64
+ mov r8, dstq
+%endif
+.w4_loop0:
+%if ARCH_X86_32
+ mov r8, dstq
+%endif
+ mova [rsp+16*12], m6
+ mov r2d, r10d
+ movq [rsp+8*0], m4
+ pand m0, m4, m8
+ psraw m4, 6
+ psubw m1, m9, m0
+ psllw m0, 8
+ por m0, m1 ; 64-frac_y, frac_y
+ movq [rsp+8*3], m0
+ pabsw m4, m4
+ movq [rsp+8*2], m4
+ movzx hd, r9b
+.w4_loop:
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x0
+ movq m0, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x1
+ movhps m0, [rsp+r3]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base_x2
+ movq m1, [rsp+r2]
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base_x3
+ movhps m1, [rsp+r3]
+ pand m2, m8, m6
+ paddsw m5, m6, m7
+ psubw m3, m9, m2
+ psllw m2, 8
+ pshufb m0, m11
+ por m2, m3
+ pmaddubsw m0, m2
+ pand m2, m8, m5
+ psubw m3, m9, m2
+ psllw m2, 8
+ pshufb m1, m11
+ por m2, m3
+ pmaddubsw m1, m2
+ cmp r3d, 127 ; topleft
+ jge .w4_toponly
+ movzx r3d, byte [rsp+8*2+0] ; base_y0
+ movq m3, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+2] ; base_y1
+ movhps m3, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+4] ; base_y2
+ movq m4, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+6] ; base_y3
+ movhps m4, [rsp+r3]
+ pshufb m3, m12
+ pshufb m4, m12
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ movddup m4, [rsp+8*3]
+ pmaddubsw m2, m4
+ pmaddubsw m3, m4
+ psraw m6, 15 ; base_x < topleft
+ pand m2, m6
+ pandn m6, m0
+ por m0, m2, m6
+ psraw m6, m5, 15
+ pand m3, m6
+ pandn m6, m1
+ por m1, m3, m6
+.w4_toponly:
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ movifnidn strideq, stridemp
+ packuswb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ punpckhqdq m0, m0
+ movd [dstq+strideq*0], m0
+ psrlq m0, 32
+ movd [dstq+strideq*1], m0
+ sub hd, 4
+ jz .w4_end
+ movq m4, [rsp+8*2]
+ movq m3, [rsp+16*6+8*1]
+ paddw m6, m5, m7 ; xpos += dx
+ psubw m4, m3
+ movq [rsp+8*2], m4
+ lea dstq, [dstq+strideq*2]
+ cmp r2d, r11d
+ jge .w4_loop
+ movddup m5, [rsp+8*3]
+.w4_leftonly_loop:
+ movzx r3d, byte [rsp+8*2+0] ; base_y0
+ movq m1, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+2] ; base_y1
+ movhps m1, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+4] ; base_y2
+ movq m2, [rsp+r3]
+ movzx r3d, byte [rsp+8*2+6] ; base_y3
+ movhps m2, [rsp+r3]
+ psubw m4, m3
+ pshufb m1, m12
+ pshufb m2, m12
+ movq [rsp+8*2], m4
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ movifnidn strideq, stridemp
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ punpckhqdq m0, m0
+ movd [dstq+strideq*0], m0
+ psrlq m0, 32
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 4
+ jg .w4_leftonly_loop
+.w4_end:
+ sub r9d, 1<<8
+ jl .w4_ret
+ movq m4, [rsp+8*1]
+%if ARCH_X86_64
+ add r8, 4
+ mov dstq, r8
+%else
+ mov dstq, r8
+ add dstq, 4
+%endif
+ paddw m4, [rsp+8*0] ; base_y += 4*dy
+ movzx r3d, word [rsp+16*15+8*1]
+ add r10d, r3d
+ movddup m6, [rsp+16*15+8*1]
+ paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above)
+ jmp .w4_loop0
+.w4_ret:
+ RET
+.w8:
+ test angled, 0x400
+ jnz .w4_main
+ movd m5, [tlq+8]
+ lea r3d, [angleq+126]
+ pshufb m5, m7
+%if ARCH_X86_64
+ mov r3b, hb
+%else
+ xor r3b, r3b
+ or r3d, hd
+%endif
+ movd [rsp+16*8+8], m5
+ cmp r3d, 8
+ ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
+ call .upsample_above
+ sub angled, 53
+ lea r3d, [hq+7]
+ xor angled, 0x7f ; 180 - angle
+ movu m1, [base+z_filter_wh8]
+ movd m0, r3d
+ movd m6, angled
+ shr angled, 8 ; is_sm << 1
+ psrldq m2, [base+z_filter_t_w48+angleq*8], 4
+ pshufb m0, m7
+ pshufb m6, m7
+ pcmpeqb m0, m1
+ pand m6, m0
+ pcmpgtb m6, m2
+%if ARCH_X86_64
+ movq [rsp+16*15+8*1], m10 ; 8<<6
+%else
+ movq m0, m10
+ movq [rsp+16*15+8*1], m0
+%endif
+ jmp .w8_filter_left
+.w8_no_upsample_above:
+ lea r3d, [hq+7]
+ mov [rsp], angled
+ sub angled, 90
+ movd m0, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ movu m3, [base+z_filter_wh8]
+ psrldq m4, [base+z_filter_t_w48+angleq*8], 4
+ call .w8_filter_top
+ mov r3d, [rsp]
+ sub r3d, 141
+%if ARCH_X86_64
+ mov r3b, hb
+%else
+ xor r3b, r3b
+ or r3d, hd
+%endif
+ cmp r3d, 8
+ jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm
+.w8_filter_left:
+ pmovmskb r5d, m6
+ test r5d, r5d
+ jz .w4_main
+ imul r5d, 0x55555555
+ mov r3, tlq
+ shr r5d, 30
+ sub r5, 3 ; filter_strength-3
+ jmp .filter_left
+.w8_filter_top:
+ movd m6, r3d
+ REPX {pshufb x, m7}, m0, m1, m6
+ pcmpeqb m0, m3
+ pand m1, m0
+ pand m6, m0
+ pcmpgtb m1, m4
+ pcmpgtb m6, m4
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w8_filter_top_end ; filter_strength == 0
+ imul r5d, 0x55555555
+ movq m0, [rsp+gprsize+16*8-2]
+ shr r5d, 30
+ movq m1, [rsp+gprsize+16*8-1]
+ sub r5, 3 ; filter_strength-3
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
+ punpcklbw m0, m1
+ pmaddubsw m0, m7
+ movq m1, [rsp+gprsize+16*8+0]
+ movq m2, [rsp+gprsize+16*8+1]
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
+ punpcklbw m1, m2
+ pmaddubsw m1, m7
+ movq m2, [rsp+gprsize+16*8+2]
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*2]
+ punpcklbw m2, m2
+ pmaddubsw m2, m7
+ paddw m0, m1
+ paddw m0, m2
+%if ARCH_X86_64
+ mov r3d, r7m ; maxw, offset due to call
+%else
+ mov r3d, [rsp+gprsize+16*18+4*3]
+%endif
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ movq [rsp+gprsize+16*8], m0
+ cmp r3d, 8
+ jge .w8_filter_top_end
+ movq m0, [tlq+r3+1]
+ movq [rsp+gprsize+r3+16*8], m0
+.w8_filter_top_end:
+ ret
+.w16:
+ test angled, 0x400
+ jnz .w4_main
+ lea r3d, [hq+15]
+ sub angled, 90
+ movd m0, r3d
+ mov r3d, 90
+ movd m1, angled
+ sub r3d, angled ; 180 - angle
+ shr angled, 8 ; is_sm << 1
+ movd m6, r3d
+ REPX {pshufb x, m7}, m0, m1, m6
+ movq m3, [base+z_filter_t_w16+angleq*4]
+ pcmpeqb m0, [base+z_filter_wh16]
+ pand m1, m0
+ pand m6, m0
+ pcmpgtb m1, m3
+ pcmpgtb m6, m3
+ pmovmskb r5d, m1
+ mov r3, tlq
+ test r5d, r5d
+ jz .w16_filter_left ; filter_strength == 0
+ imul r5d, 0x24924924
+ pshufb m5, [base+z_filter_t_w16] ; tlq[16]
+ shr r5d, 30
+ adc r5, -4 ; filter_strength-3
+ movd [rsp+16*9], m5
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
+ movu m1, [rsp+16*8-2]
+ movu m2, [rsp+16*8-1]
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m7
+ punpckhbw m1, m2
+ pmaddubsw m1, m7
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
+ mova m3, [rsp+16*8+0]
+ movu m4, [rsp+16*8+1]
+ punpcklbw m2, m3, m4
+ pmaddubsw m2, m7
+ punpckhbw m3, m4
+ pmaddubsw m3, m7
+ paddw m0, m2
+ paddw m1, m3
+ test r5d, r5d
+ jnz .w16_filter_end ; 3-tap
+ movddup m7, [base+z_filter_k+8*8]
+ movu m3, [rsp+16*8+2]
+ punpcklbw m2, m3, m3
+ pmaddubsw m2, m7
+ punpckhbw m3, m3
+ pmaddubsw m3, m7
+ paddw m0, m2
+ paddw m1, m3
+.w16_filter_end:
+ mov r2d, maxwm
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ mova [rsp+16*8], m0
+ cmp r2d, 16
+ jge .w16_filter_left
+ movu m0, [r3+r2+1]
+ movu [rsp+r2+16*8], m0
+.w16_filter_left:
+ pmovmskb r5d, m6
+ test r5d, r5d
+ jz .w4_main
+ imul r5d, 0x24924924
+ shr r5d, 30
+ adc r5, -4 ; filter_strength-3
+ jmp .filter_left
+.w32:
+ test angled, 0x400
+ jnz .w4_main
+ pshufb m6, [base+z_filter_t_w16] ; tlq[32]
+ mov r3, tlq
+ lea tlq, [rsp+16*9]
+ movd [tlq+16*1], m6
+ xor r5d, r5d ; filter_strength = 3
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ mova m0, [tlq+16*0]
+ mova m1, [tlq+16*1]
+ mov r2d, maxwm
+ mova [rsp+16*8], m0
+ mova [rsp+16*9], m1
+ cmp r2d, 32
+ jge .filter_left
+ movu m0, [r3+r2+16*0+1]
+ movu m1, [r3+r2+16*1+1]
+ movu [rsp+r2+16*8], m0
+ movu [rsp+r2+16*9], m1
+ jmp .filter_left
+.w64:
+ movu m0, [tlq+16*2+1]
+ movu m1, [tlq+16*3+1]
+ mova [rsp+16*10], m0
+ mova [rsp+16*11], m1
+ test angled, 0x400
+ jnz .w4_main
+ pshufb m1, [base+z_filter_t_w16] ; tlq[64]
+ mov r3, tlq
+ lea tlq, [rsp+16*11]
+ movd [tlq+16*1], m1
+ xor r5d, r5d ; filter_strength = 3
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ mova m0, [tlq+16*0]
+ mova m1, [tlq+16*1]
+ mova m2, [tlq+16*2]
+ mova m3, [tlq+16*3]
+ mov r2d, maxwm
+ mova [rsp+16* 8], m0
+ mova [rsp+16* 9], m1
+ mova [rsp+16*10], m2
+ mova [rsp+16*11], m3
+ cmp r2d, 64
+ jge .filter_left
+ movu m0, [r3+r2+16*0+1]
+ movu m1, [r3+r2+16*1+1]
+ movu [rsp+r2+16* 8], m0
+ movu [rsp+r2+16* 9], m1
+ cmp r2d, 32
+ jge .filter_left
+ movu m0, [r3+r2+16*2+1]
+ movu m1, [r3+r2+16*3+1]
+ movu [rsp+r2+16*10], m0
+ movu [rsp+r2+16*11], m1
+.filter_left:
+ neg hq
+ movd m0, [r3+hq]
+ pxor m1, m1
+ pshufb m0, m1
+ movd [rsp+16*6+hq-4], m0
+ lea tlq, [rsp+16*5]
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ cmp hd, -32
+ jge .filter_left_end
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ mova m0, [tlq+16*0]
+ mova m1, [tlq+16*1]
+ mova [rsp+16*2], m0
+ mova [rsp+16*3], m1
+.filter_left_end:
+ mov r2d, maxhm
+ mova m0, [rsp+16*5]
+ mova m1, [rsp+16*6]
+ mova m2, [rsp+16*7]
+ neg r2
+ mova [rsp+16*4], m0
+ mova [rsp+16*5], m1
+ mova [rsp+16*6], m2
+ cmp r2d, hd
+ jle .w4_main
+ movu m0, [r3+r2-16*2]
+ movu m1, [r3+r2-16*1]
+ movu [rsp+r2+16*4], m0
+ movu [rsp+r2+16*5], m1
+ cmp r2d, -32
+ jle .w4_main
+ movu m0, [r3+r2-16*4]
+ movu m1, [r3+r2-16*3]
+ movu [rsp+r2+16*2], m0
+ movu [rsp+r2+16*3], m1
+ jmp .w4_main
+
+%if ARCH_X86_64
+cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w
+ %define base r7-$$
+ lea r7, [$$]
+ mova m8, [base+pw_62]
+ mova m9, [base+pw_64]
+ mova m10, [base+pw_512]
+ mov org_wd, wd
+%else
+cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy
+ %define base r1-$$
+ %define m8 [base+pw_62]
+ %define m9 [base+pw_64]
+ %define m10 [base+pw_512]
+ %define org_wd r5
+ %define org_wq r5
+ mov [dstq+strideq*0], strideq
+ mov [dstq+strideq*1], wd
+ LEA r1, $$
+%endif
+ tzcnt hd, hm
+ movifnidn angled, anglem
+ dec tlq
+ movsxd hq, [base+ipred_z3_ssse3_table+hq*4]
+ sub angled, 180
+ mov dyd, angled
+ neg dyd
+ xor angled, 0x400
+ or dyq, ~0x7e
+ lea hq, [base+ipred_z3_ssse3_table+hq]
+ movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq]
+ jmp hq
+.h4:
+ lea r4d, [angleq+88]
+ test r4d, 0x480
+ jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40
+ sar r4d, 9
+ add r4d, wd
+ cmp r4d, 8
+ jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm)
+ movu m3, [tlq-7]
+ movu m1, [base+z_upsample1-4]
+ movu m4, [base+z_filter_s+2]
+ pshufb m0, m3, m1
+ pxor m1, m1
+ pshufb m2, m3, m1
+ pshufb m1, m3, m4
+ mova [rsp+16], m2 ; top[max_base_y]
+ movddup m2, [base+pb_36_m4]
+ add dyd, dyd
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ movd m5, dyd
+ mov r5d, dyd
+ pshufb m5, [base+pw_256]
+ paddw m0, m1
+ pmulhrsw m0, m10
+ shl wd, 2
+ mov tlq, rsp
+ sub rsp, wq
+ packuswb m0, m0
+ punpcklbw m0, m3
+ paddw m6, m5, m5
+ punpcklqdq m5, m6
+ pshufb m0, [base+pb_15to0]
+ mova [tlq], m0
+.h4_upsample_loop:
+ lea r4d, [r5+dyq]
+ shr r5d, 6
+ movq m0, [tlq+r5]
+ lea r5d, [r4+dyq]
+ shr r4d, 6
+ movhps m0, [tlq+r4]
+ pand m2, m8, m5
+ psubw m1, m9, m2
+ psllw m2, 8
+ por m1, m2
+ pmaddubsw m0, m1
+ paddw m5, m6
+ pmulhrsw m0, m10
+ packuswb m0, m0
+ movq [rsp+wq-8], m0
+ sub wd, 8
+ jg .h4_upsample_loop
+ jmp .h4_transpose
+.h4_no_upsample:
+ mov r4d, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h4_main
+ lea r4d, [wq+3]
+ movd m0, r4d
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ pcmpeqb m1, m0, [base+z_filter_wh4]
+ pand m1, m2
+ pcmpgtb m1, [base+z_filter_t_w48+angleq*8]
+ pmovmskb r5d, m1
+ mov r4d, 7
+ test r5d, r5d
+ jz .h4_main ; filter_strength == 0
+ movu m2, [tlq-7]
+ imul r5d, 0x55555555
+ movu m3, [base+z_filter_s-2]
+ shr r5d, 30 ; filter_strength
+ mova m4, [base+z_upsample2]
+ movddup m5, [base+z_filter_k-8+r5*8+24*0]
+ movddup m6, [base+z_filter_k-8+r5*8+24*1]
+ movddup m7, [base+z_filter_k-8+r5*8+24*2]
+ pshufb m0, m2, m3
+ shufps m3, m4, q2121
+ pmaddubsw m1, m0, m5
+ pmaddubsw m0, m6
+ pshufb m5, m2, m3
+ pmaddubsw m3, m5, m6
+ pmaddubsw m5, m7
+ pshufb m2, m4
+ pmaddubsw m2, m7
+ paddw m0, m1
+ paddw m1, m3
+ paddw m0, m5
+ paddw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ lea r2d, [r4+2]
+ cmp wd, 4
+ cmovne r4d, r2d
+ pshufd m0, m0, q0000
+ lea tlq, [rsp+15]
+ packuswb m0, m1
+ mova [rsp], m0
+.h4_main:
+ movd m5, dyd
+ movddup m0, [base+z_base_inc] ; base_inc << 6
+ sub tlq, r4
+ shl r4d, 6
+ movd m7, [tlq]
+ movd m4, r4d
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, [base+pw_m256]
+ mova m3, [base+z3_shuf_h4]
+ lea r5, [dyq+r4+63] ; ypos
+ pshufb m4, [base+pw_256]
+ psubw m4, m0 ; max_base_y
+ shl wd, 2
+ paddw m6, m5, m5
+ sub rsp, wq
+ punpcklqdq m5, m6
+.h4_loop:
+ lea r4, [r5+dyq]
+ sar r5, 6
+ movq m0, [tlq+r5-4]
+ lea r5, [r4+dyq]
+ sar r4, 6
+ movhps m0, [tlq+r4-4]
+ pand m2, m8, m5
+ psubw m1, m9, m2
+ psllw m2, 8
+ pshufb m0, m3
+ por m1, m2
+ pmaddubsw m0, m1
+ pcmpgtw m1, m4, m5
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movq [rsp+wq-8], m0
+ sub wd, 8
+ jz .h4_transpose
+ test r5d, r5d
+ jg .h4_loop
+ packuswb m7, m7
+.h4_end_loop:
+ movq [rsp+wq-8], m7
+ sub wd, 8
+ jg .h4_end_loop
+.h4_transpose:
+ mova m1, [base+z_transpose4]
+%if ARCH_X86_32
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+ lea r2, [strideq*3]
+ lea dstq, [dstq+org_wq-4]
+.h4_transpose_loop:
+ mova m0, [rsp]
+ add rsp, 16
+ pshufb m0, m1
+ movd [dstq+strideq*0], m0
+ pshuflw m2, m0, q1032
+ movd [dstq+strideq*1], m2
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r2 ], m0
+ sub dstq, 4
+ sub org_wd, 4
+ jg .h4_transpose_loop
+ RET
+.h8:
+ lea r4d, [angleq+88]
+ and r4d, ~0x7f
+ or r4d, wd
+ cmp r4d, 8
+ ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+ mova m4, [tlq-15]
+ and r4d, 4
+ movu m3, [tlq- 9]
+ movd m1, r4d
+ movu m2, [base+z_filter_s+2]
+ pxor m0, m0
+ movu m5, [base+z_filter_s+6]
+ movddup m7, [base+pb_36_m4]
+ pshufb m1, m0 ; w & 4
+ movu m0, [base+z_upsample1-4]
+ pmaxub m1, m0 ; clip 4x8
+ add dyd, dyd
+ pshufb m0, m4, m1
+ pmaddubsw m0, m7
+ pshufb m1, m4, m2
+ pmaddubsw m1, m7
+ pshufb m2, m3, [base+z_upsample1]
+ pmaddubsw m2, m7
+ pshufb m3, m5
+ pmaddubsw m3, m7
+ movd m5, dyd
+ neg dyq
+ paddw m1, m0
+ paddw m2, m3
+ pmulhrsw m1, m10
+ pmulhrsw m2, m10
+ shl wd, 3
+ lea tlq, [rsp+16]
+ pshufb m5, [base+pw_256]
+ sub rsp, wq
+ packuswb m1, m2
+ lea r5, [dyq+63]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ mova [tlq-16*1], m0
+ mova [tlq-16*0], m1
+ paddw m6, m5, m5
+ punpcklqdq m5, m6
+.h8_upsample_loop:
+ lea r4, [r5+dyq]
+ sar r5, 6
+ movu m0, [tlq+r5]
+ lea r5, [r4+dyq]
+ sar r4, 6
+ movu m1, [tlq+r4]
+ pand m3, m8, m5
+ psubw m2, m9, m3
+ psllw m2, 8
+ por m3, m2
+ pshufd m2, m3, q1010
+ pmaddubsw m0, m2
+ punpckhqdq m3, m3
+ pmaddubsw m1, m3
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m1, m0
+ mova [rsp+wq-16], m1
+ sub wd, 16
+ jg .h8_upsample_loop
+ jmp .h8_transpose
+.h8_no_upsample:
+ lea r4d, [wq+7]
+ movd m0, r4d
+ and r4d, 7
+ or r4d, 8 ; imin(w+7, 15)
+ test angled, 0x400
+ jnz .h8_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movu m1, [base+z_filter_wh8]
+ psrldq m3, [base+z_filter_t_w48+angleq*8], 4
+ pcmpeqb m1, m0
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .h8_main ; filter_strength == 0
+ mova m0, [tlq-15]
+ imul r5d, 0x55555555
+ movd m1, [tlq+1]
+ neg r4
+ movd m2, [tlq+r4]
+ shr r5d, 30
+ pxor m7, m7
+ lea tlq, [rsp+16*2]
+ sub r5, 3 ; filter_strength-3
+ mova [tlq+16*0], m0
+ pshufb m1, m7
+ mova [tlq+16*1], m1
+ pshufb m2, m7
+ movq [tlq+r4+8], m2
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sar r5d, 1
+ add tlq, 31
+ add r5d, 17
+ cmp wd, 8
+ cmova r4d, r5d
+.h8_main:
+ movd m5, dyd
+ sub tlq, r4
+ shl r4d, 6
+ movd m7, [tlq]
+ movd m4, r4d
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, [base+pw_m256]
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, [base+pw_256]
+ psubw m4, [base+z3_base_inc]
+ shl wd, 3
+ mova m6, m5
+ sub rsp, wq
+.h8_loop:
+ mov r4, r5
+ sar r4, 6
+ movu m0, [tlq+r4-8]
+ pand m2, m8, m5
+ psubw m1, m9, m2
+ psllw m2, 8
+ pshufb m0, m3
+ por m1, m2
+ pmaddubsw m0, m1
+ pcmpgtw m1, m4, m5
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movq [rsp+wq-8], m0
+ sub wd, 8
+ jz .h8_transpose
+ add r5, dyq
+ jg .h8_loop
+ packuswb m7, m7
+.h8_end_loop:
+ movq [rsp+wq-8], m7
+ sub wd, 8
+ jg .h8_end_loop
+.h8_transpose:
+%if ARCH_X86_32
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+ or r3d, 8
+ cmp org_wd, 4
+%if ARCH_X86_64
+ jne .end_transpose_main
+%else
+ jne .end_transpose_loop
+%endif
+ mova m1, [rsp+16*1]
+ mova m0, [rsp+16*0]
+ lea r2, [strideq*3]
+ add rsp, 16*2
+ punpcklbw m2, m1, m0
+ punpckhbw m1, m0
+ punpckhbw m0, m1, m2
+ punpcklbw m1, m2
+.write_4x8_end:
+ call .write_4x8
+ RET
+.write_4x8:
+ movd [dstq+r2 ], m0
+ pshuflw m4, m0, q1032
+ movd [dstq+strideq*2], m4
+ punpckhqdq m0, m0
+ movd [dstq+strideq*1], m0
+ psrlq m0, 32
+ movd [dstq+strideq*0], m0
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+r2 ], m1
+ pshuflw m4, m1, q1032
+ movd [dstq+strideq*2], m4
+ punpckhqdq m1, m1
+ movd [dstq+strideq*1], m1
+ psrlq m1, 32
+ movd [dstq+strideq*0], m1
+ ret
+.h16:
+ lea r4d, [wq+15]
+ movd m0, r4d
+ and r4d, 15
+ or r4d, 16 ; imin(w+15, 31)
+ test angled, 0x400
+ jnz .h16_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movq m3, [base+z_filter_t_w16+angleq*4]
+ pcmpeqb m1, m0, [base+z_filter_wh16]
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .h16_main ; filter_strength == 0
+ mova m0, [tlq-16*2+1]
+ imul r5d, 0x24924924
+ mova m1, [tlq-16*1+1]
+ neg r4
+ movd m2, [tlq-16*0+1]
+ shr r5d, 30
+ movd m3, [tlq+r4]
+ adc r5, -4 ; filter_strength-3
+ pxor m7, m7
+ lea tlq, [rsp+16*2]
+ mova [tlq-16*1], m0
+ pshufb m2, m7
+ mova [tlq+16*0], m1
+ pshufb m3, m7
+ mova [tlq+16*1], m2
+ movq [tlq+r4+8], m3
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ add tlq, 31
+ cmp wd, 16
+ jle .h16_main
+ pshuflw m0, [tlq-47], q0000
+ sar r5, 1
+ movq m1, [base+z3_filter_k_tail+r5*4]
+ lea r4d, [r5+33]
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq-35], m0
+.h16_main:
+ movd m5, dyd
+ sub tlq, r4
+ movd m4, r4d
+ shl r4d, 6
+ movd m7, [tlq]
+ pxor m6, m6
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, m6
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, m6
+ psubb m4, [base+pb_15to0]
+ shl wd, 4
+ mova m6, m5
+ sub rsp, wq
+.h16_loop:
+ mov r4, r5
+ pand m2, m8, m5
+ sar r4, 6
+ psubw m1, m9, m2
+ psllw m2, 8
+ movu m0, [tlq+r4-8*2]
+ por m2, m1
+ movu m1, [tlq+r4-8*1]
+ pshufb m0, m3
+ pmaddubsw m0, m2
+ pshufb m1, m3
+ pmaddubsw m1, m2
+ psrlw m2, m5, 6
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packsswb m2, m2
+ packuswb m0, m1
+ pcmpgtb m1, m4, m2
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ mova [rsp+wq-16], m0
+ sub wd, 16
+ jz .h16_transpose
+ add r5, dyq
+ jg .h16_loop
+.h16_end_loop:
+ mova [rsp+wq-16], m7
+ sub wd, 16
+ jg .h16_end_loop
+.h16_transpose:
+%if ARCH_X86_32
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+ or r3d, 16
+ cmp org_wd, 4
+%if ARCH_X86_64
+ jne .end_transpose_main
+%else
+ jne .end_transpose_loop
+%endif
+.h16_transpose_w4:
+ mova m2, [rsp+16*3]
+ mova m4, [rsp+16*2]
+ mova m3, [rsp+16*1]
+ mova m0, [rsp+16*0]
+ lea r2, [strideq*3]
+ add rsp, 16*4
+ punpckhbw m1, m2, m4
+ punpcklbw m2, m4
+ punpckhbw m4, m3, m0
+ punpcklbw m3, m0
+ punpckhwd m0, m1, m4
+ punpcklwd m1, m4
+ call .write_4x8
+ lea dstq, [dstq+strideq*4]
+ punpckhwd m0, m2, m3
+ punpcklwd m1, m2, m3
+ jmp .write_4x8_end
+.h32:
+ lea r4d, [wq+31]
+ and r4d, 31
+ or r4d, 32 ; imin(w+31, 63)
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h32_main
+ mova m0, [tlq-16*4+1]
+ mova m1, [tlq-16*3+1]
+ mova m2, [tlq-16*2+1]
+ mova m3, [tlq-16*1+1]
+ movd m4, [tlq-16*0+1]
+ neg r4
+ movd m5, [tlq+r4]
+ pxor m7, m7
+ lea tlq, [rsp+16*4]
+ mova [tlq-16*3], m0
+ mova [tlq-16*2], m1
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq-16*1], m2
+ pshufb m4, m7
+ mova [tlq+16*0], m3
+ pshufb m5, m7
+ mova [tlq+16*1], m4
+ movq [tlq+r4+8], m5
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ add tlq, 63
+ cmp wd, 32
+ jle .h32_main
+ pshuflw m0, [tlq-79], q0000
+ movq m1, [base+z3_filter_k_tail]
+ add r4d, 2
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq-67], m0
+.h32_main:
+ movd m5, dyd
+ sub tlq, r4
+ movd m4, r4d
+ shl r4d, 6
+ movd m7, [tlq]
+ pxor m6, m6
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, m6
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, m6
+ psubb m4, [base+pb_15to0]
+ mova m6, m5
+.h32_loop:
+ mov r4, r5
+ pand m2, m8, m5
+ sar r4, 6
+ psubw m1, m9, m2
+ psllw m2, 8
+ movu m0, [tlq+r4-8*4]
+ por m2, m1
+ movu m1, [tlq+r4-8*3]
+ pshufb m0, m3
+ pmaddubsw m0, m2
+ pshufb m1, m3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ sub rsp, 32
+ packuswb m0, m1
+ mova [rsp+16*0], m0
+ movu m0, [tlq+r4-8*2]
+ movu m1, [tlq+r4-8*1]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ psrlw m2, m5, 6
+ paddw m5, m6
+ packsswb m2, m2
+ packuswb m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ mova [rsp+16*1], m0
+ pand m0, m1, [rsp+16*0]
+ pandn m1, m7
+ por m0, m1
+ mova [rsp+16*0], m0
+ dec wd
+ jz .h32_transpose
+ add r5, dyq
+ jg .h32_loop
+.h32_end_loop:
+ sub rsp, 32
+ mova [rsp+16*1], m7
+ mova [rsp+16*0], m7
+ dec wd
+ jg .h32_end_loop
+.h32_transpose:
+ or r3d, 32
+ jmp .end_transpose_main
+.h64:
+ lea r4d, [wq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .h64_main
+ mova m0, [tlq-16*8+1]
+ mova m1, [tlq-16*7+1]
+ mova m2, [tlq-16*6+1]
+ mova m3, [tlq-16*5+1]
+ mova [rsp+16*1], m0
+ mova [rsp+16*2], m1
+ mova [rsp+16*3], m2
+ mova [rsp+16*4], m3
+ mova m0, [tlq-16*4+1]
+ mova m1, [tlq-16*3+1]
+ mova m2, [tlq-16*2+1]
+ mova m3, [tlq-16*1+1]
+ movd m4, [tlq-16*0+1]
+ neg r4
+ movd m5, [tlq+r4]
+ pxor m7, m7
+ lea tlq, [rsp+16*8]
+ mova [tlq-16*3], m0
+ mova [tlq-16*2], m1
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq-16*1], m2
+ pshufb m4, m7
+ mova [tlq+16*0], m3
+ pshufb m5, m7
+ mova [tlq+16*1], m4
+ movq [tlq+r4+8], m5
+ neg r4d
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+ sub tlq, 16*2
+ cmp wd, 64
+ jl .h64_filter96 ; skip one call if the last 32 bytes aren't used
+ call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
+.h64_filter96:
+ add tlq, 127
+.h64_main:
+ movd m5, dyd
+ sub tlq, r4
+ movd m4, r4d
+ shl r4d, 6
+ movd m7, [tlq]
+ pxor m6, m6
+ pshufb m5, [base+pw_256]
+ neg dyq
+ pshufb m7, m6
+ mova m3, [base+z3_shuf]
+ lea r5, [dyq+r4+63]
+ pshufb m4, m6
+ psubb m4, [base+pb_15to0]
+ mova m6, m5
+.h64_loop:
+ mov r4, r5
+ pand m2, m8, m5
+ sar r4, 6
+ psubw m1, m9, m2
+ psllw m2, 8
+ movu m0, [tlq+r4-8*8]
+ por m2, m1
+ movu m1, [tlq+r4-8*7]
+ pshufb m0, m3
+ pmaddubsw m0, m2
+ pshufb m1, m3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ sub rsp, 64
+ packuswb m0, m1
+ mova [rsp+16*0], m0
+ movu m0, [tlq+r4-8*6]
+ movu m1, [tlq+r4-8*5]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ mova [rsp+16*1], m0
+ movu m0, [tlq+r4-8*4]
+ movu m1, [tlq+r4-8*3]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ mova [rsp+16*2], m0
+ movu m0, [tlq+r4-8*2]
+ movu m1, [tlq+r4-8*1]
+ pshufb m0, m3
+ pshufb m1, m3
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ psrlw m2, m5, 6
+ paddw m5, m6
+ packsswb m2, m2
+ packuswb m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ mova [rsp+16*3], m0
+ pand m0, m1, [rsp+16*2]
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ paddsb m2, [base+pb_16]
+ mova [rsp+16*2], m0
+ pand m0, m1, [rsp+16*1]
+ pandn m1, m7
+ por m0, m1
+ pcmpgtb m1, m4, m2
+ mova [rsp+16*1], m0
+ pand m0, m1, [rsp+16*0]
+ pandn m1, m7
+ por m0, m1
+ mova [rsp+16*0], m0
+ dec wd
+ jz .h64_transpose
+ add r5, dyq
+ jg .h64_loop
+.h64_end_loop:
+ sub rsp, 64
+ mova [rsp+16*3], m7
+ mova [rsp+16*2], m7
+ mova [rsp+16*1], m7
+ mova [rsp+16*0], m7
+ dec wd
+ jg .h64_end_loop
+.h64_transpose:
+ or r3d, 64
+.end_transpose_main:
+%if ARCH_X86_64
+ lea r5, [r3*3]
+ lea r7, [strideq*3]
+%else
+ mov strideq, [dstq]
+ mov org_wd, [dstq+strideq]
+%endif
+.end_transpose_loop:
+ lea r4, [rsp+r3-8]
+ lea r6, [dstq+org_wq-8]
+.end_transpose_loop_y:
+ movq m0, [r4+r3*1]
+ movq m4, [r4+r3*0]
+%if ARCH_X86_64
+ movq m1, [r4+r5 ]
+ movq m5, [r4+r3*2]
+ lea r2, [r4+r3*4]
+%else
+ lea r2, [r4+r3*2]
+ movq m1, [r2+r3*1]
+ movq m5, [r2+r3*0]
+ lea r2, [r2+r3*2]
+%endif
+ movq m2, [r2+r3*1]
+ movq m6, [r2+r3*0]
+%if ARCH_X86_64
+ movq m3, [r2+r5 ]
+ movq m7, [r2+r3*2]
+%else
+ lea r2, [r2+r3*2]
+ movq m3, [r2+r3*1]
+ movq m7, [r2+r3*0]
+%endif
+ sub r4, 8
+ punpcklbw m0, m4
+ punpcklbw m1, m5
+ punpcklbw m2, m6
+ punpcklbw m3, m7
+ punpckhwd m4, m1, m0
+ punpcklwd m1, m0
+ punpckhwd m0, m3, m2
+ punpcklwd m3, m2
+ punpckhdq m2, m3, m1
+ punpckldq m3, m1
+ punpckldq m1, m0, m4
+ punpckhdq m0, m4
+ movhps [r6+strideq*0], m0
+ movq [r6+strideq*1], m0
+%if ARCH_X86_64
+ movhps [r6+strideq*2], m1
+ movq [r6+r7 ], m1
+ lea r6, [r6+strideq*4]
+%else
+ lea r6, [r6+strideq*2]
+ movhps [r6+strideq*0], m1
+ movq [r6+strideq*1], m1
+ lea r6, [r6+strideq*2]
+%endif
+ movhps [r6+strideq*0], m2
+ movq [r6+strideq*1], m2
+%if ARCH_X86_64
+ movhps [r6+strideq*2], m3
+ movq [r6+r7 ], m3
+ lea r6, [r6+strideq*4]
+%else
+ lea r6, [r6+strideq*2]
+ movhps [r6+strideq*0], m3
+ movq [r6+strideq*1], m3
+ lea r6, [r6+strideq*2]
+%endif
+ cmp r4, rsp
+ jae .end_transpose_loop_y
+ lea rsp, [rsp+r3*8]
+ sub org_wd, 8
+ jg .end_transpose_loop
+ RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal,
+; const uint8_t *idx, const int w, const int h);
+;---------------------------------------------------------------------------------------
+cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
+ mova m4, [palq]
+ LEA r2, pal_pred_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r2+wq*4]
+ packuswb m4, m4
+ add wq, r2
+ lea r2, [strideq*3]
+ jmp wq
+.w4:
+ pshufb m0, m4, [idxq]
+ add idxq, 16
+ movd [dstq ], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq ], m1
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r2 ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+ALIGN function_align
+.w8:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ add idxq, 32
+ movq [dstq ], m0
+ movhps [dstq+strideq ], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r2 ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+ALIGN function_align
+.w16:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ pshufb m2, m4, [idxq+32]
+ pshufb m3, m4, [idxq+48]
+ add idxq, 64
+ mova [dstq ], m0
+ mova [dstq+strideq ], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r2 ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+ALIGN function_align
+.w32:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ pshufb m2, m4, [idxq+32]
+ pshufb m3, m4, [idxq+48]
+ add idxq, 64
+ mova [dstq ], m0
+ mova [dstq+16 ], m1
+ mova [dstq+strideq ], m2
+ mova [dstq+strideq+16], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+ALIGN function_align
+.w64:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ pshufb m2, m4, [idxq+32]
+ pshufb m3, m4, [idxq+48]
+ add idxq, 64
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ add dstq, strideq
+ sub hd, 1
+ jg .w64
+ RET
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+%macro IPRED_CFL 1 ; ac in, unpacked pixels out
+ psignw m3, m%1, m1
+ pabsw m%1, m%1
+ pmulhrsw m%1, m2
+ psignw m%1, m3
+ paddw m%1, m0
+%endmacro
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ movifnidn wd, wm
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea t0d, [wq+hq]
+ movd m4, t0d
+ tzcnt t0d, t0d
+ movd m5, t0d
+ LEA t0, ipred_cfl_ssse3_table
+ tzcnt wd, wd
+ movsxd r6, [t0+r6*4]
+ movsxd wq, [t0+wq*4+16]
+ pcmpeqd m3, m3
+ psrlw m4, 1
+ add r6, t0
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h4:
+ movd m0, [tlq-4]
+ pmaddubsw m0, m3
+ jmp wq
+.w4:
+ movd m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m0, m4
+ paddw m0, m1
+ pmaddwd m0, m3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw m0, 3 ; dc >>= ctz(width + height);
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq m1, m0, m0
+ paddw m0, m1
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8
+ cmovz r6d, r2d
+ movd m5, r6d
+ pmulhuw m0, m5
+.w4_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s4:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s4_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ movd [dstq+strideq*0], m4
+ pshuflw m4, m4, q1032
+ movd [dstq+strideq*1], m4
+ punpckhqdq m4, m4
+ movd [dstq+strideq*2], m4
+ psrlq m4, 32
+ movd [dstq+r6 ], m4
+ lea dstq, [dstq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .s4_loop
+ RET
+ALIGN function_align
+.h8:
+ movq m0, [tlq-8]
+ pmaddubsw m0, m3
+ jmp wq
+.w8:
+ movq m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ paddw m0, m1
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w8_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s8:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ lea r6, [strideq*3]
+ pabsw m2, m1
+ psllw m2, 9
+.s8_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ movq [dstq ], m4
+ movhps [dstq+strideq ], m4
+ mova m4, [acq+32]
+ mova m5, [acq+48]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ movq [dstq+strideq*2], m4
+ movhps [dstq+r6 ], m4
+ lea dstq, [dstq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .s8_loop
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-16]
+ pmaddubsw m0, m3
+ jmp wq
+.w16:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8|32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w16_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s16:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s16_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq], m4
+ mova m4, [acq+32]
+ mova m5, [acq+48]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq+strideq], m4
+ lea dstq, [dstq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .s16_loop
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ mova m2, [tlq-16]
+ pmaddubsw m2, m3
+ paddw m0, m2
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ movu m2, [tlq+17]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 64|16
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w32_end:
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+.s32:
+ movd m1, alpham
+ pshuflw m1, m1, q0000
+ punpcklqdq m1, m1
+ pabsw m2, m1
+ psllw m2, 9
+.s32_loop:
+ mova m4, [acq]
+ mova m5, [acq+16]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq], m4
+ mova m4, [acq+32]
+ mova m5, [acq+48]
+ IPRED_CFL 4
+ IPRED_CFL 5
+ packuswb m4, m5
+ mova [dstq+16], m4
+ add dstq, strideq
+ add acq, 64
+ dec hd
+ jg .s32_loop
+ RET
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ mov t0d, 0x8000
+ movd m3, t0d
+ movd m2, r6d
+ psrld m3, m2
+ LEA t0, ipred_cfl_left_ssse3_table
+ movsxd r6, [t0+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, t0
+ add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+.h32:
+ movu m1, [tlq+16] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h16:
+ pshufd m1, m0, q3232 ; psrlq m1, m0, 16
+ paddw m0, m1
+.h8:
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+.h4:
+ pmaddwd m0, m2
+ pmulhrsw m0, m3
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ LEA t0, ipred_cfl_left_ssse3_table
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ mov r6d, 0x8000
+ movd m3, r6d
+ movd m2, wd
+ psrld m3, m2
+ movsxd r6, [t0+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, t0
+ add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table
+ movsxd wq, [t0+wq*4]
+ add wq, t0
+ movifnidn acq, acmp
+ jmp r6
+
+;---------------------------------------------------------------------------------------
+;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int16_t *ac, const int alpha);
+;---------------------------------------------------------------------------------------
+cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
+ tzcnt wd, wm
+ movifnidn hd, hm
+ LEA r6, ipred_cfl_splat_ssse3_table
+ movsxd wq, [r6+wq*4]
+ movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128]
+ add wq, r6
+ movifnidn acq, acmp
+ jmp wq
+
+%macro RELOAD_ACQ_32 1
+ mov acq, ac_bakq ; restore acq
+%endmacro
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
+DECLARE_REG_TMP 7
+ movddup m2, [pb_2]
+%else
+cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
+DECLARE_REG_TMP 4
+%define ac_bakq acmp
+ mov t0d, 0x02020202
+ movd m2, t0d
+ pshufd m2, m2, q0000
+%endif
+ movifnidn wd, wm
+ mov t0d, hm
+ mov hd, t0d
+ imul t0d, wd
+ movd m5, t0d
+ movifnidn hpadd, hpadm
+%if ARCH_X86_64
+ mov ac_bakq, acq
+%endif
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m4, m4
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movq m0, [yq]
+ movq m1, [yq+strideq]
+ movhps m0, [yq+strideq*2]
+ movhps m1, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 16
+ sub hd, 2
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_4_8
+ punpckhqdq m0, m0
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 16
+ sub hpadd, 2
+ jg .w4_hpad_loop
+ jmp .calc_avg_4_8
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ mova m0, [yq+strideq*2]
+ mova m1, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq+16], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 2
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_4_8
+ jmp .w8_hpad
+.w8_wpad: ; wpadd=1
+ movddup m0, [yq]
+ movddup m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ pshufhw m0, m0, q3333
+ mova [acq], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 16
+ sub hd, 1
+ jg .w8_wpad
+ test hpadd, hpadd
+ jz .calc_avg_4_8
+.w8_hpad:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 16
+ sub hpadd, 1
+ jg .w8_hpad
+ jmp .calc_avg_4_8
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ mova m6, [yq+16]
+ mova m1, [yq+strideq+16]
+ pmaddubsw m6, m2
+ pmaddubsw m1, m2
+ paddw m6, m1
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg16
+ jmp .w16_hpad_loop
+.w16_wpad:
+ cmp wpadd, 2
+ jl .w16_pad1
+ je .w16_pad2
+.w16_pad3:
+ movddup m0, [yq]
+ movddup m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ pshufhw m0, m0, q3333
+ mova [acq], m0
+ paddw m4, m0
+ mova m6, m0
+ punpckhqdq m6, m0, m0
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_pad3
+ jmp .w16_wpad_done
+.w16_pad2:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ pshufhw m6, m0, q3333
+ punpckhqdq m6, m6
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_pad2
+ jmp .w16_wpad_done
+.w16_pad1:
+ mova m0, [yq]
+ mova m1, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ paddw m0, m1
+ mova [acq], m0
+ paddw m4, m0
+ movddup m6, [yq+16]
+ movddup m1, [yq+strideq+16]
+ pmaddubsw m6, m2
+ pmaddubsw m1, m2
+ paddw m6, m1
+ pshufhw m6, m6, q3333
+ mova [acq+16], m6
+ paddw m4, m6
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ dec hd
+ jg .w16_pad1
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg16
+.w16_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ mova [acq+16], m6
+ paddw m4, m6
+ add acq, 32
+ dec hpadd
+ jg .w16_hpad_loop
+ jmp .calc_avg16
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+.calc_avg_4_8:
+ psrlw m2, 9
+ pmaddwd m4, m2
+ jmp .calc_avg
+.calc_avg16:
+ psrld m0, m4, 16
+ pslld m4, 16
+ psrld m4, 16
+ paddd m4, m0
+.calc_avg:
+ movd szd, m5
+ psrad m5, 1
+ tzcnt r1d, szd
+ paddd m4, m5
+ movd m1, r1d
+ pshufd m0, m4, q2301
+ paddd m0, m4
+ pshufd m4, m0, q1032
+ paddd m0, m4
+ psrad m0, m1 ; sum >>= log2sz;
+ packssdw m0, m0
+ RELOAD_ACQ_32 acq
+.sub_loop:
+ mova m1, [acq]
+ psubw m1, m0 ; ac[x] -= sum;
+ mova [acq], m1
+ add acq, 16
+ sub szd, 8
+ jg .sub_loop
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak
+ movddup m2, [pb_4]
+%else
+cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h
+ mov t0d, 0x04040404
+ movd m2, t0d
+ pshufd m2, m2, q0000
+%endif
+ movifnidn wd, wm
+ mov t0d, hm
+ mov hd, t0d
+ imul t0d, wd
+ movd m6, t0d
+ movifnidn hpadd, hpadm
+%if ARCH_X86_64
+ mov ac_bakq, acq
+%endif
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m4, m4
+ pxor m5, m5
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movq m1, [yq]
+ movhps m1, [yq+strideq]
+ movq m0, [yq+strideq*2]
+ movhps m0, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_4
+ punpckhqdq m0, m0
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m4, m0
+ add acq, 16
+ sub hpadd, 2
+ jg .w4_hpad_loop
+ jmp .calc_avg_4
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ mova m1, [yq]
+ mova m0, [yq+strideq]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m0
+ paddw m5, m1
+ mova m1, [yq+strideq*2]
+ mova m0, [yq+stride3q]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w8_hpad
+.w8_wpad:
+ movddup m1, [yq]
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq], m1
+ paddw m5, m1
+ movddup m0, [yq+strideq]
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+16], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w8_hpad:
+ mova [acq], m0
+ paddw m4, m0
+ mova [acq+16], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 2
+ jg .w8_hpad
+ jmp .calc_avg_8_16
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m1, [yq]
+ mova m0, [yq+16]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m5, m0
+ paddw m5, m1
+ mova m1, [yq+strideq]
+ mova m0, [yq+strideq+16]
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m0
+ paddw m4, m1
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w16_hpad_loop
+.w16_wpad:
+ cmp wpadd, 2
+ jl .w16_pad1
+ je .w16_pad2
+.w16_pad3:
+ movddup m1, [yq]
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq], m1
+ paddw m5, m1
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ movddup m1, [yq+strideq]
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhqdq m0, m1, m1
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad3
+ jmp .w16_wpad_done
+.w16_pad2:
+ mova m1, [yq]
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ pshufhw m1, m1, q3333
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ mova m1, [yq+strideq]
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ mova m0, m1
+ pshufhw m0, m0, q3333
+ punpckhqdq m0, m0
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad2
+ jmp .w16_wpad_done
+.w16_pad1:
+ mova m1, [yq]
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ movddup m0, [yq+16]
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+16], m0
+ paddw m5, m0
+ mova m1, [yq+strideq]
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ movddup m0, [yq+strideq+16]
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad1
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w16_hpad_loop:
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m1
+ paddw m5, m0
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m1
+ paddw m5, m0
+ add acq, 64
+ sub hpadd, 2
+ jg .w16_hpad_loop
+ jmp .calc_avg_8_16
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+.calc_avg_4:
+ psrlw m2, 10
+ pmaddwd m5, m2
+ pmaddwd m0, m4, m2
+ jmp .calc_avg
+.calc_avg_8_16:
+ mova m0, m5
+ psrld m5, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m5, m0
+ mova m0, m4
+ psrld m0, 16
+ pslld m4, 16
+ psrld m4, 16
+ paddd m0, m4
+.calc_avg:
+ paddd m5, m0
+ movd szd, m6
+ psrad m6, 1
+ tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height);
+ paddd m5, m6
+ movd m1, r1d
+ pshufd m0, m5, q2301
+ paddd m0, m5
+ pshufd m5, m0, q1032
+ paddd m0, m5
+ psrad m0, m1 ; sum >>= log2sz;
+ packssdw m0, m0
+ RELOAD_ACQ_32 acq ; ac = ac_orig
+.sub_loop:
+ mova m1, [acq]
+ psubw m1, m0
+ mova [acq], m1
+ add acq, 16
+ sub szd, 8
+ jg .sub_loop
+ RET
+
+%if ARCH_X86_64
+cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak
+ movddup m2, [pb_4]
+%else
+cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h
+%define ac_bakq [rsp+16*4]
+ mov t0d, 0x04040404
+ movd m2, t0d
+ pshufd m2, m2, q0000
+%endif
+ movifnidn wd, wm
+ movifnidn hpadd, hpadm
+ movd m0, hpadd
+ mov t0d, hm
+ mov hd, t0d
+ imul t0d, wd
+ movd m6, t0d
+ movd hpadd, m0
+ mov ac_bakq, acq
+ shl hpadd, 2
+ sub hd, hpadd
+ pxor m5, m5
+ pxor m4, m4
+ cmp wd, 16
+ jg .w32
+ cmp wd, 8
+ jg .w16
+ je .w8
+ ; fall-through
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
+%endif
+.w4:
+ lea stride3q, [strideq*3]
+.w4_loop:
+ movd m1, [yq]
+ movd m3, [yq+strideq]
+ punpckldq m1, m3
+ punpcklbw m1, m1
+ movd m0, [yq+strideq*2]
+ movd m3, [yq+stride3q]
+ punpckldq m0, m3
+ punpcklbw m0, m0
+ pmaddubsw m1, m2
+ pmaddubsw m0, m2
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m5, m0
+ paddw m5, m1
+ lea yq, [yq+strideq*4]
+ add acq, 32
+ sub hd, 4
+ jg .w4_loop
+ test hpadd, hpadd
+ jz .calc_avg_4
+ punpckhqdq m0, m0
+.w4_hpad_loop:
+ mova [acq], m0
+ paddw m5, m0
+ add acq, 16
+ sub hpadd, 2
+ jg .w4_hpad_loop
+.calc_avg_4:
+ psrlw m2, 10
+ pmaddwd m5, m2
+ jmp .calc_avg
+
+.w8:
+ lea stride3q, [strideq*3]
+ test wpadd, wpadd
+ jnz .w8_wpad
+.w8_loop:
+ movq m1, [yq]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ movq m0, [yq+strideq]
+ punpcklbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0
+ movq m1, [yq+strideq*2]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ movq m0, [yq+stride3q]
+ punpcklbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*4]
+ add acq, 64
+ sub hd, 4
+ jg .w8_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w8_hpad
+.w8_wpad:
+ movd m1, [yq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pmaddubsw m1, m2
+ pshufhw m1, m1, q3333
+ mova [acq], m1
+ paddw m5, m1
+ movd m0, [yq+strideq]
+ punpcklbw m0, m0
+ punpcklqdq m0, m0
+ pmaddubsw m0, m2
+ pshufhw m0, m0, q3333
+ mova [acq+16], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 32
+ sub hd, 2
+ jg .w8_wpad
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w8_hpad:
+ mova [acq], m0
+ paddw m5, m0
+ mova [acq+16], m0
+ paddw m4, m0
+ add acq, 32
+ sub hpadd, 2
+ jg .w8_hpad
+ jmp .calc_avg_8_16
+
+.w16:
+ test wpadd, wpadd
+ jnz .w16_wpad
+.w16_loop:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0
+ mova m0, [yq+strideq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_loop
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+ jmp .w16_hpad_loop
+.w16_wpad:
+ cmp wpadd, 2
+ jl .w16_pad1
+ je .w16_pad2
+.w16_pad3:
+ movd m1, [yq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pshufhw m1, m1, q3333
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ movd m1, [yq+strideq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pshufhw m1, m1, q3333
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhqdq m0, m1, m1
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad3
+ jmp .w16_wpad_done
+.w16_pad2:
+ movq m1, [yq]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ pshufhw m1, m1, q3333
+ punpckhqdq m1, m1
+ mova [acq+16], m1
+ paddw m5, m1
+ movq m1, [yq+strideq]
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ mova m0, m1
+ pshufhw m0, m0, q3333
+ punpckhqdq m0, m0
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad2
+ jmp .w16_wpad_done
+.w16_pad1:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1
+ punpckhbw m0, m0
+ punpcklqdq m0, m0
+ pshufhw m0, m0, q3333
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0
+ mova m0, [yq+strideq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq+32], m1
+ paddw m4, m1
+ punpckhbw m0, m0
+ punpcklqdq m0, m0
+ pshufhw m0, m0, q3333
+ pmaddubsw m0, m2
+ mova [acq+48], m0
+ paddw m4, m0
+ lea yq, [yq+strideq*2]
+ add acq, 64
+ sub hd, 2
+ jg .w16_pad1
+.w16_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg_8_16
+.w16_hpad_loop:
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m4, m1
+ paddw m5, m0
+ mova [acq+32], m1
+ mova [acq+48], m0
+ paddw m4, m1
+ paddw m5, m0
+ add acq, 64
+ sub hpadd, 2
+ jg .w16_hpad_loop
+.calc_avg_8_16:
+ mova m0, m5
+ psrld m5, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m5, m0
+ mova m0, m4
+ psrld m0, 16
+ pslld m4, 16
+ psrld m4, 16
+ paddd m0, m4
+ paddd m5, m0
+ jmp .calc_avg
+
+.w32:
+ pxor m0, m0
+ mova [rsp ], m0
+ mova [rsp+16], m0
+ mova [rsp+32], m0
+ mova [rsp+48], m0
+ test wpadd, wpadd
+ jnz .w32_wpad
+.w32_loop:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m4, [yq+16]
+ mova m3, m4
+ punpcklbw m3, m3
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ punpckhbw m4, m4
+ pmaddubsw m4, m2
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_loop
+ test hpadd, hpadd
+ jz .calc_avg_32
+ jmp .w32_hpad_loop
+.w32_wpad:
+ cmp wpadd, 2
+ jl .w32_pad1
+ je .w32_pad2
+ cmp wpadd, 4
+ jl .w32_pad3
+ je .w32_pad4
+ cmp wpadd, 6
+ jl .w32_pad5
+ je .w32_pad6
+.w32_pad7:
+ movd m1, [yq]
+ punpcklbw m1, m1
+ punpcklqdq m1, m1
+ pshufhw m1, m1, q3333
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ mova m0, m1
+ punpckhqdq m0, m0
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad7
+ jmp .w32_wpad_done
+.w32_pad6:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ pshufhw m0, m1, q3333
+ punpckhqdq m0, m0
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad6
+ jmp .w32_wpad_done
+.w32_pad5:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ mova m5, [rsp]
+ paddw m5, m1
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ punpcklqdq m0, m0
+ pshufhw m0, m0, q3333
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ punpckhqdq m3, m3
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad5
+ jmp .w32_wpad_done
+.w32_pad4:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, m0
+ pshufhw m3, m3, q3333
+ punpckhqdq m3, m3
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad4
+ jmp .w32_wpad_done
+.w32_pad3:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ movd m3, [yq+16]
+ punpcklbw m3, m3
+ punpcklqdq m3, m3
+ pshufhw m3, m3, q3333
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ mova m4, m3
+ punpckhqdq m4, m4
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad3
+ jmp .w32_wpad_done
+.w32_pad2:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m3, [yq+16]
+ punpcklbw m3, m3
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ pshufhw m4, m3, q3333
+ punpckhqdq m4, m4
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad2
+ jmp .w32_wpad_done
+.w32_pad1:
+ mova m0, [yq]
+ mova m1, m0
+ punpcklbw m1, m1
+ pmaddubsw m1, m2
+ mova [acq], m1
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ punpckhbw m0, m0
+ pmaddubsw m0, m2
+ mova [acq+16], m0
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova m4, [yq+16]
+ mova m3, m4
+ punpcklbw m3, m3
+ pmaddubsw m3, m2
+ mova [acq+32], m3
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ punpckhbw m4, m4
+ punpcklqdq m4, m4
+ pshufhw m4, m4, q3333
+ pmaddubsw m4, m2
+ mova [acq+48], m4
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ lea yq, [yq+strideq]
+ add acq, 64
+ sub hd, 1
+ jg .w32_pad1
+.w32_wpad_done:
+ test hpadd, hpadd
+ jz .calc_avg_32
+.w32_hpad_loop:
+ mova [acq], m1
+ mova [acq+16], m0
+ paddw m5, m1, [rsp]
+ mova [rsp ], m5
+ paddw m5, m0, [rsp+16]
+ mova [rsp+16], m5
+ mova [acq+32], m3
+ mova [acq+48], m4
+ paddw m5, m3, [rsp+32]
+ mova [rsp+32], m5
+ paddw m5, m4, [rsp+48]
+ mova [rsp+48], m5
+ add acq, 64
+ sub hpadd, 1
+ jg .w32_hpad_loop
+
+%if ARCH_X86_64
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak
+%else
+ DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h
+%endif
+
+.calc_avg_32:
+ mova m5, [rsp]
+ mova m0, m5
+ psrld m5, 16
+ pslld m0, 16
+ psrld m0, 16
+ paddd m5, m0
+ mova m0, [rsp+16]
+ mova m3, m0
+ psrld m0, 16
+ pslld m3, 16
+ psrld m3, 16
+ paddd m0, m3
+ paddd m5, m0
+ mova m0, [rsp+32]
+ mova m3, m0
+ psrld m0, 16
+ pslld m3, 16
+ psrld m3, 16
+ paddd m0, m3
+ mova m1, [rsp+48]
+ mova m3, m1
+ psrld m1, 16
+ pslld m3, 16
+ psrld m3, 16
+ paddd m1, m3
+ paddd m1, m0
+ paddd m5, m1
+.calc_avg:
+ movd szd, m6
+ psrad m6, 1
+ tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height);
+ paddd m5, m6
+ movd m1, r1d
+ pshufd m0, m5, q2301
+ paddd m0, m5
+ pshufd m5, m0, q1032
+ paddd m0, m5
+ psrad m0, m1 ; sum >>= log2sz;
+ packssdw m0, m0
+ RELOAD_ACQ_32 acq ; ac = ac_orig
+.sub_loop:
+ mova m1, [acq]
+ psubw m1, m0
+ mova [acq], m1
+ add acq, 16
+ sub szd, 8
+ jg .sub_loop
+ RET
+
+; %1 simd register that hold the mask and will hold the result
+; %2 simd register that holds the "true" values
+; %3 location of the "false" values (simd register/memory)
+%macro BLEND 3 ; mask, true, false
+ pand %2, %1
+ pandn %1, %3
+ por %1, %2
+%endmacro
+
+%macro PAETH 2 ; top, ldiff
+ pavgb m1, m%1, m3
+ pxor m0, m%1, m3
+ pand m0, m4
+ psubusb m2, m5, m1
+ psubb m1, m0
+ psubusb m1, m5
+ por m1, m2
+ paddusb m1, m1
+ por m1, m0 ; min(tldiff, 255)
+ psubusb m2, m5, m3
+ psubusb m0, m3, m5
+ por m2, m0 ; tdiff
+%ifnum %2
+ pminub m2, m%2
+ pcmpeqb m0, m%2, m2 ; ldiff <= tdiff
+%else
+ mova m0, %2
+ pminub m2, m0
+ pcmpeqb m0, m2
+%endif
+ pminub m1, m2
+ pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff
+ mova m2, m3
+ BLEND m0, m2, m%1
+ BLEND m1, m0, m5
+%endmacro
+
+cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h
+%define base r5-ipred_paeth_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ pxor m0, m0
+ movd m5, [tlq]
+ pshufb m5, m0
+ LEA r5, ipred_paeth_ssse3_table
+ movsxd wq, [r5+wq*4]
+ movddup m4, [base+ipred_paeth_shuf]
+ add wq, r5
+ jmp wq
+.w4:
+ movd m6, [tlq+1] ; top
+ pshufd m6, m6, q0000
+ lea r3, [strideq*3]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0 ; ldiff
+.w4_loop:
+ sub tlq, 4
+ movd m3, [tlq]
+ mova m1, [base+ipred_h_shuf]
+ pshufb m3, m1 ; left
+ PAETH 6, 7
+ movd [dstq ], m1
+ pshuflw m0, m1, q1032
+ movd [dstq+strideq ], m0
+ punpckhqdq m1, m1
+ movd [dstq+strideq*2], m1
+ psrlq m1, 32
+ movd [dstq+r3 ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ movddup m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w8_loop:
+ sub tlq, 2
+ movd m3, [tlq]
+ pshufb m3, [base+ipred_paeth_shuf]
+ PAETH 6, 7
+ movq [dstq ], m1
+ movhps [dstq+strideq], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w16_loop:
+ sub tlq, 1
+ movd m3, [tlq]
+ pxor m1, m1
+ pshufb m3, m1
+ PAETH 6, 7
+ mova [dstq], m1
+ add dstq, strideq
+ sub hd, 1
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp ], m6
+ mova [rsp+16], m7
+ movu m6, [tlq+17]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+32], m6
+.w32_loop:
+ dec tlq
+ movd m3, [tlq]
+ pxor m1, m1
+ pshufb m3, m1
+ mova m6, [rsp]
+ PAETH 6, [rsp+16]
+ mova [dstq ], m1
+ mova m6, [rsp+32]
+ PAETH 6, 7
+ mova [dstq+16], m1
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp ], m6
+ mova [rsp+16], m7
+ movu m6, [tlq+17]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+32], m6
+ mova [rsp+48], m7
+ movu m6, [tlq+33]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+64], m6
+ mova [rsp+80], m7
+ movu m6, [tlq+49]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+ mova [rsp+96], m6
+.w64_loop:
+ dec tlq
+ movd m3, [tlq]
+ pxor m1, m1
+ pshufb m3, m1
+ mova m6, [rsp]
+ PAETH 6, [rsp+16]
+ mova [dstq ], m1
+ mova m6, [rsp+32]
+ PAETH 6, [rsp+48]
+ mova [dstq+16], m1
+ mova m6, [rsp+64]
+ PAETH 6, [rsp+80]
+ mova [dstq+32], m1
+ mova m6, [rsp+96]
+ PAETH 6, 7
+ mova [dstq+48], m1
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+
+
+%macro FILTER 4 ;dst, src, tmp, shuf
+%ifnum %4
+ pshufb m%2, m%4
+%else
+ pshufb m%2, %4
+%endif
+ pshufd m%1, m%2, q0000 ;p0 p1
+ pmaddubsw m%1, m2
+ pshufd m%3, m%2, q1111 ;p2 p3
+ pmaddubsw m%3, m3
+ paddw m%1, [base+pw_8]
+ paddw m%1, m%3
+ pshufd m%3, m%2, q2222 ;p4 p5
+ pmaddubsw m%3, m4
+ paddw m%1, m%3
+ pshufd m%3, m%2, q3333 ;p6 __
+ pmaddubsw m%3, m5
+ paddw m%1, m%3
+ psraw m%1, 4
+ packuswb m%1, m%1
+%endmacro
+
+cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter
+%define base r6-$$
+ LEA r6, $$
+ tzcnt wd, wm
+%ifidn filterd, filterm
+ movzx filterd, filterb
+%else
+ movzx filterd, byte filterm
+%endif
+ shl filterd, 6
+ lea filterq, [base+filter_intra_taps+filterq]
+ movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4
+ movsxd wq, [base+ipred_filter_ssse3_table+wq*4]
+ mova m2, [filterq+16*0]
+ mova m3, [filterq+16*1]
+ mova m4, [filterq+16*2]
+ mova m5, [filterq+16*3]
+ lea wq, [base+ipred_filter_ssse3_table+wq]
+ mov hd, hm
+ jmp wq
+.w4:
+ mova m1, [base+filter_shuf1]
+ sub tlq, 3
+ sub tlq, hq
+ jmp .w4_loop_start
+.w4_loop:
+ movd m0, [tlq+hq]
+ punpckldq m0, m6
+ lea dstq, [dstq+strideq*2]
+.w4_loop_start:
+ FILTER 6, 0, 7, 1
+ movd [dstq+strideq*0], m6
+ pshuflw m6, m6, q1032
+ movd [dstq+strideq*1], m6
+ sub hd, 2
+ jg .w4_loop
+ RET
+
+ALIGN function_align
+.w8:
+ movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4
+ sub tlq, 5
+ sub tlq, hq
+
+.w8_loop:
+ FILTER 7, 0, 1, [base+filter_shuf1]
+ punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ FILTER 0, 6, 1, [base+filter_shuf2]
+
+ punpckldq m6, m7, m0
+ movq [dstq+strideq*0], m6
+ punpckhqdq m6, m6
+ movq [dstq+strideq*1], m6
+
+ movd m0, [tlq+hq] ;_ 6 5 0
+ punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
+
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+
+ALIGN function_align
+.w16:
+ movu m6, [tlq+1] ;top row
+ sub tlq, 5
+ sub tlq, hq
+
+.w16_loop:
+ FILTER 7, 0, 1, [base+filter_shuf1]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+4+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+
+ FILTER 7, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+8+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ movd [dstq+12+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+ mova [dstq+strideq*1], m6
+
+ movd m0, [tlq+hq] ;_ 6 5 0
+ punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
+
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+
+ALIGN function_align
+.w32:
+ movu m6, [tlq+1] ;top row
+ lea filterq, [tlq+17]
+ sub tlq, 5
+ sub tlq, hq
+
+.w32_loop:
+ FILTER 7, 0, 1, [base+filter_shuf1]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+4+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+
+ FILTER 7, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+8+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ movu m1, [filterq]
+ punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _
+ punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+12+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+ mova [dstq+strideq*1], m6
+
+ mova m6, m1
+
+ FILTER 7, 0, 6, [base+filter_shuf2]
+ punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+16+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m1, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+20+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+
+ FILTER 7, 0, 1, [base+filter_shuf2]
+ punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+ movd [dstq+24+strideq*0], m7
+ psrlq m7, 32
+ palignr m7, m6, 4
+
+ FILTER 6, 0, 1, [base+filter_shuf2]
+ movd [dstq+28+strideq*0], m6
+ psrlq m6, 32
+ palignr m6, m7, 4
+ mova [dstq+16+strideq*1], m6
+
+ mova m6, [dstq+strideq*1]
+ movd m0, [tlq+hq] ;_ 6 5 0
+ punpckldq m0, m6 ;_ 6 5 0 1 2 3 4
+ lea filterq, [dstq+16+strideq*1]
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
diff --git a/third_party/dav1d/src/x86/itx.h b/third_party/dav1d/src/x86/itx.h
new file mode 100644
index 0000000000..478eb6c6b6
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx.h
@@ -0,0 +1,363 @@
+/*
+ * Copyright © 2018-2023, VideoLAN and dav1d authors
+ * Copyright © 2018-2023, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
+#define decl_itx_fns(ext) \
+decl_itx17_fns( 4, 4, ext); \
+decl_itx16_fns( 4, 8, ext); \
+decl_itx16_fns( 4, 16, ext); \
+decl_itx16_fns( 8, 4, ext); \
+decl_itx16_fns( 8, 8, ext); \
+decl_itx16_fns( 8, 16, ext); \
+decl_itx2_fns ( 8, 32, ext); \
+decl_itx16_fns(16, 4, ext); \
+decl_itx16_fns(16, 8, ext); \
+decl_itx12_fns(16, 16, ext); \
+decl_itx2_fns (16, 32, ext); \
+decl_itx2_fns (32, 8, ext); \
+decl_itx2_fns (32, 16, ext); \
+decl_itx2_fns (32, 32, ext); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext))
+
+
+#define decl_itx2_bpc_fns(w, h, bpc, opt) \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_identity_##w##x##h, bpc, opt))
+
+#define decl_itx12_bpc_fns(w, h, bpc, opt) \
+decl_itx2_bpc_fns(w, h, bpc, opt); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_dct_##w##x##h, bpc, opt))
+
+#define decl_itx16_bpc_fns(w, h, bpc, opt) \
+decl_itx12_bpc_fns(w, h, bpc, opt); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, bpc, opt))
+
+#define decl_itx_bpc_fns(bpc, ext) \
+decl_itx16_bpc_fns( 4, 4, bpc, ext); \
+decl_itx16_bpc_fns( 4, 8, bpc, ext); \
+decl_itx16_bpc_fns( 4, 16, bpc, ext); \
+decl_itx16_bpc_fns( 8, 4, bpc, ext); \
+decl_itx16_bpc_fns( 8, 8, bpc, ext); \
+decl_itx16_bpc_fns( 8, 16, bpc, ext); \
+decl_itx2_bpc_fns ( 8, 32, bpc, ext); \
+decl_itx16_bpc_fns(16, 4, bpc, ext); \
+decl_itx16_bpc_fns(16, 8, bpc, ext); \
+decl_itx12_bpc_fns(16, 16, bpc, ext); \
+decl_itx2_bpc_fns (16, 32, bpc, ext); \
+decl_itx2_bpc_fns (32, 8, bpc, ext); \
+decl_itx2_bpc_fns (32, 16, bpc, ext); \
+decl_itx2_bpc_fns (32, 32, bpc, ext); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_16x64, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_32x64, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x16, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x32, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x64, bpc, ext))
+
+decl_itx_fns(avx512icl);
+decl_itx_bpc_fns(10, avx512icl);
+decl_itx_fns(avx2);
+decl_itx_bpc_fns(10, avx2);
+decl_itx_bpc_fns(12, avx2);
+decl_itx_fns(sse4);
+decl_itx_fns(ssse3);
+decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));
+
+static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+ assign_itx1_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+ assign_itx2_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
+ assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+ assign_itx12_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+ assign_itx16_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
+
+
+#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
+
+#define assign_itx1_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx_bpc_fn(pfx, w, h, dct_dct, DCT_DCT, bpc, ext)
+
+#define assign_itx2_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx1_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_identity, IDTX, bpc, ext)
+
+#define assign_itx12_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx2_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_adst, ADST_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_identity, H_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_dct, DCT_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_adst, ADST_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_dct, V_DCT, bpc, ext)
+
+#define assign_itx16_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx12_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_identity, H_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_identity, H_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_adst, V_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_flipadst, V_FLIPADST, bpc, ext)
+
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+ assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+ assign_itx16_fn(, 4, 4, ssse3);
+ assign_itx16_fn(R, 4, 8, ssse3);
+ assign_itx16_fn(R, 8, 4, ssse3);
+ assign_itx16_fn(, 8, 8, ssse3);
+ assign_itx16_fn(R, 4, 16, ssse3);
+ assign_itx16_fn(R, 16, 4, ssse3);
+ assign_itx16_fn(R, 8, 16, ssse3);
+ assign_itx16_fn(R, 16, 8, ssse3);
+ assign_itx12_fn(, 16, 16, ssse3);
+ assign_itx2_fn (R, 8, 32, ssse3);
+ assign_itx2_fn (R, 32, 8, ssse3);
+ assign_itx2_fn (R, 16, 32, ssse3);
+ assign_itx2_fn (R, 32, 16, ssse3);
+ assign_itx2_fn (, 32, 32, ssse3);
+ assign_itx1_fn (R, 16, 64, ssse3);
+ assign_itx1_fn (R, 32, 64, ssse3);
+ assign_itx1_fn (R, 64, 16, ssse3);
+ assign_itx1_fn (R, 64, 32, ssse3);
+ assign_itx1_fn ( , 64, 64, ssse3);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
+
+#if BITDEPTH == 16
+ if (bpc == 10) {
+ assign_itx16_fn(, 4, 4, sse4);
+ assign_itx16_fn(R, 4, 8, sse4);
+ assign_itx16_fn(R, 4, 16, sse4);
+ assign_itx16_fn(R, 8, 4, sse4);
+ assign_itx16_fn(, 8, 8, sse4);
+ assign_itx16_fn(R, 8, 16, sse4);
+ assign_itx16_fn(R, 16, 4, sse4);
+ assign_itx16_fn(R, 16, 8, sse4);
+ assign_itx12_fn(, 16, 16, sse4);
+ assign_itx2_fn (R, 8, 32, sse4);
+ assign_itx2_fn (R, 32, 8, sse4);
+ assign_itx2_fn (R, 16, 32, sse4);
+ assign_itx2_fn (R, 32, 16, sse4);
+ assign_itx2_fn (, 32, 32, sse4);
+ assign_itx1_fn (R, 16, 64, sse4);
+ assign_itx1_fn (R, 32, 64, sse4);
+ assign_itx1_fn (R, 64, 16, sse4);
+ assign_itx1_fn (R, 64, 32, sse4);
+ assign_itx1_fn (, 64, 64, sse4);
+ }
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2);
+
+#if BITDEPTH == 8
+ assign_itx16_fn( , 4, 4, avx2);
+ assign_itx16_fn(R, 4, 8, avx2);
+ assign_itx16_fn(R, 4, 16, avx2);
+ assign_itx16_fn(R, 8, 4, avx2);
+ assign_itx16_fn( , 8, 8, avx2);
+ assign_itx16_fn(R, 8, 16, avx2);
+ assign_itx2_fn (R, 8, 32, avx2);
+ assign_itx16_fn(R, 16, 4, avx2);
+ assign_itx16_fn(R, 16, 8, avx2);
+ assign_itx12_fn( , 16, 16, avx2);
+ assign_itx2_fn (R, 16, 32, avx2);
+ assign_itx1_fn (R, 16, 64, avx2);
+ assign_itx2_fn (R, 32, 8, avx2);
+ assign_itx2_fn (R, 32, 16, avx2);
+ assign_itx2_fn ( , 32, 32, avx2);
+ assign_itx1_fn (R, 32, 64, avx2);
+ assign_itx1_fn (R, 64, 16, avx2);
+ assign_itx1_fn (R, 64, 32, avx2);
+ assign_itx1_fn ( , 64, 64, avx2);
+#else
+ if (bpc == 10) {
+ assign_itx16_bpc_fn( , 4, 4, 10, avx2);
+ assign_itx16_bpc_fn(R, 4, 8, 10, avx2);
+ assign_itx16_bpc_fn(R, 4, 16, 10, avx2);
+ assign_itx16_bpc_fn(R, 8, 4, 10, avx2);
+ assign_itx16_bpc_fn( , 8, 8, 10, avx2);
+ assign_itx16_bpc_fn(R, 8, 16, 10, avx2);
+ assign_itx2_bpc_fn (R, 8, 32, 10, avx2);
+ assign_itx16_bpc_fn(R, 16, 4, 10, avx2);
+ assign_itx16_bpc_fn(R, 16, 8, 10, avx2);
+ assign_itx12_bpc_fn( , 16, 16, 10, avx2);
+ assign_itx2_bpc_fn (R, 16, 32, 10, avx2);
+ assign_itx1_bpc_fn (R, 16, 64, 10, avx2);
+ assign_itx2_bpc_fn (R, 32, 8, 10, avx2);
+ assign_itx2_bpc_fn (R, 32, 16, 10, avx2);
+ assign_itx2_bpc_fn ( , 32, 32, 10, avx2);
+ assign_itx1_bpc_fn (R, 32, 64, 10, avx2);
+ assign_itx1_bpc_fn (R, 64, 16, 10, avx2);
+ assign_itx1_bpc_fn (R, 64, 32, 10, avx2);
+ assign_itx1_bpc_fn ( , 64, 64, 10, avx2);
+ } else {
+ assign_itx16_bpc_fn( , 4, 4, 12, avx2);
+ assign_itx16_bpc_fn(R, 4, 8, 12, avx2);
+ assign_itx16_bpc_fn(R, 4, 16, 12, avx2);
+ assign_itx16_bpc_fn(R, 8, 4, 12, avx2);
+ assign_itx16_bpc_fn( , 8, 8, 12, avx2);
+ assign_itx16_bpc_fn(R, 8, 16, 12, avx2);
+ assign_itx2_bpc_fn (R, 8, 32, 12, avx2);
+ assign_itx16_bpc_fn(R, 16, 4, 12, avx2);
+ assign_itx16_bpc_fn(R, 16, 8, 12, avx2);
+ assign_itx12_bpc_fn( , 16, 16, 12, avx2);
+ assign_itx2_bpc_fn (R, 32, 8, 12, avx2);
+ assign_itx_bpc_fn(R, 16, 32, identity_identity, IDTX, 12, avx2);
+ assign_itx_bpc_fn(R, 32, 16, identity_identity, IDTX, 12, avx2);
+ assign_itx_bpc_fn( , 32, 32, identity_identity, IDTX, 12, avx2);
+ }
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+#if BITDEPTH == 8
+ assign_itx16_fn( , 4, 4, avx512icl); // no wht
+ assign_itx16_fn(R, 4, 8, avx512icl);
+ assign_itx16_fn(R, 4, 16, avx512icl);
+ assign_itx16_fn(R, 8, 4, avx512icl);
+ assign_itx16_fn( , 8, 8, avx512icl);
+ assign_itx16_fn(R, 8, 16, avx512icl);
+ assign_itx2_fn (R, 8, 32, avx512icl);
+ assign_itx16_fn(R, 16, 4, avx512icl);
+ assign_itx16_fn(R, 16, 8, avx512icl);
+ assign_itx12_fn( , 16, 16, avx512icl);
+ assign_itx2_fn (R, 16, 32, avx512icl);
+ assign_itx1_fn (R, 16, 64, avx512icl);
+ assign_itx2_fn (R, 32, 8, avx512icl);
+ assign_itx2_fn (R, 32, 16, avx512icl);
+ assign_itx2_fn ( , 32, 32, avx512icl);
+ assign_itx1_fn (R, 32, 64, avx512icl);
+ assign_itx1_fn (R, 64, 16, avx512icl);
+ assign_itx1_fn (R, 64, 32, avx512icl);
+ assign_itx1_fn ( , 64, 64, avx512icl);
+#else
+ if (bpc == 10) {
+ assign_itx16_bpc_fn( , 8, 8, 10, avx512icl);
+ assign_itx16_bpc_fn(R, 8, 16, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 8, 32, 10, avx512icl);
+ assign_itx16_bpc_fn(R, 16, 8, 10, avx512icl);
+ assign_itx12_bpc_fn( , 16, 16, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 16, 32, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 32, 8, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 32, 16, 10, avx512icl);
+ assign_itx2_bpc_fn ( , 32, 32, 10, avx512icl);
+ assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl);
+ }
+#endif
+#endif
+}
diff --git a/third_party/dav1d/src/x86/itx16_avx2.asm b/third_party/dav1d/src/x86/itx16_avx2.asm
new file mode 100644
index 0000000000..2315ec1e47
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx16_avx2.asm
@@ -0,0 +1,8599 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; Copyright © 2021, Matthias Dressel
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6
+ dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7
+idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7
+idct4_12_shuf2: dd 2, 0, 6, 4, 3, 1, 7, 5
+iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
+idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6
+iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5
+pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048
+idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11
+idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+
+%macro COEF_PAIR 2-3 0
+pd_%1_%2: dd %1, %1, %2, %2
+%define pd_%1 (pd_%1_%2 + 4*0)
+%define pd_%2 (pd_%1_%2 + 4*2)
+%if %3
+dd -%2, -%2
+%define pd_%2_m%2 pd_%2
+%endif
+%endmacro
+
+COEF_PAIR 201, 995
+COEF_PAIR 401, 1931
+COEF_PAIR 799, 3406
+COEF_PAIR 1380, 601
+COEF_PAIR 1751, 2440
+COEF_PAIR 2598, 1189
+COEF_PAIR 2751, 2106
+COEF_PAIR 2896, 1567, 1
+COEF_PAIR 2896, 3784, 1
+COEF_PAIR 3035, 3513
+COEF_PAIR 3166, 3920
+COEF_PAIR 3703, 3290
+COEF_PAIR 3857, 4052
+COEF_PAIR 4017, 2276
+COEF_PAIR 4076, 3612
+COEF_PAIR 4091, 3973
+
+pd_8: dd 8
+pd_m601: dd -601
+pd_m1189: dd -1189
+pd_m1380: dd -1380
+pd_m2106: dd -2106
+pd_m2598: dd -2598
+pd_m2751: dd -2751
+pd_m3344: dd -3344
+pd_1024: dd 1024
+pd_1321: dd 1321
+pd_1448: dd 1448
+pd_1697: dd 1697
+pd_2482: dd 2482
+pd_3072: dd 3072 ; 1024 + 2048
+pd_3803: dd 3803
+pd_5119: dd 5119 ; 1024 + 4096 - 1
+pd_5120: dd 5120 ; 1024 + 4096
+pd_5793: dd 5793
+pd_6144: dd 6144 ; 2048 + 4096
+pd_17408: dd 17408 ; 1024 + 16384
+
+pixel_10bpc_max: times 2 dw 0x03ff
+pixel_12bpc_max: times 2 dw 0x0fff
+dconly_10bpc: times 2 dw 0x7c00
+dconly_12bpc: times 2 dw 0x7000
+clip_18b_min: dd -0x20000
+clip_18b_max: dd 0x1ffff
+clip_20b_min: dd -0x80000
+clip_20b_max: dd 0x7ffff
+
+idct64_mul_16bpc:
+dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017
+dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799
+dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276
+dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406
+
+cextern deint_shuf
+cextern idct64_mul
+cextern pw_1697x8
+cextern pw_1697x16
+cextern pw_1567_3784
+cextern pw_m1567_m3784
+cextern pw_m3784_1567
+cextern pw_2896_2896
+cextern pw_m2896_2896
+cextern pw_5
+cextern pw_2048
+cextern pw_4096
+cextern pw_8192
+cextern pw_16384
+cextern pw_2896x8
+cextern pd_2048
+
+cextern idct_4x8_internal_8bpc_avx2.main
+cextern idct_4x16_internal_8bpc_avx2.main
+cextern idct_8x8_internal_8bpc_avx2.main
+cextern idct_8x16_internal_8bpc_avx2.main
+cextern idct_16x4_internal_8bpc_avx2.main
+cextern idct_16x8_internal_8bpc_avx2.main
+cextern idct_16x16_internal_8bpc_avx2.main
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast
+cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1
+cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal
+
+cextern iadst_4x4_internal_8bpc_avx2.main
+cextern iadst_4x8_internal_8bpc_avx2.main_pass2
+cextern iadst_4x16_internal_8bpc_avx2.main2
+cextern iadst_8x4_internal_8bpc_avx2.main
+cextern iadst_8x8_internal_8bpc_avx2.main_pass2
+cextern iadst_8x16_internal_8bpc_avx2.main
+cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end
+cextern iadst_16x4_internal_8bpc_avx2.main
+cextern iadst_16x8_internal_8bpc_avx2.main
+cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end
+cextern iadst_16x16_internal_8bpc_avx2.main
+cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end
+
+SECTION .text
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%macro WRAP_XMM 1+
+ INIT_XMM cpuname
+ %1
+ INIT_YMM cpuname
+%endmacro
+
+%macro IWHT4_1D_PACKED 0
+ ; m0 = in0 in2, m1 = in1 in3
+ psubd m2, m0, m1 ; t2
+ paddd xm0, xm1 ; t0
+ vpermq m2, m2, q3322
+ vpermq m0, m0, q1100
+ vpermq m1, m1, q3120
+ psubd m3, m0, m2
+ psrad m3, 1
+ psubd m3, m1 ; t1 t3
+ psubd m0, m3 ; ____ out0
+ paddd m2, m3 ; out3 ____
+%endmacro
+
+INIT_YMM avx2
+cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
+ mova xm0, [cq+16*0]
+ vinserti128 m0, [cq+16*2], 1
+ mova xm1, [cq+16*1]
+ vinserti128 m1, [cq+16*3], 1
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ lea r6, [dstq+strideq*2]
+ psrad m0, 2
+ psrad m1, 2
+ IWHT4_1D_PACKED
+ punpckhdq m0, m3
+ punpckldq m3, m2
+ punpckhqdq m1, m0, m3
+ punpcklqdq m0, m3
+ IWHT4_1D_PACKED
+ vpblendd m0, m2, 0x33
+ packssdw m0, m3
+ vextracti128 xm2, m0, 1
+ punpckhdq xm1, xm0, xm2 ; out2 out1
+ punpckldq xm0, xm2 ; out3 out0
+ movq xm2, [r6 +strideq*1]
+ movhps xm2, [dstq+strideq*0]
+ movq xm3, [r6 +strideq*0]
+ movhps xm3, [dstq+strideq*1]
+%ifidn bdmaxd, bdmaxm
+ movd xm5, bdmaxd
+ vpbroadcastw xm5, xm5
+%else ; win64: load from stack
+ vpbroadcastw xm5, bdmaxm
+%endif
+ paddsw xm0, xm2
+ paddsw xm1, xm3
+ pmaxsw xm0, xm4
+ pmaxsw xm1, xm4
+ pminsw xm0, xm5
+ pminsw xm1, xm5
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movq [r6 +strideq*0], xm1
+ movq [r6 +strideq*1], xm0
+ RET
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+; flags: 1 = packed, 2 = inv_dst2
+; skip round/shift if rnd is not a number
+%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
+%if %8 < 32
+ pmulld m%4, m%1, m%8
+ pmulld m%3, m%2, m%8
+%else
+%if %9 & 1
+ vbroadcasti128 m%3, [pd_%8]
+%else
+ vpbroadcastd m%3, [pd_%8]
+%endif
+ pmulld m%4, m%1, m%3
+ pmulld m%3, m%2
+%endif
+%if %7 < 32
+ pmulld m%1, m%7
+ pmulld m%2, m%7
+%else
+%if %9 & 1
+ vbroadcasti128 m%5, [pd_%7]
+%else
+ vpbroadcastd m%5, [pd_%7]
+%endif
+ pmulld m%1, m%5
+ pmulld m%2, m%5
+%endif
+%if %9 & 2
+ psubd m%4, m%6, m%4
+ psubd m%2, m%4, m%2
+%else
+%ifnum %6
+ paddd m%4, m%6
+%endif
+ paddd m%2, m%4
+%endif
+%ifnum %6
+ paddd m%1, m%6
+%endif
+ psubd m%1, m%3
+%ifnum %6
+ psrad m%2, 12
+ psrad m%1, 12
+%endif
+%endmacro
+
+%macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth
+cglobal inv_txfm_add_%1_%2_%4_%5bpc, 4, 5, 0, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%4_internal_%5bpc)
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%4_internal_%5bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+%if %3
+ add eobd, %3
+%endif
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 4x4, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd xm2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 4
+.dconly2:
+ add r6d, 128
+ sar r6d, 8
+.dconly3:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm2
+ vpbroadcastw xm0, xm0
+.dconly_loop:
+ movq xm1, [dstq+strideq*0]
+ movhps xm1, [dstq+strideq*1]
+ paddsw xm1, xm0
+ psubusw xm1, xm2
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ WRAP_XMM RET
+%else
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd
+ ITX_MULSUB_2D %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1
+ punpckhqdq m%3, m%2, m%1 ; t3 t2
+ punpcklqdq m%2, m%1 ; t0 t1
+ paddd m%1, m%2, m%3 ; out0 out1
+ psubd m%2, m%3 ; out3 out2
+%endmacro
+
+%macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd
+ vpbroadcastd m%5, [pw_m3784_1567]
+ punpckhwd m%3, m%2, m%1
+ vpbroadcastd m%4, [pw_1567_3784]
+ punpcklwd m%2, m%1
+ vpbroadcastd m%1, [pw_m2896_2896]
+ pmaddwd m%5, m%3
+ pmaddwd m%3, m%4
+ vpbroadcastd m%4, [pw_2896_2896]
+ pmaddwd m%1, m%2
+ pmaddwd m%2, m%4
+ REPX {paddd x, m%6}, m%5, m%3, m%1, m%2
+ REPX {psrad x, 12 }, m%5, m%3, m%1, m%2
+ packssdw m%3, m%5 ; t3 t2
+ packssdw m%2, m%1 ; t0 t1
+ paddsw m%1, m%2, m%3 ; out0 out1
+ psubsw m%2, m%3 ; out3 out2
+%endmacro
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, identity
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+
+cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
+ call .main
+ vbroadcasti128 m2, [idct4_shuf]
+ packssdw m0, m1
+ pshufb m0, m2
+ jmp tx2q
+.pass2:
+ vextracti128 xm1, m0, 1
+ WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5
+ packssdw xm5, xm5 ; pw_2048
+ pmulhrsw xm0, xm5
+ pmulhrsw xm1, xm5
+ movq xm2, [dstq+strideq*0]
+ movhps xm2, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ movq xm3, [r6 +strideq*1]
+ movhps xm3, [r6 +strideq*0]
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ paddw xm0, xm2
+ paddw xm1, xm3
+ pmaxsw xm0, xm4
+ pmaxsw xm1, xm4
+ pminsw xm0, xm5
+ pminsw xm1, xm5
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movhps [r6 +strideq*0], xm1
+ movq [r6 +strideq*1], xm1
+ RET
+ALIGN function_align
+.main:
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m5, [pd_2048]
+.main2:
+ IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5
+ ret
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+%macro IADST4_1D 0
+ vpbroadcastd m5, [pd_1321]
+ vpbroadcastd m7, [pd_2482]
+ pmulld m4, m0, m5 ; 1321*in0
+ pmulld m6, m3, m7 ; 2482*in3
+ paddd m4, m6 ; 1321*in0 + 2482*in3
+ pmulld m6, m0, m7 ; 2482*in0
+ paddd m0, m3 ; in0 + in3
+ paddd m7, m5 ; pd_3803
+ pmulld m5, m2 ; 1321*in2
+ pmulld m3, m7 ; 3803*in3
+ pmulld m7, m2 ; 3803*in2
+ psubd m2, m0 ; in2 - in0 - in3
+ vpbroadcastd m0, [pd_m3344]
+ pmulld m1, m0 ; -t3
+ pmulld m2, m0 ; out2 (unrounded)
+ psubd m6, m5 ; 2482*in0 - 1321*in2
+ paddd m4, m7 ; t0
+ psubd m6, m3 ; t1
+ paddd m3, m4, m6
+ psubd m4, m1 ; out0 (unrounded)
+ psubd m6, m1 ; out1 (unrounded)
+ paddd m3, m1 ; out3 (unrounded)
+%endmacro
+
+cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
+ call .main
+ vinserti128 m0, m4, xm6, 1
+ vinserti128 m1, m2, xm3, 1
+.pass1_end:
+ vpbroadcastd m5, [pd_2048]
+ mova m2, [itx4_shuf]
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
+ packssdw m0, m1
+ vpermd m0, m2, m0
+ psrld m2, 4
+ pshufb m0, m2
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
+ jmp tx2q
+.pass2:
+ lea r6, [deint_shuf+128]
+ vextracti128 xm1, m0, 1
+ call m(iadst_4x4_internal_8bpc).main
+.end:
+ vpbroadcastd xm4, [pw_2048]
+ movq xm2, [dstq+strideq*0]
+ movhps xm2, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ movq xm3, [r6 +strideq*0]
+ movhps xm3, [r6 +strideq*1]
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pmulhrsw xm0, xm4
+ pmulhrsw xm1, xm4
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ paddw xm0, xm2
+ paddw xm1, xm3
+ pmaxsw xm0, xm4
+ pmaxsw xm1, xm4
+ pminsw xm0, xm5
+ pminsw xm1, xm5
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [r6 +strideq*0], xm1
+ movhps [r6 +strideq*1], xm1
+ RET
+ALIGN function_align
+.main:
+ mova xm0, [cq+16*0]
+ mova xm1, [cq+16*1]
+ mova xm2, [cq+16*2]
+ mova xm3, [cq+16*3]
+%if WIN64
+ movaps [rsp+16], xmm6
+ movaps [rsp+32], xmm7
+%endif
+.main2:
+ WRAP_XMM IADST4_1D
+ ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
+ call m(iadst_4x4_internal_10bpc).main
+ vinserti128 m0, m3, xm2, 1
+ vinserti128 m1, m6, xm4, 1
+ jmp m(iadst_4x4_internal_10bpc).pass1_end
+.pass2:
+ lea r6, [deint_shuf+128]
+ vextracti128 xm1, m0, 1
+ call m(iadst_4x4_internal_8bpc).main
+ vpbroadcastd xm4, [pw_2048]
+ movq xm3, [dstq+strideq*1]
+ movhps xm3, [dstq+strideq*0]
+ lea r6, [dstq+strideq*2]
+ movq xm2, [r6 +strideq*1]
+ movhps xm2, [r6 +strideq*0]
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pmulhrsw xm0, xm4
+ pmulhrsw xm1, xm4
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ paddw xm0, xm2
+ paddw xm1, xm3
+ pmaxsw xm0, xm4
+ pmaxsw xm1, xm4
+ pminsw xm0, xm5
+ pminsw xm1, xm5
+ movhps [dstq+strideq*0], xm1
+ movq [dstq+strideq*1], xm1
+ movhps [r6 +strideq*0], xm0
+ movq [r6 +strideq*1], xm0
+ RET
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
+ vpbroadcastd m1, [pd_5793]
+ pmulld m0, m1, [cq+32*0]
+ pmulld m1, [cq+32*1]
+ vpbroadcastd m5, [pd_2048]
+ mova m3, [itx4_shuf]
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
+ packssdw m0, m1
+ vpermd m0, m3, m0
+ psrld m3, 4
+ pshufb m0, m3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m1, [pw_1697x8]
+ movq xm2, [dstq+strideq*0]
+ movhps xm2, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ pmulhrsw m1, m0
+ paddsw m0, m1
+ movq xm3, [r6 +strideq*0]
+ movhps xm3, [r6 +strideq*1]
+ vpbroadcastd xm4, [pixel_10bpc_max]
+ packssdw m5, m5 ; pw_2048
+ pmulhrsw m0, m5
+ pxor m5, m5
+ mova [cq+32*0], m5
+ mova [cq+32*1], m5
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm2
+ paddw xm1, xm3
+ pmaxsw xm0, xm5
+ pmaxsw xm1, xm5
+ pminsw xm0, xm4
+ pminsw xm1, xm4
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [r6 +strideq*0], xm1
+ movhps [r6 +strideq*1], xm1
+ RET
+
+INV_TXFM_4X4_FN dct, dct, 12
+INV_TXFM_4X4_FN dct, identity, 12
+INV_TXFM_4X4_FN dct, adst, 12
+INV_TXFM_4X4_FN dct, flipadst, 12
+
+cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(idct_4x4_internal_10bpc).main
+ mova m3, [idct4_12_shuf]
+ mova m4, [idct4_12_shuf2]
+ vpermd m2, m4, m1
+ vpermd m1, m3, m0
+ jmp m(iadst_4x4_internal_12bpc).pass1_end2
+.pass2:
+ vpbroadcastd m5, [pd_2048]
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+ call m(idct_4x4_internal_10bpc).main2
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ jmp m(iadst_4x4_internal_12bpc).end
+
+INV_TXFM_4X4_FN adst, dct, 12
+INV_TXFM_4X4_FN adst, adst, 12
+INV_TXFM_4X4_FN adst, flipadst, 12
+INV_TXFM_4X4_FN adst, identity, 12
+
+cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(iadst_4x4_internal_10bpc).main
+ vinserti128 m1, m4, xm6, 1
+ vinserti128 m2, xm3, 1
+.pass1_end:
+ mova m3, [itx4_shuf]
+ vpbroadcastd m5, [pd_1024]
+ psrad m1, 1
+ psrad m2, 1
+ vpermd m1, m3, m1
+ vpermd m2, m3, m2
+ paddd m1, m5
+ paddd m2, m5
+ psrad m1, 11
+ psrad m2, 11
+.pass1_end2:
+ vpbroadcastd m3, [clip_18b_min]
+ vpbroadcastd m4, [clip_18b_max]
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ pmaxsd m0, m3
+ pmaxsd m1, m3
+ pminsd m0, m4
+ pminsd m1, m4
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ vinserti128 m0, m4, xm6, 1
+ vinserti128 m1, m2, xm3, 1
+.pass2_end:
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
+.end:
+%if WIN64
+ WIN64_RESTORE_XMM_INTERNAL
+ %assign xmm_regs_used 6
+%endif
+.end2:
+ vpbroadcastd m4, [pw_16384]
+ movq xm2, [dstq+strideq*0]
+ movq xm3, [dstq+strideq*1]
+ lea r6, [dstq+strideq*2]
+ movhps xm2, [r6 +strideq*0] ; dst0 dst2
+ movhps xm3, [r6 +strideq*1] ; dst1 dst3
+ vpbroadcastd m5, [pixel_12bpc_max]
+ vinserti128 m2, xm3, 1
+ psrad m0, 3
+ psrad m1, 3
+ packssdw m0, m1 ; t0 t2 t1 t3
+ pmulhrsw m0, m4
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ paddw m0, m2 ; out0 out2 out1 out3
+ pmaxsw m0, m4
+ pminsw m0, m5
+ vextracti128 xm1, m0, 1 ; out1 out3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [r6 +strideq*0], xm0
+ movhps [r6 +strideq*1], xm1
+ RET
+.main_pass2:
+ vextracti128 xm3, m1, 1
+ mova xm2, xm1
+ vextracti128 xm1, m0, 1
+ jmp m(iadst_4x4_internal_10bpc).main2
+
+INV_TXFM_4X4_FN flipadst, dct, 12
+INV_TXFM_4X4_FN flipadst, adst, 12
+INV_TXFM_4X4_FN flipadst, flipadst, 12
+INV_TXFM_4X4_FN flipadst, identity, 12
+
+cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(iadst_4x4_internal_10bpc).main
+ vinserti128 m1, m3, xm2, 1
+ vinserti128 m2, m6, xm4, 1
+ jmp m(iadst_4x4_internal_12bpc).pass1_end
+.pass2:
+ call m(iadst_4x4_internal_12bpc).main_pass2
+ vinserti128 m0, m3, xm2, 1
+ vinserti128 m1, m6, xm4, 1
+ jmp m(iadst_4x4_internal_12bpc).pass2_end
+
+INV_TXFM_4X4_FN identity, dct, 12
+INV_TXFM_4X4_FN identity, adst, 12
+INV_TXFM_4X4_FN identity, flipadst, 12
+INV_TXFM_4X4_FN identity, identity, 12
+
+cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ mova m2, [itx4_shuf]
+ vpbroadcastd m3, [pd_1697]
+ vpermd m0, m2, [cq+32*0]
+ vpermd m2, m2, [cq+32*1]
+ vpbroadcastd m5, [pd_2048]
+ pmulld m1, m3, m0
+ pmulld m3, m2
+ paddd m1, m5
+ paddd m3, m5
+ psrad m1, 12
+ psrad m3, 12
+ paddd m1, m0
+ paddd m2, m3
+ jmp m(iadst_4x4_internal_12bpc).pass1_end2
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+ vpbroadcastd m3, [pd_5793]
+ vpbroadcastd m5, [pd_2048]
+ pmulld m0, m3
+ pmulld m1, m3
+ paddd m0, m5 ; 2048
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
+ jmp m(iadst_4x4_internal_12bpc).end
+
+%macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 4x8, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd xm2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2
+%else
+ jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
+ ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3
+ vpbroadcastd m%5, [pd_2896]
+ pmulld m%1, m%5
+ pmulld m%3, m%5
+ paddd m%1, m%8
+ paddd m%5, m%1, m%3
+ psubd m%1, m%3
+ psrad m%5, 12 ; t0
+ psrad m%1, 12 ; t1
+ psubd m%3, m%1, m%2
+ paddd m%2, m%1
+ paddd m%1, m%5, m%4
+ psubd m%4, m%5, m%4
+%endmacro
+
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, identity
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+
+cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m3, [pd_2896]
+ pmulld m0, m3, [cq+32*0]
+ pmulld m1, m3, [cq+32*1]
+ pmulld m2, m3, [cq+32*2]
+ pmulld m3, m3, [cq+32*3]
+ vpbroadcastd m7, [pd_2048]
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7
+ jmp tx2q
+.pass2:
+ packssdw m0, m2
+ packssdw m1, m3
+ lea r6, [deint_shuf+128]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhdq m1, m0, m2 ; 2 3
+ punpckldq m0, m2 ; 0 1
+ vextracti128 xm2, m0, 1 ; 4 5
+ vextracti128 xm3, m1, 1 ; 6 7
+ call m(idct_4x8_internal_8bpc).main
+ vpbroadcastd xm4, [pw_2048]
+ REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+strideq*1]
+ movq xm5, [dstq+r3 ]
+ movhps xm5, [dstq+strideq*2]
+ movq xm6, [r6 +strideq*0]
+ movhps xm6, [r6 +strideq*1]
+ movq xm7, [r6 +r3 ]
+ movhps xm7, [r6 +strideq*2]
+ paddw xm0, xm4 ; 0 1
+ paddw xm1, xm5 ; 3 2
+ paddw xm2, xm6 ; 4 5
+ paddw xm3, xm7 ; 7 6
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
+ REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movhps [dstq+strideq*2], xm1
+ movq [dstq+r3 ], xm1
+ movq [r6 +strideq*0], xm2
+ movhps [r6 +strideq*1], xm2
+ movhps [r6 +strideq*2], xm3
+ movq [r6 +r3 ], xm3
+ RET
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(iadst_8x4_internal_10bpc).main
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m4
+ paddd m1, m5, m6
+ paddd m2, m5
+ paddd m3, m5
+.pass1_end:
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ jmp tx2q
+.pass2:
+ call .pass2_main
+ mova xm4, [pw_2048_m2048]
+ REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
+.end:
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+strideq*1]
+ movq xm5, [dstq+strideq*2]
+ movhps xm5, [dstq+r3 ]
+ movq xm6, [r6 +strideq*0]
+ movhps xm6, [r6 +strideq*1]
+ movq xm7, [r6 +strideq*2]
+ movhps xm7, [r6 +r3 ]
+ paddw xm0, xm4 ; 0 1
+ paddw xm1, xm5 ; 2 3
+ paddw xm2, xm6 ; 4 5
+ paddw xm3, xm7 ; 6 7
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
+ REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ movq [r6 +strideq*0], xm2
+ movhps [r6 +strideq*1], xm2
+ movq [r6 +strideq*2], xm3
+ movhps [r6 +r3 ], xm3
+ RET
+ALIGN function_align
+.pass2_main:
+ packssdw m0, m2
+ packssdw m1, m3
+ lea r6, [deint_shuf+128]
+ punpcklwd m4, m0, m1
+ punpckhwd m0, m1
+ punpckhdq m5, m4, m0
+ punpckldq m4, m0
+ vextracti128 xm2, m4, 1 ; 4 5
+ vextracti128 xm3, m5, 1 ; 6 7
+ pshufd xm4, xm4, q1032 ; 1 0
+ pshufd xm5, xm5, q1032 ; 3 2
+ jmp m(iadst_4x8_internal_8bpc).main_pass2
+ALIGN function_align
+.main:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+.main2:
+ vbroadcasti128 m0, [cq+16*0]
+ vbroadcasti128 m2, [cq+16*2]
+ vbroadcasti128 m3, [cq+16*5]
+ vbroadcasti128 m1, [cq+16*7]
+ vpbroadcastd m6, [pd_2896]
+ shufpd m0, m2, 0x0c ; 0 2
+ shufpd m1, m3, 0x0c ; 7 5
+ vbroadcasti128 m2, [cq+16*4]
+ vbroadcasti128 m4, [cq+16*6]
+ vbroadcasti128 m5, [cq+16*1]
+ vbroadcasti128 m3, [cq+16*3]
+ vpbroadcastd m7, [pd_2048]
+ shufpd m2, m4, 0x0c ; 4 6
+ shufpd m3, m5, 0x0c ; 3 1
+ REPX {pmulld x, m6}, m0, m1, m2, m3
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+.main3:
+ ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1
+ ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1
+ psubd m4, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ REPX {pmaxsd x, m8}, m4, m2, m0, m1
+ REPX {pminsd x, m9}, m4, m2, m0, m1
+ pxor m5, m5
+ psubd m5, m4
+ vpblendd m4, m2, 0xcc ; t4 t7
+ vpblendd m2, m5, 0xcc ; t5 -t6
+ ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 1567, 3784
+ vpbroadcastd m5, [pd_2896]
+ vbroadcasti128 m6, [pw_2048_m2048] ; + + - -
+ punpckhqdq m3, m0, m1
+ punpcklqdq m0, m1
+ psubd m1, m0, m3 ; t2 t3
+ paddd m0, m3 ; out0 -out7
+ punpckhqdq m3, m4, m2 ; t7a t6a
+ punpcklqdq m4, m2 ; t5a t4a
+ psubd m2, m4, m3 ; t7 t6
+ paddd m4, m3 ; out6 -out1
+ REPX {pmaxsd x, m8}, m1, m2
+ REPX {pminsd x, m9}, m1, m2
+ vpblendd m3, m1, m2, 0xcc
+ shufpd m1, m2, 0x05
+ pmulld m3, m5
+ pmulld m5, m1
+ psignd m0, m6 ; out0 out7
+ psignd m4, m6 ; out6 out1
+ paddd m3, m7
+ psubd m2, m3, m5
+ paddd m5, m3
+ psrad m2, 12 ; out4 -out5
+ psrad m5, 12 ; -out3 out2
+ ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ call m(iadst_8x4_internal_10bpc).main
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m3
+ paddd m1, m5, m2
+ paddd m2, m5, m6
+ paddd m3, m5, m4
+ jmp m(iadst_4x8_internal_10bpc).pass1_end
+.pass2:
+ call m(iadst_4x8_internal_10bpc).pass2_main
+ mova xm4, [pw_2048_m2048]
+ REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm4, [dstq+strideq*1]
+ movhps xm4, [dstq+strideq*0]
+ movq xm5, [dstq+r3 ]
+ movhps xm5, [dstq+strideq*2]
+ movq xm6, [r6 +strideq*1]
+ movhps xm6, [r6 +strideq*0]
+ movq xm7, [r6 +r3 ]
+ movhps xm7, [r6 +strideq*2]
+ paddw xm3, xm4 ; 1 0
+ paddw xm2, xm5 ; 3 2
+ paddw xm1, xm6 ; 5 4
+ paddw xm0, xm7 ; 7 6
+ vpbroadcastd xm5, [pixel_10bpc_max]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0
+ REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0
+ movhps [dstq+strideq*0], xm3
+ movq [dstq+strideq*1], xm3
+ movhps [dstq+strideq*2], xm2
+ movq [dstq+r3 ], xm2
+ movhps [r6 +strideq*0], xm1
+ movq [r6 +strideq*1], xm1
+ movhps [r6 +strideq*2], xm0
+ movq [r6 +r3 ], xm0
+ RET
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m3, [pd_2896]
+ pmulld m0, m3, [cq+32*0]
+ pmulld m1, m3, [cq+32*1]
+ pmulld m2, m3, [cq+32*2]
+ pmulld m3, [cq+32*3]
+ vpbroadcastd m5, [pd_2048]
+ vpbroadcastd m4, [pd_5793]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ REPX {pmulld x, m4}, m0, m1, m2, m3
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m6, [pixel_10bpc_max]
+ call .pass2_end
+ RET
+ALIGN function_align
+.pass2_end:
+ vpbroadcastd m4, [pw_4096]
+ packssdw m0, m2
+ packssdw m1, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmulhrsw m2, m4
+ pmulhrsw m0, m4
+ punpckhdq m1, m0, m2 ; 2 3 6 7
+ punpckldq m0, m2 ; 0 1 4 5
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm2, [dstq+strideq*0]
+ movhps xm2, [dstq+strideq*1]
+ vpbroadcastq m4, [r6 +strideq*0]
+ vpbroadcastq m5, [r6 +strideq*1]
+ movq xm3, [dstq+strideq*2]
+ movhps xm3, [dstq+r3 ]
+ vpblendd m2, m4, 0x30
+ vpblendd m2, m5, 0xc0
+ vpbroadcastq m4, [r6 +strideq*2]
+ vpbroadcastq m5, [r6 +r3 ]
+ vpblendd m3, m4, 0x30
+ vpblendd m3, m5, 0xc0
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ paddw m0, m2 ; out0 out1 out4 out5
+ paddw m1, m3 ; out2 out3 out6 out7
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m6
+ pminsw m1, m6
+ vextracti128 xm2, m0, 1 ; out4 out5
+ vextracti128 xm3, m1, 1 ; out6 out7
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ movq [r6 +strideq*0], xm2
+ movhps [r6 +strideq*1], xm2
+ movq [r6 +strideq*2], xm3
+ movhps [r6 +r3 ], xm3
+ ret
+
+INV_TXFM_4X8_FN dct, dct, 12
+INV_TXFM_4X8_FN dct, identity, 12
+INV_TXFM_4X8_FN dct, adst, 12
+INV_TXFM_4X8_FN dct, flipadst, 12
+
+cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ jmp m(idct_4x8_internal_10bpc).pass1
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ ; transpose & interleave
+ pshufd m0, m0, q1320
+ pshufd m1, m1, q1320
+ pshufd m2, m2, q1320
+ pshufd m3, m3, q1320
+ punpckldq m4, m0, m1
+ punpckhdq m0, m1
+ punpckldq m5, m2, m3
+ punpckhdq m2, m3
+ vpermq m0, m0, q3102
+ vpermq m2, m2, q3102
+ vperm2i128 m1, m0, m2, 0x31 ; 1 5 (interleaved)
+ vperm2i128 m3, m0, m2, 0x20 ; 7 3 (interleaved)
+ vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved)
+ vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved)
+ vpbroadcastd m7, [pd_2048]
+ call m(idct_8x4_internal_10bpc).main
+ psubd m3, m0, m4 ; out7 out6
+ paddd m0, m4 ; out0 out1
+ paddd m1, m2, m5 ; out3 out2
+ psubd m2, m5 ; out4 out5
+ pshufd m1, m1, q1032
+ pshufd m3, m3, q1032
+ jmp m(iadst_4x8_internal_12bpc).end
+
+INV_TXFM_4X8_FN adst, dct, 12
+INV_TXFM_4X8_FN adst, adst, 12
+INV_TXFM_4X8_FN adst, flipadst, 12
+INV_TXFM_4X8_FN adst, identity, 12
+
+cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ call m(iadst_8x4_internal_10bpc).main
+ psrad m0, m4, 1
+ psrad m1, m6, 1
+ psrad m2, 1
+ psrad m3, 1
+.pass1_end:
+ vpbroadcastd m5, [pd_1024]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 11}, m0, m1, m2, m3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call .pass2_main
+ vpblendd m3, m0, m4, 0x33 ; out6 out7
+ vpblendd m0, m4, 0xcc ; out0 out1
+ pshufd m1, m5, q1032
+ psignd m2, m6 ; out4 out5
+ psignd m1, m6 ; out2 out3
+.end:
+ vpbroadcastd m4, [pw_16384]
+ REPX {psrad x, 3}, m0, m1, m2, m3
+ packssdw m0, m2 ; 0 1 4 5 (interleaved)
+ packssdw m1, m3 ; 2 3 6 7 (interleaved)
+ mova m2, [iadst8_12_shuf]
+ vpermd m0, m2, m0 ; 0 1 4 5
+ vpermd m1, m2, m1 ; 2 3 6 7
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ lea r3, [strideq*3]
+ lea r6, [dstq+strideq*4]
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+strideq*1]
+ movq xm5, [dstq+strideq*2]
+ movhps xm5, [dstq+r3 ]
+ movq xm6, [r6 +strideq*0]
+ movhps xm6, [r6 +strideq*1]
+ vinserti128 m4, xm6, 1
+ movq xm7, [r6 +strideq*2]
+ movhps xm7, [r6 +r3 ]
+ vinserti128 m5, xm7, 1
+ paddw m0, m4 ; 0 1 4 5
+ paddw m1, m5 ; 2 3 6 7
+ vpbroadcastd m5, [pixel_12bpc_max]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ REPX {pmaxsw x, m4}, m0, m1
+ REPX {pminsw x, m5}, m0, m1
+ vextracti128 xm2, m0, 1 ; out4 out5
+ vextracti128 xm3, m1, 1 ; out6 out7
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ movq [r6 +strideq*0], xm2
+ movhps [r6 +strideq*1], xm2
+ movq [r6 +strideq*2], xm3
+ movhps [r6 +r3 ], xm3
+ RET
+ALIGN function_align
+.pass2_main:
+ ; transpose & interleave
+ pshufd m0, m0, q1320
+ pshufd m1, m1, q1320
+ pshufd m2, m2, q1320
+ pshufd m3, m3, q1320
+ punpckldq m4, m0, m1
+ punpckhdq m0, m1
+ punpckldq m5, m2, m3
+ punpckhdq m2, m3
+ vperm2i128 m1, m0, m2, 0x31 ; 7 5 (interleaved)
+ vperm2i128 m3, m0, m2, 0x20 ; 3 1 (interleaved)
+ vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved)
+ vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved)
+ vpbroadcastd m7, [pd_2048]
+ jmp m(iadst_4x8_internal_10bpc).main3
+
+INV_TXFM_4X8_FN flipadst, dct, 12
+INV_TXFM_4X8_FN flipadst, adst, 12
+INV_TXFM_4X8_FN flipadst, flipadst, 12
+INV_TXFM_4X8_FN flipadst, identity, 12
+
+cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ call m(iadst_8x4_internal_10bpc).main
+ psrad m0, m3, 1
+ psrad m1, m2, 1
+ psrad m2, m6, 1
+ psrad m3, m4, 1
+ jmp m(iadst_4x8_internal_12bpc).pass1_end
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call m(iadst_4x8_internal_12bpc).pass2_main
+ shufpd m3, m4, m0, 0x05 ; out1 out0
+ shufpd m0, m4, 0x05 ; out7 out6
+ psignd m2, m6
+ pshufd m6, m6, q1032
+ pshufd m1, m2, q1032 ; out5 out4
+ psignd m2, m5, m6 ; out3 out2
+ jmp m(iadst_4x8_internal_12bpc).end
+
+INV_TXFM_4X8_FN identity, dct, 12
+INV_TXFM_4X8_FN identity, adst, 12
+INV_TXFM_4X8_FN identity, flipadst, 12
+INV_TXFM_4X8_FN identity, identity, 12
+
+cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ jmp m(iidentity_4x8_internal_10bpc).pass1
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+ ; m2 = in4 in5
+ ; m3 = in6 in7
+ vpbroadcastd m6, [pixel_12bpc_max]
+ call m(iidentity_4x8_internal_10bpc).pass2_end
+ RET
+
+%macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 4x16, %3
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ vpbroadcastd xm2, [dconly_%3bpc]
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3
+%endif
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, identity
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+
+cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m10, [pd_3072]
+ mova m1, [cq+32*2]
+ mova m3, [cq+32*6]
+ mova m5, [cq+32*3]
+ mova m7, [cq+32*7]
+ call .pass1_main
+ pmulld m0, m6, [cq+32*0]
+ pmulld m2, m6, [cq+32*4]
+ pmulld m4, m6, [cq+32*1]
+ pmulld m6, [cq+32*5]
+ call .pass1_main2
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ lea r6, [deint_shuf+128]
+ punpcklwd m4, m2, m3
+ punpckhwd m2, m3
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhdq m1, m0, m4 ; 2 3
+ punpckldq m0, m4 ; 0 1
+ punpckldq m4, m5, m2 ; 8 9
+ punpckhdq m5, m2 ; a b
+ vextracti128 xm2, m0, 1 ; 4 5
+ vextracti128 xm3, m1, 1 ; 6 7
+ vextracti128 xm6, m4, 1 ; c d
+ vextracti128 xm7, m5, 1 ; e f
+ call m(idct_4x16_internal_8bpc).main
+ vpbroadcastd m9, [pw_2048]
+ vinserti128 m0, m0, xm1, 1 ; 0 1 3 2
+ vinserti128 m1, m2, xm3, 1 ; 4 5 7 6
+ vinserti128 m2, m4, xm5, 1 ; 8 9 b a
+ vinserti128 m3, m6, xm7, 1 ; c d f e
+ vpbroadcastd m8, [pixel_10bpc_max]
+ call .pass2_end
+ RET
+ALIGN function_align
+.pass1_main:
+ vpbroadcastd m4, [pd_3784]
+ vpbroadcastd m8, [pd_1567]
+ vpbroadcastd m9, [pd_2048]
+ vpbroadcastd m6, [pd_1448]
+ ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l
+ ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h
+ ret
+ALIGN function_align
+.pass1_main2:
+ paddd m0, m10
+ paddd m4, m10
+ paddd m8, m0, m2
+ psubd m0, m2
+ paddd m9, m4, m6
+ psubd m4, m6
+ REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h
+ psubd m2, m0, m1
+ paddd m1, m0
+ psubd m6, m4, m5
+ paddd m5, m4
+ paddd m0, m8, m3
+ psubd m3, m8, m3
+ paddd m4, m9, m7
+ psubd m7, m9, m7
+ ret
+ALIGN function_align
+.pass2_end:
+ lea r6, [strideq*3]
+ pxor m7, m7
+ pmulhrsw m0, m9
+ call .write_4x4
+ pmulhrsw m0, m1, m9
+ call .write_4x4
+ pmulhrsw m0, m2, m9
+ call .write_4x4
+ pmulhrsw m0, m3, m9
+ call .write_4x4
+ ret
+ALIGN function_align
+.write_4x4:
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+strideq*1]
+ vpbroadcastq m5, [dstq+strideq*2]
+ vpbroadcastq m6, [dstq+r6 ]
+ mova [cq+32*0], m7
+ mova [cq+32*1], m7
+ add cq, 32*2
+ vpblendd m4, m5, 0xc0
+ vpblendd m4, m6, 0x30
+ paddw m4, m0
+ pmaxsw m4, m7
+ pminsw m4, m8
+ vextracti128 xm5, m4, 1
+ movq [dstq+strideq*0], xm4
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+strideq*2], xm5
+ movq [dstq+r6 ], xm5
+ lea dstq, [dstq+strideq*4]
+ ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
+ call m(iadst_16x4_internal_10bpc).main
+ vpbroadcastd m6, [pd_6144]
+ call m(iadst_16x4_internal_10bpc).main_end
+ psrad m0, m4, 13
+ psrad m1, m5, 13
+ psrad m2, 13
+ psrad m3, 13
+ psrad m4, m8, 13
+ psrad m5, m9, 13
+ psrad m6, 13
+ psrad m7, 13
+ jmp tx2q
+.pass2:
+ call .pass2_main
+ vpbroadcastd m5, [pw_2048]
+ vpbroadcastd m8, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1
+ pshufd m2, m2, q1032 ; -out11 out8 out10 -out9
+ vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13
+ pxor m7, m7
+ psubw m9, m7, m5
+ vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048
+ pmulhrsw m0, m4, m9
+ call .write_4x4
+ pmulhrsw m0, m1, m9
+ call .write_4x4
+ pmulhrsw m0, m2, m9
+ call .write_4x4
+ pmulhrsw m0, m3, m9
+ call .write_4x4
+ RET
+ALIGN function_align
+.write_4x4:
+ movq xm4, [dstq+r6 ]
+ movhps xm4, [dstq+strideq*0]
+ vpbroadcastq m5, [dstq+strideq*1]
+ vpbroadcastq m6, [dstq+strideq*2]
+ mova [cq+32*0], m7
+ mova [cq+32*1], m7
+ add cq, 32*2
+ vpblendd m4, m5, 0xc0
+ vpblendd m4, m6, 0x30
+ paddw m4, m0
+ pmaxsw m4, m7
+ pminsw m4, m8
+ vextracti128 xm5, m4, 1
+ movhps [dstq+strideq*0], xm4
+ movhps [dstq+strideq*1], xm5
+ movq [dstq+strideq*2], xm5
+ movq [dstq+r6 ], xm4
+ lea dstq, [dstq+strideq*4]
+ ret
+ALIGN function_align
+.pass2_main:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ lea r6, [deint_shuf+128]
+ punpcklwd m4, m2, m3
+ punpckhwd m2, m3
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhdq m1, m0, m4
+ punpckldq m0, m4
+ punpckldq m4, m5, m2
+ punpckhdq m5, m2
+ vpblendd m3, m0, m1, 0x33
+ vpblendd m0, m1, 0xcc
+ shufpd m2, m5, m4, 0x05
+ shufpd m4, m5, 0x05
+ vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5
+ vinserti128 m0, xm3, 1 ; 0 3 2 1
+ vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ????
+ vinserti128 m2, xm4, 1 ; b 8 9 a
+ call m(iadst_4x16_internal_8bpc).main2
+ vpbroadcastd m5, [pw_2896x8]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m5 ; out8 -out11 -out9 out10
+ ret
+ALIGN function_align
+.main:
+ vbroadcasti128 m0, [cq+16* 0]
+ vbroadcasti128 m4, [cq+16* 2]
+ vbroadcasti128 m1, [cq+16*15]
+ vbroadcasti128 m5, [cq+16*13]
+ vbroadcasti128 m2, [cq+16* 4]
+ vbroadcasti128 m6, [cq+16* 6]
+ vbroadcasti128 m3, [cq+16*11]
+ vbroadcasti128 m7, [cq+16* 9]
+ shufpd m0, m4, 0x0c ; 0 2
+ shufpd m1, m5, 0x0c ; 15 13
+ shufpd m2, m6, 0x0c ; 4 6
+ shufpd m3, m7, 0x0c ; 11 9
+ vbroadcasti128 m4, [cq+16* 8]
+ vbroadcasti128 m6, [cq+16*10]
+ vbroadcasti128 m5, [cq+16* 7]
+ vbroadcasti128 m7, [cq+16* 5]
+ shufpd m4, m6, 0x0c ; 8 10
+ shufpd m5, m7, 0x0c ; 7 5
+ vbroadcasti128 m6, [cq+16*12]
+ vbroadcasti128 m7, [cq+16*14]
+ shufpd m6, m7, 0x0c ; 12 14
+ vbroadcasti128 m7, [cq+16* 3]
+ vbroadcasti128 m8, [cq+16* 1]
+ shufpd m7, m8, 0x0c ; 3 1
+.main2:
+ ; expects: m12 = clip_min m13 = clip_max
+ vpbroadcastd m11, [pd_2048]
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1
+ psubd m8, m0, m4 ; t8a t10a
+ paddd m0, m4 ; t0a t2a
+ psubd m4, m1, m5 ; t9a t11a
+ paddd m1, m5 ; t1a t3a
+ psubd m5, m2, m6 ; t12a t14a
+ paddd m2, m6 ; t4a t6a
+ psubd m6, m3, m7 ; t13a t15a
+ paddd m3, m7 ; t5a t7a
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8
+ ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1
+ ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1
+ psubd m7, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ psubd m3, m4, m6 ; t12a t14a
+ paddd m4, m6 ; t8a t10a
+ psubd m6, m8, m5 ; t13a t15a
+ paddd m8, m5 ; t9a t11a
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8
+ punpcklqdq m5, m3, m7 ; t12a t4
+ punpckhqdq m3, m7 ; t14a t6
+ punpckhqdq m7, m6, m2 ; t15a t7
+ punpcklqdq m6, m2 ; t13a t5
+ ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 3784, 1567
+ ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 1567, 10
+ vpbroadcastd m10, [pd_2896]
+ vbroadcasti128 m9, [pw_2048_m2048] ; + + - -
+ punpckhqdq m2, m4, m0 ; t10a t2
+ punpcklqdq m4, m0 ; t8a t0
+ punpckhqdq m0, m8, m1 ; t11a t3
+ punpcklqdq m8, m1 ; t9a t1
+ paddd m1, m6, m7 ; out2 -out3
+ psubd m6, m7 ; t14a t6
+ paddd m7, m5, m3 ; -out13 out12
+ psubd m5, m3 ; t15a t7
+ psubd m3, m8, m0 ; t11 t3a
+ paddd m8, m0 ; out14 -out15
+ paddd m0, m4, m2 ; -out1 out0
+ psubd m4, m2 ; t10 t2a
+ REPX {pmaxsd x, m12}, m6, m5, m3, m4
+ REPX {pminsd x, m13}, m6, m5, m3, m4
+ REPX {pmulld x, m10}, m6, m5, m3, m4
+ paddd m6, m11
+ paddd m4, m11
+ paddd m2, m6, m5 ; -out5 out4
+ psubd m6, m5 ; out10 -out11
+ psubd m5, m4, m3 ; -out9 out8
+ paddd m3, m4 ; out6 -out7
+ REPX {psrad x, 12}, m2, m3, m5, m6
+ REPX {psignd x, m9}, m1, m8, m3, m6
+ pshufd m9, m9, q1032
+ REPX {psignd x, m9}, m0, m7, m2, m5
+ ret
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
+.pass1:
+ call m(iadst_16x4_internal_10bpc).main
+ vpbroadcastd m6, [pd_6144]
+ call m(iadst_16x4_internal_10bpc).main_end
+ psrad m0, m3, 13
+ psrad m1, m2, 13
+ psrad m2, m5, 13
+ psrad m3, m4, 13
+ psrad m4, m7, 13
+ psrad m5, m6, 13
+ psrad m6, m9, 13
+ psrad m7, m8, 13
+ jmp tx2q
+.pass2:
+ call m(iadst_4x16_internal_10bpc).pass2_main
+ vpbroadcastd m5, [pw_2048]
+ vpbroadcastd m8, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2
+ pshufd m2, m2, q1032 ; -out11 out8 out10 -out9
+ vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14
+ pxor m7, m7
+ psubw m9, m7, m5
+ vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048
+ pmulhrsw m0, m4, m9
+ call .write_4x4
+ pmulhrsw m0, m2, m9
+ call .write_4x4
+ pmulhrsw m0, m1, m9
+ call .write_4x4
+ pmulhrsw m0, m3, m9
+ call .write_4x4
+ RET
+ALIGN function_align
+.write_4x4:
+ movq xm4, [dstq+strideq*0]
+ movhps xm4, [dstq+r6 ]
+ vpbroadcastq m5, [dstq+strideq*1]
+ vpbroadcastq m6, [dstq+strideq*2]
+ mova [cq+32*0], m7
+ mova [cq+32*1], m7
+ add cq, 32*2
+ vpblendd m4, m5, 0x30
+ vpblendd m4, m6, 0xc0
+ paddw m4, m0
+ pmaxsw m4, m7
+ pminsw m4, m8
+ vextracti128 xm5, m4, 1
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*1], xm5
+ movhps [dstq+strideq*2], xm5
+ movhps [dstq+r6 ], xm4
+ lea dstq, [dstq+strideq*4]
+ ret
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
+ vpbroadcastd m7, [pd_5793]
+ pmulld m0, m7, [cq+32*0]
+ pmulld m4, m7, [cq+32*1]
+ pmulld m1, m7, [cq+32*2]
+ pmulld m5, m7, [cq+32*3]
+ pmulld m2, m7, [cq+32*4]
+ pmulld m6, m7, [cq+32*5]
+ pmulld m3, m7, [cq+32*6]
+ pmulld m7, [cq+32*7]
+ vpbroadcastd m8, [pd_6144]
+ REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7
+ REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7
+ jmp tx2q
+.pass2:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpbroadcastd m7, [pw_1697x16]
+ vpbroadcastd m8, [pw_2048]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ vpbroadcastd m4, [pixel_10bpc_max]
+ call .pass2_end
+ RET
+ALIGN function_align
+.pass2_end:
+ punpckhwd m7, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ lea r6, [strideq*5]
+ pxor m3, m3
+ punpckhdq m5, m0, m2 ; 2 3 6 7
+ punpckldq m0, m2 ; 0 1 4 5
+ punpckldq m6, m7, m1 ; 8 9 c d
+ punpckhdq m7, m1 ; a b e f
+ pmulhrsw m0, m8
+ call .write_2x4x2
+ pmulhrsw m0, m5, m8
+ call .write_2x4x2
+ pmulhrsw m0, m6, m8
+ lea dstq, [dstq+strideq*4]
+ call .write_2x4x2
+ pmulhrsw m0, m7, m8
+ call .write_2x4x2
+ ret
+ALIGN function_align
+.write_2x4x2:
+ movq xm1, [dstq+strideq*0]
+ movhps xm1, [dstq+strideq*1]
+ vpbroadcastq m2, [dstq+strideq*4]
+ vpblendd m1, m2, 0x30
+ vpbroadcastq m2, [dstq+r6 ]
+ vpblendd m1, m2, 0xc0
+ mova [cq+32*0], m3
+ mova [cq+32*1], m3
+ add cq, 32*2
+ paddw m1, m0
+ pmaxsw m1, m3
+ pminsw m1, m4
+ vextracti128 xm2, m1, 1
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ movq [dstq+strideq*4], xm2
+ movhps [dstq+r6 ], xm2
+ lea dstq, [dstq+strideq*2]
+ ret
+
+INV_TXFM_4X16_FN dct, dct, 12
+INV_TXFM_4X16_FN dct, identity, 12
+INV_TXFM_4X16_FN dct, adst, 12
+INV_TXFM_4X16_FN dct, flipadst, 12
+
+cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ jmp m(idct_4x16_internal_10bpc).pass1
+.pass2:
+ punpckldq m8, m0, m1
+ punpckhdq m0, m1
+ punpckldq m9, m2, m3
+ punpckhdq m2, m3
+ punpckldq m1, m4, m5
+ punpckhdq m4, m5
+ punpckldq m3, m6, m7
+ punpckhdq m6, m7
+ punpcklqdq m5, m0, m2 ; 2 6
+ punpckhqdq m12, m0, m2 ; 3 7
+ punpcklqdq m0, m8, m9 ; 0 4
+ punpckhqdq m10, m8, m9 ; 1 5
+ punpcklqdq m2, m1, m3 ; 8 12
+ punpckhqdq m13, m1, m3 ; 9 13
+ punpcklqdq m9, m4, m6 ; 10 14
+ punpckhqdq m4, m6 ; 11 15
+ vperm2i128 m1, m5, m9, 0x20 ; 2 10
+ vperm2i128 m3, m9, m5, 0x31 ; 14 6
+ vpermq m11, m4, q1302 ; 15 11
+ ; interleave
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13
+ REPX {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13
+ call m(idct_16x4_internal_10bpc).pass1_main
+ vpermq m6, m12, q1302 ; 7 3
+ vpermq m5, m13, q3120 ; 9 13
+ call m(idct_16x4_internal_10bpc).pass1_main2
+ call m(idct_16x4_internal_10bpc).pass1_main3
+ REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ mova m4, [idct16_12_shuf]
+ REPX {vpermd x, m4, x}, m0, m1, m2, m3
+ vpbroadcastd m9, [pw_16384]
+ vpbroadcastd m8, [pixel_12bpc_max]
+ call m(idct_4x16_internal_10bpc).pass2_end
+ RET
+
+INV_TXFM_4X16_FN adst, dct, 12
+INV_TXFM_4X16_FN adst, adst, 12
+INV_TXFM_4X16_FN adst, flipadst, 12
+INV_TXFM_4X16_FN adst, identity, 12
+
+cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ call .main_pass1
+ psrad m0, m4, 12
+ psrad m1, m5, 12
+ psrad m2, 12
+ psrad m3, 12
+ psrad m4, m8, 12
+ psrad m5, m9, 12
+ psrad m6, 12
+ psrad m7, 12
+ jmp tx2q
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call .transpose_16x4
+ call m(iadst_4x16_internal_10bpc).main2
+ pshufd m4, m5, q1032
+ psrad m5, m6, 3
+ pshufd m6, m7, q1032
+ psrad m7, m8, 3
+ REPX {pshufd x, x, q1032}, m0, m2
+ REPX {psrad x, 3}, m0, m1, m2, m3, m4, m6
+.pass2_end:
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ mova m4, [iadst16_12_shuf]
+ REPX {vpermd x, m4, x}, m0, m1, m2, m3
+ vpbroadcastd m9, [pw_16384]
+ vpbroadcastd m8, [pixel_12bpc_max]
+ lea r6, [strideq*3]
+ pxor m7, m7
+ pmulhrsw m0, m9
+ call m(iadst_4x16_internal_10bpc).write_4x4
+ pmulhrsw m0, m9, m1
+ call m(iadst_4x16_internal_10bpc).write_4x4
+ pmulhrsw m0, m9, m2
+ call m(iadst_4x16_internal_10bpc).write_4x4
+ pmulhrsw m0, m9, m3
+ call m(iadst_4x16_internal_10bpc).write_4x4
+ RET
+ALIGN function_align
+.transpose_16x4:
+ ; transpose & interleave
+ punpckldq m8, m0, m1
+ punpckhdq m0, m1
+ punpckldq m9, m2, m3
+ punpckhdq m2, m3
+ punpckldq m1, m4, m5
+ punpckhdq m4, m5
+ punpckldq m3, m6, m7
+ punpckhdq m6, m7
+ punpcklqdq m10, m8, m0
+ punpckhqdq m0, m8
+ punpcklqdq m11, m9, m2
+ punpckhqdq m2, m9
+ punpcklqdq m8, m1, m4
+ punpckhqdq m4, m1
+ punpcklqdq m9, m3, m6
+ punpckhqdq m6, m3
+ vperm2i128 m5, m0, m2, 0x31 ; 7 5
+ vperm2i128 m7, m0, m2, 0x20 ; 3 1
+ vperm2i128 m0, m10, m11, 0x20 ; 0 2
+ vperm2i128 m2, m10, m11, 0x31 ; 4 6
+ vperm2i128 m1, m4, m6, 0x31 ; 15 13
+ vperm2i128 m3, m4, m6, 0x20 ; 11 9
+ vperm2i128 m4, m8, m9, 0x20 ; 8 10
+ vperm2i128 m6, m8, m9, 0x31 ; 12 14
+ ret
+ALIGN function_align
+.main_pass1:
+ call m(iadst_16x4_internal_10bpc).main
+ vpbroadcastd m6, [pd_3072]
+ paddd m10, m4, m5
+ psubd m4, m3
+ psubd m5, m3
+ paddd m3, m10
+ psubd m8, m7, m1
+ paddd m7, m9
+ psubd m9, m1
+ paddd m7, m1
+ REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7
+ REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7
+ paddd m6, m0
+ ret
+
+INV_TXFM_4X16_FN flipadst, dct, 12
+INV_TXFM_4X16_FN flipadst, adst, 12
+INV_TXFM_4X16_FN flipadst, flipadst, 12
+INV_TXFM_4X16_FN flipadst, identity, 12
+
+cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ call m(iadst_4x16_internal_12bpc).main_pass1
+ psrad m0, m3, 12
+ psrad m1, m2, 12
+ psrad m2, m5, 12
+ psrad m3, m4, 12
+ psrad m4, m7, 12
+ psrad m5, m6, 12
+ psrad m6, m9, 12
+ psrad m7, m8, 12
+ jmp tx2q
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(iadst_4x16_internal_12bpc).transpose_16x4
+ call m(iadst_4x16_internal_10bpc).main2
+ pshufd m4, m3, q1032
+ psrad m3, m5, 3
+ psrad m5, m2, 3
+ pshufd m2, m6, q1032
+ pshufd m6, m1, q1032
+ psrad m1, m7, 3
+ psrad m7, m0, 3
+ pshufd m0, m8, q1032
+ REPX {psrad x, 3}, m0, m2, m4, m6
+ jmp m(iadst_4x16_internal_12bpc).pass2_end
+
+INV_TXFM_4X16_FN identity, dct, 12
+INV_TXFM_4X16_FN identity, adst, 12
+INV_TXFM_4X16_FN identity, flipadst, 12
+INV_TXFM_4X16_FN identity, identity, 12
+
+cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [pd_1697]
+ mova m0, [cq+32*0]
+ mova m4, [cq+32*1]
+ mova m1, [cq+32*2]
+ mova m5, [cq+32*3]
+ vpbroadcastd m9, [pd_6144]
+ pmulld m2, m8, m0
+ pmulld m6, m8, m4
+ pmulld m3, m8, m1
+ pmulld m7, m8, m5
+ mova m10, [cq+32*4]
+ mova m11, [cq+32*5]
+ mova m12, [cq+32*6]
+ mova m13, [cq+32*7]
+ REPX {paddd x, m9}, m2, m6, m3, m7
+ REPX {psrad x, 12}, m2, m6, m3, m7
+ paddd m0, m2
+ pmulld m2, m8, m10
+ paddd m4, m6
+ pmulld m6, m8, m11
+ paddd m1, m3
+ pmulld m3, m8, m12
+ paddd m5, m7
+ pmulld m7, m8, m13
+ REPX {psrad x, 1 }, m0, m4, m1, m5
+ REPX {paddd x, m9}, m2, m6, m3, m7
+ REPX {psrad x, 12}, m2, m6, m3, m7
+ paddd m2, m10
+ paddd m6, m11
+ paddd m3, m12
+ paddd m7, m13
+ REPX {psrad x, 1 }, m2, m6, m3, m7
+ jmp tx2q
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m8, [pd_5793]
+ vpbroadcastd m9, [pd_1024]
+ REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpbroadcastd m8, [pw_16384]
+ vpbroadcastd m4, [pixel_12bpc_max]
+ call m(iidentity_4x16_internal_10bpc).pass2_end
+ RET
+
+%macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 8x4, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd m2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 4
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
+%else
+ jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, identity
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+
+cglobal idct_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+.pass1:
+ vbroadcasti128 m1, [cq+16*1]
+ vbroadcasti128 m0, [cq+16*5]
+ vbroadcasti128 m2, [cq+16*3]
+ vbroadcasti128 m3, [cq+16*7]
+ vpbroadcastd m6, [pd_2896]
+ shufpd m1, m0, 0x0c ; 1 5
+ shufpd m3, m2, 0x0c ; 7 3
+ vbroadcasti128 m0, [cq+16*0]
+ vbroadcasti128 m4, [cq+16*2]
+ vbroadcasti128 m2, [cq+16*4]
+ vbroadcasti128 m5, [cq+16*6]
+ vpbroadcastd m7, [pd_2048]
+ shufpd m0, m4, 0x0c ; 0 2
+ shufpd m2, m5, 0x0c ; 4 6
+ REPX {pmulld x, m6}, m1, m3, m0, m2
+ REPX {paddd x, m7}, m1, m3, m0, m2
+ REPX {psrad x, 12}, m1, m3, m0, m2
+ call .main
+ psubd m3, m0, m4 ; out7 out6 (interleaved)
+ paddd m0, m4 ; out0 out1 (interleaved)
+ paddd m1, m2, m5 ; out3 out2 (interleaved)
+ psubd m2, m5 ; out4 out5 (interleaved)
+ pshufd m1, m1, q1032
+ pshufd m3, m3, q1032
+ jmp tx2q
+.pass2:
+ vbroadcasti128 m4, [deint_shuf]
+ packssdw m0, m1
+ packssdw m2, m3
+ vperm2i128 m1, m0, m2, 0x31
+ vinserti128 m0, xm2, 1
+ pshufb m0, m4
+ pshufb m1, m4
+ IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7
+ vpermq m0, m0, q3120 ; out0 out1
+ vpermq m2, m1, q2031 ; out2 out3
+ jmp m(iadst_8x4_internal_10bpc).end
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1
+ IDCT4_1D_PACKED 0, 2, 4, 5, 6, 7
+ vpbroadcastd m6, [pd_2896]
+ punpcklqdq m4, m1, m3 ; t4a t7a
+ punpckhqdq m1, m3 ; t5a t6a
+ psubd m3, m4, m1 ; t5a t6a
+ paddd m4, m1 ; t4 t7
+ REPX {pmaxsd x, m8}, m3, m4, m0, m2
+ REPX {pminsd x, m9}, m3, m4, m0, m2
+ pmulld m3, m6
+ pshufd m1, m3, q1032
+ paddd m3, m7
+ psubd m5, m3, m1
+ paddd m1, m3
+ psrad m5, 12
+ psrad m1, 12
+ vpblendd m5, m4, 0x33 ; t4 t5
+ punpckhqdq m4, m1 ; t7 t6
+ ret
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ call m(iadst_4x8_internal_10bpc).main
+ vpblendd m3, m0, m4, 0x33 ; out6 out7
+ vpblendd m0, m4, 0xcc ; out0 out1
+ pshufd m1, m5, q1032
+ psignd m2, m6 ; out4 out5
+ psignd m1, m6 ; out2 out3
+ jmp tx2q
+.pass2:
+ call .pass2_main
+ vpermq m0, m0, q3120 ; out0 out1
+ vpermq m2, m1, q3120 ; out2 out3
+.end:
+ vpbroadcastd m1, [pw_2048]
+ pmulhrsw m0, m1
+ pmulhrsw m1, m2
+ vpbroadcastd m5, [pixel_10bpc_max]
+.end2:
+ mova xm2, [dstq+strideq*0]
+ vinserti128 m2, [dstq+strideq*1], 1
+ lea r6, [dstq+strideq*2]
+ mova xm3, [r6 +strideq*0]
+ vinserti128 m3, [r6 +strideq*1], 1
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [r6 +strideq*0], xm1
+ vextracti128 [r6 +strideq*1], m1, 1
+ RET
+ALIGN function_align
+.pass2_main:
+ vbroadcasti128 m4, [deint_shuf]
+ packssdw m0, m1
+ packssdw m2, m3
+ lea r6, [deint_shuf+128]
+ vperm2i128 m1, m0, m2, 0x31
+ vinserti128 m0, xm2, 1
+ pshufb m0, m4
+ pshufb m1, m4
+ jmp m(iadst_8x4_internal_8bpc).main
+ALIGN function_align
+.main:
+ vpbroadcastd m1, [pd_2896]
+ pmulld m0, m1, [cq+32*0]
+ pmulld m3, m1, [cq+32*3]
+ pmulld m2, m1, [cq+32*2]
+ pmulld m1, [cq+32*1]
+ vpbroadcastd m4, [pd_2048]
+ REPX {paddd x, m4}, m0, m3, m2, m1
+ REPX {psrad x, 12}, m0, m3, m2, m1
+.main2:
+ IADST4_1D
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_10bpc, 0, 5, 10, dst, stride, c, eob, tx2
+ call m(iadst_4x8_internal_10bpc).main
+ shufpd m3, m4, m0, 0x05
+ shufpd m0, m4, 0x05
+ psignd m2, m6
+ pshufd m6, m6, q1032
+ pshufd m1, m2, q1032
+ psignd m2, m5, m6
+ jmp tx2q
+.pass2:
+ call m(iadst_8x4_internal_10bpc).pass2_main
+ vpermq m2, m0, q2031
+ vpermq m0, m1, q2031
+ jmp m(iadst_8x4_internal_10bpc).end
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m4, [pd_2896]
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpermq m2, [cq+32*2], q3120
+ vpermq m3, [cq+32*3], q3120
+ vpbroadcastd m7, [pd_2048]
+ REPX {pmulld x, m4}, m0, m1, m2, m3
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ REPX {paddd x, x }, m0, m1, m2, m3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m5, [pixel_10bpc_max]
+ vpbroadcastd m4, [pw_1697x8]
+ packssdw m0, m1
+ packssdw m2, m3
+ pmulhrsw m1, m4, m0
+ pmulhrsw m4, m2
+ paddsw m0, m1
+ paddsw m2, m4
+ packssdw m7, m7 ; pw_2048
+.pass2_end:
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ lea r6, [dstq+strideq*2]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmulhrsw m2, m7
+ pmulhrsw m0, m7
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ mova xm2, [dstq+strideq*0]
+ vinserti128 m2, [r6 +strideq*0], 1
+ mova xm3, [dstq+strideq*1]
+ vinserti128 m3, [r6 +strideq*1], 1
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ vextracti128 [r6 +strideq*0], m0, 1
+ vextracti128 [r6 +strideq*1], m1, 1
+ RET
+
+INV_TXFM_8X4_FN dct, dct, 12
+INV_TXFM_8X4_FN dct, identity, 12
+INV_TXFM_8X4_FN dct, adst, 12
+INV_TXFM_8X4_FN dct, flipadst, 12
+
+cglobal idct_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_20b_min]
+ vpbroadcastd m9, [clip_20b_max]
+ jmp m(idct_8x4_internal_10bpc).pass1
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call m(iadst_8x4_internal_12bpc).transpose_4x8
+ IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7
+ jmp m(iadst_8x4_internal_12bpc).end
+
+INV_TXFM_8X4_FN adst, dct, 12
+INV_TXFM_8X4_FN adst, adst, 12
+INV_TXFM_8X4_FN adst, flipadst, 12
+INV_TXFM_8X4_FN adst, identity, 12
+
+cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_20b_min]
+ vpbroadcastd m9, [clip_20b_max]
+ call m(iadst_4x8_internal_10bpc).main2
+ vpblendd m3, m0, m4, 0x33 ; out6 out7
+ vpblendd m0, m4, 0xcc ; out0 out1
+ pshufd m1, m5, q1032
+ psignd m2, m6 ; out4 out5
+ psignd m1, m6 ; out2 out3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call .pass2_main
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m4
+ paddd m1, m5, m6
+ paddd m2, m5
+ paddd m3, m5
+.pass2_end:
+ REPX {psrad x, 12}, m0, m1, m2, m3
+.end:
+ vpbroadcastd m4, [pw_16384]
+ REPX {psrad x, 3}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m2, m4
+ vpermq m0, m0, q3120 ; out0 out1
+ vpermq m1, m1, q3120 ; out2 out3
+ vpbroadcastd m5, [pixel_12bpc_max]
+ jmp m(iadst_8x4_internal_10bpc).end2
+ALIGN function_align
+.pass2_main:
+ call .transpose_4x8
+ jmp m(iadst_8x4_internal_10bpc).main2
+ALIGN function_align
+.transpose_4x8:
+ ; deinterleave
+ pshufd m0, m0, q3120
+ pshufd m1, m1, q3120
+ pshufd m2, m2, q3120
+ pshufd m3, m3, q3120
+ ; transpose
+ punpcklqdq m4, m0, m1
+ punpckhqdq m0, m1
+ punpcklqdq m5, m2, m3
+ punpckhqdq m2, m3
+ vperm2i128 m1, m0, m2, 0x20 ; out1
+ vperm2i128 m3, m0, m2, 0x31 ; out3
+ vperm2i128 m2, m4, m5, 0x31 ; out2
+ vperm2i128 m0, m4, m5, 0x20 ; out0
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct, 12
+INV_TXFM_8X4_FN flipadst, adst, 12
+INV_TXFM_8X4_FN flipadst, flipadst, 12
+INV_TXFM_8X4_FN flipadst, identity, 12
+
+cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_20b_min]
+ vpbroadcastd m9, [clip_20b_max]
+ call m(iadst_4x8_internal_10bpc).main2
+ shufpd m3, m4, m0, 0x05
+ shufpd m0, m4, 0x05
+ psignd m2, m6
+ pshufd m6, m6, q1032
+ pshufd m1, m2, q1032
+ psignd m2, m5, m6
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ call m(iadst_8x4_internal_12bpc).pass2_main
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m3
+ paddd m1, m5, m2
+ paddd m3, m5, m4
+ paddd m2, m5, m6
+ jmp m(iadst_8x4_internal_12bpc).pass2_end
+
+INV_TXFM_8X4_FN identity, dct, 12
+INV_TXFM_8X4_FN identity, adst, 12
+INV_TXFM_8X4_FN identity, flipadst, 12
+INV_TXFM_8X4_FN identity, identity, 12
+
+cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
+ jmp m(iidentity_8x4_internal_10bpc).pass1
+.pass2:
+ ; m0 = in0 in1 (interleaved)
+ ; m1 = in2 in3 (interleaved)
+ ; m2 = in4 in5 (interleaved)
+ ; m3 = in6 in7 (interleaved)
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+ REPX {pmaxsd x, m8}, m0, m1, m2, m3
+ REPX {pminsd x, m9}, m0, m1, m2, m3
+ vpbroadcastd m4, [pd_5793]
+ REPX {pmulld x, m4}, m0, m1, m2, m3
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 15}, m0, m1, m2, m3
+ vpbroadcastd m5, [pixel_12bpc_max]
+ vpbroadcastd m7, [pw_16384]
+ packssdw m0, m1
+ packssdw m2, m3
+ jmp m(iidentity_8x4_internal_10bpc).pass2_end
+
+%macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 8x8, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd m2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+.dconly2:
+ add r6d, 384
+ sar r6d, 9
+.dconly3:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm2
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ mova xm1, [dstq+strideq*0]
+ vinserti128 m1, [dstq+strideq*1], 1
+ paddsw m1, m0
+ psubusw m1, m2
+ mova [dstq+strideq*0], xm1
+ vextracti128 [dstq+strideq*1], m1, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%else
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+%macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2]
+ ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a
+ psubd m%9, m%3, m%7 ; t6
+ paddd m%3, m%7 ; t2
+ psubd m%7, m%1, m%5 ; t4
+ paddd m%1, m%5 ; t0
+ psubd m%5, m%6, m%2 ; t7
+ paddd m%6, m%2 ; t3
+ psubd m%2, m%8, m%4 ; t5
+ paddd m%8, m%4 ; t1
+ REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8
+ REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8
+ ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a
+ psubd m%10, m%7, m%9 ; t7
+ paddd m%7, m%9 ; out6
+ vpbroadcastd m%9, [pd_1448]
+ psubd m%4, m%8, m%6 ; t3
+ paddd m%8, m%6 ; -out7
+ psubd m%6, m%1, m%3 ; t2
+ paddd m%1, m%3 ; out0
+ psubd m%3, m%2, m%5 ; t6
+ paddd m%2, m%5 ; -out1
+ REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10
+ REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10
+ REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10
+ psubd m%5, m%6, m%4 ; (t2 - t3) * 1448
+ paddd m%4, m%6 ; (t2 + t3) * 1448
+ psubd m%6, m%3, m%10 ; (t6 - t7) * 1448
+ paddd m%3, m%10 ; (t6 + t7) * 1448
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, identity
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ mova m4, [cq+32*4]
+ mova m5, [cq+32*5]
+ mova m6, [cq+32*6]
+ mova m7, [cq+32*7]
+ vpbroadcastd m11, [pd_2048]
+ call .main
+ call .round_shift1
+ jmp tx2q
+.pass2:
+ call .transpose_8x8_packed
+ call m(idct_8x8_internal_8bpc).main
+ vpbroadcastd m12, [pw_2048]
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call .write_8x4_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call .write_8x4
+ RET
+ALIGN function_align
+.write_8x4_start:
+ vpbroadcastd m11, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m10, m10
+.write_8x4:
+ mova xm8, [dstq+strideq*0]
+ vinserti128 m8, [dstq+strideq*1], 1
+ mova xm9, [dstq+strideq*2]
+ vinserti128 m9, [dstq+r6 ], 1
+ mova [cq+32*0], m10
+ mova [cq+32*1], m10
+ mova [cq+32*2], m10
+ mova [cq+32*3], m10
+ add cq, 32*4
+ paddw m0, m8
+ paddw m1, m9
+ pmaxsw m0, m10
+ pmaxsw m1, m10
+ pminsw m0, m11
+ pminsw m1, m11
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+strideq*4]
+ ret
+ALIGN function_align
+.transpose_8x8_packed:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ lea r6, [deint_shuf+128]
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckhdq m2, m4, m1
+ punpckldq m4, m1
+ vinserti128 m1, m3, xm2, 1
+ vperm2i128 m3, m2, 0x31
+ vperm2i128 m2, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ ret
+ALIGN function_align
+.main_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main:
+ ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a
+ ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3
+ paddd m8, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ paddd m9, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ vpbroadcastd m3, [pd_2896]
+ REPX {pmaxsd x, m12}, m1, m8, m7, m9
+ REPX {pminsd x, m13}, m1, m8, m7, m9
+ REPX {pmulld x, m3 }, m0, m4, m7, m1
+ paddd m0, m11
+ paddd m7, m11
+ psubd m5, m0, m4
+ paddd m0, m4
+ psubd m4, m7, m1
+ paddd m7, m1
+ REPX {psrad x, 12 }, m5, m0, m4, m7
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ paddd m6, m5, m2 ; dct4 out1
+ psubd m5, m2 ; dct4 out2
+ REPX {pmaxsd x, m12}, m0, m6, m5, m3
+ REPX {pminsd x, m13}, m0, m6, m5, m3
+ ret
+ALIGN function_align
+.round_shift1:
+ pcmpeqd m1, m1
+ REPX {psubd x, m1}, m0, m6, m5, m3
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ psubd m7, m0, m9 ; out7
+ paddd m0, m9 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+ REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ call .main
+ call .main_end
+ jmp tx2q
+.pass2:
+ call m(idct_8x8_internal_10bpc).transpose_8x8_packed
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_8x8_internal_8bpc).main_pass2
+ vpbroadcastd m5, [pw_2048]
+ vpbroadcastd xm12, [pw_4096]
+ psubw m12, m5
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+32*0]
+ mova m7, [cq+32*7]
+ mova m1, [cq+32*1]
+ mova m6, [cq+32*6]
+ mova m2, [cq+32*2]
+ mova m5, [cq+32*5]
+ mova m3, [cq+32*3]
+ mova m4, [cq+32*4]
+ vpbroadcastd m11, [pd_2048]
+.main2:
+ IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
+ psrld m8, 10 ; pd_1
+ vpbroadcastd m9, [pd_3072]
+ ret
+ALIGN function_align
+.main_end:
+ paddd m0, m8
+ psubd m1, m8, m1
+ paddd m6, m8
+ psubd m7, m8, m7
+ REPX {psrad x, 1 }, m0, m1, m6, m7
+ ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12
+ ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12
+ psubd m8, m9, m8 ; pd_3071
+ paddd m2, m9
+ psubd m3, m8, m3
+ paddd m4, m9
+ psubd m5, m8, m5
+ REPX {psrad x, 12}, m2, m3, m4, m5
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ call m(iadst_8x8_internal_10bpc).main
+ call .main_end
+ jmp tx2q
+.pass2:
+ call m(idct_8x8_internal_10bpc).transpose_8x8_packed
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_8x8_internal_8bpc).main_pass2
+ vpbroadcastd m12, [pw_2048]
+ vpbroadcastd xm5, [pw_4096]
+ psubw m12, m5
+ vpermq m8, m3, q2031
+ vpermq m9, m2, q2031
+ vpermq m2, m1, q2031
+ vpermq m3, m0, q2031
+ pmulhrsw m0, m8, m12
+ pmulhrsw m1, m9, m12
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.main_end:
+ paddd m10, m8, m0
+ psubd m0, m8, m7
+ psubd m7, m8, m1
+ paddd m1, m8, m6
+ psrad m0, 1
+ psrad m1, 1
+ psrad m6, m7, 1
+ psrad m7, m10, 1
+ psubd m8, m9, m8 ; pd_6143
+ psubd m10, m8, m5
+ paddd m5, m9, m2
+ psubd m2, m8, m3
+ paddd m3, m9, m4
+ psrad m4, m2, 12
+ psrad m2, m10, 12
+ psrad m3, 12
+ psrad m5, 12
+ ret
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+.pass1:
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ mova m4, [cq+32*4]
+ mova m5, [cq+32*5]
+ mova m6, [cq+32*6]
+ mova m7, [cq+32*7]
+ jmp tx2q
+.pass2:
+ packssdw m3, m7
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass2_main:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ vpbroadcastd m12, [pw_4096]
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m1
+ punpckhdq m4, m1
+ punpckhqdq m1, m0, m2 ; 1 5
+ punpcklqdq m0, m2 ; 0 4
+ punpcklqdq m2, m3, m4 ; 2 6
+ punpckhqdq m3, m4 ; 3 7
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call .write_2x8x2_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call .write_2x8x2_zero
+ RET
+.write_2x8x2_start:
+ lea r6, [strideq*5]
+ pxor m6, m6
+.write_2x8x2_zero:
+ mova [cq+32*0], m6
+ mova [cq+32*1], m6
+ mova [cq+32*2], m6
+ mova [cq+32*3], m6
+ add cq, 32*4
+.write_2x8x2:
+ mova xm4, [dstq+strideq*0]
+ vinserti128 m4, [dstq+strideq*4], 1
+ mova xm5, [dstq+strideq*1]
+ vinserti128 m5, [dstq+r6 ], 1
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m6
+ pmaxsw m1, m6
+ pminsw m0, m7
+ pminsw m1, m7
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ vextracti128 [dstq+strideq*4], m0, 1
+ vextracti128 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+strideq*2]
+ ret
+
+%macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4]
+ punpckldq m%9, m%1, m%2 ; aibj emfn
+ punpckhdq m%1, m%2 ; ckdl gohp
+ punpckldq m%10, m%3, m%4 ; qyrz uCvD
+ punpckhdq m%3, m%4 ; sAtB wExF
+ punpckldq m%11, m%5, m%6 ; GOHP KSLT
+ punpckhdq m%5, m%6 ; IQJR MUNV
+ punpckldq m%12, m%7, m%8 ; WeXf aibj
+ punpckhdq m%7, m%8 ; YgZh ckdl
+ punpcklqdq m%2, m%9, m%10 ; aiqy emuC
+ punpckhqdq m%9, m%10 ; bjrz fnvD
+ punpcklqdq m%4, m%1, m%3 ; cksA gowE
+ punpckhqdq m%10, m%1, m%3 ; dltB hpxF
+ punpcklqdq m%6, m%11, m%12 ; GOWe KSai
+ punpckhqdq m%11, m%12 ; HPXf LTbj
+ punpcklqdq m%8, m%5, m%7 ; IQYg MUck
+ punpckhqdq m%12, m%5, m%7 ; JRZh NVdl
+ vperm2i128 m%1, m%2, m%6, 0x20 ; out0
+ vperm2i128 m%5, m%2, m%6, 0x31 ; out4
+ vperm2i128 m%2, m%9, m%11, 0x20 ; out1
+ vperm2i128 m%6, m%9, m%11, 0x31 ; out5
+ vperm2i128 m%3, m%4, m%8, 0x20 ; out2
+ vperm2i128 m%7, m%4, m%8, 0x31 ; out6
+ vperm2i128 m%4, m%10, m%12, 0x20 ; out3
+ vperm2i128 m%8, m%10, m%12, 0x31 ; out7
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct, 12
+INV_TXFM_8X8_FN dct, identity, 12
+INV_TXFM_8X8_FN dct, adst, 12
+INV_TXFM_8X8_FN dct, flipadst, 12
+
+cglobal idct_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(idct_8x8_internal_10bpc).pass1
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call .transpose_8x8
+ vpbroadcastd m11, [pd_2048]
+ call m(idct_8x8_internal_10bpc).main
+ call .round_shift4
+ jmp m(iadst_8x8_internal_12bpc).pass2_end
+ALIGN function_align
+.write_8x4_start:
+ vpbroadcastd m11, [pixel_12bpc_max]
+ lea r6, [strideq*3]
+ pxor m10, m10
+ ret
+ALIGN function_align
+.transpose_8x8:
+ TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ ret
+ALIGN function_align
+.round_shift4:
+ vpbroadcastd m1, [pd_8]
+ REPX {paddd x, m1}, m0, m6, m5, m3
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ psubd m7, m0, m9 ; out7
+ paddd m0, m9 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+
+INV_TXFM_8X8_FN adst, dct, 12
+INV_TXFM_8X8_FN adst, adst, 12
+INV_TXFM_8X8_FN adst, flipadst, 12
+INV_TXFM_8X8_FN adst, identity, 12
+
+cglobal iadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iadst_8x8_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+.pass2_end:
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ REPX {vpermq x, x, q3120}, m0, m1
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ packssdw m0, m4, m5
+ packssdw m1, m6, m7
+ REPX {vpermq x, x, q3120}, m0, m1
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.pass2_main:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_12bpc).transpose_8x8
+ vpbroadcastd m11, [pd_2048]
+.pass2_main2:
+ call m(iadst_8x8_internal_10bpc).main2
+ pslld m9, m8, 3 ; pd_8
+ paddd m0, m9
+ psubd m1, m9, m1 ; 8+x
+ paddd m6, m9
+ psubd m7, m9, m7
+ REPX {psrad x, 4}, m0, m1, m6, m7
+ vpbroadcastd m9, [pd_17408]
+ psubd m8, m9, m8 ; 17407
+ paddd m2, m9
+ psubd m3, m8, m3
+ paddd m4, m9
+ psubd m5, m8, m5
+ REPX {psrad x, 15}, m2, m3, m4, m5
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct, 12
+INV_TXFM_8X8_FN flipadst, adst, 12
+INV_TXFM_8X8_FN flipadst, flipadst, 12
+INV_TXFM_8X8_FN flipadst, identity, 12
+
+cglobal iflipadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iflipadst_8x8_internal_10bpc).pass1
+.pass2:
+ call m(iadst_8x8_internal_12bpc).pass2_main
+ packssdw m7, m7, m6
+ packssdw m6, m1, m0
+ packssdw m1, m5, m4
+ vpermq m0, m7, q3120
+ vpermq m1, m1, q3120
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ packssdw m0, m3, m2
+ vpermq m0, m0, q3120
+ vpermq m1, m6, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+
+INV_TXFM_8X8_FN identity, dct, 12
+INV_TXFM_8X8_FN identity, adst, 12
+INV_TXFM_8X8_FN identity, flipadst, 12
+INV_TXFM_8X8_FN identity, identity, 12
+
+cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ jmp m(iidentity_8x8_internal_10bpc).pass1
+.pass2:
+ packssdw m3, m7
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(iidentity_8x8_internal_10bpc).pass2_main
+
+%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
+ INV_TXFM_FN %1, %2, %3, 8x16, %4
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_%4bpc]
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity, 35
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+
+cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ cmp eobd, 43
+ jl .fast
+ add cq, 32
+ call .pass1_main
+ sub cq, 32
+ mova [cq+32* 1], m0
+ mova [cq+32* 3], m1
+ mova [cq+32* 5], m2
+ mova [cq+32* 7], m3
+ mova [cq+32* 9], m4
+ mova [cq+32*11], m5
+ mova [cq+32*13], m6
+ mova m15, m7
+ call .pass1_main
+ mova m8, [cq+32* 1]
+ mova m9, [cq+32* 3]
+ mova m10, [cq+32* 5]
+ mova m11, [cq+32* 7]
+ mova m12, [cq+32* 9]
+ mova m13, [cq+32*11]
+ mova m14, [cq+32*13]
+ jmp tx2q
+.fast:
+ call .pass1_main
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call .transpose
+ call m(idct_8x16_internal_8bpc).main
+ vpbroadcastd m12, [pw_2048]
+ REPX {vpermq x, x, q3120}, m0, m2, m4, m6
+ REPX {vpermq x, x, q2031}, m1, m3, m5, m7
+.end:
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m0, m2, m12
+ pmulhrsw m1, m3, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m0, m4, m12
+ pmulhrsw m1, m5, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m0, m6, m12
+ pmulhrsw m1, m7, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.transpose:
+ packssdw m0, m8
+ packssdw m1, m9
+ packssdw m2, m10
+ packssdw m3, m11
+ packssdw m4, m12
+ packssdw m5, m13
+ packssdw m6, m14
+ packssdw m7, m15
+ lea r6, [deint_shuf+128]
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpcklwd m3, m4, m5
+ punpckhwd m4, m5
+ punpckhwd m5, m6, m7
+ punpcklwd m6, m7
+ punpckhdq m7, m3, m6
+ punpckldq m3, m6
+ punpckhdq m6, m4, m5
+ punpckldq m4, m5
+ punpckhdq m5, m8, m1
+ punpckldq m8, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ vperm2i128 m2, m0, m3, 0x31
+ vinserti128 m0, xm3, 1
+ vperm2i128 m3, m1, m7, 0x31
+ vinserti128 m1, xm7, 1
+ vperm2i128 m7, m5, m6, 0x31
+ vinserti128 m5, xm6, 1
+ vperm2i128 m6, m8, m4, 0x31
+ vinserti128 m4, m8, xm4, 1
+ ret
+ALIGN function_align
+.pass1_main:
+ pmulld m0, m14, [cq+32* 0]
+ pmulld m1, m14, [cq+32* 2]
+ pmulld m2, m14, [cq+32* 4]
+ pmulld m3, m14, [cq+32* 6]
+ pmulld m4, m14, [cq+32* 8]
+ pmulld m5, m14, [cq+32*10]
+ pmulld m6, m14, [cq+32*12]
+ pmulld m7, m14, [cq+32*14]
+ call m(idct_8x8_internal_10bpc).main_rect2
+ jmp m(idct_8x8_internal_10bpc).round_shift1
+ALIGN function_align
+.main_evenhalf:
+ paddd m1, m6, m7 ; idct8 out1
+ psubd m6, m7 ; idct8 out6
+ psubd m7, m0, m9 ; idct8 out7
+ paddd m0, m9 ; idct8 out0
+ paddd m2, m5, m4 ; idct8 out2
+ psubd m5, m4 ; idct8 out5
+ psubd m4, m3, m8 ; idct8 out4
+ paddd m3, m8 ; idct8 out3
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+.main_oddhalf_fast_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_oddhalf_fast: ; lower half zero
+ vpbroadcastd m7, [pd_4076]
+ vpbroadcastd m8, [pd_401]
+ vpbroadcastd m6, [pd_m1189]
+ vpbroadcastd m9, [pd_3920]
+ vpbroadcastd m5, [pd_3612]
+ vpbroadcastd m10, [pd_1931]
+ vpbroadcastd m4, [pd_m2598]
+ vpbroadcastd m15, [pd_3166]
+ pmulld m7, m0
+ pmulld m0, m8
+ pmulld m6, m1
+ pmulld m1, m9
+ pmulld m5, m2
+ pmulld m2, m10
+ pmulld m4, m3
+ pmulld m3, m15
+ jmp .main_oddhalf_fast2
+.main_oddhalf_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main_oddhalf:
+ ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
+ ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a
+.main_oddhalf_fast2:
+ REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
+ REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
+ psubd m8, m0, m4 ; t9
+ paddd m0, m4 ; t8
+ psubd m4, m6, m2 ; t10
+ paddd m2, m6 ; t11
+ psubd m6, m1, m5 ; t13
+ paddd m5, m1 ; t12
+ psubd m1, m7, m3 ; t14
+ paddd m7, m3 ; t15
+ REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
+ REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
+ vpbroadcastd m15, [pd_3784]
+ vpbroadcastd m10, [pd_1567]
+ ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2
+ psubd m3, m1, m4 ; t10
+ paddd m1, m4 ; t9
+ psubd m4, m0, m2 ; t11a
+ paddd m0, m2 ; t8a
+ psubd m2, m8, m6 ; t13
+ paddd m6, m8 ; t14
+ psubd m8, m7, m5 ; t12a
+ paddd m7, m5 ; t15a
+ REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
+ REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
+ REPX {pmulld x, m14}, m2, m8, m3, m4
+ paddd m2, m11
+ paddd m8, m11
+ paddd m5, m2, m3 ; t13a
+ psubd m2, m3 ; t10a
+ psubd m3, m8, m4 ; t11
+ paddd m4, m8 ; t12
+ REPX {psrad x, 12}, m5, m2, m3, m4
+ mova [r6-32*4], m7
+ mova [r6-32*3], m6
+ mova [r6-32*2], m5
+ mova [r6-32*1], m4
+ mova [r6+32*0], m3
+ mova [r6+32*1], m2
+ mova [r6+32*2], m1
+ mova [r6+32*3], m0
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity, 35
+
+cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ cmp eobd, 43
+ jl .fast
+ add cq, 32
+ call .pass1_main
+ call m(iadst_8x8_internal_10bpc).main_end
+ sub cq, 32
+ mova [cq+32* 1], m0
+ mova [cq+32* 3], m1
+ mova [cq+32* 5], m2
+ mova [cq+32* 7], m3
+ mova [cq+32* 9], m4
+ mova [cq+32*11], m5
+ mova [cq+32*13], m6
+ mova m15, m7
+ call .pass1_main
+ call m(iadst_8x8_internal_10bpc).main_end
+ mova m8, [cq+32* 1]
+ mova m9, [cq+32* 3]
+ mova m10, [cq+32* 5]
+ mova m11, [cq+32* 7]
+ mova m12, [cq+32* 9]
+ mova m13, [cq+32*11]
+ mova m14, [cq+32*13]
+ jmp tx2q
+.fast:
+ call .pass1_main
+ call m(iadst_8x8_internal_10bpc).main_end
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call m(idct_8x16_internal_10bpc).transpose
+ call m(iadst_8x16_internal_8bpc).main
+ call m(iadst_8x16_internal_8bpc).main_pass2_end
+ vpbroadcastd m8, [pw_2048]
+ vpbroadcastd xm12, [pw_4096]
+ REPX {vpermq x, x, q2031}, m0, m1, m2, m3
+ REPX {vpermq x, x, q3120}, m4, m5, m6, m7
+ psubw m12, m8
+ jmp m(idct_8x16_internal_10bpc).end
+ALIGN function_align
+.pass1_main:
+ pmulld m0, m14, [cq+32* 0]
+ pmulld m7, m14, [cq+32*14]
+ pmulld m1, m14, [cq+32* 2]
+ pmulld m6, m14, [cq+32*12]
+ pmulld m2, m14, [cq+32* 4]
+ pmulld m5, m14, [cq+32*10]
+ pmulld m3, m14, [cq+32* 6]
+ pmulld m4, m14, [cq+32* 8]
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp m(iadst_8x8_internal_10bpc).main2
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity, 35
+
+cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ cmp eobd, 43
+ jl .fast
+ add cq, 32
+ call m(iadst_8x16_internal_10bpc).pass1_main
+ call m(iflipadst_8x8_internal_10bpc).main_end
+ sub cq, 32
+ mova [cq+32* 1], m0
+ mova [cq+32* 3], m1
+ mova [cq+32* 5], m2
+ mova [cq+32* 7], m3
+ mova [cq+32* 9], m4
+ mova [cq+32*11], m5
+ mova [cq+32*13], m6
+ mova m15, m7
+ call m(iadst_8x16_internal_10bpc).pass1_main
+ call m(iflipadst_8x8_internal_10bpc).main_end
+ mova m8, [cq+32* 1]
+ mova m9, [cq+32* 3]
+ mova m10, [cq+32* 5]
+ mova m11, [cq+32* 7]
+ mova m12, [cq+32* 9]
+ mova m13, [cq+32*11]
+ mova m14, [cq+32*13]
+ jmp tx2q
+.fast:
+ call m(iadst_8x16_internal_10bpc).pass1_main
+ call m(iflipadst_8x8_internal_10bpc).main_end
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call m(idct_8x16_internal_10bpc).transpose
+ call m(iadst_8x16_internal_8bpc).main
+ call m(iadst_8x16_internal_8bpc).main_pass2_end
+ vpbroadcastd m12, [pw_2048]
+ vpbroadcastd xm13, [pw_4096]
+ mova m11, m0
+ vpermq m0, m7, q2031
+ mova m10, m1
+ vpermq m1, m6, q2031
+ mova m9, m2
+ vpermq m2, m5, q2031
+ mova m8, m3
+ vpermq m3, m4, q2031
+ vpermq m4, m8, q3120
+ vpermq m5, m9, q3120
+ vpermq m6, m10, q3120
+ vpermq m7, m11, q3120
+ psubw m12, m13
+ jmp m(idct_8x16_internal_10bpc).end
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384]
+ pmulhrsw m%2, m%3, m%1
+%if %0 == 4 ; if downshifting by 1
+%ifnum %4
+ pmulhrsw m%2, m%4
+%else ; without rounding
+ psraw m%2, 1
+%endif
+%else
+ paddsw m%1, m%1
+%endif
+ paddsw m%1, m%2
+%endmacro
+
+cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m15, [pd_2896]
+ pmulld m0, m15, [cq+32* 0]
+ pmulld m8, m15, [cq+32* 1]
+ pmulld m1, m15, [cq+32* 2]
+ pmulld m9, m15, [cq+32* 3]
+ pmulld m2, m15, [cq+32* 4]
+ pmulld m10, m15, [cq+32* 5]
+ pmulld m3, m15, [cq+32* 6]
+ pmulld m11, m15, [cq+32* 7]
+ pmulld m4, m15, [cq+32* 8]
+ pmulld m12, m15, [cq+32* 9]
+ pmulld m5, m15, [cq+32*10]
+ pmulld m13, m15, [cq+32*11]
+ pmulld m6, m15, [cq+32*12]
+ pmulld m14, m15, [cq+32*13]
+ pmulld m7, m15, [cq+32*14]
+ pmulld m15, [cq+32*15]
+ mova [cq], m7
+ vpbroadcastd m7, [pd_2048]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ paddd m7, [cq]
+ REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ packssdw m0, m8
+ packssdw m1, m9
+ packssdw m2, m10
+ packssdw m3, m11
+ packssdw m4, m12
+ packssdw m5, m13
+ packssdw m6, m14
+ packssdw m13, m7, m15
+ vpbroadcastd m8, [pw_1697x16]
+ REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13
+ vpbroadcastd m7, [pixel_10bpc_max]
+ vpbroadcastd m12, [pw_2048]
+ call .pass2_end
+ RET
+ALIGN function_align
+.pass2_end:
+ punpckhwd m9, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m13
+ punpcklwd m6, m13
+ punpckhwd m13, m4, m5
+ punpcklwd m4, m5
+ punpcklwd m5, m2, m3
+ punpckhwd m2, m3
+ punpckhdq m3, m0, m5
+ punpckldq m0, m5
+ punpckhdq m11, m9, m2
+ punpckldq m9, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckldq m6, m13, m1
+ punpckhdq m13, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m8, m9, m6
+ punpckhqdq m9, m6
+ punpcklqdq m10, m11, m13
+ punpckhqdq m11, m13
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2_start
+ pmulhrsw m0, m12, m2
+ pmulhrsw m1, m12, m3
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
+ pmulhrsw m0, m12, m8
+ pmulhrsw m1, m12, m9
+ lea dstq, [dstq+strideq*4]
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
+ pmulhrsw m0, m12, m10
+ pmulhrsw m1, m12, m11
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
+ ret
+
+INV_TXFM_8X16_FN dct, dct, 0, 12
+INV_TXFM_8X16_FN dct, identity, 35, 12
+INV_TXFM_8X16_FN dct, adst, 0, 12
+INV_TXFM_8X16_FN dct, flipadst, 0, 12
+
+cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(idct_8x16_internal_10bpc).pass1
+.pass2:
+ lea r6, [rsp+32*4]
+ call .transpose
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ mova [cq+32* 8], m0
+ mova [cq+32*10], m2
+ mova [cq+32*12], m4
+ mova [cq+32*14], m6
+ pmaxsd m0, m12, [cq+32* 1]
+ pmaxsd m4, m12, m1
+ pmaxsd m1, m12, [cq+32* 3]
+ pmaxsd m2, m12, [cq+32* 5]
+ pmaxsd m6, m12, m5
+ pmaxsd m5, m12, m3
+ pmaxsd m3, m12, [cq+32* 7]
+ pmaxsd m7, m12
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ pmaxsd m0, m12, [cq+32* 0]
+ pmaxsd m1, m12, [cq+32* 2]
+ pmaxsd m2, m12, [cq+32* 4]
+ pmaxsd m3, m12, [cq+32* 6]
+ pmaxsd m4, m12, [cq+32* 8]
+ pmaxsd m5, m12, [cq+32*10]
+ pmaxsd m6, m12, [cq+32*12]
+ pmaxsd m7, m12, [cq+32*14]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ vpbroadcastd m11, [pd_8]
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_16x8_internal_10bpc).pass1_rotations
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+.end:
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ packssdw m4, m8, m9
+ packssdw m5, m10, m11
+ packssdw m6, m12, m13
+ packssdw m7, m14, m15
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m2, q3120
+ vpermq m1, m3, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m4, q3120
+ vpermq m1, m5, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m6, q3120
+ vpermq m1, m7, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+ALIGN function_align
+.transpose:
+ mova [cq+32* 8], m8
+ mova [cq+32* 9], m9
+ mova [cq+32*10], m10
+ mova [cq+32*11], m11
+ call m(idct_8x8_internal_12bpc).transpose_8x8
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m1
+ mova [cq+32* 2], m2
+ mova [cq+32* 3], m3
+ mova [cq+32* 4], m4
+ mova [cq+32* 5], m5
+ mova [cq+32* 6], m6
+ mova [cq+32* 7], m7
+ mova m0, [cq+32* 8]
+ mova m1, [cq+32* 9]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*11]
+ mova m4, m12
+ mova m5, m13
+ mova m6, m14
+ mova m7, m15
+ jmp m(idct_8x8_internal_12bpc).transpose_8x8
+
+INV_TXFM_8X16_FN adst, dct, 0, 12
+INV_TXFM_8X16_FN adst, adst, 0, 12
+INV_TXFM_8X16_FN adst, flipadst, 0, 12
+INV_TXFM_8X16_FN adst, identity, 35, 12
+
+cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iadst_8x16_internal_10bpc).pass1
+.pass2:
+ lea r6, [rsp+32*4]
+ call .pass2_main
+ call m(iadst_16x8_internal_10bpc).pass1_rotations
+.pass2_end:
+ REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
+ jmp m(idct_8x16_internal_12bpc).end
+ALIGN function_align
+.pass2_main:
+ call m(idct_8x16_internal_12bpc).transpose
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+ mova [cq+32* 8], m0
+ mova [cq+32*11], m3
+ mova [cq+32*12], m4
+ mova [cq+32*15], m7
+ pmaxsd m0, m13, [cq+32* 2] ; 2
+ pmaxsd m3, m13, m1 ; 9
+ pmaxsd m1, m13, m5 ; 13
+ pmaxsd m4, m13, m2 ; 10
+ pmaxsd m2, m13, [cq+32* 6] ; 6
+ pmaxsd m5, m13, [cq+32* 5] ; 5
+ pmaxsd m6, m13, m6 ; 14
+ pmaxsd m7, m13, [cq+32* 1] ; 1
+ REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m12, [pd_2048]
+ vpbroadcastd m15, [pd_2896]
+ call m(iadst_16x8_internal_10bpc).main_part1
+ pmaxsd m0, m13, [cq+32* 0] ; 0
+ pmaxsd m1, m13, [cq+32*15] ; 15
+ pmaxsd m2, m13, [cq+32* 4] ; 4
+ pmaxsd m3, m13, [cq+32*11] ; 11
+ pmaxsd m4, m13, [cq+32* 8] ; 8
+ pmaxsd m5, m13, [cq+32* 7] ; 7
+ pmaxsd m6, m13, [cq+32*12] ; 12
+ pmaxsd m7, m13, [cq+32* 3] ; 3
+ REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(iadst_16x8_internal_10bpc).main_part2
+ vpbroadcastd m14, [pd_17408]
+ psrld m15, 11 ; pd_1
+ psubd m13, m14, m15 ; pd_17407
+ pslld m15, 3 ; pd_8
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct, 0, 12
+INV_TXFM_8X16_FN flipadst, adst, 0, 12
+INV_TXFM_8X16_FN flipadst, flipadst, 0, 12
+INV_TXFM_8X16_FN flipadst, identity, 35, 12
+
+cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iflipadst_8x16_internal_10bpc).pass1
+.pass2:
+ lea r6, [rsp+32*4]
+ call m(iadst_8x16_internal_12bpc).pass2_main
+ call m(iflipadst_16x8_internal_10bpc).pass1_rotations
+ jmp m(iadst_8x16_internal_12bpc).pass2_end
+
+INV_TXFM_8X16_FN identity, dct, 0, 12
+INV_TXFM_8X16_FN identity, adst, 0, 12
+INV_TXFM_8X16_FN identity, flipadst, 0, 12
+INV_TXFM_8X16_FN identity, identity, 0, 12
+
+cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ jmp m(iidentity_8x16_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+ packssdw m0, m8
+ packssdw m1, m9
+ packssdw m2, m10
+ packssdw m3, m11
+ packssdw m4, m12
+ packssdw m5, m13
+ packssdw m6, m14
+ packssdw m13, m7, m15
+ vpbroadcastd m7, [pixel_12bpc_max]
+ vpbroadcastd m12, [pw_16384]
+ call m(iidentity_8x16_internal_10bpc).pass2_end
+ RET
+ALIGN function_align
+.pass2_main:
+ mova [cq], m7
+ vpbroadcastd m7, [clip_18b_min]
+ REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ pmaxsd m7, [cq]
+ mova [cq], m15
+ vpbroadcastd m15, [clip_18b_max]
+ REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ pminsd m15, [cq]
+ mova [cq], m7
+ vpbroadcastd m7, [pd_5793]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ pmulld m7, [cq]
+ mova [cq], m15
+ vpbroadcastd m15, [pd_1024]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [cq]
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ ret
+
+%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 16x4, %3
+%ifidn %1_%2, dct_dct
+ vpbroadcastd m3, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 4
+.dconly2:
+ add r6d, 384
+ sar r6d, 9
+.dconly3:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm3
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ paddsw m1, m0, [dstq+strideq*0]
+ paddsw m2, m0, [dstq+strideq*1]
+ psubusw m1, m3
+ psubusw m2, m3
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%else
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly
+%endif
+%endif
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, identity
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+
+cglobal idct_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_18b_min]
+ vpbroadcastd m9, [clip_18b_max]
+.pass1:
+ vbroadcasti128 m0, [cq+16* 0]
+ vbroadcasti128 m4, [cq+16* 4]
+ vbroadcasti128 m1, [cq+16* 2]
+ vbroadcasti128 m7, [cq+16* 6]
+ vbroadcasti128 m5, [cq+16*10]
+ vbroadcasti128 m2, [cq+16* 8]
+ vbroadcasti128 m6, [cq+16*12]
+ vbroadcasti128 m3, [cq+16*14]
+ shufpd m0, m4, 0x0c ; 0 4
+ shufpd m1, m5, 0x0c ; 2 10
+ shufpd m2, m6, 0x0c ; 8 12
+ shufpd m3, m7, 0x0c ; 14 6
+ call .pass1_main
+ vbroadcasti128 m10, [cq+16* 1]
+ vbroadcasti128 m4, [cq+16* 5]
+ vbroadcasti128 m11, [cq+16*15]
+ vbroadcasti128 m5, [cq+16*11]
+ shufpd m10, m4, 0x0c ; 1 5
+ shufpd m11, m5, 0x0c ; 15 11
+ vbroadcasti128 m5, [cq+16* 9]
+ vbroadcasti128 m4, [cq+16*13]
+ shufpd m5, m4, 0x0c ; 9 13
+ vbroadcasti128 m6, [cq+16* 7]
+ vbroadcasti128 m4, [cq+16* 3]
+ shufpd m6, m4, 0x0c ; 7 3
+ call .pass1_main2
+ pcmpeqd m4, m4
+ REPX {psubd x, m4}, m0, m1, m2, m3
+ call .pass1_main3
+ REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ call .transpose_4x16_packed
+ lea r6, [deint_shuf+128]
+ call m(idct_16x4_internal_8bpc).main
+.end:
+ vpbroadcastd m4, [pw_2048]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ vpbroadcastd m5, [pixel_10bpc_max]
+.end2:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+.end3:
+ lea r6, [dstq+strideq*2]
+ paddw m2, [r6 +strideq*0]
+ paddw m3, [r6 +strideq*1]
+ pxor m4, m4
+ REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
+ REPX {pmaxsw x, m4}, m0, m1, m2, m3
+ REPX {pminsw x, m5}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [r6 +strideq*0], m2
+ mova [r6 +strideq*1], m3
+ RET
+ALIGN function_align
+.pass1_main:
+ vpbroadcastd m7, [pd_2048]
+ call m(idct_8x4_internal_10bpc).main
+ psubd m3, m0, m4 ; idct8 out7 out6
+ paddd m0, m4 ; idct8 out0 out1
+ paddd m1, m2, m5 ; idct8 out3 out2
+ psubd m2, m5 ; idct8 out4 out5
+ ret
+ALIGN function_align
+.pass1_main2:
+ ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1
+ ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1
+ vbroadcasti128 m12, [pd_3784_m3784]
+ psubd m4, m10, m5
+ paddd m10, m5 ; t8 t11
+ psignd m4, m12 ; t9 t10
+ psubd m5, m11, m6
+ paddd m11, m6 ; t15 t12
+ psignd m5, m12 ; t14 t13
+ vpbroadcastd m6, [pd_1567]
+ vpbroadcastd m13, [pd_3784]
+ REPX {pmaxsd x, m8}, m5, m4
+ REPX {pminsd x, m9}, m5, m4
+ pmulld m12, m5
+ pmulld m5, m6
+ vbroadcasti128 m6, [pd_1567_m1567]
+ pmulld m13, m4
+ pmulld m4, m6
+ REPX {pmaxsd x, m8}, m10, m11, m0, m1
+ REPX {pminsd x, m9}, m10, m11, m0, m1
+ paddd m12, m7
+ paddd m5, m7
+ paddd m4, m12
+ psubd m5, m13
+ psrad m4, 12 ; t14a t10a
+ psrad m5, 12 ; t9a t13a
+ vpbroadcastd m12, [pd_2896]
+ punpckhqdq m6, m11, m5
+ punpcklqdq m11, m4
+ punpckhqdq m4, m10, m4
+ punpcklqdq m10, m5
+ psubd m5, m11, m6 ; t12a t13
+ paddd m11, m6 ; t15a t14
+ psubd m6, m10, m4 ; t11a t10
+ paddd m10, m4 ; t8a t9
+ REPX {pmaxsd x, m8}, m5, m6
+ REPX {pminsd x, m9}, m5, m6
+ pmulld m5, m12
+ pmulld m6, m12
+ REPX {pmaxsd x, m8}, m2, m3, m11, m10
+ REPX {pminsd x, m9}, m2, m3, m11, m10
+ ret
+ALIGN function_align
+.pass1_main3:
+ paddd m5, m7
+ psubd m4, m5, m6
+ paddd m5, m6
+ psrad m4, 12 ; t11 t10a
+ psrad m5, 12 ; t12 t13a
+ psubd m7, m0, m11 ; out15 out14
+ paddd m0, m11 ; out0 out1
+ psubd m6, m1, m5 ; out12 out13
+ paddd m1, m5 ; out3 out2
+ psubd m5, m2, m4 ; out11 out10
+ paddd m2, m4 ; out4 out5
+ psubd m4, m3, m10 ; out8 out9
+ paddd m3, m10 ; out7 out6
+ REPX {pshufd x, x, q1032}, m1, m3, m5, m7
+ ret
+ALIGN function_align
+.transpose_4x16_packed:
+ vbroadcasti128 m8, [deint_shuf]
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ REPX {pshufb x, m8}, m0, m2, m4, m6
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpckhqdq m2, m4, m6
+ punpcklqdq m4, m6
+ vperm2i128 m3, m1, m2, 0x31
+ vinserti128 m1, xm2, 1
+ vperm2i128 m2, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ ret
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ call m(iadst_4x16_internal_10bpc).main
+ psrad m11, 11 ; pd_1
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ paddd m4, m5, m11
+ paddd m5, m6, m11
+ paddd m6, m7, m11
+ paddd m7, m8, m11
+.pass1_end:
+ REPX {pshufd x, x, q1032}, m0, m2, m4, m6
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ call m(idct_16x4_internal_10bpc).transpose_4x16_packed
+ lea r6, [deint_shuf+128]
+ call m(iadst_16x4_internal_8bpc).main
+ jmp m(idct_16x4_internal_10bpc).end
+ALIGN function_align
+.main:
+ vpbroadcastd m6, [pd_1321]
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ vpbroadcastd m7, [pd_2482]
+ mova m2, [cq+32*6]
+ mova m3, [cq+32*7]
+ pmulld m4, m0, m6
+ pmulld m5, m1, m6 ; 1321*in0
+ pmulld m9, m2, m7
+ pmulld m8, m3, m7 ; 2482*in3
+ paddd m4, m9
+ paddd m8, m5 ; 1321*in0 + 2482*in3
+ pmulld m5, m0, m7
+ pmulld m9, m1, m7 ; 2482*in0
+ paddd m0, m2
+ paddd m1, m3 ; in0 + in3
+ paddd m7, m6 ; pd_3803
+ pmulld m2, m7
+ pmulld m3, m7 ; 3803*in3
+ psubd m5, m2
+ psubd m9, m3 ; 2482*in0 - 3803*in3
+ mova m2, [cq+32*4]
+ pmulld m10, m7, m2
+ pmulld m3, m6, m2
+ psubd m2, m0
+ mova m0, [cq+32*5]
+ pmulld m7, m0 ; 3803*in2
+ pmulld m6, m0 ; 1321*in2
+ psubd m0, m1 ; in2 - in0 - in3
+ vpbroadcastd m1, [pd_m3344]
+ paddd m4, m10
+ paddd m7, m8 ; t0
+ psubd m5, m3
+ psubd m9, m6 ; t1
+ pmulld m2, m1
+ pmulld m0, m1 ; t2
+ pmulld m3, m1, [cq+32*2]
+ pmulld m1, [cq+32*3] ; -t3
+ ret
+ALIGN function_align
+.main_end:
+ ; expects: m6 = rnd
+ paddd m5, m6
+ paddd m9, m6
+ paddd m10, m4, m5
+ paddd m4, m6
+ paddd m8, m7, m6
+ paddd m7, m9
+ psubd m4, m3 ; out0 (unshifted)
+ psubd m5, m3 ; out1 (unshifted)
+ paddd m2, m6 ; out2 (unshifted)
+ paddd m3, m10 ; out3 (unshifted)
+ psubd m8, m1 ; out4 (unshifted)
+ psubd m9, m1 ; out5 (unshifted)
+ paddd m6, m0 ; out6 (unshifted)
+ paddd m7, m1 ; out7 (unshifted)
+ ret
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ call m(iadst_4x16_internal_10bpc).main
+ psrad m11, 11 ; pd_1
+ paddd m4, m3, m11
+ paddd m3, m5, m11
+ paddd m5, m2, m11
+ paddd m2, m6, m11
+ paddd m6, m1, m11
+ paddd m1, m7, m11
+ paddd m7, m0, m11
+ paddd m0, m8, m11
+ jmp m(iadst_16x4_internal_10bpc).pass1_end
+.pass2:
+ call m(idct_16x4_internal_10bpc).transpose_4x16_packed
+ lea r6, [deint_shuf+128]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m4, [pw_2048]
+ pmulhrsw m5, m3, m4
+ pmulhrsw m6, m2, m4
+ pmulhrsw m2, m1, m4
+ pmulhrsw m3, m0, m4
+ paddw m0, m5, [dstq+strideq*0]
+ paddw m1, m6, [dstq+strideq*1]
+ vpbroadcastd m5, [pixel_10bpc_max]
+ jmp m(idct_16x4_internal_10bpc).end3
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [pd_5793]
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m4, [cq+32*4], q3120 ; 8 9
+ vpermq m5, [cq+32*5], q3120 ; a b
+ vpermq m6, [cq+32*6], q3120 ; c d
+ vpermq m7, [cq+32*7], q3120 ; e f
+ vpbroadcastd m9, [pd_3072]
+ REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ call m(idct_16x4_internal_10bpc).transpose_4x16_packed
+ vpbroadcastd m7, [pw_1697x8]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(idct_16x4_internal_10bpc).end
+
+INV_TXFM_16X4_FN dct, dct, 12
+INV_TXFM_16X4_FN dct, identity, 12
+INV_TXFM_16X4_FN dct, adst, 12
+INV_TXFM_16X4_FN dct, flipadst, 12
+
+cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [clip_20b_min]
+ vpbroadcastd m9, [clip_20b_max]
+ jmp m(idct_16x4_internal_10bpc).pass1
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ ; deinterleave
+ REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
+ ; transpose
+ punpcklqdq m8, m0, m1
+ punpckhqdq m0, m1
+ punpcklqdq m9, m2, m3
+ punpckhqdq m2, m3
+ punpcklqdq m10, m4, m5
+ punpckhqdq m4, m5
+ punpcklqdq m11, m6, m7
+ punpckhqdq m6, m7
+ vperm2i128 m3, m0, m2, 0x31 ; out6
+ vperm2i128 m1, m0, m2, 0x20 ; out2
+ vperm2i128 m7, m4, m6, 0x31 ; out7
+ vperm2i128 m5, m4, m6, 0x20 ; out3
+ vperm2i128 m13, m10, m11, 0x31 ; out5
+ vperm2i128 m12, m10, m11, 0x20 ; out1
+ vperm2i128 m11, m8, m9, 0x31 ; out4
+ vperm2i128 m10, m8, m9, 0x20 ; out0
+ call m(idct_4x16_internal_10bpc).pass1_main
+ pmulld m0, m6, m10
+ pmulld m2, m6, m11
+ pmulld m4, m6, m12
+ pmulld m6, m13
+ vpbroadcastd m10, [pd_17408]
+ call m(idct_4x16_internal_10bpc).pass1_main2
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpbroadcastd m5, [pixel_12bpc_max]
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ jmp m(idct_16x4_internal_10bpc).end2
+
+INV_TXFM_16X4_FN adst, dct, 12
+INV_TXFM_16X4_FN adst, adst, 12
+INV_TXFM_16X4_FN adst, flipadst, 12
+INV_TXFM_16X4_FN adst, identity, 12
+
+cglobal iadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iadst_16x4_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ jmp m(idct_16x4_internal_10bpc).end2
+ALIGN function_align
+.pass2_main:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7
+ pmaxsd m8, m4, m12
+ pmaxsd m9, m5, m12
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ call m(iadst_8x4_internal_12bpc).transpose_4x8
+ mova [cq+32*0], m0
+ mova [cq+32*2], m1
+ mova [cq+32*4], m2
+ mova [cq+32*6], m3
+ pminsd m0, m8, m13
+ pminsd m1, m9, m13
+ pminsd m2, m6, m13
+ pminsd m3, m7, m13
+ call m(iadst_8x4_internal_12bpc).transpose_4x8
+ mova [cq+32*1], m0
+ mova [cq+32*3], m1
+ mova [cq+32*5], m2
+ mova [cq+32*7], m3
+ call m(iadst_16x4_internal_10bpc).main
+ vpbroadcastd m6, [pd_2048]
+ call m(iadst_16x4_internal_10bpc).main_end
+ psrad m0, m4, 15
+ psrad m1, m5, 15
+ psrad m2, 15
+ psrad m3, 15
+ psrad m4, m8, 15
+ psrad m5, m9, 15
+ psrad m6, 15
+ psrad m7, 15
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpbroadcastd m4, [pw_16384]
+ vpbroadcastd m5, [pixel_12bpc_max]
+ ret
+
+INV_TXFM_16X4_FN flipadst, dct, 12
+INV_TXFM_16X4_FN flipadst, adst, 12
+INV_TXFM_16X4_FN flipadst, flipadst, 12
+INV_TXFM_16X4_FN flipadst, identity, 12
+
+cglobal iflipadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(iflipadst_16x4_internal_10bpc).pass1
+.pass2:
+ call m(iadst_16x4_internal_12bpc).pass2_main
+ vpermq m7, m0, q3120
+ vpermq m6, m1, q3120
+ vpermq m1, m2, q3120
+ vpermq m0, m3, q3120
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pmulhrsw m2, m6, m4
+ pmulhrsw m3, m7, m4
+ jmp m(idct_16x4_internal_10bpc).end2
+
+INV_TXFM_16X4_FN identity, dct, 12
+INV_TXFM_16X4_FN identity, adst, 12
+INV_TXFM_16X4_FN identity, flipadst, 12
+INV_TXFM_16X4_FN identity, identity, 12
+
+cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
+ vpbroadcastd m8, [pd_1697]
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpbroadcastd m9, [pd_3072]
+ pmulld m4, m8, m0
+ pmulld m5, m8, m1
+ pmulld m6, m8, m2
+ pmulld m7, m8, m3
+ vpermq m10, [cq+32*4], q3120 ; 8 9
+ vpermq m11, [cq+32*5], q3120 ; a b
+ vpermq m12, [cq+32*6], q3120 ; c d
+ vpermq m13, [cq+32*7], q3120 ; e f
+ REPX {paddd x, m9}, m4, m5, m6, m7
+ REPX {psrad x, 12}, m4, m5, m6, m7
+ paddd m0, m4
+ pmulld m4, m8, m10
+ paddd m1, m5
+ pmulld m5, m8, m11
+ paddd m2, m6
+ pmulld m6, m8, m12
+ paddd m3, m7
+ pmulld m7, m8, m13
+ REPX {paddd x, m9}, m4, m5, m6, m7
+ REPX {psrad x, 12}, m4, m5, m6, m7
+ paddd m4, m10
+ paddd m5, m11
+ paddd m6, m12
+ paddd m7, m13
+ jmp tx2q
+.pass2:
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m8, [pd_5793]
+ vpbroadcastd m9, [pd_2048]
+ REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_16x4_internal_10bpc).transpose_4x16_packed
+ vpbroadcastd m4, [pw_16384]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ vpbroadcastd m5, [pixel_12bpc_max]
+ jmp m(idct_16x4_internal_10bpc).end2
+
+%macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 16x8, %3
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_%3bpc]
+ mov [cq], eobd ; 0
+ or r3d, 8
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+
+cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m14, [pd_2896]
+ pmulld m0, m14, [cq+32* 1]
+ pmulld m1, m14, [cq+32* 3]
+ pmulld m2, m14, [cq+32* 5]
+ pmulld m3, m14, [cq+32* 7]
+ pmulld m4, m14, [cq+32* 9]
+ pmulld m5, m14, [cq+32*11]
+ pmulld m6, m14, [cq+32*13]
+ pmulld m7, m14, [cq+32*15]
+ vpbroadcastd m11, [pd_2048]
+ lea r6, [rsp+32*4]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
+ pmulld m0, m14, [cq+32* 0]
+ pmulld m1, m14, [cq+32* 2]
+ pmulld m2, m14, [cq+32* 4]
+ pmulld m3, m14, [cq+32* 6]
+ pmulld m4, m14, [cq+32* 8]
+ pmulld m5, m14, [cq+32*10]
+ pmulld m6, m14, [cq+32*12]
+ pmulld m7, m14, [cq+32*14]
+ call m(idct_8x8_internal_10bpc).main_rect2
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ psrld m11, 11 ; pd_1
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ call .pass1_rotations
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call .transpose
+ call m(idct_16x8_internal_8bpc).main
+ vpbroadcastd m10, [pw_2048]
+.end:
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ call .write_16x4_start
+.end2:
+ pmulhrsw m0, m4, m10
+ pmulhrsw m1, m5, m10
+ pmulhrsw m2, m6, m10
+ pmulhrsw m3, m7, m10
+ call .write_16x4_zero
+ RET
+ALIGN function_align
+.pass1_rotations:
+ mova m14, [r6-32*4]
+ mova m13, [r6-32*3]
+ mova m12, [r6-32*2]
+ mova m11, [r6-32*1]
+ mova m10, [r6+32*0]
+ mova m9, [r6+32*1]
+ mova m8, [r6+32*2]
+ psubd m15, m0, m14 ; out15
+ paddd m0, m14 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m2, m12 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m10 ; out11
+ paddd m4, m10 ; out4
+ psubd m10, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, [r6+32*3] ; out8
+ paddd m7, [r6+32*3] ; out7
+ ret
+ALIGN function_align
+.transpose:
+ lea r6, [deint_shuf+128]
+.transpose2:
+ packssdw m0, m8
+ packssdw m1, m9
+ packssdw m2, m10
+ packssdw m3, m11
+ packssdw m4, m12
+ packssdw m5, m13
+ packssdw m6, m14
+ packssdw m7, m15
+.transpose3:
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ punpckhwd m3, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m6, m7
+ punpcklwd m6, m7
+ punpckhdq m7, m4, m6
+ punpckldq m4, m6
+ punpckldq m6, m8, m2
+ punpckhdq m8, m2
+ punpckhdq m2, m0, m1
+ punpckldq m0, m1
+ punpckhdq m1, m3, m5
+ punpckldq m3, m5
+ punpcklqdq m5, m6, m3
+ punpckhqdq m6, m3
+ punpckhqdq m3, m2, m7
+ punpcklqdq m2, m7
+ punpcklqdq m7, m8, m1
+ punpckhqdq m8, m1
+ punpckhqdq m1, m0, m4
+ punpcklqdq m0, m4
+ vperm2i128 m4, m0, m5, 0x31
+ vinserti128 m0, xm5, 1
+ vperm2i128 m5, m1, m6, 0x31
+ vinserti128 m1, xm6, 1
+ vperm2i128 m6, m2, m7, 0x31
+ vinserti128 m2, xm7, 1
+ vperm2i128 m7, m3, m8, 0x31
+ vinserti128 m3, xm8, 1
+ ret
+ALIGN function_align
+.write_16x4_start:
+ vpbroadcastd m9, [pixel_10bpc_max]
+ lea r3, [strideq*3]
+ pxor m8, m8
+.write_16x4_zero:
+ REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7
+ add cq, 32*8
+.write_16x4:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r3 ]
+ REPX {pmaxsw x, m8}, m0, m1, m2, m3
+ REPX {pminsw x, m9}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r3 ], m3
+ lea dstq, [dstq+strideq*4]
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+.pass1:
+ lea r6, [rsp+32*4]
+ call .main
+ vpbroadcastd m14, [pd_3072]
+ psrld m15, 11 ; pd_1
+ psubd m13, m14, m15 ; pd_3071
+ call .pass1_rotations
+.pass1_end:
+ REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11
+ jmp tx2q
+.pass2:
+ call m(idct_16x8_internal_10bpc).transpose
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+ vpbroadcastd m10, [pw_2048]
+ pxor m11, m11
+ psubw m11, m10
+ pmulhrsw m0, m10
+ pmulhrsw m1, m11
+ pmulhrsw m2, m10
+ pmulhrsw m3, m11
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m4, m10
+ pmulhrsw m1, m5, m11
+ pmulhrsw m2, m6, m10
+ pmulhrsw m3, m7, m11
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+ALIGN function_align
+.pass1_rotations:
+ paddd m0, m15
+ psubd m1, m15, m1
+ paddd m2, m15
+ psubd m3, m15, m3
+ paddd m4, m14
+ psubd m5, m13, m5
+ paddd m6, m14
+ psubd m7, m13, m7
+ paddd m8, m14, m9
+ psubd m9, m13, m10
+ paddd m10, m14, m11
+ psubd m11, m13, m12
+ paddd m12, m15, [r6-32*1]
+ psubd m13, m15, [r6-32*2]
+ paddd m14, m15, [r6-32*3]
+ psubd m15, [r6-32*4]
+ ret
+ALIGN function_align
+.main:
+ ; expects: m13 = clip_min m14 = clip_max
+ vpbroadcastd m15, [pd_2896]
+ pmulld m0, m15, [cq+32* 2]
+ pmulld m1, m15, [cq+32*13]
+ pmulld m2, m15, [cq+32* 6]
+ pmulld m3, m15, [cq+32* 9]
+ pmulld m4, m15, [cq+32*10]
+ pmulld m5, m15, [cq+32* 5]
+ pmulld m6, m15, [cq+32*14]
+ pmulld m7, m15, [cq+32* 1]
+ vpbroadcastd m12, [pd_2048]
+ REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ call .main_part1
+ pmulld m0, m15, [cq+32* 0]
+ pmulld m1, m15, [cq+32*15]
+ pmulld m2, m15, [cq+32* 4]
+ pmulld m3, m15, [cq+32*11]
+ pmulld m4, m15, [cq+32* 8]
+ pmulld m5, m15, [cq+32* 7]
+ pmulld m6, m15, [cq+32*12]
+ pmulld m7, m15, [cq+32* 3]
+ REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main_part2:
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380
+ psubd m8, m0, m4 ; t8a
+ paddd m0, m4 ; t0a
+ psubd m4, m1, m5 ; t9a
+ paddd m1, m5 ; t1a
+ psubd m5, m2, m6 ; t12a
+ paddd m2, m6 ; t4a
+ psubd m6, m3, m7 ; t13a
+ paddd m7, m3 ; t5a
+ REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
+ REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7
+ vpbroadcastd m11, [pd_4017]
+ vpbroadcastd m10, [pd_799]
+ ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11
+ ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10
+ psubd m3, m0, m2 ; t4
+ paddd m0, m2 ; t0
+ psubd m2, m1, m7 ; t5
+ paddd m1, m7 ; t1
+ psubd m7, m4, m6 ; t12a
+ paddd m4, m6 ; t8a
+ psubd m6, m8, m5 ; t13a
+ paddd m5, m8 ; t9a
+ REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
+ REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5
+ vpbroadcastd m11, [pd_3784]
+ vpbroadcastd m10, [pd_1567]
+ ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11
+ ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11
+ pminsd m10, m14, [r6-32*4] ; t2
+ pminsd m8, m14, [r6-32*3] ; t3
+ psubd m9, m0, m10 ; t2a
+ paddd m0, m10 ; out0
+ psubd m10, m1, m8 ; t3a
+ paddd m1, m8 ; -out15
+ pmaxsd m9, m13
+ pmaxsd m10, m13
+ pminsd m9, m14
+ pminsd m10, m14
+ mova [r6-32*4], m1
+ mova m11, [r6-32*1] ; t7a
+ mova m1, [r6-32*2] ; t6a
+ psubd m8, m3, m11 ; t7
+ paddd m11, m3 ; out12
+ paddd m3, m2, m1 ; -out3
+ psubd m2, m1 ; t6
+ pmaxsd m8, m13
+ pmaxsd m2, m13
+ pminsd m8, m14
+ pminsd m2, m14
+ mova [r6-32*1], m11
+ mova [r6-32*3], m2
+ mova m1, [r6+32*3] ; t15
+ mova m2, [r6+32*2] ; t14
+ paddd m12, m7, m1 ; -out13
+ psubd m7, m1 ; t15a
+ psubd m11, m6, m2 ; t14a
+ paddd m2, m6 ; out2
+ pmaxsd m7, m13
+ pmaxsd m11, m13
+ pminsd m7, m14
+ pminsd m11, m14
+ mova [r6-32*2], m12
+ pminsd m1, m14, [r6+32*0] ; t10a
+ pminsd m12, m14, [r6+32*1] ; t11a
+ psubd m6, m4, m1 ; t10
+ paddd m1, m4 ; -out1
+ psubd m4, m5, m12 ; t11
+ paddd m5, m12 ; out14
+ vpbroadcastd m12, [pd_1448]
+ pmaxsd m6, m13
+ pmaxsd m4, m13
+ pminsd m6, m14
+ pminsd m4, m14
+ REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4
+ pmulld m12, [r6-32*3] ; t6
+ mova [r6-32*3], m5
+ paddd m5, m11, m7 ; -out5 (unshifted)
+ psubd m11, m7 ; out10 (unshifted)
+ paddd m7, m9, m10 ; -out7 (unshifted)
+ psubd m9, m10 ; out8 (unshifted)
+ psubd m10, m6, m4 ; -out9 (unshifted)
+ paddd m6, m4 ; out6 (unshifted)
+ paddd m4, m12, m8 ; out4 (unshifted)
+ psubd m12, m8 ; -out11 (unshifted)
+ ret
+.main_part1:
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601
+ psubd m8, m0, m4 ; t10a
+ paddd m0, m4 ; t2a
+ psubd m4, m1, m5 ; t11a
+ paddd m1, m5 ; t3a
+ psubd m5, m2, m6 ; t14a
+ paddd m2, m6 ; t6a
+ psubd m6, m3, m7 ; t15a
+ paddd m7, m3 ; t7a
+ REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
+ REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7
+ vpbroadcastd m11, [pd_2276]
+ vpbroadcastd m10, [pd_3406]
+ ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11
+ ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10
+ psubd m3, m0, m2 ; t6
+ paddd m0, m2 ; t2
+ psubd m2, m1, m7 ; t7
+ paddd m1, m7 ; t3
+ psubd m7, m4, m6 ; t14a
+ paddd m4, m6 ; t10a
+ psubd m6, m8, m5 ; t15a
+ paddd m5, m8 ; t11a
+ REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
+ REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later
+ vpbroadcastd m11, [pd_1567]
+ vpbroadcastd m10, [pd_3784]
+ ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11
+ ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+.pass1:
+ lea r6, [rsp+32*4]
+ call m(iadst_16x8_internal_10bpc).main
+ vpbroadcastd m14, [pd_3072]
+ psrld m15, 11
+ psubd m13, m14, m15
+ call .pass1_rotations
+ jmp m(iadst_16x8_internal_10bpc).pass1_end
+.pass2:
+ call m(idct_16x8_internal_10bpc).transpose
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+ vpbroadcastd m10, [pw_2048]
+ pxor m11, m11
+ psubw m11, m10
+ mova m12, m0
+ pmulhrsw m0, m7, m11
+ mova m7, m1
+ pmulhrsw m1, m6, m10
+ mova m6, m2
+ pmulhrsw m2, m5, m11
+ mova m5, m3
+ pmulhrsw m3, m4, m10
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m5, m11
+ pmulhrsw m1, m6, m10
+ pmulhrsw m2, m7, m11
+ pmulhrsw m3, m12, m10
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+ALIGN function_align
+.pass1_rotations:
+ psubd m8, m13, m7
+ paddd m7, m14, m9
+ paddd m9, m14, m6
+ psubd m6, m13, m10
+ psubd m10, m13, m5
+ paddd m5, m14, m11
+ paddd m11, m14, m4
+ psubd m4, m13, m12
+ psubd m12, m15, m3
+ paddd m3, m15, [r6-32*1]
+ paddd m13, m15, m2
+ psubd m2, m15, [r6-32*2]
+ psubd m14, m15, m1
+ mova m1, m15
+ paddd m15, m0
+ psubd m0, m1, [r6-32*4]
+ paddd m1, [r6-32*3]
+ ret
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+.pass1:
+ vpbroadcastd m15, [pd_2896]
+ pmulld m0, m15, [cq+32* 0]
+ pmulld m1, m15, [cq+32* 1]
+ pmulld m2, m15, [cq+32* 2]
+ pmulld m3, m15, [cq+32* 3]
+ pmulld m4, m15, [cq+32* 4]
+ pmulld m5, m15, [cq+32* 5]
+ pmulld m6, m15, [cq+32* 6]
+ pmulld m7, m15, [cq+32* 7]
+ pmulld m8, m15, [cq+32* 8]
+ pmulld m9, m15, [cq+32* 9]
+ pmulld m10, m15, [cq+32*10]
+ pmulld m11, m15, [cq+32*11]
+ pmulld m12, m15, [cq+32*12]
+ pmulld m13, m15, [cq+32*13]
+ pmulld m14, m15, [cq+32*14]
+ pmulld m15, [cq+32*15]
+ mova [rsp], m7
+ vpbroadcastd m7, [pd_2048]
+ REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ paddd m7, [rsp]
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ mova [rsp], m15
+ vpbroadcastd m15, [pd_5793]
+ REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ pmulld m15, [rsp]
+ mova [rsp], m7
+ vpbroadcastd m7, [pd_3072]
+ REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ paddd m7, [rsp]
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call m(idct_16x8_internal_10bpc).transpose
+ vpbroadcastd m10, [pw_4096]
+ jmp m(idct_16x8_internal_10bpc).end
+
+INV_TXFM_16X8_FN dct, dct, 12
+INV_TXFM_16X8_FN dct, identity, 12
+INV_TXFM_16X8_FN dct, adst, 12
+INV_TXFM_16X8_FN dct, flipadst, 12
+
+cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(idct_16x8_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+ RET
+ALIGN function_align
+.pass2_main:
+ call m(idct_8x16_internal_12bpc).transpose
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m11, [pd_2048]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_12bpc).round_shift4
+ mova [cq+32* 8], m0
+ mova [cq+32* 9], m1
+ mova [cq+32*10], m2
+ mova [cq+32*11], m3
+ mova [cq+32*12], m4
+ mova [cq+32*13], m5
+ mova [cq+32*14], m6
+ mova [cq+32*15], m7
+ pmaxsd m0, m12, [cq+32*0]
+ pmaxsd m1, m12, [cq+32*1]
+ pmaxsd m2, m12, [cq+32*2]
+ pmaxsd m3, m12, [cq+32*3]
+ pmaxsd m4, m12, [cq+32*4]
+ pmaxsd m5, m12, [cq+32*5]
+ pmaxsd m6, m12, [cq+32*6]
+ pmaxsd m7, m12, [cq+32*7]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_12bpc).round_shift4
+.end:
+ packssdw m0, [cq+32* 8]
+ packssdw m1, [cq+32* 9]
+ packssdw m2, [cq+32*10]
+ packssdw m3, [cq+32*11]
+ packssdw m4, [cq+32*12]
+ packssdw m5, [cq+32*13]
+ packssdw m6, [cq+32*14]
+ packssdw m7, [cq+32*15]
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ call .write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpermq m0, m4, q3120
+ vpermq m1, m5, q3120
+ vpermq m2, m6, q3120
+ vpermq m3, m7, q3120
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ALIGN function_align
+.write_16x4_start:
+ vpbroadcastd m9, [pixel_12bpc_max]
+ lea r3, [strideq*3]
+ pxor m8, m8
+ ret
+
+INV_TXFM_16X8_FN adst, dct, 12
+INV_TXFM_16X8_FN adst, adst, 12
+INV_TXFM_16X8_FN adst, flipadst, 12
+INV_TXFM_16X8_FN adst, identity, 12
+
+cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_20b_min]
+ vpbroadcastd m14, [clip_20b_max]
+ jmp m(iadst_16x8_internal_10bpc).pass1
+.pass2:
+ call .pass2_main
+ call m(idct_16x8_internal_12bpc).end
+ RET
+ALIGN function_align
+.pass2_main:
+ call m(idct_8x16_internal_12bpc).transpose
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m11, [pd_2048]
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(iadst_8x8_internal_12bpc).pass2_main2
+ mova [cq+32* 8], m0
+ mova [cq+32* 9], m1
+ mova [cq+32*10], m2
+ mova [cq+32*11], m3
+ mova [cq+32*12], m4
+ mova [cq+32*13], m5
+ mova [cq+32*14], m6
+ mova [cq+32*15], m7
+ pmaxsd m0, m12, [cq+32*0]
+ pmaxsd m1, m12, [cq+32*1]
+ pmaxsd m2, m12, [cq+32*2]
+ pmaxsd m3, m12, [cq+32*3]
+ pmaxsd m4, m12, [cq+32*4]
+ pmaxsd m5, m12, [cq+32*5]
+ pmaxsd m6, m12, [cq+32*6]
+ pmaxsd m7, m12, [cq+32*7]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(iadst_8x8_internal_12bpc).pass2_main2
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct, 12
+INV_TXFM_16X8_FN flipadst, adst, 12
+INV_TXFM_16X8_FN flipadst, flipadst, 12
+INV_TXFM_16X8_FN flipadst, identity, 12
+
+cglobal iflipadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_20b_min]
+ vpbroadcastd m14, [clip_20b_max]
+ jmp m(iflipadst_16x8_internal_10bpc).pass1
+.pass2:
+ call m(iadst_16x8_internal_12bpc).pass2_main
+ packssdw m13, m0, [cq+32* 8]
+ packssdw m12, m1, [cq+32* 9]
+ packssdw m11, m2, [cq+32*10]
+ packssdw m10, m3, [cq+32*11]
+ packssdw m3, m4, [cq+32*12]
+ packssdw m2, m5, [cq+32*13]
+ packssdw m1, m6, [cq+32*14]
+ packssdw m0, m7, [cq+32*15]
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+ call m(idct_16x8_internal_12bpc).write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpermq m0, m10, q3120
+ vpermq m1, m11, q3120
+ vpermq m2, m12, q3120
+ vpermq m3, m13, q3120
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+
+INV_TXFM_16X8_FN identity, dct, 12
+INV_TXFM_16X8_FN identity, adst, 12
+INV_TXFM_16X8_FN identity, flipadst, 12
+INV_TXFM_16X8_FN identity, identity, 12
+
+cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
+ jmp m(iidentity_16x8_internal_10bpc).pass1
+.pass2:
+ call m(idct_16x8_internal_10bpc).transpose2
+ vpbroadcastd m10, [pw_4096]
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ call m(idct_16x8_internal_12bpc).write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ jmp m(idct_16x8_internal_10bpc).end2
+
+%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
+ INV_TXFM_FN %1, %2, %3, 16x16, %4
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_%4bpc]
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity, 28
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+
+cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+.pass1:
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ add cq, 32
+ call .main
+ sub cq, 32
+ mova m10, [r6-32*4]
+ mova m9, [r6-32*3]
+ mova m8, [r6-32*2]
+ psubd m15, m0, m10 ; out15
+ paddd m0, m10 ; out0
+ psubd m10, m1, m9 ; out14
+ paddd m1, m9 ; out1
+ psubd m9, m2, m8 ; out13
+ paddd m2, m8 ; out2
+ REPX {psrad x, 2}, m0, m1, m2
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova m2, [r6-32*1]
+ mova m1, [r6+32*0]
+ mova m0, [r6+32*1]
+ REPX {psrad x, 2}, m9, m10, m15
+ psubd m8, m3, m2 ; out12
+ paddd m3, m2 ; out3
+ psubd m2, m4, m1 ; out11
+ paddd m4, m1 ; out4
+ psubd m1, m5, m0 ; out10
+ paddd m5, m0 ; out5
+ REPX {psrad x, 2}, m3, m4, m5
+ mova [r6-32*1], m3
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova m4, [r6+32*2]
+ mova m3, [r6+32*3]
+ REPX {psrad x, 2}, m1, m2, m8
+ psubd m5, m6, m4 ; out9
+ paddd m6, m4 ; out6
+ psubd m4, m7, m3 ; out8
+ paddd m7, m3 ; out7
+ REPX {psrad x, 2}, m6, m7, m4, m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ add r6, 32*8
+ mova [r6-32*4], m4
+ mova [r6-32*3], m5
+ mova [r6-32*2], m1
+ mova [r6-32*1], m2
+ mova [r6+32*0], m8
+ mova [r6+32*1], m9
+ mova [r6+32*2], m10
+ mova [r6+32*3], m15
+.fast:
+ add r6, 32*8
+ call .main
+ mova m14, [r6-32*4]
+ mova m13, [r6-32*3]
+ mova m12, [r6-32*2]
+ mova m11, [r6-32*1]
+ mova m10, [r6+32*0]
+ mova m9, [r6+32*1]
+ mova m8, [r6+32*2]
+ psubd m15, m0, m14 ; out15
+ paddd m0, m14 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m2, m12 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m10 ; out11
+ paddd m4, m10 ; out4
+ psubd m10, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, [r6+32*3] ; out8
+ paddd m7, [r6+32*3] ; out7
+ sub r6, 32*8
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call .transpose
+ lea r6, [pw_5+128]
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+.end:
+ call .write_16x16
+ RET
+ALIGN function_align
+.write_16x16:
+ mova [rsp+gprsize+32*0], m8
+ mova [rsp+gprsize+32*1], m9
+ mova [rsp+gprsize+32*2], m12
+ vpbroadcastd m12, [pw_2048]
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ pmulhrsw m2, m12
+ pmulhrsw m3, m12
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+.write_16x16_2:
+ pmulhrsw m0, m12, m4
+ pmulhrsw m1, m12, m5
+ pmulhrsw m2, m12, m6
+ pmulhrsw m3, m12, m7
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m12, [rsp+gprsize+32*0]
+ pmulhrsw m1, m12, [rsp+gprsize+32*1]
+ pmulhrsw m2, m12, m10
+ pmulhrsw m3, m12, m11
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m12, [rsp+gprsize+32*2]
+ pmulhrsw m1, m12, m13
+ pmulhrsw m2, m12, m14
+ pmulhrsw m3, m12, m15
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ALIGN function_align
+.transpose:
+ test eobd, eobd
+ jl .transpose_fast
+ packssdw m8, [r6-32*4]
+ packssdw m9, [r6-32*3]
+ packssdw m10, [r6-32*2]
+ packssdw m11, [r6-32*1]
+ packssdw m12, [r6+32*0]
+ packssdw m13, [r6+32*1]
+ packssdw m14, [r6+32*2]
+ packssdw m15, [r6+32*3]
+ sub r6, 32*8
+ packssdw m0, [r6-32*4]
+ packssdw m1, [r6-32*3]
+ packssdw m2, [r6-32*2]
+ packssdw m3, [r6-32*1]
+ packssdw m4, [r6+32*0]
+ packssdw m5, [r6+32*1]
+ packssdw m6, [r6+32*2]
+ packssdw m7, [r6+32*3]
+ mova [r6], m8
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ punpckhwd m3, m6, m7
+ punpcklwd m6, m7
+ punpcklwd m7, m4, m5
+ punpckhwd m4, m5
+ punpckldq m5, m8, m2
+ punpckhdq m8, m2
+ punpckhdq m2, m0, m1
+ punpckldq m0, m1
+ punpckhdq m1, m7, m6
+ punpckldq m7, m6
+ punpckhdq m6, m4, m3
+ punpckldq m4, m3
+ punpckhqdq m3, m2, m1
+ punpcklqdq m2, m1
+ punpckhqdq m1, m0, m7
+ punpcklqdq m0, m7
+ punpcklqdq m7, m8, m6
+ punpckhqdq m8, m6
+ punpckhqdq m6, m5, m4
+ punpcklqdq m5, m4
+ mova m4, [r6]
+ mova [r6], m8
+ punpcklwd m8, m4, m9
+ punpckhwd m4, m9
+ punpcklwd m9, m10, m11
+ punpckhwd m10, m11
+ punpckhwd m11, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m12, m13
+ punpcklwd m12, m13
+ punpckldq m13, m4, m10
+ punpckhdq m4, m10
+ punpckhdq m10, m8, m9
+ punpckldq m8, m9
+ punpckhdq m9, m12, m14
+ punpckldq m12, m14
+ punpckhdq m14, m15, m11
+ punpckldq m15, m11
+ punpckhqdq m11, m10, m9
+ punpcklqdq m10, m9
+ punpckhqdq m9, m8, m12
+ punpcklqdq m8, m12
+ punpcklqdq m12, m13, m15
+ punpckhqdq m13, m15
+ punpckhqdq m15, m4, m14
+ punpcklqdq m14, m4, m14
+ vperm2i128 m4, m0, m8, 0x31
+ vinserti128 m0, xm8, 1
+ vinserti128 m8, m5, xm12, 1
+ vperm2i128 m12, m5, 0x13
+ vperm2i128 m5, m1, m9, 0x31
+ vinserti128 m1, xm9, 1
+ vinserti128 m9, m6, xm13, 1
+ vperm2i128 m13, m6, 0x13
+ vperm2i128 m6, m2, m10, 0x31
+ vinserti128 m2, xm10, 1
+ vinserti128 m10, m7, xm14, 1
+ vperm2i128 m14, m7, 0x13
+ vperm2i128 m7, m3, m11, 0x31
+ vinserti128 m3, xm11, 1
+ mova xm11, [r6]
+ vinserti128 m11, xm15, 1
+ vinserti128 m15, [r6+16], 0
+ ret
+.transpose_fast:
+ call m(idct_16x8_internal_10bpc).transpose2
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ ret
+ALIGN function_align
+.main:
+ mova m0, [cq+64* 1]
+ mova m1, [cq+64* 3]
+ mova m2, [cq+64* 5]
+ mova m3, [cq+64* 7]
+ mova m4, [cq+64* 9]
+ mova m5, [cq+64*11]
+ mova m6, [cq+64*13]
+ mova m7, [cq+64*15]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 2]
+ mova m2, [cq+64* 4]
+ mova m3, [cq+64* 6]
+ mova m4, [cq+64* 8]
+ mova m5, [cq+64*10]
+ mova m6, [cq+64*12]
+ mova m7, [cq+64*14]
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ psrld m10, m11, 10 ; pd_2
+ REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+.pass1:
+ vpbroadcastd m15, [pd_2896]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ add cq, 32
+ call .main
+ sub cq, 32
+ vpbroadcastd m8, [pd_5120]
+ paddd m4, m8
+ paddd m6, m8
+ paddd m9, m8
+ paddd m11, m8
+ vpbroadcastd m8, [pd_5119]
+ psubd m5, m8, m5
+ psubd m7, m8, m7
+ psubd m10, m8, m10
+ psubd m12, m8, m12
+ REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ psrld m4, m15, 10 ; pd_2
+ paddd m0, m4
+ psubd m1, m4, m1
+ paddd m2, m4
+ psubd m3, m4, m3
+ psubd m7, m4, [r6-32*4]
+ paddd m6, m4, [r6-32*3]
+ psubd m5, m4, [r6-32*2]
+ paddd m4, [r6-32*1]
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ add r6, 32*8
+ mova [r6-32*4], m9
+ mova [r6-32*3], m10
+ mova [r6-32*2], m11
+ mova [r6-32*1], m12
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+.fast:
+ add r6, 32*8
+ call .main
+ vpbroadcastd m14, [pd_5120]
+ vpbroadcastd m13, [pd_5119]
+ psrld m15, 10 ; pd_2
+ paddd m0, m15
+ psubd m1, m15, m1
+ paddd m2, m15
+ psubd m3, m15, m3
+ paddd m4, m14
+ psubd m5, m13, m5
+ paddd m6, m14
+ psubd m7, m13, m7
+ paddd m8, m14, m9
+ psubd m9, m13, m10
+ paddd m10, m14, m11
+ psubd m11, m13, m12
+ paddd m12, m15, [r6-32*1]
+ psubd m13, m15, [r6-32*2]
+ paddd m14, m15, [r6-32*3]
+ psubd m15, [r6-32*4]
+.pass1_end:
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
+ sub r6, 32*8
+ jmp tx2q
+.pass2:
+ call m(idct_16x16_internal_10bpc).transpose
+ lea r6, [pw_5+128]
+ mova [rsp], m15
+ call m(iadst_16x16_internal_8bpc).main
+ call m(iadst_16x16_internal_8bpc).main_pass2_end
+ mova [rsp+32*0], m8
+ mova [rsp+32*2], m12
+ mova [rsp+32*3], m13
+ vpbroadcastd m12, [pw_2048]
+ pxor m13, m13
+ psubw m13, m12
+ pmulhrsw m0, m12
+ pmulhrsw m1, m13, [rsp+32*1]
+ mova [rsp+32*1], m9
+ pmulhrsw m2, m12
+ pmulhrsw m3, m13
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m12, m4
+ pmulhrsw m1, m13, m5
+ pmulhrsw m2, m12, m6
+ pmulhrsw m3, m13, m7
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m12, [rsp+32*0]
+ pmulhrsw m1, m13, [rsp+32*1]
+ pmulhrsw m2, m12, m10
+ pmulhrsw m3, m13, m11
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m12, [rsp+32*2]
+ pmulhrsw m1, m13, [rsp+32*3]
+ pmulhrsw m2, m12, m14
+ pmulhrsw m3, m13, m15
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+64* 2]
+ mova m1, [cq+64*13]
+ mova m2, [cq+64* 6]
+ mova m3, [cq+64* 9]
+ mova m4, [cq+64*10]
+ mova m5, [cq+64* 5]
+ mova m6, [cq+64*14]
+ mova m7, [cq+64* 1]
+ vpbroadcastd m12, [pd_2048]
+ call m(iadst_16x8_internal_10bpc).main_part1
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64*15]
+ mova m2, [cq+64* 4]
+ mova m3, [cq+64*11]
+ mova m4, [cq+64* 8]
+ mova m5, [cq+64* 7]
+ mova m6, [cq+64*12]
+ mova m7, [cq+64* 3]
+ jmp m(iadst_16x8_internal_10bpc).main_part2
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+.pass1:
+ vpbroadcastd m15, [pd_2896]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ add cq, 32
+ call m(iadst_16x16_internal_10bpc).main
+ sub cq, 32
+ vpbroadcastd m8, [pd_5120]
+ paddd m11, m8
+ paddd m9, m8
+ paddd m6, m8
+ paddd m4, m8
+ vpbroadcastd m8, [pd_5119]
+ psubd m12, m8, m12
+ psubd m10, m8, m10
+ psubd m7, m8, m7
+ psubd m5, m8, m5
+ REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4
+ mova [r6+32*0], m12
+ mova [r6+32*1], m11
+ mova [r6+32*2], m10
+ mova [r6+32*3], m9
+ psrld m9, m15, 10 ; pd_2
+ psubd m3, m9, m3
+ paddd m2, m9
+ psubd m1, m9, m1
+ paddd m0, m9
+ psubd m12, m9, [r6-32*4]
+ paddd m11, m9, [r6-32*3]
+ psubd m10, m9, [r6-32*2]
+ paddd m9, [r6-32*1]
+ REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0
+ mova [r6-32*4], m12
+ mova [r6-32*3], m11
+ mova [r6-32*2], m10
+ mova [r6-32*1], m9
+ add r6, 32*8
+ mova [r6-32*4], m7
+ mova [r6-32*3], m6
+ mova [r6-32*2], m5
+ mova [r6-32*1], m4
+ mova [r6+32*0], m3
+ mova [r6+32*1], m2
+ mova [r6+32*2], m1
+ mova [r6+32*3], m0
+.fast:
+ add r6, 32*8
+ call m(iadst_16x16_internal_10bpc).main
+ vpbroadcastd m14, [pd_5120]
+ vpbroadcastd m13, [pd_5119]
+ psrld m15, 10 ; pd_2
+ psubd m8, m13, m7
+ paddd m7, m14, m9
+ paddd m9, m14, m6
+ psubd m6, m13, m10
+ psubd m10, m13, m5
+ paddd m5, m14, m11
+ paddd m11, m14, m4
+ psubd m4, m13, m12
+ psubd m12, m15, m3
+ paddd m3, m15, [r6-32*1]
+ paddd m13, m15, m2
+ psubd m2, m15, [r6-32*2]
+ psubd m14, m15, m1
+ mova m1, m15
+ paddd m15, m0
+ psubd m0, m1, [r6-32*4]
+ paddd m1, [r6-32*3]
+ jmp m(iadst_16x16_internal_10bpc).pass1_end
+.pass2:
+ call m(idct_16x16_internal_10bpc).transpose
+ lea r6, [pw_5+128]
+ mova [rsp], m15
+ call m(iadst_16x16_internal_8bpc).main
+ call m(iadst_16x16_internal_8bpc).main_pass2_end
+ mova [rsp+32*3], m3
+ mova [rsp+32*2], m2
+ mova [rsp+32*0], m0
+ mova m2, m13
+ mova m3, m12
+ vpbroadcastd m12, [pw_2048]
+ pxor m13, m13
+ psubw m13, m12
+ pmulhrsw m0, m13, m15
+ pmulhrsw m1, m12, m14
+ pmulhrsw m2, m13
+ pmulhrsw m3, m12
+ mova m14, m8
+ mova m15, m9
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m13, m11
+ pmulhrsw m1, m12, m10
+ pmulhrsw m2, m13, m15
+ pmulhrsw m3, m12, m14
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m13, m7
+ pmulhrsw m1, m12, m6
+ pmulhrsw m2, m13, m5
+ pmulhrsw m3, m12, m4
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ pmulhrsw m0, m13, [rsp+32*3]
+ pmulhrsw m1, m12, [rsp+32*2]
+ pmulhrsw m2, m13, [rsp+32*1]
+ pmulhrsw m3, m12, [rsp+32*0]
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+
+INV_TXFM_16X16_FN identity, dct, -92
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m15, [pd_5793]
+ vpbroadcastd m7, [pd_5120]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ mov r3, -32*8*4
+.righthalf:
+ pmulld m0, m15, [cq+r3+32*33]
+ pmulld m1, m15, [cq+r3+32*35]
+ pmulld m2, m15, [cq+r3+32*37]
+ pmulld m3, m15, [cq+r3+32*39]
+ add r6, 32*4
+ REPX {paddd x, m7}, m0, m1, m2, m3
+ REPX {psrad x, 13}, m0, m1, m2, m3
+ mova [r6+32*0], m0
+ mova [r6+32*1], m1
+ mova [r6+32*2], m2
+ mova [r6+32*3], m3
+ add r3, 32*8
+ jl .righthalf
+.fast:
+ pmulld m0, m15, [cq+64* 0]
+ pmulld m1, m15, [cq+64* 1]
+ pmulld m2, m15, [cq+64* 2]
+ pmulld m3, m15, [cq+64* 3]
+ pmulld m4, m15, [cq+64* 4]
+ pmulld m5, m15, [cq+64* 5]
+ pmulld m6, m15, [cq+64* 6]
+ pmulld m8, m15, [cq+64* 7]
+ mova [cq], m8
+ pmulld m8, m15, [cq+64* 8]
+ pmulld m9, m15, [cq+64* 9]
+ pmulld m10, m15, [cq+64*10]
+ pmulld m11, m15, [cq+64*11]
+ pmulld m12, m15, [cq+64*12]
+ pmulld m13, m15, [cq+64*13]
+ pmulld m14, m15, [cq+64*14]
+ pmulld m15, [cq+64*15]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ paddd m7, [cq]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp tx2q
+.pass2:
+ call m(idct_16x16_internal_10bpc).transpose
+
+ mova [cq+32*0], m15
+ mova [cq+32*1], m0
+ vpbroadcastd m15, [pw_1697x16]
+
+ REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14
+ mova m0, [cq+32*1]
+ mova [cq+32*1], m1
+ IDTX16 0, 1, 15
+ mova m1, [cq+32*0]
+ pmulhrsw m15, m1
+ paddsw m1, m1
+ paddsw m15, m1
+ mova m1, [cq+32*1]
+ jmp m(idct_16x16_internal_10bpc).end
+
+INV_TXFM_16X16_FN dct, dct, 0, 12
+INV_TXFM_16X16_FN dct, identity, 28, 12
+INV_TXFM_16X16_FN dct, adst, 0, 12
+INV_TXFM_16X16_FN dct, flipadst, 0, 12
+
+cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ jmp m(idct_16x16_internal_10bpc).pass1
+.pass2:
+ mova [cq+32* 8], m8
+ mova [cq+32* 9], m9
+ mova [cq+32*10], m10
+ mova [cq+32*11], m11
+ mova [cq+32*12], m12
+ mova [cq+32*13], m13
+ mova [cq+32*14], m14
+ mova [cq+32*15], m15
+ call .pass2_main
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ packssdw m4, m8, m9
+ packssdw m5, m10, m11
+ packssdw m6, m12, m13
+ packssdw m7, m14, m15
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ mova m0, [cq+32* 8]
+ mova m1, [cq+32* 9]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*11]
+ mova m4, [cq+32*12]
+ mova m5, [cq+32*13]
+ mova m6, [cq+32*14]
+ mova m7, [cq+32*15]
+ mov r5, r6
+ add r6, 32*16
+ call .pass2_main
+ jmp m(iadst_16x16_internal_12bpc).end
+ALIGN function_align
+.write_16x16:
+ mova [rsp+gprsize+32*0], m8
+ mova [rsp+gprsize+32*1], m9
+ mova [rsp+gprsize+32*2], m12
+ vpbroadcastd m12, [pw_16384]
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ pmulhrsw m2, m12
+ pmulhrsw m3, m12
+ call m(idct_16x8_internal_12bpc).write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ jmp m(idct_16x16_internal_10bpc).write_16x16_2
+ALIGN function_align
+.pass2_main:
+ call m(idct_8x8_internal_12bpc).transpose_8x8
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m2
+ mova [cq+32* 2], m4
+ mova [cq+32* 3], m6
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ pmaxsd m0, m12, m1
+ pmaxsd m1, m12, m3
+ pmaxsd m2, m12, m5
+ pmaxsd m3, m12, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ test eobd, eobd
+ jge .pass2_slow
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ jmp .pass2_fast
+.pass2_slow:
+ sub r6, 32*8
+ mova m8, [r6-32*4]
+ mova m4, [r6-32*3]
+ mova m10, [r6-32*2]
+ mova m5, [r6-32*1]
+ mova m12, [r6+32*0]
+ mova m6, [r6+32*1]
+ mova m14, [r6+32*2]
+ mova m7, [r6+32*3]
+ TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15
+ mova [cq+32* 4], m8
+ mova [cq+32* 5], m10
+ mova [cq+32* 6], m12
+ mova [cq+32* 7], m14
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ REPX {pmaxsd x, m12}, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m4, m5, m6, m7
+.pass2_fast:
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ pmaxsd m0, m12, [cq+32* 0]
+ pmaxsd m1, m12, [cq+32* 1]
+ pmaxsd m2, m12, [cq+32* 2]
+ pmaxsd m3, m12, [cq+32* 3]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ test eobd, eobd
+ jge .pass2_slow2
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ jmp .pass2_fast2
+.pass2_slow2:
+ pmaxsd m4, m12, [cq+32* 4]
+ pmaxsd m5, m12, [cq+32* 5]
+ pmaxsd m6, m12, [cq+32* 6]
+ pmaxsd m7, m12, [cq+32* 7]
+ REPX {pminsd x, m13}, m4, m5, m6, m7
+.pass2_fast2:
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ psrad m11, 8 ; pd_8
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_16x8_internal_10bpc).pass1_rotations
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ ret
+
+INV_TXFM_16X16_FN adst, dct, 0, 12
+INV_TXFM_16X16_FN adst, adst, 0, 12
+INV_TXFM_16X16_FN adst, flipadst, 0, 12
+
+cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_20b_min]
+ vpbroadcastd m14, [clip_20b_max]
+ jmp m(iadst_16x16_internal_10bpc).pass1
+.pass2:
+ call .pass2_part1
+ call m(iadst_16x8_internal_10bpc).pass1_rotations
+ call .pass2_part2
+ call m(iadst_16x8_internal_10bpc).pass1_rotations
+.pass2_part3:
+ REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
+.end:
+ packssdw m15, m14
+ packssdw m14, m13, m12
+ packssdw m13, m11, m10
+ packssdw m12, m9, m8
+ packssdw m11, m7, m6
+ packssdw m10, m5, m4
+ packssdw m7, m3, m2
+ packssdw m6, m1, m0
+ vpblendd m0, m6, [r5-32*4], 0x33
+ vpblendd m1, m6, [r5-32*4], 0xcc
+ vpblendd m2, m7, [r5-32*3], 0x33
+ vpblendd m3, m7, [r5-32*3], 0xcc
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ call m(idct_16x8_internal_12bpc).write_16x4_start
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpblendd m0, m10, [r5-32*2], 0x33
+ vpblendd m1, m10, [r5-32*2], 0xcc
+ vpblendd m2, m11, [r5-32*1], 0x33
+ vpblendd m3, m11, [r5-32*1], 0xcc
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpblendd m0, m12, [r5+32*0], 0x33
+ vpblendd m1, m12, [r5+32*0], 0xcc
+ vpblendd m2, m13, [r5+32*1], 0x33
+ vpblendd m3, m13, [r5+32*1], 0xcc
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ vpblendd m0, m14, [r5+32*2], 0x33
+ vpblendd m1, m14, [r5+32*2], 0xcc
+ vpblendd m2, m15, [r5+32*3], 0x33
+ vpblendd m3, m15, [r5+32*3], 0xcc
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ call m(idct_16x8_internal_10bpc).write_16x4_zero
+ RET
+ALIGN function_align
+.pass2_part1:
+ mova [cq+32* 8], m8
+ mova [cq+32* 9], m9
+ mova [cq+32*10], m10
+ mova [cq+32*11], m11
+ mova [cq+32*12], m12
+ mova [cq+32*13], m13
+ mova [cq+32*14], m14
+ mova [cq+32*15], m15
+.pass2_main:
+ call m(idct_8x8_internal_12bpc).transpose_8x8
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m3
+ mova [cq+32* 2], m4
+ mova [cq+32* 3], m7
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+ pmaxsd m0, m13, m2
+ pmaxsd m2, m13, m6
+ pmaxsd m5, m13, m5
+ pmaxsd m7, m13, m1
+ REPX {pminsd x, m14}, m0, m2, m5, m7
+ test eobd, eobd
+ jge .pass2_slow
+ pxor m1, m1
+ REPX {mova x, m1}, m3, m4, m6
+ jmp .pass2_fast
+.pass2_slow:
+ sub r6, 32*8
+ mova m8, [r6-32*4]
+ mova m3, [r6-32*3]
+ mova m4, [r6-32*2]
+ mova m11, [r6-32*1]
+ mova m12, [r6+32*0]
+ mova m1, [r6+32*1]
+ mova m6, [r6+32*2]
+ mova m15, [r6+32*3]
+ TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14
+ mova [cq+32* 4], m8
+ mova [cq+32* 5], m11
+ mova [cq+32* 6], m12
+ mova [cq+32* 7], m15
+ vpbroadcastd m13, [clip_18b_min]
+ vpbroadcastd m14, [clip_18b_max]
+ REPX {pmaxsd x, m13}, m1, m3, m4, m6
+ REPX {pminsd x, m14}, m1, m3, m4, m6
+.pass2_fast:
+ vpbroadcastd m12, [pd_2048]
+ vpbroadcastd m15, [pd_2896]
+ call m(iadst_16x8_internal_10bpc).main_part1
+ pmaxsd m0, m13, [cq+32* 0] ; 0
+ pmaxsd m7, m13, [cq+32* 1] ; 3
+ pmaxsd m2, m13, [cq+32* 2] ; 4
+ pmaxsd m5, m13, [cq+32* 3] ; 7
+ REPX {pminsd x, m14}, m0, m2, m5, m7
+ test eobd, eobd
+ jge .pass2_slow2
+ pxor m1, m1
+ REPX {mova x, m1}, m3, m4, m6
+ jmp .pass2_fast2
+.pass2_slow2:
+ pmaxsd m4, m13, [cq+32* 4] ; 8
+ pmaxsd m3, m13, [cq+32* 5] ; 11
+ pmaxsd m6, m13, [cq+32* 6] ; 12
+ pmaxsd m1, m13, [cq+32* 7] ; 15
+ REPX {pminsd x, m14}, m1, m3, m4, m6
+.pass2_fast2:
+ call m(iadst_16x8_internal_10bpc).main_part2
+ vpbroadcastd m14, [pd_17408]
+ psrld m15, 11 ; pd_1
+ psubd m13, m14, m15 ; pd_17407
+ pslld m15, 3 ; pd_8
+ ret
+ALIGN function_align
+.pass2_part2:
+ REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
+ packssdw m0, m1
+ packssdw m1, m2, m3
+ packssdw m2, m4, m5
+ packssdw m3, m6, m7
+ packssdw m4, m8, m9
+ packssdw m5, m10, m11
+ packssdw m6, m12, m13
+ packssdw m7, m14, m15
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ mova m0, [cq+32* 8]
+ mova m1, [cq+32* 9]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*11]
+ mova m4, [cq+32*12]
+ mova m5, [cq+32*13]
+ mova m6, [cq+32*14]
+ mova m7, [cq+32*15]
+ mov r5, r6
+ add r6, 32*16
+ jmp .pass2_main
+
+INV_TXFM_16X16_FN flipadst, dct, 0, 12
+INV_TXFM_16X16_FN flipadst, adst, 0, 12
+INV_TXFM_16X16_FN flipadst, flipadst, 0, 12
+
+cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m13, [clip_20b_min]
+ vpbroadcastd m14, [clip_20b_max]
+ jmp m(iflipadst_16x16_internal_10bpc).pass1
+.pass2:
+ call m(iadst_16x16_internal_12bpc).pass2_part1
+ call m(iflipadst_16x8_internal_10bpc).pass1_rotations
+ call m(iadst_16x16_internal_12bpc).pass2_part2
+ call m(iflipadst_16x8_internal_10bpc).pass1_rotations
+ jmp m(iadst_16x16_internal_12bpc).pass2_part3
+
+INV_TXFM_16X16_FN identity, dct, -92, 12
+INV_TXFM_16X16_FN identity, identity, 0, 12
+
+%macro IDTX16_12BPC 1 ; src
+ pmulld m6, m7, m%1
+ paddd m6, m15
+ psrad m6, 12
+ paddd m6, m%1
+ psrad m%1, m6, 1
+%endmacro
+
+cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
+ vpbroadcastd m7, [pd_1697]
+ vpbroadcastd m15, [pd_5120]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ mov r3, -32*8*4
+.righthalf:
+ mova m10, [cq+r3+32*33]
+ mova m11, [cq+r3+32*35]
+ mova m12, [cq+r3+32*37]
+ mova m13, [cq+r3+32*39]
+ add r6, 32*4
+ pmulld m0, m7, m10
+ pmulld m1, m7, m11
+ pmulld m2, m7, m12
+ pmulld m3, m7, m13
+ REPX {paddd x, m15}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ paddd m0, m10
+ paddd m1, m11
+ paddd m2, m12
+ paddd m3, m13
+ REPX {psrad x, 1 }, m0, m1, m2, m3
+ mova [r6+32*0], m0
+ mova [r6+32*1], m1
+ mova [r6+32*2], m2
+ mova [r6+32*3], m3
+ add r3, 32*8
+ jl .righthalf
+.fast:
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 1]
+ mova m2, [cq+64* 2]
+ mova m3, [cq+64* 3]
+ mova m4, [cq+64* 4]
+ mova m5, [cq+64* 5]
+ mova m8, [cq+64* 6]
+ mova m9, [cq+64* 7]
+ REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9
+ mova [cq+64*0], m8
+ mova [cq+64*1], m9
+ mova m8, [cq+64* 8]
+ mova m9, [cq+64* 9]
+ mova m10, [cq+64*10]
+ mova m11, [cq+64*11]
+ mova m12, [cq+64*12]
+ mova m13, [cq+64*13]
+ mova m14, [cq+64*14]
+ REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14
+ mova m6, [cq+64*15]
+ pmulld m7, m6
+ paddd m7, m15
+ psrad m7, 12
+ paddd m7, m6
+ mova m6, [cq+64*0]
+ psrad m15, m7, 1
+ mova m7, [cq+64*1]
+ jmp tx2q
+.pass2:
+ call m(iidentity_8x16_internal_12bpc).pass2_main
+ call m(idct_16x16_internal_10bpc).transpose_fast
+ test eobd, eobd
+ jl .pass2_fast
+ mova [cq+32* 8], m0
+ mova [cq+32* 9], m1
+ mova [cq+32*10], m2
+ mova [cq+32*11], m3
+ mova [cq+32*12], m4
+ mova [cq+32*13], m5
+ mova [cq+32*14], m6
+ mova [cq+32*15], m7
+ mova m8, [r6-32*4]
+ mova m9, [r6-32*3]
+ mova m10, [r6-32*2]
+ mova m11, [r6-32*1]
+ mova m12, [r6+32*0]
+ mova m13, [r6+32*1]
+ mova m14, [r6+32*2]
+ mova m15, [r6+32*3]
+ sub r6, 32*8
+ mova m0, [r6-32*4]
+ mova m1, [r6-32*3]
+ mova m2, [r6-32*2]
+ mova m3, [r6-32*1]
+ mova m4, [r6+32*0]
+ mova m5, [r6+32*1]
+ mova m6, [r6+32*2]
+ mova m7, [r6+32*3]
+ call m(iidentity_8x16_internal_12bpc).pass2_main
+ call m(idct_16x8_internal_10bpc).transpose2
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+ mova m12, m4
+ mova m13, m5
+ mova m14, m6
+ mova m15, m7
+ mova m0, [cq+32* 8]
+ mova m1, [cq+32* 9]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*11]
+ mova m4, [cq+32*12]
+ mova m5, [cq+32*13]
+ mova m6, [cq+32*14]
+ mova m7, [cq+32*15]
+.pass2_fast:
+ call m(idct_16x16_internal_12bpc).write_16x16
+ RET
+
+%macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack
+ mova m%4, [r6+32*(%1-4)]
+ mova m%2, [r5+32*(3-%1)]
+ mova m%5, [r4+32*(%1-4)]
+ psubd m%3, m%1, m%4 ; idct16 out15 - n
+ paddd m%1, m%4 ; idct16 out0 + n
+ pmaxsd m%1, m12
+ pmaxsd m%3, m12
+ pminsd m%1, m13
+ pminsd m%3, m13
+ paddd m%1, m11
+ paddd m%3, m11
+ psubd m%4, m%1, m%2 ; out31 - n
+ paddd m%1, m%2 ; out0 + n
+ paddd m%2, m%3, m%5 ; out15 - n
+ psubd m%3, m%5 ; out16 + n
+ REPX {psrad x, %6}, m%1, m%3, m%2, m%4
+%if %7 & 1
+ packssdw m%1, m%3 ; out0 + n, out16 + n
+ packssdw m%2, m%4 ; out15 - n, out31 - n
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vbroadcasti128 m14, [idct32_shuf]
+ mov r4, cq
+ call .pass1_main
+ mova [rsp+32*0], m2
+ mova [rsp+32*1], m3
+ cmp eobd, 43
+ jge .eob43
+ pxor m4, m4
+ REPX {mova x, m4}, [rsp+32*2], m2, m3, m11
+ jmp .pass1_end_fast
+.eob43:
+ lea r6, [rsp+32*8]
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ call .pass1_main
+ mova [rsp+32*2], m2
+ cmp eobd, 107
+ jge .eob107
+ mova m11, m3
+ mova m2, m0
+ mova m3, m1
+ mova m0, [r6-32*4]
+ mova m1, [r6-32*3]
+ pxor m4, m4
+.pass1_end_fast:
+ vpbroadcastd m10, [pw_2048]
+ lea r6, [deint_shuf+128]
+ REPX {mova x, m4}, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+ jmp .end
+.eob107:
+ mova [rsp+32*3], m3
+ mova [r6-32*2], m0
+ mova [r6-32*1], m1
+ call .pass1_main
+ cmp eobd, 171
+ jge .eob171
+ pshufd m12, m2, q1032
+ pshufd m13, m3, q1032
+ mova m4, m0
+ mova m5, m1
+ pxor m6, m6
+ REPX {mova x, m6}, m7, m14, m15
+ jmp .pass1_end
+.eob171:
+ mova [r6+32*0], m0
+ mova [r6+32*1], m1
+ mova [r6+32*2], m2
+ mova [r6+32*3], m3
+ call .pass1_main
+ pshufd m12, [r6+32*2], q1032 ; out19 out17
+ pshufd m13, [r6+32*3], q1032 ; out23 out21
+ mova m4, [r6+32*0] ; out16 out18
+ mova m5, [r6+32*1] ; out20 out22
+ pshufd m14, m2, q1032 ; out27 out25
+ pshufd m15, m3, q1032 ; out31 out29
+ mova m6, m0 ; out24 out26
+ mova m7, m1 ; out28 out30
+.pass1_end:
+ mova m0, [r6-32*4] ; out0 out2
+ mova m1, [r6-32*3] ; out4 out6
+ mova m2, [r6-32*2] ; out8 out10
+ mova m3, [r6-32*1] ; out12 out14
+ lea r6, [deint_shuf+128]
+ mova m11, [rsp+32*3] ; out13 out15
+ vpbroadcastd m10, [pw_2048]
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+.end: ; [rsp+0*32] = m12
+ vpbroadcastd m12, [pw_2048]
+ mov cq, r4
+ mova [rsp+32*1], m8
+ mova [rsp+32*2], m9
+ mova [rsp+32*3], m10
+ mova [rsp+32*4], m11
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ vpermq m0, m2, q3120
+ vpermq m1, m3, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m4, q3120
+ vpermq m1, m5, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m6, q3120
+ vpermq m1, m7, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [rsp+32*1], q3120
+ vpermq m1, [rsp+32*2], q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [rsp+32*3], q3120
+ vpermq m1, [rsp+32*4], q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [rsp+32*0], q3120
+ vpermq m1, m13, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m14, q3120
+ vpermq m1, m15, q2031
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
+ALIGN function_align
+.pass1_main_part1:
+ mova m0, [cq+128*0]
+ mova m1, [cq+128*1]
+ mova m2, [cq+128*2]
+ mova m3, [cq+128*3]
+ mova m4, [cq+128*4]
+ mova m5, [cq+128*5]
+ mova m6, [cq+128*6]
+ mova m7, [cq+128*7]
+ call m(idct_8x8_internal_10bpc).main
+ psrld m1, m11, 10 ; pd_2
+ REPX {paddd x, m1}, m0, m6, m5, m3
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ psubd m7, m0, m9 ; out7
+ paddd m0, m9 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+ALIGN function_align
+.pass1_main:
+ call .pass1_main_part1
+ add cq, 32
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m4, m14
+ pshufb m6, m14
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ vperm2i128 m1, m0, m2, 0x31 ; 4 6
+ vinserti128 m0, xm2, 1 ; 0 2
+ vinserti128 m2, m3, xm4, 1 ; 1 3
+ vperm2i128 m3, m4, 0x31 ; 5 7
+ ret
+.main_oddhalf_part1_fast_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_oddhalf_part1_fast: ; lower half zero
+ vpbroadcastd m7, [pd_4091]
+ vpbroadcastd m8, [pd_201]
+ vpbroadcastd m6, [pd_m1380]
+ vpbroadcastd m9, [pd_3857]
+ vpbroadcastd m5, [pd_3703]
+ vpbroadcastd m10, [pd_1751]
+ vpbroadcastd m4, [pd_m2751]
+ vpbroadcastd m15, [pd_3035]
+ pmulld m7, m0
+ pmulld m0, m8
+ pmulld m6, m1
+ pmulld m1, m9
+ pmulld m5, m2
+ pmulld m2, m10
+ pmulld m4, m3
+ pmulld m3, m15
+ jmp .main_oddhalf_part1_fast2
+.main_oddhalf_part1_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
+ ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
+.main_oddhalf_part1_fast2:
+ REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
+ REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
+ psubd m8, m0, m4 ; t17
+ paddd m0, m4 ; t16
+ psubd m4, m6, m2 ; t18
+ paddd m6, m2 ; t19
+ psubd m2, m1, m5 ; t29
+ paddd m1, m5 ; t28
+ psubd m5, m7, m3 ; t30
+ paddd m7, m3 ; t31
+ REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
+ REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
+ vpbroadcastd m15, [pd_4017]
+ vpbroadcastd m10, [pd_799]
+ ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a
+ ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a
+ psubd m3, m0, m6 ; t19a
+ paddd m0, m6 ; t16a
+ psubd m6, m7, m1 ; t28a
+ paddd m7, m1 ; t31a
+ psubd m1, m5, m4 ; t18
+ paddd m5, m4 ; t17
+ psubd m4, m8, m2 ; t29
+ paddd m8, m2 ; t30
+ REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
+ REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
+ vpbroadcastd m15, [pd_3784]
+ vpbroadcastd m10, [pd_1567]
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
+ ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28
+ mova [r6-32*4], m0
+ mova [r6-32*3], m5
+ mova [r6-32*2], m4
+ mova [r6-32*1], m6
+ mova [r6+32*0], m3
+ mova [r6+32*1], m1
+ mova [r6+32*2], m8
+ mova [r6+32*3], m7
+ ret
+.main_oddhalf_part2_fast_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_oddhalf_part2_fast: ; lower half zero
+ vpbroadcastd m7, [pd_m601]
+ vpbroadcastd m8, [pd_4052]
+ vpbroadcastd m6, [pd_3973]
+ vpbroadcastd m9, [pd_995]
+ vpbroadcastd m5, [pd_m2106]
+ vpbroadcastd m10, [pd_3513]
+ vpbroadcastd m4, [pd_3290]
+ vpbroadcastd m15, [pd_2440]
+ pmulld m7, m0
+ pmulld m0, m8
+ pmulld m6, m1
+ pmulld m1, m9
+ pmulld m5, m2
+ pmulld m2, m10
+ pmulld m4, m3
+ pmulld m3, m15
+ jmp .main_oddhalf_part2_fast2
+.main_oddhalf_part2_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
+.main_oddhalf_part2_fast2:
+ REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
+ REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
+ psubd m8, m0, m4 ; t25
+ paddd m0, m4 ; t24
+ psubd m4, m6, m2 ; t26
+ paddd m6, m2 ; t27
+ psubd m2, m1, m5 ; t21
+ paddd m1, m5 ; t20
+ psubd m5, m7, m3 ; t22
+ paddd m7, m3 ; t23
+ REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
+ REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
+ vpbroadcastd m15, [pd_2276]
+ vpbroadcastd m10, [pd_3406]
+ ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a
+ ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a
+ psubd m3, m0, m6 ; t27a
+ paddd m0, m6 ; t24a
+ psubd m6, m7, m1 ; t20a
+ paddd m7, m1 ; t23a
+ psubd m1, m5, m4 ; t21
+ paddd m5, m4 ; t22
+ psubd m4, m8, m2 ; t26
+ paddd m8, m2 ; t25
+ REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
+ REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
+ vpbroadcastd m15, [pd_3784]
+ vpbroadcastd m10, [pd_1567]
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a
+ ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20
+ mova m9, [r6-32*4] ; t16a
+ mova m10, [r6-32*3] ; t17
+ psubd m2, m9, m7 ; t23
+ paddd m9, m7 ; t16
+ psubd m7, m10, m5 ; t22a
+ paddd m10, m5 ; t17a
+ REPX {pmaxsd x, m12}, m9, m10, m2, m7
+ REPX {pminsd x, m13}, m9, m10, m2, m7
+ mova [r6-32*4], m9
+ mova [r6-32*3], m10
+ mova m9, [r6-32*2] ; t18a
+ mova m10, [r6-32*1] ; t19
+ psubd m5, m9, m1 ; t21
+ paddd m9, m1 ; t18
+ psubd m1, m10, m6 ; t20a
+ paddd m10, m6 ; t19a
+ REPX {pmaxsd x, m12}, m9, m10, m5, m1
+ REPX {pminsd x, m13}, m9, m10, m5, m1
+ mova [r6-32*2], m9
+ mova [r6-32*1], m10
+ mova m9, [r6+32*0] ; t28
+ mova m10, [r6+32*1] ; t29a
+ psubd m6, m9, m3 ; t27a
+ paddd m9, m3 ; t28a
+ psubd m3, m10, m4 ; t26
+ paddd m10, m4 ; t29
+ REPX {pmaxsd x, m12}, m9, m10, m6, m3
+ REPX {pminsd x, m13}, m9, m10, m6, m3
+ REPX {pmulld x, m14}, m6, m3, m1, m5
+ paddd m6, m11
+ paddd m3, m11
+ psubd m4, m6, m1 ; t20
+ paddd m6, m1 ; t27
+ psubd m1, m3, m5 ; t21a
+ paddd m3, m5 ; t26a
+ REPX {psrad x, 12 }, m4, m1, m3, m6
+ mova [r6+32*0], m4
+ mova [r6+32*1], m1
+ mova m4, [r6+32*2] ; t30
+ mova m1, [r6+32*3] ; t31a
+ psubd m5, m4, m8 ; t25a
+ paddd m4, m8 ; t30a
+ psubd m8, m1, m0 ; t24
+ paddd m1, m0 ; t31
+ REPX {pmaxsd x, m12}, m8, m5, m4, m1
+ REPX {pminsd x, m13}, m8, m5, m4, m1
+ REPX {pmulld x, m14}, m5, m8, m7, m2
+ paddd m5, m11
+ paddd m8, m11
+ psubd m0, m5, m7 ; t22
+ paddd m5, m7 ; t25
+ psubd m7, m8, m2 ; t23a
+ paddd m2, m8 ; t24a
+ REPX {psrad x, 12 }, m0, m7, m2, m5
+ mova [r6+32*2], m0
+ mova [r6+32*3], m7
+ mov r4, r6
+ add r6, 32*8
+ mova [r6-32*4], m2
+ mova [r6-32*3], m5
+ mova [r6-32*2], m3
+ mova [r6-32*1], m6
+ mova [r6+32*0], m9
+ mova [r6+32*1], m10
+ mova [r6+32*2], m4
+ mova [r6+32*3], m1
+ mov r5, r6
+ add r6, 32*8
+ ret
+ALIGN function_align
+.main_end:
+ psrld m11, 10 ; pd_2
+ IDCT32_END 0, 15, 8, 9, 10, 2
+ IDCT32_END 1, 14, 8, 9, 10, 2
+ punpckhwd m8, m0, m1 ; 16 17
+ punpcklwd m0, m1 ; 0 1
+ punpcklwd m1, m14, m15 ; 14 15
+ punpckhwd m14, m15 ; 30 31
+ mova [r5+32*3], m8
+ mova [r5+32*2], m14
+ IDCT32_END 2, 15, 8, 9, 10, 2
+ IDCT32_END 3, 14, 8, 9, 10, 2
+ punpckhwd m8, m2, m3 ; 18 19
+ punpcklwd m2, m3 ; 2 3
+ punpcklwd m3, m14, m15 ; 12 13
+ punpckhwd m14, m15 ; 28 29
+ mova [r5+32*1], m8
+ mova [r5+32*0], m14
+ IDCT32_END 4, 15, 8, 9, 10, 2
+ IDCT32_END 5, 14, 8, 9, 10, 2
+ punpckhwd m8, m4, m5 ; 20 21
+ punpcklwd m4, m5 ; 4 5
+ punpcklwd m5, m14, m15 ; 10 11
+ punpckhwd m14, m15 ; 26 27
+ mova [r5-32*1], m8
+ mova [r5-32*2], m14
+ IDCT32_END 6, 15, 8, 9, 10, 2
+ IDCT32_END 7, 14, 8, 9, 10, 2
+ punpckhwd m8, m6, m7 ; 22 23
+ punpcklwd m6, m7 ; 6 7
+ punpcklwd m7, m14, m15 ; 8 9
+ punpckhwd m14, m15 ; 24 25
+ mova [r5-32*3], m8
+ mova [r5-32*4], m14
+.transpose:
+ punpckhdq m15, m3, m1
+ punpckldq m3, m1
+ punpckhdq m1, m4, m6
+ punpckldq m4, m6
+ punpckhdq m6, m0, m2
+ punpckldq m0, m2
+ punpckhdq m2, m7, m5
+ punpckldq m7, m5
+ punpcklqdq m5, m2, m15
+ punpckhqdq m2, m15
+ punpckhqdq m15, m7, m3
+ punpcklqdq m7, m3
+ punpckhqdq m3, m6, m1
+ punpcklqdq m6, m1
+ punpckhqdq m1, m0, m4
+ punpcklqdq m0, m4
+ vperm2i128 m4, m0, m7, 0x31
+ vinserti128 m0, xm7, 1
+ vperm2i128 m7, m3, m2, 0x31
+ vinserti128 m3, xm2, 1
+ vinserti128 m2, m6, xm5, 1
+ vperm2i128 m6, m5, 0x31
+ vperm2i128 m5, m1, m15, 0x31
+ vinserti128 m1, xm15, 1
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m5, [pw_5]
+ pxor m6, m6
+ mov r6d, eobd
+ add eobb, 21
+ cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192
+ lea r6, [strideq*3]
+ lea r5, [strideq*5]
+ lea r4, [strideq+r6*2] ; strideq*7
+.loop:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {paddsw x, m5}, m0, m1, m2, m3
+ REPX {psraw x, 3 }, m0, m1, m2, m3
+ call .main_zero
+ add cq, 32
+ lea dstq, [dstq+strideq*8]
+ sub eobd, 64
+ jge .loop
+ RET
+ALIGN function_align
+.main_zero:
+ REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+.main:
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m4, m2, m1
+ punpcklwd m2, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ mova xm4, [dstq+strideq*0]
+ vinserti128 m4, [dstq+strideq*4], 1
+ paddw m0, m4
+ mova xm4, [dstq+strideq*1]
+ vinserti128 m4, [dstq+r5 ], 1
+ paddw m1, m4
+ mova xm4, [dstq+strideq*2]
+ vinserti128 m4, [dstq+r6*2 ], 1
+ paddw m2, m4
+ mova xm4, [dstq+r6 ]
+ vinserti128 m4, [dstq+r4 ], 1
+ paddw m3, m4
+ REPX {pmaxsw x, m6}, m0, m1, m2, m3
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*4], m0, 1
+ mova [dstq+strideq*1], xm1
+ vextracti128 [dstq+r5 ], m1, 1
+ mova [dstq+strideq*2], xm2
+ vextracti128 [dstq+r6*2 ], m2, 1
+ mova [dstq+r6 ], xm3
+ vextracti128 [dstq+r4 ], m3, 1
+ ret
+
+cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ mov r4, cq
+ lea r6, [rsp+32*4]
+ call .pass1_main
+ cmp eobd, 43
+ jge .eob43
+ jmp .pass2_fast
+.eob43:
+ call .pass1_main
+ cmp eobd, 107
+ jge .eob107
+.pass2_fast:
+ mov cq, r4
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ pmaxsd m0, m12, [cq+128*1+ 0]
+ pmaxsd m1, m12, [cq+128*7+ 0]
+ pmaxsd m2, m12, [cq+128*1+32]
+ pmaxsd m3, m12, [cq+128*7+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
+ pmaxsd m0, m12, [cq+128*3+ 0]
+ pmaxsd m1, m12, [cq+128*5+ 0]
+ pmaxsd m2, m12, [cq+128*3+32]
+ pmaxsd m3, m12, [cq+128*5+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
+ pmaxsd m0, m12, [cq+128*2+ 0]
+ pmaxsd m1, m12, [cq+128*6+ 0]
+ pmaxsd m2, m12, [cq+128*2+32]
+ pmaxsd m3, m12, [cq+128*6+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ call m(idct_8x16_internal_10bpc).main_oddhalf_fast
+ pmaxsd m0, m12, [cq+128*0+ 0]
+ pmaxsd m1, m12, [cq+128*4+ 0]
+ pmaxsd m2, m12, [cq+128*0+32]
+ pmaxsd m3, m12, [cq+128*4+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ jmp .pass2_end
+.eob107:
+ call .pass1_main
+ cmp eobd, 171
+ jge .eob171
+ jmp .pass2
+.eob171:
+ call .pass1_main
+.pass2:
+ mov cq, r4
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ pmaxsd m0, m12, [cq+128*1+ 0]
+ pmaxsd m1, m12, [cq+128*7+ 0]
+ pmaxsd m2, m12, [cq+128*1+32]
+ pmaxsd m3, m12, [cq+128*7+32]
+ pmaxsd m4, m12, [cq+128*1+64]
+ pmaxsd m5, m12, [cq+128*7+64]
+ pmaxsd m6, m12, [cq+128*1+96]
+ pmaxsd m7, m12, [cq+128*7+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
+ pmaxsd m0, m12, [cq+128*3+ 0]
+ pmaxsd m1, m12, [cq+128*5+ 0]
+ pmaxsd m2, m12, [cq+128*3+32]
+ pmaxsd m3, m12, [cq+128*5+32]
+ pmaxsd m4, m12, [cq+128*3+64]
+ pmaxsd m5, m12, [cq+128*5+64]
+ pmaxsd m6, m12, [cq+128*3+96]
+ pmaxsd m7, m12, [cq+128*5+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
+ pmaxsd m0, m12, [cq+128*2+ 0]
+ pmaxsd m1, m12, [cq+128*6+ 0]
+ pmaxsd m2, m12, [cq+128*2+32]
+ pmaxsd m3, m12, [cq+128*6+32]
+ pmaxsd m4, m12, [cq+128*2+64]
+ pmaxsd m5, m12, [cq+128*6+64]
+ pmaxsd m6, m12, [cq+128*2+96]
+ pmaxsd m7, m12, [cq+128*6+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ pmaxsd m0, m12, [cq+128*0+ 0]
+ pmaxsd m1, m12, [cq+128*4+ 0]
+ pmaxsd m2, m12, [cq+128*0+32]
+ pmaxsd m3, m12, [cq+128*4+32]
+ pmaxsd m4, m12, [cq+128*0+64]
+ pmaxsd m5, m12, [cq+128*4+64]
+ pmaxsd m6, m12, [cq+128*0+96]
+ pmaxsd m7, m12, [cq+128*4+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+.pass2_end:
+ psrld m11, 8 ; pd_8
+ IDCT32_END 0, 15, 8, 9, 10, 4
+ IDCT32_END 1, 14, 8, 9, 10, 4
+ punpckhqdq m8, m0, m1 ; 16 17 (interleaved)
+ punpcklqdq m0, m1 ; 0 1 (interleaved)
+ punpcklqdq m1, m14, m15 ; 14 15 (interleaved)
+ punpckhqdq m14, m15 ; 30 31 (interleaved)
+ mova [r5+32*3], m8
+ mova [r5+32*2], m14
+ IDCT32_END 2, 15, 8, 9, 10, 4
+ IDCT32_END 3, 14, 8, 9, 10, 4
+ punpckhqdq m8, m2, m3 ; 18 19 (interleaved)
+ punpcklqdq m2, m3 ; 2 3 (interleaved)
+ punpcklqdq m3, m14, m15 ; 12 13 (interleaved)
+ punpckhqdq m14, m15 ; 28 29 (interleaved)
+ mova [r5+32*1], m8
+ mova [r5+32*0], m14
+ IDCT32_END 4, 15, 8, 9, 10, 4
+ IDCT32_END 5, 14, 8, 9, 10, 4
+ punpckhqdq m8, m4, m5 ; 20 21 (interleaved)
+ punpcklqdq m4, m5 ; 4 5 (interleaved)
+ punpcklqdq m5, m14, m15 ; 10 11 (interleaved)
+ punpckhqdq m14, m15 ; 26 27 (interleaved)
+ mova [r5-32*1], m8
+ mova [r5-32*2], m14
+ IDCT32_END 6, 15, 8, 9, 10, 4
+ IDCT32_END 7, 14, 8, 9, 10, 4
+ punpckhqdq m8, m6, m7 ; 22 23 (interleaved)
+ punpcklqdq m6, m7 ; 6 7 (interleaved)
+ punpcklqdq m7, m14, m15 ; 8 9 (interleaved)
+ punpckhqdq m14, m15 ; 24 25 (interleaved)
+ mova [r5-32*3], m8
+ mova [r5-32*4], m14
+ mova m15, m1
+.end:
+ vpermq m0, m0, q3120
+ vpermq m1, m2, q3120
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m4, q3120
+ vpermq m1, m6, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m7, q3120
+ vpermq m1, m5, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m3, q3120
+ vpermq m1, m15, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5+32*3], q3120
+ vpermq m1, [r5+32*1], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5-32*1], q3120
+ vpermq m1, [r5-32*3], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5-32*4], q3120
+ vpermq m1, [r5-32*2], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5+32*0], q3120
+ vpermq m1, [r5+32*2], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_12bpc]
+ mov [cq], eobd ; 0
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
+ALIGN function_align
+.pass1_main:
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1
+ TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15
+ mova [cq+128*0], m0
+ mova [cq+128*1], m1
+ mova [cq+128*2], m2
+ mova [cq+128*3], m3
+ mova [cq+128*4], m4
+ mova [cq+128*5], m5
+ mova [cq+128*6], m6
+ mova [cq+128*7], m7
+ add cq, 32
+ ret
+ALIGN function_align
+.main_end:
+ psrld m11, 10 ; pd_2
+ IDCT32_END 0, 15, 8, 9, 10, 2, 0
+ mova [cq+32*16], m8
+ mova [cq+32*31], m9
+ IDCT32_END 1, 14, 8, 9, 10, 2, 0
+ mova [cq+32*17], m8
+ mova [cq+32*30], m9
+ mova [cq+32*14], m14
+ IDCT32_END 2, 14, 8, 9, 10, 2, 0
+ mova [cq+32*18], m8
+ mova [cq+32*29], m9
+ mova [cq+32*13], m14
+ IDCT32_END 3, 14, 8, 9, 10, 2, 0
+ mova [cq+32*19], m8
+ mova [cq+32*28], m9
+ mova [cq+32*12], m14
+ IDCT32_END 4, 14, 8, 9, 10, 2, 0
+ mova [cq+32*20], m8
+ mova [cq+32*27], m9
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m1
+ mova [cq+32* 2], m2
+ IDCT32_END 5, 10, 0, 1, 2, 2, 0
+ mova [cq+32*21], m0
+ mova [cq+32*26], m1
+ IDCT32_END 6, 9, 0, 1, 2, 2, 0
+ mova [cq+32*22], m0
+ mova [cq+32*25], m1
+ IDCT32_END 7, 8, 0, 1, 2, 2, 0
+ mova [cq+32*23], m0
+ mova [cq+32*24], m1
+ mova m0, [cq+32* 0]
+ mova m1, [cq+32* 1]
+ mova m2, [cq+32* 2]
+ mova m11, m14
+ mova m12, [cq+32*12]
+ mova m13, [cq+32*13]
+ mova m14, [cq+32*14]
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1
+
+cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jnz .full
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 8
+.dconly:
+ add r6d, 640
+ sar r6d, 10
+.dconly2:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm3
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ paddsw m1, m0, [dstq+32*0]
+ paddsw m2, m0, [dstq+32*1]
+ psubusw m1, m3
+ psubusw m2, m3
+ mova [dstq+32*0], m1
+ mova [dstq+32*1], m2
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.full:
+ PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+ lea r6, [rsp+32*4]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ call .pass1
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
+ lea r6, [deint_shuf+128]
+ vpbroadcastd m11, [pw_2048]
+ mov r4, dstq
+ call .pass2
+ mova m0, [r5+32*3] ; 16 17
+ mova m1, [r5+32*2] ; 30 31
+ mova m2, [r5+32*1] ; 18 19
+ mova m3, [r5+32*0] ; 28 29
+ mova m4, [r5-32*1] ; 20 21
+ mova m5, [r5-32*2] ; 26 27
+ mova m6, [r5-32*3] ; 22 23
+ mova m7, [r5-32*4] ; 24 25
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ lea dstq, [r4+32]
+ call .pass2
+ RET
+ALIGN function_align
+.pass2:
+ call m(idct_16x8_internal_8bpc).main
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m11, m4
+ pmulhrsw m1, m11, m5
+ pmulhrsw m2, m11, m6
+ pmulhrsw m3, m11, m7
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ALIGN function_align
+.pass1:
+ mova m0, [cq+32* 1]
+ mova m1, [cq+32* 7]
+ mova m2, [cq+32* 9]
+ mova m3, [cq+32*15]
+ mova m4, [cq+32*17]
+ mova m5, [cq+32*23]
+ mova m6, [cq+32*25]
+ mova m7, [cq+32*31]
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
+ mova m0, [cq+32* 3]
+ mova m1, [cq+32* 5]
+ mova m2, [cq+32*11]
+ mova m3, [cq+32*13]
+ mova m4, [cq+32*19]
+ mova m5, [cq+32*21]
+ mova m6, [cq+32*27]
+ mova m7, [cq+32*29]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
+ mova m0, [cq+32* 2]
+ mova m1, [cq+32* 6]
+ mova m2, [cq+32*10]
+ mova m3, [cq+32*14]
+ mova m4, [cq+32*18]
+ mova m5, [cq+32*22]
+ mova m6, [cq+32*26]
+ mova m7, [cq+32*30]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ mova m0, [cq+32* 0]
+ mova m1, [cq+32* 4]
+ mova m2, [cq+32* 8]
+ mova m3, [cq+32*12]
+ mova m4, [cq+32*16]
+ mova m5, [cq+32*20]
+ mova m6, [cq+32*24]
+ mova m7, [cq+32*28]
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m5, [pw_4096]
+ pxor m6, m6
+ mov r6d, eobd
+ add eobb, 21
+ cmovc eobd, r6d
+ lea r6, [strideq*3]
+ lea r5, [strideq*5]
+ lea r4, [strideq+r6*2] ; strideq*7
+.loop:
+ mova m0, [cq+32*0]
+ packssdw m0, [cq+32*1]
+ mova m1, [cq+32*2]
+ packssdw m1, [cq+32*3]
+ REPX {mova [cq+32*x], m6}, 0, 1, 2, 3
+ add cq, 32*8
+ mova m2, [cq-32*4]
+ packssdw m2, [cq-32*3]
+ mova m3, [cq-32*2]
+ packssdw m3, [cq-32*1]
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {mova [cq+32*x], m6}, -4, -3, -2, -1
+ call m(inv_txfm_add_identity_identity_8x32_10bpc).main
+ add dstq, 16
+ sub eobd, 64
+ jge .loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jnz .full
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_12bpc]
+ mov [cq], eobd ; 0
+ or r3d, 8
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
+.full:
+ PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+ lea r6, [rsp+32*4]
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1
+ call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end
+ mov r4, dstq
+ call m(idct_16x8_internal_12bpc).pass2_main
+ mova m0, [cq+32* 0] ; 16
+ mova m1, [cq+32* 1] ; 17
+ mova m2, [cq+32* 2] ; 18
+ mova m3, [cq+32* 3] ; 19
+ mova m4, [cq+32* 4] ; 20
+ mova m5, [cq+32* 5] ; 21
+ mova m6, [cq+32* 6] ; 22
+ mova m7, [cq+32* 7] ; 23
+ mova m8, [cq+32* 8] ; 24
+ mova m9, [cq+32* 9] ; 25
+ mova m10, [cq+32*10] ; 26
+ mova m11, [cq+32*11] ; 27
+ mova m12, [cq+32*12] ; 28
+ mova m13, [cq+32*13] ; 29
+ mova m14, [cq+32*14] ; 30
+ mova m15, [cq+32*15] ; 31
+ lea dstq, [r4+32]
+ call m(idct_16x8_internal_12bpc).pass2_main
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1
+
+%macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2]
+ mova m%4, [%2]
+ paddsw m%3, m%1, m%4
+ psubsw m%1, m%4
+%if %1 == 0
+ pxor m6, m6
+%endif
+ pmulhrsw m%3, m15
+ pmulhrsw m%1, m15
+ paddw m%3, [dstq+%5]
+ paddw m%1, [r2+%6]
+ pmaxsw m%3, m6
+ pmaxsw m%1, m6
+ pminsw m%3, m7
+ pminsw m%1, m7
+ mova [dstq+%5], m%3
+ mova [r2+%6], m%1
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*16]
+ lea r4, [r6+32*8]
+ lea r5, [r6+32*16]
+ call .main
+ sub eobd, 44
+ jge .eob44
+ vperm2i128 m2, m0, m3, 0x31 ; 5
+ vinserti128 m0, xm3, 1 ; 1
+ vperm2i128 m3, m1, m4, 0x31 ; 7
+ vinserti128 m1, xm4, 1 ; 3
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ REPX {mova [r6+32*x], m4}, 0, 1, 2, 3
+ jmp .fast
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 32
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
+.eob44:
+ mova [r4+16*0], xm0
+ mova [r4+16*1], xm3
+ mova [r4+16*2], xm1
+ mova [r4+16*3], xm4
+ vextracti128 [r4+16*4], m0, 1
+ vextracti128 [r4+16*5], m3, 1
+ vextracti128 [r4+16*6], m1, 1
+ vextracti128 [r4+16*7], m4, 1
+ call .main
+ sub eobd, 107
+ jge .eob151
+ vperm2i128 m7, m1, m4, 0x31 ; 15
+ vinserti128 m5, m1, xm4, 1 ; 11
+ vperm2i128 m6, m0, m3, 0x31 ; 13
+ vinserti128 m4, m0, xm3, 1 ; 9
+ mova m0, [r4+32*0]
+ mova m1, [r4+32*1]
+ mova m2, [r4+32*2]
+ mova m3, [r4+32*3]
+.fast:
+ lea r6, [pw_5+128]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp .idct16
+.eob151:
+ mova [r4-16*8], xm0
+ mova [r4-16*7], xm3
+ mova [r4-16*6], xm1
+ mova [r4-16*5], xm4
+ vextracti128 [r4-16*4], m0, 1
+ vextracti128 [r4-16*3], m3, 1
+ vextracti128 [r4-16*2], m1, 1
+ vextracti128 [r4-16*1], m4, 1
+ call .main
+ sub eobd, 128
+ jge .eob279
+ vperm2i128 m10, m0, m3, 0x31 ; 21
+ vinserti128 m8, m0, xm3, 1 ; 17
+ vperm2i128 m11, m1, m4, 0x31 ; 23
+ vinserti128 m9, m1, xm4, 1 ; 19
+ pxor m12, m12
+ REPX {mova x, m12}, m13, m14, m15
+ REPX {mova [r6+32*x], m12}, 0, 1, 2, 3
+ jmp .full
+.eob279:
+ mova [r5+16*0], xm0
+ mova [r5+16*1], xm3
+ mova [r5+16*2], xm1
+ mova [r5+16*3], xm4
+ vextracti128 [r5+16*4], m0, 1
+ vextracti128 [r5+16*5], m3, 1
+ vextracti128 [r5+16*6], m1, 1
+ vextracti128 [r5+16*7], m4, 1
+ call .main
+ vperm2i128 m14, m0, m3, 0x31 ; 29
+ vinserti128 m12, m0, xm3, 1 ; 25
+ vperm2i128 m15, m1, m4, 0x31 ; 31
+ vinserti128 m13, m1, xm4, 1 ; 27
+ mova m8, [r5+32*0]
+ mova m9, [r5+32*1]
+ mova m10, [r5+32*2]
+ mova m11, [r5+32*3]
+.full:
+ mova m0, [r4+32*0]
+ mova m1, [r4+32*1]
+ mova m2, [r4+32*2]
+ mova m3, [r4+32*3]
+ mova m4, [r4-32*4]
+ mova m5, [r4-32*3]
+ mova m6, [r4-32*2]
+ mova m7, [r4-32*1]
+ lea r6, [pw_5 + 128]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ lea r3, [rsp+32*8]
+ mova m8, [r3+32*0]
+ mova m9, [r3+32*1]
+ mova m10, [r3+32*2]
+ mova m11, [r3+32*3]
+ mova m12, [r3-32*4]
+ mova m13, [r3-32*3]
+ mova m14, [r3-32*2]
+ mova m15, [r3-32*1]
+.idct16:
+ lea r3, [rsp+32*16]
+ mova m0, [r3+32*0]
+ mova m1, [r3+32*1]
+ mova m2, [r3+32*2]
+ mova m3, [r3+32*3]
+ mova m4, [r3-32*4]
+ mova m5, [r3-32*3]
+ mova m6, [r3-32*2]
+ mova m7, [r3-32*1]
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ call .pass2_end
+ RET
+ALIGN function_align
+.main:
+ pmulld m0, m14, [cq+128* 1]
+ pmulld m1, m14, [cq+128* 3]
+ pmulld m2, m14, [cq+128* 5]
+ pmulld m3, m14, [cq+128* 7]
+ pmulld m4, m14, [cq+128* 9]
+ pmulld m5, m14, [cq+128*11]
+ pmulld m6, m14, [cq+128*13]
+ pmulld m7, m14, [cq+128*15]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
+ pmulld m0, m14, [cq+128* 0]
+ pmulld m1, m14, [cq+128* 2]
+ pmulld m2, m14, [cq+128* 4]
+ pmulld m3, m14, [cq+128* 6]
+ pmulld m4, m14, [cq+128* 8]
+ pmulld m5, m14, [cq+128*10]
+ pmulld m6, m14, [cq+128*12]
+ pmulld m7, m14, [cq+128*14]
+ call m(idct_8x8_internal_10bpc).main_rect2
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ psrld m15, m11, 11 ; pd_1
+ mova m8, [r6-32*4]
+ mova m9, [r6-32*3]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m10, m0, m8 ; out15
+ paddd m0, m8 ; out0
+ mova m8, [r6-32*2]
+ paddd m15, m1, m9 ; out1
+ psubd m1, m9 ; out14
+ mova m9, [r6-32*1]
+ REPX {psrad x, 1}, m0, m15, m10, m1
+ packssdw m0, m15
+ packssdw m1, m10
+ psubd m10, m2, m8 ; out13
+ paddd m2, m8 ; out2
+ mova m8, [r6+32*0]
+ paddd m15, m3, m9 ; out3
+ psubd m3, m9 ; out12
+ mova m9, [r6+32*1]
+ REPX {psrad x, 1}, m2, m15, m10, m3
+ packssdw m2, m15
+ packssdw m3, m10
+ psubd m10, m4, m8 ; out11
+ paddd m4, m8 ; out4
+ mova m8, [r6+32*2]
+ paddd m15, m5, m9 ; out5
+ psubd m5, m9 ; out10
+ mova m9, [r6+32*3]
+ REPX {psrad x, 1}, m4, m10, m15, m5
+ packssdw m4, m15
+ packssdw m5, m10
+ psubd m10, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ paddd m15, m7, m9 ; out7
+ psubd m7, m9 ; out8
+ REPX {psrad x, 1}, m6, m10, m15, m7
+ packssdw m6, m15
+ packssdw m7, m10
+ punpckhwd m8, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m2, m3, m1
+ punpcklwd m3, m1
+ punpckhwd m1, m4, m6
+ punpcklwd m4, m6
+ punpcklwd m6, m7, m5
+ punpckhwd m7, m5
+ pxor m5, m5
+ mov r7d, 128*13
+.main_zero_loop:
+ mova [cq+r7-128*1], m5
+ mova [cq+r7+128*0], m5
+ mova [cq+r7+128*1], m5
+ mova [cq+r7+128*2], m5
+ sub r7d, 128*4
+ jg .main_zero_loop
+ add cq, 32
+ punpcklwd m5, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m1
+ punpckhwd m4, m1
+ punpckhwd m1, m0, m8
+ punpcklwd m0, m8
+ punpckhwd m8, m6, m7
+ punpcklwd m6, m7
+ punpcklqdq m7, m1, m4
+ punpckhqdq m1, m4
+ punpckhqdq m4, m8, m3
+ punpcklqdq m8, m3
+ punpckhqdq m3, m6, m5
+ punpcklqdq m6, m5
+ punpcklqdq m5, m0, m2
+ punpckhqdq m0, m2
+ mova [r6+16*0], xm5
+ mova [r6+16*1], xm6
+ mova [r6+16*2], xm7
+ mova [r6+16*3], xm8
+ vextracti128 [r6+16*4], m5, 1
+ vextracti128 [r6+16*5], m6, 1
+ vextracti128 [r6+16*6], m7, 1
+ vextracti128 [r6+16*7], m8, 1
+ sub r6, 32*4
+ ret
+ALIGN function_align
+.pass2_end:
+ mova [rsp+gprsize+32*0], m6
+ mova [rsp+gprsize+32*2], m7
+ mova [rsp+gprsize+32*3], m15
+ vpbroadcastd m15, [pw_2048]
+ vpbroadcastd m7, [pixel_10bpc_max]
+ IDCT32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4
+ IDCT32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8
+ IDCT32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4
+ IDCT32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m1, [rsp+gprsize+32*1]
+ IDCT32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4
+ IDCT32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8
+ IDCT32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4
+ IDCT32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m1, [rsp+gprsize+32*0]
+ IDCT32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4
+ IDCT32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8
+ IDCT32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4
+ IDCT32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m1, [rsp+gprsize+32*2]
+ mova m2, [rsp+gprsize+32*3]
+ IDCT32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4
+ IDCT32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8
+ IDCT32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4
+ IDCT32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0
+ ret
+
+cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m8, [pw_2896x8]
+ vpbroadcastd m9, [pw_1697x16]
+ vpbroadcastd m11, [pw_8192]
+ lea r6, [strideq*5]
+ pxor m6, m6
+ paddw m10, m11, m11 ; pw_16384
+ mov r5, dstq
+ call .main
+ sub eobd, 36
+ jl .ret
+ add cq, 128*8
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8-32
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 107 ; eob < 143
+ jl .ret
+ add cq, 128*8
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8-32
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 128 ; eob < 271
+ jl .ret
+ add cq, 128*8
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8-32
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 128 ; eob < 399
+ jl .ret
+ add cq, 128*8
+ lea dstq, [r5+16]
+ call .main
+.ret:
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
+ REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+.main2:
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ punpcklwd m4, m2, m1
+ punpckhwd m2, m1
+ punpckhqdq m1, m0, m4
+ punpcklqdq m0, m4
+ call m(iidentity_8x8_internal_10bpc).write_2x8x2
+ punpcklqdq m0, m3, m2
+ punpckhqdq m1, m3, m2
+ jmp m(iidentity_8x8_internal_10bpc).write_2x8x2
+
+cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1
+
+cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ lea r6, [rsp+32*4]
+ call .main
+ cmp eobd, 36
+ jge .full
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
+ lea r6, [pw_5+128]
+ mov r7, dstq
+ call m(idct_16x16_internal_8bpc).main
+ call .write_16x16
+ mova m0, [r5+32*3]
+ mova m1, [r5+32*2]
+ mova m2, [r5+32*1]
+ mova m3, [r5+32*0]
+ mova m4, [r5-32*1]
+ mova m5, [r5-32*2]
+ mova m6, [r5-32*3]
+ mova m7, [r5-32*4]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
+ jmp .end
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
+.full:
+ add cq, 32
+ mova [r4+32*3], m0
+ mova [r4+32*2], m1
+ mova [r4+32*1], m2
+ mova [r4+32*0], m3
+ mova [r4-32*1], m4
+ mova [r4-32*2], m5
+ mova [r4-32*3], m6
+ mova [r4-32*4], m7
+ call .main
+ sub r4, 32*16 ; topleft 16x8
+ call .transpose_16x16
+ lea r6, [pw_5+128]
+ mov r7, dstq
+ call m(idct_16x16_internal_8bpc).main
+ call .write_16x16
+ mova m0, [r5+32*3]
+ mova m1, [r5+32*2]
+ mova m2, [r5+32*1]
+ mova m3, [r5+32*0]
+ mova m4, [r5-32*1]
+ mova m5, [r5-32*2]
+ mova m6, [r5-32*3]
+ mova m7, [r5-32*4]
+ add r4, 32*8 ; bottomleft 16x8
+ call .transpose_16x16
+.end:
+ lea dstq, [r7+32]
+ call m(idct_16x16_internal_8bpc).main
+ call .write_16x16
+ RET
+ALIGN function_align
+.transpose_16x16:
+ punpckhdq m8, m3, m1
+ punpckldq m3, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckhdq m2, m7, m5
+ punpckldq m7, m5
+ punpckhdq m5, m4, m6
+ punpckldq m4, m6
+ punpckhqdq m6, m0, m4
+ punpcklqdq m0, m4
+ punpckhqdq m4, m1, m5
+ punpcklqdq m1, m5
+ punpckhqdq m5, m7, m3
+ punpcklqdq m7, m3
+ punpckhqdq m3, m2, m8
+ punpcklqdq m2, m8
+ vinserti128 m8, m0, xm7, 1
+ vperm2i128 m12, m0, m7, 0x31
+ vinserti128 m9, m6, xm5, 1
+ vperm2i128 m13, m6, m5, 0x31
+ vinserti128 m10, m1, xm2, 1
+ vperm2i128 m14, m1, m2, 0x31
+ vinserti128 m11, m4, xm3, 1
+ vperm2i128 m15, m4, m3, 0x31
+ mova m0, [r4+32*3]
+ mova m1, [r4+32*2]
+ mova m2, [r4+32*1]
+ mova m3, [r4+32*0]
+ mova m4, [r4-32*1]
+ mova m5, [r4-32*2]
+ mova m6, [r4-32*3]
+ mova m7, [r4-32*4]
+ mova [rsp+gprsize], m15
+ jmp m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ALIGN function_align
+.main:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ pmulld m0, m14, [cq+64* 1]
+ pmulld m1, m14, [cq+64* 7]
+ pmulld m2, m14, [cq+64* 9]
+ pmulld m3, m14, [cq+64*15]
+ pmulld m4, m14, [cq+64*17]
+ pmulld m5, m14, [cq+64*23]
+ pmulld m6, m14, [cq+64*25]
+ pmulld m7, m14, [cq+64*31]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2
+ pmulld m0, m14, [cq+64* 3]
+ pmulld m1, m14, [cq+64* 5]
+ pmulld m2, m14, [cq+64*11]
+ pmulld m3, m14, [cq+64*13]
+ pmulld m4, m14, [cq+64*19]
+ pmulld m5, m14, [cq+64*21]
+ pmulld m6, m14, [cq+64*27]
+ pmulld m7, m14, [cq+64*29]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2
+ pmulld m0, m14, [cq+64* 2]
+ pmulld m1, m14, [cq+64* 6]
+ pmulld m2, m14, [cq+64*10]
+ pmulld m3, m14, [cq+64*14]
+ pmulld m4, m14, [cq+64*18]
+ pmulld m5, m14, [cq+64*22]
+ pmulld m6, m14, [cq+64*26]
+ pmulld m7, m14, [cq+64*30]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
+ pmulld m0, m14, [cq+64* 0]
+ pmulld m1, m14, [cq+64* 4]
+ pmulld m2, m14, [cq+64* 8]
+ pmulld m3, m14, [cq+64*12]
+ pmulld m4, m14, [cq+64*16]
+ pmulld m5, m14, [cq+64*20]
+ pmulld m6, m14, [cq+64*24]
+ pmulld m7, m14, [cq+64*28]
+ call m(idct_8x8_internal_10bpc).main_rect2
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ pxor m8, m8
+ mov r7d, 64*30
+.main_zero_loop:
+ mova [cq+r7-64*2], m8
+ mova [cq+r7-64*1], m8
+ mova [cq+r7+64*0], m8
+ mova [cq+r7+64*1], m8
+ sub r7d, 64*4
+ jg .main_zero_loop
+.main_end:
+ psrld m11, 11 ; pd_1
+ IDCT32_END 0, 15, 8, 9, 10, 1
+ IDCT32_END 1, 14, 8, 9, 10, 1
+ punpckhwd m8, m0, m1 ; 16 17
+ punpcklwd m0, m1 ; 0 1
+ punpcklwd m1, m14, m15 ; 14 15
+ punpckhwd m14, m15 ; 30 31
+ mova [r5+32*3], m8
+ mova [r5+32*2], m14
+ IDCT32_END 2, 15, 8, 9, 10, 1
+ IDCT32_END 3, 14, 8, 9, 10, 1
+ punpckhwd m8, m2, m3 ; 18 19
+ punpcklwd m2, m3 ; 2 3
+ punpcklwd m3, m14, m15 ; 12 13
+ punpckhwd m14, m15 ; 28 29
+ mova [r5+32*1], m8
+ mova [r5+32*0], m14
+ IDCT32_END 4, 15, 8, 9, 10, 1
+ IDCT32_END 5, 14, 8, 9, 10, 1
+ punpckhwd m8, m4, m5 ; 20 21
+ punpcklwd m4, m5 ; 4 5
+ punpcklwd m5, m14, m15 ; 10 11
+ punpckhwd m14, m15 ; 26 27
+ mova [r5-32*1], m8
+ mova [r5-32*2], m14
+ IDCT32_END 6, 15, 8, 9, 10, 1
+ IDCT32_END 7, 14, 8, 9, 10, 1
+ punpckhwd m8, m6, m7 ; 22 23
+ punpcklwd m6, m7 ; 6 7
+ punpcklwd m7, m14, m15 ; 8 9
+ punpckhwd m14, m15 ; 24 25
+ mova [r5-32*3], m8
+ mova [r5-32*4], m14
+ ret
+ALIGN function_align
+.write_16x16:
+ mova m1, [rsp+gprsize+32*1]
+ mova [rsp+gprsize+32*0], m8
+ mova [rsp+gprsize+32*1], m9
+ mova [rsp+gprsize+32*2], m12
+ vpbroadcastd m12, [pw_2048]
+ vpbroadcastd m9, [pixel_10bpc_max]
+ lea r3, [strideq*3]
+ pxor m8, m8
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ pmulhrsw m2, m12
+ pmulhrsw m3, m12
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m0, m12, m4
+ pmulhrsw m1, m12, m5
+ pmulhrsw m2, m12, m6
+ pmulhrsw m3, m12, m7
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m0, m12, [rsp+gprsize+32*0]
+ pmulhrsw m1, m12, [rsp+gprsize+32*1]
+ pmulhrsw m2, m12, m10
+ pmulhrsw m3, m12, m11
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m0, m12, [rsp+gprsize+32*2]
+ pmulhrsw m1, m12, m13
+ pmulhrsw m2, m12, m14
+ pmulhrsw m3, m12, m15
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+
+cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m8, [pw_2896x8]
+ vpbroadcastd m9, [pw_1697x16]
+ vpbroadcastd m10, [pw_4096]
+ lea r6, [strideq*5]
+ pxor m6, m6
+ mov r5, dstq
+ call .main
+ sub eobd, 36
+ jl .ret
+ add cq, 32
+ lea dstq, [dstq+strideq*4]
+ call .main
+ add cq, 64*8-32
+ lea dstq, [r5+16*1]
+ call .main
+ sub eobd, 107 ; eob < 143
+ jl .ret
+ add cq, 32
+ lea dstq, [dstq+strideq*4]
+ call .main
+ add cq, 64*8-32
+ lea dstq, [r5+16*2]
+ call .main
+ sub eobd, 128 ; eob < 271
+ jl .ret
+ add cq, 32
+ lea dstq, [dstq+strideq*4]
+ call .main
+ add cq, 64*8-32
+ lea dstq, [r5+16*3]
+ call .main
+ sub eobd, 128 ; eob < 399
+ jl .ret
+ add cq, 32
+ lea dstq, [dstq+strideq*4]
+ call .main
+.ret:
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+64*0]
+ packssdw m0, [cq+64*1]
+ mova m1, [cq+64*2]
+ packssdw m1, [cq+64*3]
+ mova m2, [cq+64*4]
+ packssdw m2, [cq+64*5]
+ mova m3, [cq+64*6]
+ packssdw m3, [cq+64*7]
+ REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
+ REPX {paddsw x, x }, m0, m1, m2, m3
+ REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3
+ REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
+
+cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1
+
+cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ lea r6, [rsp+32*7]
+ call .main
+ cmp eobd, 36
+ jl .fast
+ call .main
+ cmp eobd, 136
+ jl .fast
+ call .main
+ cmp eobd, 300
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
+.fast:
+ lea r4, [rsp+32*71]
+ pxor m0, m0
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r3, [rsp+32*3]
+ mov r4, r6
+ lea r5, [r6+32*8]
+ lea r6, [pw_5+128]
+ call .pass2_oddhalf
+ call .pass2_evenhalf
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
+ sub dstq, r3
+ lea r2, [r2+r3+32]
+ add dstq, 32
+ lea r3, [rsp+32*11]
+ call .pass2_oddhalf
+ call .pass2_evenhalf
+ lea r3, [strideq*3]
+ call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+128* 1]
+ mova m1, [cq+128* 7]
+ mova m2, [cq+128* 9]
+ mova m3, [cq+128*15]
+ mova m4, [cq+128*17]
+ mova m5, [cq+128*23]
+ mova m6, [cq+128*25]
+ mova m7, [cq+128*31]
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
+ mova m0, [cq+128* 3]
+ mova m1, [cq+128* 5]
+ mova m2, [cq+128*11]
+ mova m3, [cq+128*13]
+ mova m4, [cq+128*19]
+ mova m5, [cq+128*21]
+ mova m6, [cq+128*27]
+ mova m7, [cq+128*29]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
+ mova m0, [cq+128* 2]
+ mova m1, [cq+128* 6]
+ mova m2, [cq+128*10]
+ mova m3, [cq+128*14]
+ mova m4, [cq+128*18]
+ mova m5, [cq+128*22]
+ mova m6, [cq+128*26]
+ mova m7, [cq+128*30]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 4]
+ mova m2, [cq+128* 8]
+ mova m3, [cq+128*12]
+ mova m4, [cq+128*16]
+ mova m5, [cq+128*20]
+ mova m6, [cq+128*24]
+ mova m7, [cq+128*28]
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
+ pxor m15, m15
+ mov r7d, 128*29
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ add cq, 32
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ mova m0, [r5+32*3]
+ mova m1, [r5+32*2]
+ mova m2, [r5+32*1]
+ mova m3, [r5+32*0]
+ mova m4, [r5-32*1]
+ mova m5, [r5-32*2]
+ mova m6, [r5-32*3]
+ mova m7, [r5-32*4]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ mova [r5-32*4], m0
+ mova [r5-32*3], m1
+ mova [r5-32*2], m2
+ mova [r5-32*1], m3
+ mova [r5+32*0], m4
+ mova [r5+32*1], m5
+ mova [r5+32*2], m6
+ mova [r5+32*3], m7
+ ret
+ALIGN function_align
+.pass2_oddhalf:
+ mova m0, [r3+32* 1] ; 1
+ mova m1, [r3+32* 3] ; 3
+ mova m2, [r3+32* 5] ; 5
+ mova m3, [r3+32* 7] ; 7
+ mova m4, [r3+32*17] ; 9
+ mova m5, [r3+32*19] ; 11
+ mova m6, [r3+32*21] ; 13
+ mova m7, [r3+32*23] ; 15
+ mova m8, [r3+32*33] ; 17
+ mova m9, [r3+32*35] ; 19
+ mova m10, [r3+32*37] ; 21
+ mova m11, [r3+32*39] ; 23
+ mova m12, [r3+32*49] ; 25
+ mova m13, [r3+32*51] ; 27
+ mova m14, [r3+32*53] ; 29
+ mova m15, [r3+32*55] ; 31
+ jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ALIGN function_align
+.pass2_evenhalf:
+ mova m0, [r3+32* 0] ; 0
+ mova m1, [r3+32* 2] ; 2
+ mova m2, [r3+32* 4] ; 4
+ mova m3, [r3+32* 6] ; 6
+ mova m4, [r3+32*16] ; 8
+ mova m5, [r3+32*18] ; 10
+ mova m6, [r3+32*20] ; 12
+ mova m7, [r3+32*22] ; 14
+ mova m8, [r3+32*32] ; 16
+ mova m9, [r3+32*34] ; 18
+ mova m10, [r3+32*36] ; 20
+ mova m11, [r3+32*38] ; 22
+ mova m12, [r3+32*48] ; 24
+ mova m13, [r3+32*50] ; 26
+ mova m14, [r3+32*52] ; 28
+ mova m15, [r3+32*54] ; 30
+ mova [rsp+gprsize], m15
+ jmp m(idct_16x16_internal_8bpc).main
+
+cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m5, [pw_8192]
+ pxor m6, m6
+ lea r6, [strideq*3]
+ lea r5, [strideq*5]
+ lea r4, [strideq+r6*2] ; strideq*7
+ call .main ; 0
+ cmp eobd, 36
+ jl .ret
+ add cq, 128*8 ; 0 1
+ mov r7, dstq ; 1
+ add dstq, 16
+ call .main
+ call .main2
+ cmp eobd, 136
+ jl .ret
+ add cq, 128*16-32 ; 0 1 2
+ lea dstq, [r7+16*2] ; 1 2
+ call .main ; 2
+ call .main2
+ call .main2
+ cmp eobd, 300
+ jl .ret
+ add cq, 128*24-64 ; 0 1 2 3
+ add r7, 16*3 ; 1 2 3
+ mov dstq, r7 ; 2 3
+ call .main ; 3
+ call .main2
+ call .main2
+ call .main2
+ cmp eobd, 535
+ jl .ret
+ add cq, 128*24-64 ; 0 1 2 3
+ lea dstq, [r7+strideq*8] ; 1 2 3 4
+ mov r7, dstq ; 2 3 4
+ call .main ; 3 4
+ call .main2
+ call .main2
+ cmp eobd, 755
+ jl .ret
+ add cq, 128*16-32 ; 0 1 2 3
+ lea dstq, [r7+strideq*8] ; 1 2 3 4
+ call .main ; 2 3 4 5
+ call .main2 ; 3 4 5
+ cmp eobd, 911
+ jl .ret
+ add cq, 128*8 ; 0 1 2 3
+ add dstq, 16 ; 1 2 3 4
+ call .main ; 2 3 4 5
+.ret: ; 3 4 5 6
+ RET
+ALIGN function_align
+.main2:
+ sub cq, 128*8-32
+ lea dstq, [dstq+strideq*8-16]
+.main:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero
+
+cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1
+
+%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
+%if %1 & 1
+ mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n
+ mova m%4, [r4-32*(14+%1)] ; idct32 out31-n
+%else
+ mova m%5, [r4-32*(45-%1)]
+ mova m%4, [r5-32*(20+%1)]
+%endif
+ paddsw m%6, m%5, m%4 ; idct32 out 0+n
+ psubsw m%5, m%4 ; idct32 out31-n
+ paddsw m%4, m%5, m%3 ; out31-n
+ psubsw m%5, m%3 ; out32+n
+ paddsw m%3, m%6, m%2 ; out 0+n
+ psubsw m%6, m%2 ; out63-n
+ REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3
+%if %1 & 1
+ %define %%d0 r2
+ %define %%d1 dstq
+%else
+ %define %%d0 dstq
+ %define %%d1 r2
+%endif
+ paddw m%3, [%%d0+%7 ]
+ paddw m%4, [%%d1+%8 ]
+ paddw m%5, [%%d0+%9 ]
+ paddw m%6, [%%d1+%10]
+ pxor m%2, m%2
+ REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6
+ vpbroadcastd m%2, [pixel_10bpc_max]
+ REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6
+ mova [%%d0+%7 ], m%3
+ mova [%%d1+%8 ], m%4
+ mova [%%d0+%9 ], m%5
+ mova [%%d1+%10], m%6
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*6]
+ call .main
+ sub eobd, 44
+ jl .fast
+ call .main
+ sub eobd, 107
+ jl .fast
+ call .main
+ sub eobd, 128
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 64
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
+.fast:
+ lea r4, [rsp+32*38]
+ pxor m0, m0
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r6, [pw_5+128]
+ mova m0, [rsp+32* 2] ; in0
+ mova m1, [rsp+32* 6] ; in4
+ mova m2, [rsp+32*10] ; in8
+ mova m3, [rsp+32*14] ; in12
+ mova m4, [rsp+32*18] ; in16
+ mova m5, [rsp+32*22] ; in20
+ mova m6, [rsp+32*26] ; in24
+ mova m7, [rsp+32*30] ; in28
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ lea r4, [rsp+32*38]
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ add r4, 32*8
+ mova [r4-32*4], m8
+ mova [r4-32*3], m9
+ mova [r4-32*2], m10
+ mova [r4-32*1], m11
+ mova [r4+32*0], m12
+ mova [r4+32*1], m13
+ mova [r4+32*2], m14
+ mova [r4+32*3], m15
+ mova m0, [rsp+32* 4] ; in2
+ mova m1, [rsp+32* 8] ; in6
+ mova m2, [rsp+32*12] ; in10
+ mova m3, [rsp+32*16] ; in14
+ mova m4, [rsp+32*20] ; in18
+ mova m5, [rsp+32*24] ; in22
+ mova m6, [rsp+32*28] ; in26
+ mova m7, [rsp+32*32] ; in30
+ lea r5, [r4+32*16]
+ add r4, 32*8
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova m0, [rsp+32* 3] ; in1
+ mova m1, [rsp+32*33] ; in31
+ mova m2, [rsp+32*19] ; in17
+ mova m3, [rsp+32*17] ; in15
+ mova m4, [rsp+32*11] ; in9
+ mova m5, [rsp+32*25] ; in23
+ mova m6, [rsp+32*27] ; in25
+ mova m7, [rsp+32* 9] ; in7
+ lea r6, [idct64_mul - 8]
+ add r4, 32*16
+ add r5, 32*32
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ mova m0, [rsp+32* 7] ; in5
+ mova m1, [rsp+32*29] ; in27
+ mova m2, [rsp+32*23] ; in21
+ mova m3, [rsp+32*13] ; in11
+ mova m4, [rsp+32*15] ; in13
+ mova m5, [rsp+32*21] ; in19
+ mova m6, [rsp+32*31] ; in29
+ mova m7, [rsp+32* 5] ; in3
+ add r6, 8
+ add r4, 32*8
+ sub r5, 32*8
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ lea r8, [strideq*4]
+ lea r9, [strideq*5]
+ lea r3, [r9+strideq*1] ; stride*6
+ lea r7, [r9+strideq*2] ; stride*7
+ call .main_part2_pass2
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+128* 1]
+ mova m1, [cq+128* 3]
+ mova m2, [cq+128* 5]
+ mova m3, [cq+128* 7]
+ mova m4, [cq+128* 9]
+ mova m5, [cq+128*11]
+ mova m6, [cq+128*13]
+ mova m7, [cq+128*15]
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 2]
+ mova m2, [cq+128* 4]
+ mova m3, [cq+128* 6]
+ mova m4, [cq+128* 8]
+ mova m5, [cq+128*10]
+ mova m6, [cq+128*12]
+ mova m7, [cq+128*14]
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ pxor m15, m15
+ mov r7d, 128*13
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ add cq, 32
+ psrld m15, m11, 10 ; pd_2
+ mova m8, [r6-32*4]
+ mova m9, [r6+32*3]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m10, m0, m8 ; out15
+ paddd m0, m8 ; out0
+ mova m8, [r6-32*3]
+ psubd m15, m7, m9 ; out8
+ paddd m7, m9 ; out7
+ mova m9, [r6+32*2]
+ REPX {psrad x, 2}, m0, m15, m10, m7
+ packssdw m0, m15
+ packssdw m7, m10
+ psubd m10, m1, m8 ; out14
+ paddd m1, m8 ; out1
+ mova m8, [r6-32*2]
+ psubd m15, m6, m9 ; out9
+ paddd m6, m9 ; out6
+ mova m9, [r6+32*1]
+ REPX {psrad x, 2}, m1, m15, m10, m6
+ packssdw m1, m15
+ packssdw m6, m10
+ psubd m10, m2, m8 ; out13
+ paddd m2, m8 ; out2
+ mova m8, [r6-32*1]
+ psubd m15, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ mova m9, [r6+32*0]
+ REPX {psrad x, 2}, m2, m15, m10, m5
+ packssdw m2, m15
+ packssdw m5, m10
+ psubd m10, m3, m8 ; out12
+ paddd m3, m8 ; out3
+ psubd m15, m4, m9 ; out11
+ paddd m4, m9 ; out4
+ REPX {psrad x, 2}, m3, m15, m10, m4
+ packssdw m3, m15
+ packssdw m4, m10
+ call m(idct_16x8_internal_10bpc).transpose3
+ mova [r6-32*4], m0
+ mova [r6-32*3], m1
+ mova [r6-32*2], m2
+ mova [r6-32*1], m3
+ mova [r6+32*0], m4
+ mova [r6+32*1], m5
+ mova [r6+32*2], m6
+ mova [r6+32*3], m7
+ add r6, 32*8
+ ret
+.main_part2_pass2:
+ vpbroadcastd m11, [pw_1567_3784]
+ vpbroadcastd m12, [pw_m3784_1567]
+ vpbroadcastd m13, [pw_2896_2896]
+ lea r6, [pw_5+128]
+ lea r2, [dstq+r7]
+.main_part2_pass2_loop:
+ vpbroadcastd m14, [pw_m2896_2896]
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal
+ vpbroadcastd m14, [pw_2048]
+ IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8
+ IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8
+ IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8
+ IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8
+ add dstq, strideq
+ sub r2, strideq
+ cmp r4, r5
+ jne .main_part2_pass2_loop
+ ret
+ALIGN function_align
+.main_part1_rect2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_part1: ; idct64 steps 1-5
+ ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+ vpbroadcastd m7, [r5+4*0]
+ vpbroadcastd m8, [r5+4*1]
+ vpbroadcastd m6, [r5+4*2]
+ vpbroadcastd m9, [r5+4*3]
+ vpbroadcastd m5, [r5+4*4]
+ vpbroadcastd m10, [r5+4*5]
+ vpbroadcastd m4, [r5+4*6]
+ vpbroadcastd m15, [r5+4*7]
+ pmulld m7, m0 ; t63a
+ pmulld m0, m8 ; t32a
+ pmulld m6, m1 ; t62a
+ pmulld m1, m9 ; t33a
+ pmulld m5, m2 ; t61a
+ pmulld m2, m10 ; t34a
+ pmulld m4, m3 ; t60a
+ pmulld m3, m15 ; t35a
+ vpbroadcastd m10, [r5+4*8]
+ vpbroadcastd m15, [r5+4*9]
+ REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
+ REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
+ psubd m8, m0, m1 ; t33
+ paddd m0, m1 ; t32
+ psubd m1, m7, m6 ; t62
+ paddd m7, m6 ; t63
+ psubd m6, m3, m2 ; t34
+ paddd m3, m2 ; t35
+ psubd m2, m4, m5 ; t61
+ paddd m4, m5 ; t60
+ REPX {pmaxsd x, m12}, m8, m1, m6, m2
+ REPX {pminsd x, m13}, m8, m1, m6, m2
+ ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a
+ ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a
+ REPX {pmaxsd x, m12}, m0, m3, m7, m4
+ REPX {pminsd x, m13}, m0, m3, m7, m4
+ vpbroadcastd m10, [r5+4*10]
+ vpbroadcastd m15, [r5+4*11]
+ psubd m5, m0, m3 ; t35a
+ paddd m0, m3 ; t32a
+ psubd m3, m7, m4 ; t60a
+ paddd m7, m4 ; t63a
+ psubd m4, m1, m6 ; t34
+ paddd m1, m6 ; t33
+ psubd m6, m8, m2 ; t61
+ paddd m8, m2 ; t62
+ REPX {pmaxsd x, m12}, m5, m3, m4, m6
+ REPX {pminsd x, m13}, m5, m3, m4, m6
+ ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60
+ ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
+ REPX {pmaxsd x, m12}, m0, m7, m1, m8
+ REPX {pminsd x, m13}, m0, m7, m1, m8
+ add r5, 4*12
+ mova [r6-32*4], m0
+ mova [r6+32*3], m7
+ mova [r6-32*3], m1
+ mova [r6+32*2], m8
+ mova [r6-32*2], m6
+ mova [r6+32*1], m4
+ mova [r6-32*1], m3
+ mova [r6+32*0], m5
+ add r6, 32*8
+ ret
+.main_part2: ; idct64 steps 6-9
+ lea r5, [r6+32*3]
+ sub r6, 32*4
+ vpbroadcastd m10, [pd_1567]
+ vpbroadcastd m15, [pd_3784]
+.main_part2_loop:
+ mova m0, [r6-32*32] ; t32a
+ mova m1, [r5-32*24] ; t39a
+ mova m2, [r5-32*32] ; t63a
+ mova m3, [r6-32*24] ; t56a
+ mova m4, [r6-32*16] ; t40a
+ mova m5, [r5-32* 8] ; t47a
+ mova m6, [r5-32*16] ; t55a
+ mova m7, [r6-32* 8] ; t48a
+ psubd m8, m0, m1 ; t39
+ paddd m0, m1 ; t32
+ psubd m1, m2, m3 ; t56
+ paddd m2, m3 ; t63
+ psubd m3, m5, m4 ; t40
+ paddd m5, m4 ; t47
+ psubd m4, m7, m6 ; t55
+ paddd m7, m6 ; t48
+ REPX {pmaxsd x, m12}, m8, m1, m3, m4
+ REPX {pminsd x, m13}, m8, m1, m3, m4
+ ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a
+ ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a
+ REPX {pmaxsd x, m12}, m0, m2, m5, m7
+ REPX {pminsd x, m13}, m0, m5, m2, m7
+ psubd m6, m2, m7 ; t48a
+ paddd m2, m7 ; t63a
+ psubd m7, m0, m5 ; t47a
+ paddd m0, m5 ; t32a
+ psubd m5, m8, m4 ; t55
+ paddd m8, m4 ; t56
+ psubd m4, m1, m3 ; t40
+ paddd m1, m3 ; t39
+ REPX {pmaxsd x, m12}, m6, m7, m5, m4
+ REPX {pminsd x, m13}, m6, m7, m5, m4
+ REPX {pmulld x, m14}, m6, m7, m5, m4
+ REPX {pmaxsd x, m12}, m2, m0, m8, m1
+ REPX {pminsd x, m13}, m2, m0, m8, m1
+ paddd m6, m11
+ paddd m5, m11
+ psubd m3, m6, m7 ; t47
+ paddd m6, m7 ; t48
+ psubd m7, m5, m4 ; t40a
+ paddd m5, m4 ; t55a
+ REPX {psrad x, 12}, m3, m6, m7, m5
+ mova [r5-32* 8], m2
+ mova [r6-32*32], m0
+ mova [r6-32* 8], m8
+ mova [r5-32*32], m1
+ mova [r5-32*24], m3
+ mova [r6-32*16], m6
+ mova [r6-32*24], m7
+ mova [r5-32*16], m5
+ add r6, 32
+ sub r5, 32
+ cmp r6, r5
+ jl .main_part2_loop
+ ret
+
+cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ lea r6, [rsp+32*6]
+ call .main
+ cmp eobd, 36
+ jl .fast
+ call .main
+ cmp eobd, 136
+ jl .fast
+ call .main
+ cmp eobd, 300
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
+ mov [cq], eobd ; 0
+ or r3d, 64
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
+.fast:
+ lea r4, [rsp+32*70]
+ pxor m0, m0
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r6, [pw_5 + 128]
+ mov r10, rsp
+ lea r8, [strideq*4]
+ lea r9, [strideq*5]
+ lea r3, [r9+strideq*1] ; stride*6
+ lea r7, [r9+strideq*2] ; stride*7
+.pass2_loop:
+ mova m0, [r10+32* 2] ; in0
+ mova m1, [r10+32* 6] ; in4
+ mova m2, [r10+32*18] ; in8
+ mova m3, [r10+32*22] ; in12
+ mova m4, [r10+32*34] ; in16
+ mova m5, [r10+32*38] ; in20
+ mova m6, [r10+32*50] ; in24
+ mova m7, [r10+32*54] ; in28
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ lea r4, [rsp+32*70]
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ add r4, 32*8
+ mova [r4-32*4], m8
+ mova [r4-32*3], m9
+ mova [r4-32*2], m10
+ mova [r4-32*1], m11
+ mova [r4+32*0], m12
+ mova [r4+32*1], m13
+ mova [r4+32*2], m14
+ mova [r4+32*3], m15
+ mova m0, [r10+32* 4] ; in2
+ mova m1, [r10+32* 8] ; in6
+ mova m2, [r10+32*20] ; in10
+ mova m3, [r10+32*24] ; in14
+ mova m4, [r10+32*36] ; in18
+ mova m5, [r10+32*40] ; in22
+ mova m6, [r10+32*52] ; in26
+ mova m7, [r10+32*56] ; in30
+ lea r5, [r4+32*16]
+ add r4, 32*8
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova m0, [r10+32* 3] ; in1
+ mova m1, [r10+32*57] ; in31
+ mova m2, [r10+32*35] ; in17
+ mova m3, [r10+32*25] ; in15
+ mova m4, [r10+32*19] ; in9
+ mova m5, [r10+32*41] ; in23
+ mova m6, [r10+32*51] ; in25
+ mova m7, [r10+32* 9] ; in7
+ lea r6, [idct64_mul - 8]
+ add r4, 32*16
+ add r5, 32*32
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ mova m0, [r10+32* 7] ; in5
+ mova m1, [r10+32*53] ; in27
+ mova m2, [r10+32*39] ; in21
+ mova m3, [r10+32*21] ; in11
+ mova m4, [r10+32*23] ; in13
+ mova m5, [r10+32*37] ; in19
+ mova m6, [r10+32*55] ; in29
+ mova m7, [r10+32* 5] ; in3
+ add r6, 8
+ add r4, 32*8
+ sub r5, 32*8
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2
+ add r10, 32*8
+ sub r4, 32*98 ; rsp+32*16
+ sub dstq, r8
+ add dstq, 32
+ cmp r10, r4
+ jl .pass2_loop
+ RET
+ALIGN function_align
+.main:
+ vpbroadcastd m14, [pd_2896]
+ vpbroadcastd m11, [pd_2048]
+ pmulld m0, m14, [cq+128* 1]
+ pmulld m1, m14, [cq+128* 7]
+ pmulld m2, m14, [cq+128* 9]
+ pmulld m3, m14, [cq+128*15]
+ pmulld m4, m14, [cq+128*17]
+ pmulld m5, m14, [cq+128*23]
+ pmulld m6, m14, [cq+128*25]
+ pmulld m7, m14, [cq+128*31]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2
+ pmulld m0, m14, [cq+128* 3]
+ pmulld m1, m14, [cq+128* 5]
+ pmulld m2, m14, [cq+128*11]
+ pmulld m3, m14, [cq+128*13]
+ pmulld m4, m14, [cq+128*19]
+ pmulld m5, m14, [cq+128*21]
+ pmulld m6, m14, [cq+128*27]
+ pmulld m7, m14, [cq+128*29]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2
+ pmulld m0, m14, [cq+128* 2]
+ pmulld m1, m14, [cq+128* 6]
+ pmulld m2, m14, [cq+128*10]
+ pmulld m3, m14, [cq+128*14]
+ pmulld m4, m14, [cq+128*18]
+ pmulld m5, m14, [cq+128*22]
+ pmulld m6, m14, [cq+128*26]
+ pmulld m7, m14, [cq+128*30]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
+ pmulld m0, m14, [cq+128* 0]
+ pmulld m1, m14, [cq+128* 4]
+ pmulld m2, m14, [cq+128* 8]
+ pmulld m3, m14, [cq+128*12]
+ pmulld m4, m14, [cq+128*16]
+ pmulld m5, m14, [cq+128*20]
+ pmulld m6, m14, [cq+128*24]
+ pmulld m7, m14, [cq+128*28]
+ pxor m15, m15
+ mov r7d, 128*29
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ add cq, 32
+ call m(idct_8x8_internal_10bpc).main_rect2
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main_end
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ mova m0, [r5+32*3]
+ mova m1, [r5+32*2]
+ mova m2, [r5+32*1]
+ mova m3, [r5+32*0]
+ mova m4, [r5-32*1]
+ mova m5, [r5-32*2]
+ mova m6, [r5-32*3]
+ mova m7, [r5-32*4]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ mova [r5-32*4], m0
+ mova [r5-32*3], m1
+ mova [r5-32*2], m2
+ mova [r5-32*1], m3
+ mova [r5+32*0], m4
+ mova [r5+32*1], m5
+ mova [r5+32*2], m6
+ mova [r5+32*3], m7
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jnz .normal
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 16
+.dconly:
+ add r6d, 640
+ sar r6d, 10
+.dconly2:
+ vpbroadcastd m5, [dconly_10bpc]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ movd xm0, r6d
+ paddsw xm0, xm5
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ paddsw m1, m0, [dstq+32*0]
+ paddsw m2, m0, [dstq+32*1]
+ paddsw m3, m0, [dstq+32*2]
+ paddsw m4, m0, [dstq+32*3]
+ REPX {psubusw x, m5}, m1, m2, m3, m4
+ mova [dstq+32*0], m1
+ mova [dstq+32*1], m2
+ mova [dstq+32*2], m3
+ mova [dstq+32*3], m4
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.normal:
+ PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*4]
+ call .main
+ call .shift_transpose
+ cmp eobd, 36
+ jl .fast
+ call .main
+ call .shift_transpose
+ jmp .pass2
+.fast:
+ pxor m0, m0
+ mov r3d, 4
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ dec r3d
+ jg .fast_loop
+.pass2:
+ lea r7, [r6-32*64]
+ lea r4, [r6-32*32]
+ lea r6, [pw_5+128]
+ mov r5, dstq
+.pass2_loop:
+ mova m0, [r7-32*4]
+ mova m1, [r7-32*3]
+ mova m2, [r7-32*2]
+ mova m3, [r7-32*1]
+ mova m4, [r7+32*0]
+ mova m5, [r7+32*1]
+ mova m6, [r7+32*2]
+ mova m7, [r7+32*3]
+ add r7, 32*32
+ mova m8, [r7-32*4]
+ mova m9, [r7-32*3]
+ mova m10, [r7-32*2]
+ mova m11, [r7-32*1]
+ mova m12, [r7+32*0]
+ mova m13, [r7+32*1]
+ mova m14, [r7+32*2]
+ mova m15, [r7+32*3]
+ sub r7, 32*24
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).write_16x16
+ add r5, 32
+ mov dstq, r5
+ cmp r7, r4
+ jl .pass2_loop
+ RET
+ALIGN function_align
+.main:
+ lea r5, [idct64_mul_16bpc]
+ mova m0, [cq+64* 1]
+ mova m1, [cq+64*31]
+ mova m2, [cq+64*17]
+ mova m3, [cq+64*15]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+64* 7]
+ mova m1, [cq+64*25]
+ mova m2, [cq+64*23]
+ mova m3, [cq+64* 9]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+64* 5]
+ mova m1, [cq+64*27]
+ mova m2, [cq+64*21]
+ mova m3, [cq+64*11]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+64* 3]
+ mova m1, [cq+64*29]
+ mova m2, [cq+64*19]
+ mova m3, [cq+64*13]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
+ mova m0, [cq+64* 2]
+ mova m1, [cq+64*14]
+ mova m2, [cq+64*18]
+ mova m3, [cq+64*30]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
+ mova m0, [cq+64* 6]
+ mova m1, [cq+64*10]
+ mova m2, [cq+64*22]
+ mova m3, [cq+64*26]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
+ mova m0, [cq+64* 4]
+ mova m1, [cq+64*12]
+ mova m2, [cq+64*20]
+ mova m3, [cq+64*28]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_fast
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 8]
+ mova m2, [cq+64*16]
+ mova m3, [cq+64*24]
+ pxor m15, m15
+ mov r7d, 64*30
+.main_zero_loop:
+ mova [cq+r7-64*2], m15
+ mova [cq+r7-64*1], m15
+ mova [cq+r7+64*0], m15
+ mova [cq+r7+64*1], m15
+ sub r7d, 64*4
+ jg .main_zero_loop
+.main_end:
+ psrld m15, m11, 10 ; pd_2
+.main_end2:
+ add cq, 32
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ add r6, 32*8
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ mova [r6+32*2], m1
+ mova [r6+32*1], m2
+ mova [r6+32*0], m3
+ mova [r6-32*1], m4
+ mova [r6-32*2], m5
+ mova [r6-32*3], m6
+ mova [r6-32*4], m7
+ jmp .main_end_loop_start
+.main_end_loop:
+ mova m0, [r6+32* 3] ; idct8 0 + n
+.main_end_loop_start:
+ mova m1, [r5+32* 4] ; idct16 15 - n
+ mova m2, [r5-32*12] ; idct32 16 + n
+ mova m3, [r6-32*13] ; idct32 31 - n
+ mova m4, [r6-32*29] ; idct64 63 - n
+ mova m5, [r5-32*28] ; idct64 48 + n
+ mova m6, [r6-32*45] ; idct64 47 - n
+ mova m7, [r5-32*44] ; idct64 32 + n
+ paddd m8, m0, m1 ; idct16 out0 + n
+ psubd m0, m1 ; idct16 out15 - n
+ REPX {pmaxsd x, m12}, m8, m0
+ REPX {pminsd x, m13}, m8, m0
+ paddd m1, m8, m3 ; idct32 out0 + n
+ psubd m8, m3 ; idct32 out31 - n
+ paddd m3, m0, m2 ; idct32 out15 - n
+ psubd m0, m2 ; idct32 out16 + n
+ REPX {pmaxsd x, m12}, m1, m8, m3, m0
+ REPX {pminsd x, m13}, m1, m3, m8, m0
+ REPX {paddd x, m15}, m1, m3, m0, m8
+ paddd m2, m1, m4 ; idct64 out0 + n (unshifted)
+ psubd m1, m4 ; idct64 out63 - n (unshifted)
+ paddd m4, m3, m5 ; idct64 out15 - n (unshifted)
+ psubd m3, m5 ; idct64 out48 + n (unshifted)
+ paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
+ psubd m0, m6 ; idct64 out47 - n (unshifted)
+ paddd m6, m8, m7 ; idct64 out31 - n (unshifted)
+ psubd m8, m7 ; idct64 out32 + n (unshifted)
+ mova [r5-32*44], m2
+ mova [r6+32* 3], m1
+ mova [r6-32*45], m4
+ mova [r5+32* 4], m3
+ mova [r5-32*28], m5
+ mova [r6-32*13], m0
+ mova [r6-32*29], m6
+ mova [r5-32*12], m8
+ add r5, 32
+ sub r6, 32
+ cmp r5, r6
+ jl .main_end_loop
+ ret
+.shift_transpose:
+%macro IDCT64_SHIFT_TRANSPOSE 1 ; shift
+ sub r6, 32*48
+ mov r5, r6
+%%loop:
+ mova m0, [r6-32* 4]
+ mova m4, [r6+32* 4]
+ mova m1, [r6-32* 3]
+ mova m5, [r6+32* 5]
+ mova m2, [r6-32* 2]
+ mova m6, [r6+32* 6]
+ mova m3, [r6-32* 1]
+ mova m7, [r6+32* 7]
+ REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ mova m4, [r6+32* 0]
+ mova m6, [r6+32* 8]
+ mova m5, [r6+32* 1]
+ mova m7, [r6+32* 9]
+ REPX {psrad x, %1}, m4, m6, m5, m7
+ packssdw m4, m6
+ packssdw m5, m7
+ mova m6, [r6+32* 2]
+ mova m8, [r6+32*10]
+ mova m7, [r6+32* 3]
+ mova m9, [r6+32*11]
+ REPX {psrad x, %1}, m6, m8, m7, m9
+ packssdw m6, m8
+ packssdw m7, m9
+ call m(idct_16x8_internal_10bpc).transpose3
+ mova [r5-32*4], m0
+ mova [r5-32*3], m1
+ mova [r5-32*2], m2
+ mova [r5-32*1], m3
+ mova [r5+32*0], m4
+ mova [r5+32*1], m5
+ mova [r5+32*2], m6
+ mova [r5+32*3], m7
+ add r6, 32*16
+ add r5, 32*8
+ cmp r5, r4
+ jl %%loop
+ mov r6, r4
+%endmacro
+ IDCT64_SHIFT_TRANSPOSE 2
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*7]
+ call .main
+ cmp eobd, 36
+ jl .fast
+ call .main
+ cmp eobd, 136
+ jl .fast
+ call .main
+ cmp eobd, 300
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 32
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2
+.fast:
+ pxor m0, m0
+ lea r4, [rsp+32*135]
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r7, [r6-32*32]
+ lea r5, [r6+32*8]
+ lea r6, [pw_5+128]
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+.pass2_loop:
+ mova m0, [r7-32*99]
+ mova m1, [r7-32*97]
+ mova m2, [r7-32*95]
+ mova m3, [r7-32*93]
+ mova m4, [r7-32*67]
+ mova m5, [r7-32*65]
+ mova m6, [r7-32*63]
+ mova m7, [r7-32*61]
+ mova m8, [r7-32*35]
+ mova m9, [r7-32*33]
+ mova m10, [r7-32*31]
+ mova m11, [r7-32*29]
+ mova m12, [r7-32* 3]
+ mova m13, [r7-32* 1]
+ mova m14, [r7+32* 1]
+ mova m15, [r7+32* 3]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ mova m0, [r7-32*100]
+ mova m1, [r7-32*98]
+ mova m2, [r7-32*96]
+ mova m3, [r7-32*94]
+ mova m4, [r7-32*68]
+ mova m5, [r7-32*66]
+ mova m6, [r7-32*64]
+ mova m7, [r7-32*62]
+ mova m8, [r7-32*36]
+ mova m9, [r7-32*34]
+ mova m10, [r7-32*32]
+ mova m11, [r7-32*30]
+ mova m12, [r7-32* 4]
+ mova m13, [r7-32* 2]
+ mova m14, [r7+32* 0]
+ mova m15, [r7+32* 2]
+ add r7, 32*8
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
+ sub dstq, r3
+ lea r2, [r2+r3+32]
+ add dstq, 32
+ cmp r7, r4
+ jl .pass2_loop
+ RET
+ALIGN function_align
+.main:
+ lea r5, [idct64_mul_16bpc]
+ pmulld m0, m14, [cq+128* 1]
+ pmulld m1, m14, [cq+128*31]
+ pmulld m2, m14, [cq+128*17]
+ pmulld m3, m14, [cq+128*15]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
+ pmulld m0, m14, [cq+128* 7]
+ pmulld m1, m14, [cq+128*25]
+ pmulld m2, m14, [cq+128*23]
+ pmulld m3, m14, [cq+128* 9]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
+ pmulld m0, m14, [cq+128* 5]
+ pmulld m1, m14, [cq+128*27]
+ pmulld m2, m14, [cq+128*21]
+ pmulld m3, m14, [cq+128*11]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
+ pmulld m0, m14, [cq+128* 3]
+ pmulld m1, m14, [cq+128*29]
+ pmulld m2, m14, [cq+128*19]
+ pmulld m3, m14, [cq+128*13]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
+ pmulld m0, m14, [cq+128* 2]
+ pmulld m1, m14, [cq+128*14]
+ pmulld m2, m14, [cq+128*18]
+ pmulld m3, m14, [cq+128*30]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast_rect2
+ pmulld m0, m14, [cq+128* 6]
+ pmulld m1, m14, [cq+128*10]
+ pmulld m2, m14, [cq+128*22]
+ pmulld m3, m14, [cq+128*26]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast_rect2
+ pmulld m0, m14, [cq+128* 4]
+ pmulld m1, m14, [cq+128*12]
+ pmulld m2, m14, [cq+128*20]
+ pmulld m3, m14, [cq+128*28]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_fast_rect2
+ pmulld m0, m14, [cq+128* 0]
+ pmulld m1, m14, [cq+128* 8]
+ pmulld m2, m14, [cq+128*16]
+ pmulld m3, m14, [cq+128*24]
+ pxor m15, m15
+ mov r7d, 128*29
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ psrld m15, m11, 11 ; pd_1
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end2
+ IDCT64_SHIFT_TRANSPOSE 1
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ vpbroadcastd m14, [pd_2896]
+ lea r6, [rsp+32*7]
+ call .main
+ cmp eobd, 36
+ jl .fast
+ call .main
+ cmp eobd, 136
+ jl .fast
+ call .main
+ cmp eobd, 300
+ jl .fast
+ call .main
+ jmp .pass2
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly
+.fast:
+ pxor m0, m0
+ lea r4, [rsp+32*135]
+.fast_loop:
+ REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+ add r6, 32*8
+ cmp r6, r4
+ jl .fast_loop
+.pass2:
+ lea r10, [r6-32*32]
+ lea r6, [pw_5+128]
+ lea r8, [strideq*4]
+ lea r9, [strideq*5]
+ lea r3, [r9+strideq*1] ; stride*6
+ lea r7, [r9+strideq*2] ; stride*7
+.pass2_loop:
+ mova m0, [r10-32*100] ; in0
+ mova m1, [r10-32*96] ; in4
+ mova m2, [r10-32*68] ; in8
+ mova m3, [r10-32*64] ; in12
+ mova m4, [r10-32*36] ; in16
+ mova m5, [r10-32*32] ; in20
+ mova m6, [r10-32* 4] ; in24
+ mova m7, [r10+32* 0] ; in28
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [r4-32*4], m0
+ mova [r4-32*3], m1
+ mova [r4-32*2], m2
+ mova [r4-32*1], m3
+ mova [r4+32*0], m4
+ mova [r4+32*1], m5
+ mova [r4+32*2], m6
+ mova [r4+32*3], m7
+ add r4, 32*8
+ mova [r4-32*4], m8
+ mova [r4-32*3], m9
+ mova [r4-32*2], m10
+ mova [r4-32*1], m11
+ mova [r4+32*0], m12
+ mova [r4+32*1], m13
+ mova [r4+32*2], m14
+ mova [r4+32*3], m15
+ mova m0, [r10-32*98] ; in2
+ mova m1, [r10-32*94] ; in6
+ mova m2, [r10-32*66] ; in10
+ mova m3, [r10-32*62] ; in14
+ mova m4, [r10-32*34] ; in18
+ mova m5, [r10-32*30] ; in22
+ mova m6, [r10-32* 2] ; in26
+ mova m7, [r10+32* 2] ; in30
+ lea r5, [r4+32*16]
+ add r4, 32*8
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova m0, [r10-32*99] ; in1
+ mova m1, [r10+32* 3] ; in31
+ mova m2, [r10-32*35] ; in17
+ mova m3, [r10-32*61] ; in15
+ mova m4, [r10-32*67] ; in9
+ mova m5, [r10-32*29] ; in23
+ mova m6, [r10-32* 3] ; in25
+ mova m7, [r10-32*93] ; in7
+ lea r6, [idct64_mul - 8]
+ add r4, 32*16
+ add r5, 32*32
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ mova m0, [r10-32*95] ; in5
+ mova m1, [r10-32* 1] ; in27
+ mova m2, [r10-32*31] ; in21
+ mova m3, [r10-32*65] ; in11
+ mova m4, [r10-32*63] ; in13
+ mova m5, [r10-32*33] ; in19
+ mova m6, [r10+32* 1] ; in29
+ mova m7, [r10-32*97] ; in3
+ add r6, 8
+ add r4, 32*8
+ sub r5, 32*8
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2
+ add r10, 32*8
+ sub dstq, r8
+ sub r4, 32*44
+ add dstq, 32
+ cmp r10, r4
+ jl .pass2_loop
+ RET
+ALIGN function_align
+.main:
+ lea r5, [idct64_mul_16bpc]
+ mova m0, [cq+128* 1]
+ mova m1, [cq+128*31]
+ mova m2, [cq+128*17]
+ mova m3, [cq+128*15]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+128* 7]
+ mova m1, [cq+128*25]
+ mova m2, [cq+128*23]
+ mova m3, [cq+128* 9]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+128* 5]
+ mova m1, [cq+128*27]
+ mova m2, [cq+128*21]
+ mova m3, [cq+128*11]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ mova m0, [cq+128* 3]
+ mova m1, [cq+128*29]
+ mova m2, [cq+128*19]
+ mova m3, [cq+128*13]
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
+ mova m0, [cq+128* 2]
+ mova m1, [cq+128*14]
+ mova m2, [cq+128*18]
+ mova m3, [cq+128*30]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
+ mova m0, [cq+128* 6]
+ mova m1, [cq+128*10]
+ mova m2, [cq+128*22]
+ mova m3, [cq+128*26]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
+ mova m0, [cq+128* 4]
+ mova m1, [cq+128*12]
+ mova m2, [cq+128*20]
+ mova m3, [cq+128*28]
+ call m(idct_8x16_internal_10bpc).main_oddhalf_fast
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 8]
+ mova m2, [cq+128*16]
+ mova m3, [cq+128*24]
+ pxor m15, m15
+ mov r7d, 128*29
+.main_zero_loop:
+ mova [cq+r7-128*1], m15
+ mova [cq+r7+128*0], m15
+ mova [cq+r7+128*1], m15
+ mova [cq+r7+128*2], m15
+ sub r7d, 128*4
+ jg .main_zero_loop
+ call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
+ jmp m(inv_txfm_add_dct_dct_64x16_10bpc).shift_transpose
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/itx16_avx512.asm b/third_party/dav1d/src/x86/itx16_avx512.asm
new file mode 100644
index 0000000000..d973655462
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx16_avx512.asm
@@ -0,0 +1,4133 @@
+; Copyright © 2022-2023, VideoLAN and dav1d authors
+; Copyright © 2022-2023, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+idct8x8p: db 0, 1, 4, 5, 2, 3, 6, 7, 16, 17, 20, 21, 18, 19, 22, 23
+ db 8, 9, 12, 13, 10, 11, 14, 15, 24, 25, 28, 29, 26, 27, 30, 31
+ db 32, 33, 36, 37, 34, 35, 38, 39, 48, 49, 52, 53, 50, 51, 54, 55
+ db 40, 41, 44, 45, 42, 43, 46, 47, 56, 57, 60, 61, 58, 59, 62, 63
+idtx8x8p: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
+ db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
+ db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
+ db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
+idct8x16p: db 54, 55, 2, 3, 22, 23, 34, 35, 38, 39, 18, 19, 6, 7, 50, 51
+ db 62, 63, 10, 11, 30, 31, 42, 43, 46, 47, 26, 27, 14, 15, 58, 59
+ db 52, 53, 4, 5, 20, 21, 36, 37, 32, 33, 0, 1, 48, 49, 16, 17
+ db 60, 61, 12, 13, 28, 29, 44, 45, 40, 41, 8, 9, 56, 57, 24, 25
+iadst8x16p: db 0, 1, 54, 55, 48, 49, 6, 7, 16, 17, 38, 39, 32, 33, 22, 23
+ db 8, 9, 62, 63, 56, 57, 14, 15, 24, 25, 46, 47, 40, 41, 30, 31
+ db 4, 5, 50, 51, 52, 53, 2, 3, 20, 21, 34, 35, 36, 37, 18, 19
+ db 12, 13, 58, 59, 60, 61, 10, 11, 28, 29, 42, 43, 44, 45, 26, 27
+permA: db 0, 1, 0, 8, 4, 5, 1, 9, 8, 9, 4, 12, 12, 13, 5, 13
+ db 16, 17, 16, 24, 20, 21, 17, 25, 24, 25, 20, 28, 28, 29, 21, 29
+ db 2, 3, 2, 10, 6, 7, 3, 11, 10, 11, 6, 14, 14, 15, 7, 15
+ db 18, 19, 18, 26, 22, 23, 19, 27, 26, 27, 22, 30, 30, 31, 23, 31
+permB: db 4, 2, 1, 8, 0, 0, 1, 0, 12, 3, 3, 10, 8, 1, 3, 2
+ db 5, 10, 5, 12, 1, 8, 5, 4, 13, 11, 7, 14, 9, 9, 7, 6
+ db 6, 6, 13, 4, 2, 4, 4, 5, 14, 7, 15, 6, 10, 5, 6, 7
+ db 7, 14, 9, 0, 3, 12, 0, 1, 15, 15, 11, 2, 11, 13, 2, 3
+permC: db 0, 9, 0, 0, 0, 1, 4, 4, 2, 11, 2, 2, 2, 3, 6, 6
+ db 1, 8, 1, 8, 4, 5, 5, 12, 3, 10, 3, 10, 6, 7, 7, 14
+ db 9, 1, 8, 1, 1, 0, 12, 5, 11, 3, 10, 3, 3, 2, 14, 7
+ db 8, 0, 9, 9, 5, 4, 13, 13, 10, 2, 11, 11, 7, 6, 15, 15
+idct8x32p: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
+ db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
+ db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
+ db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
+idct32x8p: db 2, 18, 0, 16, 3, 19, 1, 17, 10, 26, 8, 24, 11, 27, 9, 25
+ db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57
+ db 6, 22, 4, 20, 7, 23, 5, 21, 14, 30, 12, 28, 15, 31, 13, 29
+ db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61
+idtx32x8p: db 0, 8, 16, 24, 4, 12, 20, 28, 2, 10, 18, 26, 6, 14, 22, 30
+ db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62
+ db 1, 9, 17, 25, 5, 13, 21, 29, 3, 11, 19, 27, 7, 15, 23, 31
+ db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63
+
+pw_2048_m2048: times 16 dw 2048
+pw_m2048_2048: times 16 dw -2048
+pw_2048: times 16 dw 2048
+
+; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-
+%macro COEF_PAIR 2-3 0 ; a, b, flags
+%if %3 == 1
+pd_%1_m%2: dd %1, %1, -%2, -%2
+%define pd_%1 (pd_%1_m%2 + 4*0)
+%define pd_m%2 (pd_%1_m%2 + 4*2)
+%elif %3 == 2
+pd_m%1_%2: dd -%1, -%1, %2, %2
+%define pd_m%1 (pd_m%1_%2 + 4*0)
+%define pd_%2 (pd_m%1_%2 + 4*2)
+%else
+pd_%1_%2: dd %1, %1, %2, %2
+%define pd_%1 (pd_%1_%2 + 4*0)
+%define pd_%2 (pd_%1_%2 + 4*2)
+%if %3 == 3
+%define pd_%2_m%2 pd_%2
+dd -%2, -%2
+%endif
+%endif
+%endmacro
+
+COEF_PAIR 201, 995
+COEF_PAIR 401, 1189, 1
+COEF_PAIR 401, 1931
+COEF_PAIR 401, 3920
+COEF_PAIR 799, 2276, 1
+COEF_PAIR 799, 3406
+COEF_PAIR 799, 4017
+COEF_PAIR 1380, 601
+COEF_PAIR 1751, 2440
+COEF_PAIR 2598, 1189
+COEF_PAIR 2598, 1931, 2
+COEF_PAIR 2598, 3612
+COEF_PAIR 2751, 2106
+COEF_PAIR 2896, 1567, 3
+COEF_PAIR 2896, 3784, 3
+COEF_PAIR 3035, 3513
+COEF_PAIR 3166, 1931
+COEF_PAIR 3166, 3612
+COEF_PAIR 3166, 3920
+COEF_PAIR 3703, 3290
+COEF_PAIR 3857, 4052
+COEF_PAIR 4017, 2276
+COEF_PAIR 4017, 3406
+COEF_PAIR 4076, 1189
+COEF_PAIR 4076, 3612
+COEF_PAIR 4076, 3920
+COEF_PAIR 4091, 3973
+
+pb_32: times 4 db 32
+pw_5: times 2 dw 5
+pw_4096: times 2 dw 4096
+pw_8192: times 2 dw 8192
+pw_1697x16: times 2 dw 1697*16
+pw_2896x8: times 2 dw 2896*8
+pixel_10bpc_max: times 2 dw 0x03ff
+dconly_10bpc: times 2 dw 0x7c00
+clip_18b_min: dd -0x20000
+clip_18b_max: dd 0x1ffff
+pd_1: dd 1
+pd_2: dd 2
+pd_1448: dd 1448
+pd_2048: dd 2048
+pd_3071: dd 3071 ; 1024 + 2048 - 1
+pd_3072: dd 3072 ; 1024 + 2048
+pd_5119: dd 5119 ; 1024 + 4096 - 1
+pd_5120: dd 5120 ; 1024 + 4096
+pd_5793: dd 5793
+
+cextern dup16_perm
+cextern int8_permA
+cextern idct_8x8_internal_8bpc_avx512icl.main
+cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2
+cextern idct_8x16_internal_8bpc_avx512icl.main
+cextern idct_8x16_internal_8bpc_avx512icl.main2
+cextern idct_8x16_internal_8bpc_avx512icl.main_fast
+cextern idct_8x16_internal_8bpc_avx512icl.main_fast2
+cextern iadst_8x16_internal_8bpc_avx512icl.main2
+cextern idct_16x8_internal_8bpc_avx512icl.main
+cextern iadst_16x8_internal_8bpc_avx512icl.main_pass2
+cextern idct_16x16_internal_8bpc_avx512icl.main
+cextern idct_16x16_internal_8bpc_avx512icl.main2
+cextern idct_16x16_internal_8bpc_avx512icl.main_fast
+cextern idct_16x16_internal_8bpc_avx512icl.main_fast2
+cextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast
+cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast2
+cextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main
+cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf
+cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast
+cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast2
+cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf
+cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast
+cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2
+cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf
+cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast
+
+SECTION .text
+
+%define o_base (pw_2048+4*128)
+%define o_base_8bpc (int8_permA+64*18)
+%define o(x) (r5 - o_base + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+INIT_ZMM avx512icl
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+; flags: 1 = inv_dst1, 2 = inv_dst2
+; skip round/shift if rnd is not a number
+%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
+%if %8 < 32
+ pmulld m%4, m%1, m%8
+ pmulld m%3, m%2, m%8
+%else
+%if %8 < 4096
+ vpbroadcastd m%3, [o(pd_%8)]
+%else
+ vbroadcasti32x4 m%3, [o(pd_%8)]
+%endif
+ pmulld m%4, m%1, m%3
+ pmulld m%3, m%2
+%endif
+%if %7 < 32
+ pmulld m%1, m%7
+ pmulld m%2, m%7
+%else
+%if %7 < 4096
+ vpbroadcastd m%5, [o(pd_%7)]
+%else
+ vbroadcasti32x4 m%5, [o(pd_%7)]
+%endif
+ pmulld m%1, m%5
+ pmulld m%2, m%5
+%endif
+%if %9 & 2
+ psubd m%4, m%6, m%4
+ psubd m%2, m%4, m%2
+%else
+%ifnum %6
+ paddd m%4, m%6
+%endif
+ paddd m%2, m%4
+%endif
+%ifnum %6
+ paddd m%1, m%6
+%endif
+%if %9 & 1
+ psubd m%1, m%3, m%1
+%else
+ psubd m%1, m%3
+%endif
+%ifnum %6
+ psrad m%2, 12
+ psrad m%1, 12
+%endif
+%endmacro
+
+%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size
+cglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%4_internal_10bpc)
+ lea r5, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%4_internal_10bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+%if %3
+ add eobd, %3
+%endif
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 8x8
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+.dconly:
+ add r6d, 384
+ sar r6d, 9
+.dconly2:
+ vpbroadcastd ym2, [o(dconly_10bpc)]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw ym1, r6d
+ paddsw ym1, ym2
+.dconly_loop:
+ mova xm0, [dstq+strideq*0]
+ vinserti32x4 ym0, [dstq+strideq*1], 1
+ paddsw ym0, ym1
+ psubusw ym0, ym2
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
+
+cglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call .load
+ vpermi2q m1, m0, m2 ; 1 5
+ vpermi2q m3, m6, m4 ; 7 3
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call .main
+ call .main_end
+ mova m4, [o(idct8x8p)]
+ packssdw m0, m2 ; 0 1 4 5
+ packssdw m1, m3 ; 3 2 7 6
+ vpermb m0, m4, m0
+ vprolq m1, 32
+ vpermb m2, m4, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym2, m0, 1
+ vextracti32x8 ym3, m1, 1
+ call m(idct_8x8_internal_8bpc).main
+ mova m10, [permC]
+ vpbroadcastd m12, [pw_2048]
+.end:
+ vpermt2q m0, m10, m1
+ vpermt2q m2, m10, m3
+.end2:
+ vpbroadcastd m11, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m10, m10
+ pmulhrsw m8, m12, m0
+ call .write_8x4_start
+ pmulhrsw m8, m12, m2
+.write_8x4:
+ lea dstq, [dstq+strideq*4]
+ add cq, 64*2
+.write_8x4_start:
+ mova xm9, [dstq+strideq*0]
+ vinserti32x4 ym9, [dstq+strideq*1], 1
+ vinserti32x4 m9, [dstq+strideq*2], 2
+ vinserti32x4 m9, [dstq+r6 ], 3
+ mova [cq+64*0], m10
+ mova [cq+64*1], m10
+ paddw m9, m8
+ pmaxsw m9, m10
+ pminsw m9, m11
+ mova [dstq+strideq*0], xm9
+ vextracti32x4 [dstq+strideq*1], ym9, 1
+ vextracti32x4 [dstq+strideq*2], m9, 2
+ vextracti32x4 [dstq+r6 ], m9, 3
+ ret
+ALIGN function_align
+.load:
+ mova m0, [cq+64*0] ; 0 1
+ mova m4, [cq+64*1] ; 2 3
+ mova m1, [o(permB)]
+ mova m2, [cq+64*2] ; 4 5
+ mova m6, [cq+64*3] ; 6 7
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psrlq m5, m1, 32
+ vpbroadcastd m12, [o(pd_2896)]
+ mova m3, m1
+ vpbroadcastd m11, [o(pd_1)]
+ ret
+ALIGN function_align
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m3, [o(pd_4017_3406)]
+ vbroadcasti32x4 m8, [o(pd_799_m2276)]
+ vbroadcasti32x4 m2, [o(pd_2896_3784)]
+ vbroadcasti32x4 m9, [o(pd_2896_1567)]
+ pmulld m3, m1 ; t4a t5a
+ pmulld m1, m8 ; t7a t6a
+ pmulld m2, m0 ; t0 t3
+ pmulld m0, m9 ; t1 t2
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 1, 3, 8, 9, 10, _, 799_3406, 4017_2276
+ ITX_MULSUB_2D 0, 2, 8, 9, 10, _, 2896_1567, 2896_3784
+.main2:
+ REPX {paddd x, m13}, m1, m3, m0, m2
+ REPX {psrad x, 12 }, m1, m3, m0, m2
+ punpcklqdq m8, m1, m3 ; t4a t7a
+ punpckhqdq m1, m3 ; t5a t6a
+ psubd m3, m8, m1 ; t5a t6a
+ paddd m8, m1 ; t4 t7
+ pmaxsd m3, m14
+ punpckhqdq m1, m2, m0 ; t3 t2
+ pminsd m3, m15
+ punpcklqdq m2, m0 ; t0 t1
+ pmulld m3, m12
+ paddd m0, m2, m1 ; dct4 out0 out1
+ psubd m2, m1 ; dct4 out3 out2
+ REPX {pmaxsd x, m14}, m8, m0, m2
+ REPX {pminsd x, m15}, m8, m0, m2
+.main3:
+ pshufd m1, m3, q1032
+ paddd m3, m13
+ psubd m9, m3, m1
+ paddd m3, m1
+ psrad m9, 12
+ psrad m3, 12
+ punpckhqdq m1, m8, m3 ; t7 t6
+ shufpd m8, m9, 0xaa ; t4 t5
+ ret
+.main_end:
+ paddd m0, m11
+ paddd m2, m11
+ psubd m3, m0, m1 ; out7 out6
+ paddd m0, m1 ; out0 out1
+ paddd m1, m2, m8 ; out3 out2
+ psubd m2, m8 ; out4 out5
+ REPX {vpsravd x, m11}, m0, m2, m3, m1
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+INV_TXFM_8X8_FN adst, adst
+
+cglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x8_internal_10bpc).load
+ vpermi2q m1, m6, m2 ; 7 5
+ vpermi2q m3, m4, m0 ; 3 1
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call .main
+ punpckldq m1, m2, m4 ; out4 out6
+ punpckhdq m2, m0 ; -out5 -out7
+ punpckldq m0, m3 ; out0 out2
+ punpckhdq m4, m3 ; -out1 -out3
+ paddd m1, m11
+ psubd m3, m11, m2
+ paddd m0, m11
+ psubd m4, m11, m4
+.pass1_end:
+ REPX {psrad x, 1}, m1, m0, m3, m4
+ packssdw m0, m1 ; 0 2 4 6
+ packssdw m4, m3 ; 1 3 5 7
+ psrlq m1, [o(permB)], 8
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ psrlq m2, m1, 32
+ vpermi2q m1, m0, m3
+ vpermt2q m0, m2, m3
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ movu m10, [permC+2]
+ vbroadcasti32x8 m12, [pw_2048_m2048+16]
+ jmp m(idct_8x8_internal_10bpc).end
+.main_pass2:
+ vextracti32x8 ym2, m0, 1
+ vextracti32x8 ym3, m1, 1
+ lea r5, [o_base_8bpc]
+ pshufd ym4, ym0, q1032
+ pshufd ym5, ym1, q1032
+ jmp m(iadst_8x8_internal_8bpc).main_pass2
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 1, 0, 4, 5, 6, 13, 401_1931, 4076_3612
+ ITX_MULSUB_2D 3, 2, 4, 5, 6, 13, 3166_3920, 2598_1189
+ psubd m4, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ REPX {pmaxsd x, m14}, m4, m2, m0, m1
+ REPX {pminsd x, m15}, m4, m2, m0, m1
+ pxor m5, m5
+ psubd m5, m4
+ shufpd m4, m2, 0xaa ; t4 t7
+ shufpd m2, m5, 0xaa ; t5 -t6
+ ITX_MULSUB_2D 4, 2, 3, 5, 6, 13, 1567, 3784
+ punpckhqdq m3, m0, m1
+ punpcklqdq m0, m1
+ psubd m1, m0, m3 ; t2 t3
+ paddd m0, m3 ; out0 -out7
+ punpckhqdq m3, m4, m2 ; t7a t6a
+ punpcklqdq m4, m2 ; t5a t4a
+ psubd m2, m4, m3 ; t7 t6
+ paddd m4, m3 ; out6 -out1
+ REPX {pmaxsd x, m14}, m1, m2
+ REPX {pminsd x, m15}, m1, m2
+ shufpd m3, m1, m2, 0xaa
+ shufpd m1, m2, 0x55
+ pmulld m3, m12
+ pmulld m1, m12
+ paddd m3, m13
+ psubd m2, m3, m1
+ paddd m3, m1
+ psrad m2, 12 ; out4 -out5
+ pshufd m3, m3, q1032
+ psrad m3, 12 ; out2 -out3
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, identity
+INV_TXFM_8X8_FN flipadst, flipadst
+
+cglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x8_internal_10bpc).load
+ vpermi2q m1, m6, m2 ; 7 5
+ vpermi2q m3, m4, m0 ; 3 1
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call m(iadst_8x8_internal_10bpc).main
+ punpckhdq m1, m3, m4 ; -out3 -out1
+ punpckldq m3, m0 ; out2 out0
+ punpckhdq m0, m2 ; -out7 -out5
+ punpckldq m4, m2 ; out6 out4
+ psubd m1, m11, m1
+ paddd m3, m11
+ psubd m0, m11, m0
+ paddd m4, m11
+ jmp m(iadst_8x8_internal_10bpc).pass1_end
+.pass2:
+ call m(iadst_8x8_internal_10bpc).main_pass2
+ movu m10, [permC+1]
+ vbroadcasti32x8 m12, [pw_m2048_2048+16]
+ lea r6, [strideq*3]
+ vpermt2q m0, m10, m1 ; 7 6 5 4
+ vpbroadcastd m11, [pixel_10bpc_max]
+ vpermt2q m2, m10, m3 ; 3 2 1 0
+ pxor m10, m10
+ pmulhrsw m8, m12, m2
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m0
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ mova m1, [cq+64*0]
+ packssdw m1, [cq+64*2] ; 0 4 1 5
+ mova m2, [cq+64*1] ; 2 6 3 7
+ packssdw m2, [cq+64*3]
+ mova m0, [o(idtx8x8p)]
+ vpermb m1, m0, m1
+ vpermb m2, m0, m2
+ punpckldq m0, m1, m2 ; 0 1 4 5
+ punpckhdq m1, m2 ; 2 3 6 7
+ jmp tx2q
+.pass2:
+ movu m3, [o(permC+2)]
+ vpbroadcastd m12, [o(pw_4096)]
+ psrlq m2, m3, 32
+ vpermi2q m2, m0, m1
+ vpermt2q m0, m3, m1
+ jmp m(idct_8x8_internal_10bpc).end2
+
+%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 8x16
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity, 35
+INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, adst
+
+cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call .load
+ call .main
+ call .main_end
+.pass1_end:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ jmp tx2q
+.pass2:
+ mova m8, [o(idct8x16p)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3
+ punpckhdq m5, m0, m1
+ punpckldq m0, m1
+ punpckhdq m4, m2, m3
+ punpckldq m2, m3
+ punpcklqdq m8, m0, m2 ; 15 1
+ punpckhqdq m0, m2 ; 7 9
+ punpckhqdq m1, m5, m4 ; 3 13
+ punpcklqdq m5, m4 ; 11 5
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym7, m8, 1 ; 14 2
+ vextracti32x8 ym3, m0, 1 ; 6 10
+ vextracti32x8 ym6, m1, 1 ; 12 4
+ vextracti32x8 ym9, m5, 1 ; 8 0
+ call m(idct_8x16_internal_8bpc).main2
+ mova m8, [permC]
+ vpbroadcastd m12, [pw_2048]
+ vpermt2q m0, m8, m1
+ lea r6, [strideq*3]
+ vpermt2q m2, m8, m3
+ vpbroadcastd m11, [pixel_10bpc_max]
+ vpermt2q m4, m8, m5
+ pxor m10, m10
+ vpermt2q m6, m8, m7
+ pmulhrsw m8, m12, m0
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m2
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m4
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m6
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+.fast:
+ mova ym0, [cq+64*0]
+ mova ym4, [cq+64*2]
+ mova ym1, [cq+64*1]
+ mova ym5, [cq+64*5]
+ mova ym2, [cq+64*4]
+ mova ym6, [cq+64*6]
+ mova ym3, [cq+64*7]
+ mova ym7, [cq+64*3]
+ call .round_input_fast
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_10bpc).main_end
+ movu m6, [o(permC+3)]
+ packssdw m3, m1, m3
+ packssdw m1, m0, m2
+ vprolq m3, 32
+ vpermd m1, m6, m1
+ vpermd m3, m6, m3
+ mova ym0, ym1 ; 0 4
+ vextracti32x8 ym1, m1, 1 ; 1 5
+ mova ym2, ym3 ; 2 6
+ vextracti32x8 ym3, m3, 1 ; 3 7
+ jmp tx2q
+ALIGN function_align
+.round_input_fast:
+ movshdup m8, [o(permB)]
+ vpbroadcastd m12, [o(pd_2896)]
+ vpermt2q m0, m8, m4
+ vpermt2q m1, m8, m5
+ vpermt2q m2, m8, m6
+ vpermt2q m3, m8, m7
+ vpbroadcastd m13, [o(pd_2048)]
+ REPX {pmulld x, m12}, m0, m1, m2, m3
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ REPX {paddd x, m13}, m0, m1, m2, m3
+ vpbroadcastd m11, [o(pd_1)]
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ ret
+ALIGN function_align
+.load:
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+.load2:
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m0, m12, [cq+64*0]
+ pmulld m1, m12, [cq+64*1]
+ pmulld m2, m12, [cq+64*2]
+ pmulld m3, m12, [cq+64*3]
+ vpbroadcastd m13, [o(pd_2048)]
+ pmulld m4, m12, [cq+64*4]
+ pmulld m5, m12, [cq+64*5]
+ pmulld m6, m12, [cq+64*6]
+ pmulld m7, m12, [cq+64*7]
+.round:
+ REPX {paddd x, m13}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ REPX {paddd x, m13}, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m4, m5, m6, m7
+ ret
+ALIGN function_align
+.main_fast_rect2:
+ REPX {paddd x, m13}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+.main_fast:
+ pmulld m0, m12
+ pmulld m5, m3, [o(pd_2276)] {1to16} ; t5a
+ pmulld m3, [o(pd_3406)] {1to16} ; t6a
+ pmulld m7, m1, [o(pd_4017)] {1to16} ; t7a
+ pmulld m1, [o(pd_799)] {1to16} ; t4a
+ pmulld m6, m2, [o(pd_3784)] {1to16} ; t3
+ pmulld m2, [o(pd_1567)] {1to16} ; t2
+ paddd m0, m13
+ psubd m5, m13, m5
+ psrad m0, 12 ; t0
+ mova m9, m0 ; t1
+ jmp .main2
+.main_rect2:
+ call .round
+.main:
+ pmulld m0, m12
+ ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a
+ ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3
+ pmulld m4, m12
+ paddd m0, m13
+ paddd m5, m13
+ psubd m9, m0, m4 ; t1
+ paddd m0, m4 ; t0
+ psrad m9, 12
+ psrad m0, 12
+.main2:
+ REPX {paddd x, m13}, m3, m1, m7
+ REPX {psrad x, 12 }, m5, m1, m3, m7
+ paddd m8, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ psubd m5, m7, m3 ; t6a
+ paddd m7, m3 ; t7
+ pmaxsd m5, m14
+ pmaxsd m1, m14
+ paddd m2, m13
+ paddd m6, m13
+ pminsd m5, m15
+ pminsd m1, m15
+ pmulld m5, m12
+ pmulld m1, m12
+ pmaxsd m8, m14
+ pmaxsd m7, m14
+ pminsd m8, m15
+ paddd m5, m13
+ psubd m4, m5, m1
+ paddd m5, m1
+ REPX {psrad x, 12 }, m2, m6, m5, m4
+ paddd m1, m9, m2 ; dct4 out1
+ psubd m2, m9, m2 ; dct4 out2
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ pminsd m6, m15, m7
+ REPX {pmaxsd x, m14}, m0, m1, m2, m3
+ REPX {pminsd x, m15}, m0, m1, m2, m3
+ ret
+.main_end:
+ vpbroadcastd m11, [o(pd_1)]
+.main_end2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ psubd m7, m0, m6 ; out7
+ paddd m0, m6 ; out0
+ psubd m6, m1, m5 ; out6
+ paddd m1, m5 ; out1
+ psubd m5, m2, m4 ; out5
+ paddd m2, m4 ; out2
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+ REPX {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, identity, 35
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, adst
+
+cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call m(idct_8x16_internal_10bpc).load
+ call .main
+ psrad m0, 1
+ psrad m1, 1
+ psrad m6, m10, 1
+ psrad m7, m11, 1
+ psrad m2, 12
+ psrad m3, 12
+ psrad m4, m8, 12
+ psrad m5, m9, 12
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.fast:
+ call .fast_main
+ punpcklqdq m1, m2, m4 ; out4 out6
+ punpckhqdq m2, m0 ; -out5 -out7
+ punpcklqdq m0, m3 ; out0 out2
+ punpckhqdq m4, m3 ; -out1 -out3
+ paddd m1, m11
+ psubd m3, m11, m2
+ paddd m0, m11
+ psubd m4, m11, m4
+.fast_end:
+ movu m5, [o(permC+3)]
+ REPX {psrad x, 1}, m1, m0, m3, m4
+ packssdw m2, m0, m1 ; 0 2 4 6
+ packssdw m3, m4, m3 ; 1 3 5 7
+ vpermd m2, m5, m2
+ vpermd m3, m5, m3
+ mova ym0, ym2
+ vextracti32x8 ym2, m2, 1
+ mova ym1, ym3
+ vextracti32x8 ym3, m3, 1
+ jmp tx2q
+.pass2:
+ call .pass2_main
+ movu m4, [permB+2]
+ vbroadcasti32x8 m12, [pw_2048_m2048+16]
+ psrlq m7, m4, 8
+ vpermi2q m4, m0, m3 ; 0 1 2 3
+ psrlq m5, m7, 24
+ vpermi2q m7, m0, m3 ; 12 13 14 15
+ psrlq m6, m5, 8
+ vpermq m5, m5, m1 ; 4 5 6 7
+ vpermq m6, m6, m2 ; 8 9 10 11
+.pass2_end:
+ vpbroadcastd m11, [pixel_10bpc_max]
+ pxor m10, m10
+ lea r6, [strideq*3]
+ pmulhrsw m8, m12, m4
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m5
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m6
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m7
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, 13, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, 13, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, 13, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, 13, 3166, 2598 ; t5a, t4a
+ psubd m8, m2, m6 ; t6
+ paddd m2, m6 ; t2
+ psubd m6, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ psubd m4, m5, m1 ; t7
+ paddd m5, m1 ; t3
+ psubd m1, m7, m3 ; t5
+ paddd m7, m3 ; t1
+ REPX {pmaxsd x, m14}, m6, m1, m8, m4, m2, m0, m5, m7
+ REPX {pminsd x, m15}, m6, m1, m8, m4, m2, m0, m5, m7
+ vpbroadcastd m10, [o(pd_1567)]
+ vpbroadcastd m11, [o(pd_3784)]
+ ITX_MULSUB_2D 6, 1, 3, 9, _, 13, 10, 11 ; t5a, t4a
+ ITX_MULSUB_2D 4, 8, 3, 9, _, 13, 11, 10 ; t6a, t7a
+ vpbroadcastd m12, [o(pd_1448)]
+ psubd m9, m6, m8 ; t7
+ paddd m6, m8 ; out6
+ psubd m3, m7, m5 ; t3
+ paddd m7, m5 ; -out7
+ psubd m5, m0, m2 ; t2
+ paddd m0, m2 ; out0
+ psubd m2, m1, m4 ; t6
+ paddd m1, m4 ; -out1
+ REPX {pmaxsd x, m14}, m5, m3, m2, m9
+ REPX {pminsd x, m15}, m5, m3, m2, m9
+ REPX {pmulld x, m12}, m5, m3, m2, m9
+ vpbroadcastd m4, [o(pd_1)]
+ psubd m8, m5, m3 ; (t2 - t3) * 1448
+ paddd m3, m5 ; (t2 + t3) * 1448
+ psubd m5, m2, m9 ; (t6 - t7) * 1448
+ paddd m2, m9 ; (t6 + t7) * 1448
+ vpbroadcastd m9, [o(pd_3072)]
+ paddd m0, m4
+ psubd m1, m4, m1
+ paddd m10, m6, m4
+ psubd m11, m4, m7
+ paddd m2, m9
+ paddd m8, m9
+ vpbroadcastd m9, [o(pd_3071)]
+ psubd m3, m9, m3
+ psubd m9, m5
+ ret
+ALIGN function_align
+.fast_main:
+ mova ym0, [cq+64*0]
+ mova ym4, [cq+64*2]
+ mova ym1, [cq+64*7]
+ mova ym5, [cq+64*5]
+ mova ym2, [cq+64*4]
+ mova ym6, [cq+64*6]
+ mova ym3, [cq+64*3]
+ mova ym7, [cq+64*1]
+ call m(idct_8x16_internal_10bpc).round_input_fast
+ jmp m(iadst_8x8_internal_10bpc).main
+ALIGN function_align
+.pass2_main:
+ mova m8, [o(iadst8x16p)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3
+ vpbroadcastd m10, [o(pw_2896x8)]
+ punpckhdq m5, m0, m1
+ punpckldq m0, m1
+ punpckhdq m1, m2, m3
+ punpckldq m2, m3
+ lea r5, [o_base_8bpc]
+ punpckhqdq m4, m0, m2 ; 12 3 14 1
+ punpcklqdq m0, m2 ; 0 15 2 13
+ punpckhqdq m6, m5, m1 ; 8 7 10 5
+ punpcklqdq m5, m1 ; 4 11 6 9
+ call m(iadst_8x16_internal_8bpc).main2
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m10 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m10 ; out8 -out11 -out9 out10
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, identity, 35
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+
+cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call m(idct_8x16_internal_10bpc).load
+ call m(iadst_8x16_internal_10bpc).main
+ psrad m7, m0, 1
+ psrad m0, m11, 1
+ psrad m6, m1, 1
+ psrad m1, m10, 1
+ psrad m5, m2, 12
+ psrad m2, m9, 12
+ psrad m4, m3, 12
+ psrad m3, m8, 12
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.fast:
+ call m(iadst_8x16_internal_10bpc).fast_main
+ punpckhqdq m1, m3, m4 ; -out3 -out1
+ punpcklqdq m3, m0 ; out2 out0
+ punpckhqdq m0, m2 ; -out7 -out5
+ punpcklqdq m4, m2 ; out6 out4
+ psubd m1, m11, m1
+ paddd m3, m11
+ psubd m0, m11, m0
+ paddd m4, m11
+ jmp m(iadst_8x16_internal_10bpc).fast_end
+.pass2:
+ call m(iadst_8x16_internal_10bpc).pass2_main
+ movu m7, [permB+2]
+ vbroadcasti32x8 m12, [pw_m2048_2048+16]
+ psrlq m4, m7, 8
+ vpermi2q m7, m3, m0 ; 3 2 1 0
+ psrlq m5, m4, 24
+ vpermi2q m4, m3, m0 ; 15 14 13 12
+ psrlq m6, m5, 8
+ vpermq m5, m5, m2 ; 11 10 9 8
+ vpermq m6, m6, m1 ; 7 6 5 4
+ jmp m(iadst_8x16_internal_10bpc).pass2_end
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x16_internal_10bpc).load2
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.pass2:
+ vpbroadcastd m8, [o(pw_1697x16)]
+ pmulhrsw m4, m8, m0
+ pmulhrsw m5, m8, m1
+ pmulhrsw m6, m8, m2
+ pmulhrsw m7, m8, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ vpbroadcastd m7, [o(pw_2048)]
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ vpbroadcastd m6, [o(pixel_10bpc_max)]
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m1
+ punpckhdq m4, m1
+ pxor m5, m5
+ punpckhqdq m1, m0, m2 ; 1 5 9 13
+ punpcklqdq m0, m2 ; 0 4 8 12
+ punpcklqdq m2, m3, m4 ; 2 6 10 14
+ punpckhqdq m3, m4 ; 3 7 11 15
+ lea r6, [strideq*3]
+ pmulhrsw m0, m7
+ call .write_8x4_start
+ pmulhrsw m0, m7, m1
+ call .write_8x4
+ pmulhrsw m0, m7, m2
+ call .write_8x4
+ pmulhrsw m0, m7, m3
+.write_8x4:
+ add dstq, strideq
+ add cq, 64*2
+.write_8x4_start:
+ mova xm4, [dstq+strideq*0]
+ vinserti32x4 ym4, [dstq+strideq*4], 1
+ vinserti32x4 m4, [dstq+strideq*8], 2
+ vinserti32x4 m4, [dstq+r6*4 ], 3
+ mova [cq+64*0], m5
+ mova [cq+64*1], m5
+ paddw m4, m0
+ pmaxsw m4, m5
+ pminsw m4, m6
+ mova [dstq+strideq*0], xm4
+ vextracti32x4 [dstq+strideq*4], ym4, 1
+ vextracti32x4 [dstq+strideq*8], m4, 2
+ vextracti32x4 [dstq+r6*4 ], m4, 3
+ ret
+
+%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 16x8
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+.dconly:
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+.dconly2:
+ vpbroadcastd m2, [o(dconly_10bpc)]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw m1, r6d
+ paddsw m1, m2
+.dconly_loop:
+ mova ym0, [dstq+strideq*0]
+ vinserti32x8 m0, [dstq+strideq*1], 1
+ paddsw m0, m1
+ psubusw m0, m2
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity, -21
+INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, adst
+
+cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m4, m12, [cq+64*0] ; 0 1
+ pmulld m9, m12, [cq+64*1] ; 2 3
+ pmulld m8, m12, [cq+64*2] ; 4 5
+ pmulld m7, m12, [cq+64*3] ; 6 7
+ vpbroadcastd m13, [o(pd_2048)]
+ pxor m2, m2
+ mova m15, [o(permB)]
+ REPX {mova [cq+64*x], m2}, 0, 1, 2, 3
+ psrlq m0, m15, 32
+ REPX {paddd x, m13}, m4, m9, m8, m7
+ vpbroadcastd m14, [o(clip_18b_min)]
+ REPX {psrad x, 12 }, m4, m8, m9, m7
+ mova m1, m0
+ vpermi2q m0, m4, m8 ; 0 4
+ cmp eobd, 43
+ jl .fast
+ pmulld m5, m12, [cq+64*4] ; 8 9
+ pmulld m10, m12, [cq+64*5] ; 10 11
+ pmulld m11, m12, [cq+64*6] ; 12 13
+ pmulld m6, m12, [cq+64*7] ; 14 15
+ REPX {mova [cq+64*x], m2}, 4, 5, 6, 7
+ REPX {paddd x, m13}, m5, m10, m11, m6
+ REPX {psrad x, 12 }, m10, m5, m11, m6
+ mova m2, m1
+ vpermi2q m1, m9, m10 ; 2 10
+ mova m3, m2
+ vpermi2q m2, m5, m11 ; 8 12
+ vpermi2q m3, m6, m7 ; 14 6
+ vpermt2q m4, m15, m11 ; 1 13
+ vpermt2q m6, m15, m9 ; 15 3
+ vpermt2q m5, m15, m8 ; 9 5
+ vpermt2q m7, m15, m10 ; 7 11
+ vpbroadcastd m15, [o(clip_18b_max)]
+ call m(idct_8x8_internal_10bpc).main
+ call .main
+ jmp .pass1_end
+.fast:
+ vpermi2q m1, m9, m7 ; 2 6
+ vpermt2q m4, m15, m9 ; 1 3
+ vpermt2q m7, m15, m8 ; 7 5
+ vpbroadcastd m15, [o(clip_18b_max)]
+ call m(idct_8x8_internal_10bpc).main_fast
+ call .main_fast
+.pass1_end:
+ call m(idct_8x16_internal_10bpc).main_end
+ mova m8, [o(permA)]
+ psrlq m9, m8, 8
+.pass1_end2:
+ mova m10, m9
+ mova m11, m8
+ call .transpose_16x8
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(idct_16x8_internal_8bpc).main
+ movshdup m4, [permC]
+ vpbroadcastd m11, [pw_2048]
+ psrlq m5, m4, 8
+.end:
+ vpbroadcastd m13, [pixel_10bpc_max]
+ pxor m12, m12
+ vpermq m8, m4, m0
+ vpermq m9, m5, m1
+ lea r6, [strideq*3]
+ call .write_16x4
+ vpermq m8, m4, m2
+ vpermq m9, m5, m3
+.write_16x4:
+ pmulhrsw m8, m11
+ pmulhrsw m9, m11
+.write_16x4_noround:
+ mova ym10, [dstq+strideq*0]
+ vinserti32x8 m10, [dstq+strideq*1], 1
+ paddw m8, m10
+ mova ym10, [dstq+strideq*2]
+ vinserti32x8 m10, [dstq+r6 ], 1
+ paddw m9, m10
+ pmaxsw m8, m12
+ pmaxsw m9, m12
+ pminsw m8, m13
+ pminsw m9, m13
+ mova [dstq+strideq*0], ym8
+ vextracti32x8 [dstq+strideq*1], m8, 1
+ mova [dstq+strideq*2], ym9
+ vextracti32x8 [dstq+r6 ], m9, 1
+ lea dstq, [dstq+strideq*4]
+ ret
+ALIGN function_align
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m6, [o(pd_4076_3920)]
+ vbroadcasti32x4 m3, [o(pd_401_m1189)]
+ vbroadcasti32x4 m5, [o(pd_m2598_1931)]
+ vbroadcasti32x4 m9, [o(pd_3166_3612)]
+ pmulld m6, m4 ; t15a t12a
+ pmulld m4, m3 ; t8a t11a
+ pmulld m5, m7 ; t9a t10a
+ pmulld m7, m9 ; t14a t13a
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 4, 6, 3, 9, 10, _, 401_3920, 4076_1189
+ ITX_MULSUB_2D 5, 7, 3, 9, 10, _, 3166_1931, 2598_3612
+.main2:
+ REPX {paddd x, m13}, m4, m6, m5, m7
+ REPX {psrad x, 12 }, m4, m5, m6, m7
+ paddd m9, m4, m5 ; t8 t11
+ psubd m4, m5 ; t9 t10
+ psubd m5, m6, m7 ; t14 t13
+ paddd m6, m7 ; t15 t12
+ REPX {pmaxsd x, m14}, m5, m4, m9, m6
+ REPX {pminsd x, m15}, m5, m4, m9, m6
+.main3:
+ psubd m3, m0, m1 ; dct8 out7 out6
+ paddd m0, m1 ; dct8 out0 out1
+ vbroadcasti32x4 m7, [o(pd_3784_m3784)]
+ pmulld m7, m5
+ vpmulld m5, [o(pd_1567)] {1to16}
+ paddd m1, m2, m8 ; dct8 out3 out2
+ psubd m2, m8 ; dct8 out4 out5
+ vbroadcasti32x4 m8, [o(pd_1567_m1567)]
+ pmulld m8, m4
+ vpmulld m4, [o(pd_3784)] {1to16}
+ REPX {pmaxsd x, m14}, m0, m1
+ REPX {pminsd x, m15}, m0, m1
+ paddd m7, m13
+ paddd m5, m13
+ paddd m7, m8
+ psubd m5, m4
+ psrad m7, 12 ; t14a t10a
+ psrad m5, 12 ; t9a t13a
+ punpckhqdq m4, m9, m7
+ punpcklqdq m8, m9, m5
+ punpckhqdq m5, m6, m5
+ punpcklqdq m6, m7
+ psubd m7, m8, m4 ; t11a t10
+ paddd m8, m4 ; t8a t9
+ psubd m4, m6, m5 ; t12a t13
+ paddd m6, m5 ; t15a t14
+ REPX {pmaxsd x, m14}, m4, m7
+ REPX {pminsd x, m15}, m4, m7
+ pmulld m4, m12
+ pmulld m7, m12
+ REPX {pmaxsd x, m14}, m2, m3, m6, m8
+ REPX {pminsd x, m15}, m2, m3, m6, m8
+ paddd m4, m13
+ paddd m5, m4, m7
+ psubd m4, m7
+ psrad m4, 12 ; t11 t10a
+ psrad m5, 12 ; t12 t13a
+ ret
+ALIGN function_align
+.transpose_16x8:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpermi2d m8, m0, m2
+ vpermt2d m0, m9, m2
+ vpermi2d m10, m1, m3
+ vpermi2d m11, m1, m3
+ punpckhwd m3, m8, m0
+ punpcklwd m1, m8, m0
+ punpckhwd m4, m10, m11
+ punpcklwd m2, m10, m11
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, identity, -21
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, adst
+
+cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ call .main_pass1
+ vpbroadcastd m9, [o(pd_1)]
+ paddd m0, m9
+ psubd m1, m9, m1
+ paddd m2, m9
+ psubd m3, m9, m3
+ paddd m4, m9, m5
+ psubd m5, m9, m6
+ paddd m6, m9, m7
+ psubd m7, m9, m8
+.pass1_end:
+ mova m9, [o(permA)]
+ psrlq m8, m9, 8
+ REPX {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7
+ jmp m(idct_16x8_internal_10bpc).pass1_end2
+.pass2:
+ call .main_pass2
+ vpermq m8, m11, m0
+ vpermq m9, m11, m1
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ vpermq m8, m11, m2
+ vpermq m9, m11, m3
+ jmp m(idct_16x8_internal_10bpc).write_16x4_noround
+ALIGN function_align
+.main_pass1:
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m2, m12, [cq+64*0]
+ pmulld m7, m12, [cq+64*1]
+ pmulld m1, m12, [cq+64*2]
+ pmulld m5, m12, [cq+64*3]
+ vpbroadcastd m13, [o(pd_2048)]
+ pxor m4, m4
+ mova m10, [o(permB)]
+ REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
+ REPX {paddd x, m13}, m2, m7, m1, m5
+ psrlq m6, m10, 32
+ REPX {psrad x, 12 }, m2, m7, m1, m5
+ mova m0, m6
+ vpermi2q m0, m2, m7 ; 0 2
+ vpermt2q m7, m10, m2 ; 3 1
+ mova m2, m6
+ vpermi2q m2, m1, m5 ; 4 6
+ vpermt2q m5, m10, m1 ; 7 5
+ cmp eobd, 43
+ jl .main_fast
+ pmulld m8, m12, [cq+64*4]
+ pmulld m3, m12, [cq+64*5]
+ pmulld m9, m12, [cq+64*6]
+ pmulld m1, m12, [cq+64*7]
+ REPX {mova [cq+64*x], m4}, 4, 5, 6, 7
+ REPX {paddd x, m13}, m8, m3, m9, m1
+ REPX {psrad x, 12 }, m8, m3, m9, m1
+ mova m4, m6
+ vpermi2q m4, m8, m3 ; 8 10
+ vpermt2q m3, m10, m8 ; 11 9
+ vpermi2q m6, m9, m1 ; 12 14
+ vpermt2q m1, m10, m9 ; 15 13
+.main:
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, _, 201_995, 4091_3973, 1
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, _, 1751_2440, 3703_3290, 1
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, _, 3035_3513, 2751_2106
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, _, 3857_4052, 1380_601
+ jmp .main2
+.main_fast:
+ vbroadcasti32x4 m1, [o(pd_4091_3973)]
+ vbroadcasti32x4 m8, [o(pd_201_995)]
+ vbroadcasti32x4 m3, [o(pd_3703_3290)]
+ vbroadcasti32x4 m9, [o(pd_1751_2440)]
+ vbroadcasti32x4 m4, [o(pd_2751_2106)]
+ vbroadcasti32x4 m10, [o(pd_3035_3513)]
+ vbroadcasti32x4 m6, [o(pd_1380_601)]
+ vbroadcasti32x4 m11, [o(pd_3857_4052)]
+ pmulld m1, m0
+ pmulld m0, m8
+ pmulld m3, m2
+ pmulld m2, m9
+ pmulld m4, m5
+ pmulld m5, m10
+ pmulld m6, m7
+ pmulld m7, m11
+.main2:
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ REPX {psubd x, m13, x}, m1, m3
+ REPX {paddd x, m13 }, m0, m2, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m4, m1, m5, m2, m6, m3, m7
+ psubd m8, m0, m4 ; t8a t10a
+ paddd m0, m4 ; t0a t2a
+ psubd m4, m1, m5 ; t9a t11a
+ paddd m1, m5 ; t1a t3a
+ psubd m5, m2, m6 ; t12a t14a
+ paddd m2, m6 ; t4a t6a
+ psubd m6, m3, m7 ; t13a t15a
+ paddd m3, m7 ; t5a t7a
+ REPX {pmaxsd x, m14}, m8, m4, m5, m6
+ REPX {pminsd x, m15}, m8, m4, m5, m6
+ vbroadcasti32x4 m11, [o(pd_4017_2276)]
+ vbroadcasti32x4 m10, [o(pd_799_3406)]
+ ITX_MULSUB_2D 8, 4, 7, 9, _, 13, 10, 11
+ ITX_MULSUB_2D 6, 5, 7, 9, _, 13, 11, 10
+ REPX {pmaxsd x, m14}, m0, m2, m1, m3
+ REPX {pminsd x, m15}, m0, m2, m1, m3
+ psubd m7, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ psubd m3, m4, m6 ; t12a t14a
+ paddd m4, m6 ; t8a t10a
+ psubd m6, m8, m5 ; t13a t15a
+ paddd m8, m5 ; t9a t11a
+ REPX {pmaxsd x, m14}, m7, m3, m2, m6
+ REPX {pminsd x, m15}, m7, m3, m2, m6
+ punpcklqdq m5, m3, m7 ; t12a t4
+ punpckhqdq m3, m7 ; t14a t6
+ punpckhqdq m7, m6, m2 ; t15a t7
+ punpcklqdq m6, m2 ; t13a t5
+ vpbroadcastd m11, [o(pd_1567)]
+ vpbroadcastd m10, [o(pd_3784)]
+ ITX_MULSUB_2D 7, 3, 2, 9, 10, 13, 10, 11
+ ITX_MULSUB_2D 5, 6, 2, 9, 10, 13, 11, 10
+ REPX {pmaxsd x, m14}, m0, m4, m1, m8
+ REPX {pminsd x, m15}, m0, m4, m1, m8
+ punpckhqdq m2, m4, m0 ; t10a t2
+ punpcklqdq m4, m0 ; t8a t0
+ punpckhqdq m0, m8, m1 ; t11a t3
+ punpcklqdq m8, m1 ; t9a t1
+ paddd m1, m6, m7 ; out2 -out3
+ psubd m6, m7 ; t14a t6
+ paddd m7, m5, m3 ; -out13 out12
+ psubd m5, m3 ; t15a t7
+ psubd m3, m8, m0 ; t11 t3a
+ paddd m8, m0 ; out14 -out15
+ paddd m0, m4, m2 ; -out1 out0
+ psubd m4, m2 ; t10 t2a
+ REPX {pmaxsd x, m14}, m6, m5, m3, m4
+ mov r6d, 0x3333
+ REPX {pminsd x, m15}, m6, m5, m3, m4
+ kmovw k1, r6d
+ REPX {pmulld x, m12}, m6, m5, m3, m4
+ pxor m9, m9
+ REPX {vpsubd x{k1}, m9, x}, m0, m1, m7, m8
+ paddd m6, m13
+ paddd m4, m13
+ paddd m2, m6, m5 ; -out5 out4
+ psubd m6, m5 ; out10 -out11
+ psubd m5, m4, m3 ; -out9 out8
+ paddd m3, m4 ; out6 -out7
+ REPX {psrad x, 12}, m2, m3, m5, m6
+ REPX {vpsubd x{k1}, m9, x}, m2, m3, m5, m6
+ ret
+ALIGN function_align
+.main_pass2:
+ lea r5, [o_base_8bpc]
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_16x8_internal_8bpc).main_pass2
+ movshdup m11, [permC]
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ vpbroadcastd m13, [pixel_10bpc_max]
+ pxor m12, m12
+ lea r6, [strideq*3]
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, identity, -21
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+
+cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(iadst_16x8_internal_10bpc).main_pass1
+ vpbroadcastd m9, [o(pd_1)]
+ psubd m4, m9, m3
+ paddd m3, m9, m5
+ paddd m5, m9, m2
+ psubd m2, m9, m6
+ psubd m6, m9, m1
+ paddd m1, m9, m7
+ paddd m7, m9, m0
+ psubd m0, m9, m8
+ jmp m(iadst_16x8_internal_10bpc).pass1_end
+.pass2:
+ call m(iadst_16x8_internal_10bpc).main_pass2
+ psrlq m11, 8
+ vpermq m8, m11, m3
+ vpermq m9, m11, m2
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ vpermq m8, m11, m1
+ vpermq m9, m11, m0
+ jmp m(idct_16x8_internal_10bpc).write_16x4_noround
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x16_internal_10bpc).load2
+ vpbroadcastd m8, [o(pd_5793)]
+ vpbroadcastd m13, [o(pd_3072)]
+ pxor m10, m10
+ REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {mova [cq+64*x], m10}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(idct_8x16_internal_10bpc).round
+ psrlq m8, [o(permA)], 16
+ psrlq m9, m8, 8
+ mova m10, m8
+ mova m11, m9
+ call m(idct_16x8_internal_10bpc).transpose_16x8
+ jmp tx2q
+.pass2:
+ movshdup m4, [o(permC)]
+ vpbroadcastd m11, [o(pw_4096)]
+ mova m5, m4
+ jmp m(idct_16x8_internal_10bpc).end
+
+%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 16x16
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity, 28
+INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, adst
+
+cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ cmp eobd, 36
+ jl .fast
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 2]
+ mova m2, [cq+64* 4]
+ mova m3, [cq+64* 6]
+ mova m4, [cq+64* 8]
+ mova m5, [cq+64*10]
+ mova m6, [cq+64*12]
+ mova m7, [cq+64*14]
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ call m(idct_8x16_internal_10bpc).main
+ mova m16, [cq+64* 1]
+ mova m17, [cq+64* 3]
+ mova m18, [cq+64* 5]
+ mova m19, [cq+64* 7]
+ mova m20, [cq+64* 9]
+ mova m21, [cq+64*11]
+ mova m22, [cq+64*13]
+ mova m23, [cq+64*15]
+ call .main
+ call .main_end
+.pass1_end:
+%if WIN64
+ movaps xmm6, [cq+16*0]
+ movaps xmm7, [cq+16*1]
+%endif
+ vzeroupper
+.pass1_end2:
+ call .main_end3
+.pass1_end3:
+ mov r6d, 64*12
+ pxor m8, m8
+.zero_loop:
+ mova [cq+r6+64*3], m8
+ mova [cq+r6+64*2], m8
+ mova [cq+r6+64*1], m8
+ mova [cq+r6+64*0], m8
+ sub r6d, 64*4
+ jge .zero_loop
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(idct_16x16_internal_8bpc).main
+ movshdup m12, [permC]
+ vpbroadcastd m11, [pw_2048]
+ psrlq m13, m12, 8
+ vpermq m8, m12, m0
+ vpermq m0, m13, m7
+ vpermq m7, m13, m1
+ vpermq m1, m12, m6
+ vpermq m6, m12, m2
+ vpermq m2, m13, m5
+ vpermq m5, m13, m3
+ vpermq m3, m12, m4
+.pass2_end:
+ lea r6, [strideq*3]
+ vpbroadcastd m13, [pixel_10bpc_max]
+ pxor m12, m12
+ pmulhrsw m8, m11, m8
+ pmulhrsw m9, m11, m7
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ pmulhrsw m8, m11, m6
+ pmulhrsw m9, m11, m5
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ pmulhrsw m8, m11, m3
+ pmulhrsw m9, m11, m2
+ call m(idct_16x8_internal_10bpc).write_16x4_noround
+ pmulhrsw m8, m11, m1
+ pmulhrsw m9, m11, m0
+ jmp m(idct_16x8_internal_10bpc).write_16x4_noround
+.fast:
+ mova ym0, [cq+64*0]
+ mova ym2, [cq+64*4]
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+64*2]
+ mova ym3, [cq+64*6]
+ mova ym4, [cq+64*1]
+ mova ym5, [cq+64*3]
+ mova ym6, [cq+64*5]
+ mova ym7, [cq+64*7]
+ vpermt2q m0, m8, m2 ; 0 4
+ vpermt2q m1, m8, m3 ; 2 6
+ vpermt2q m4, m8, m5 ; 1 3
+ vpermt2q m7, m8, m6 ; 7 5
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ vpbroadcastd m11, [o(pd_2)]
+ call m(idct_8x16_internal_10bpc).main_end2
+ mova m8, [o(permA)]
+ psrlq m9, m8, 8
+ jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2
+ALIGN function_align
+.main_fast_rect2:
+ REPX {paddd x, m13}, m16, m17, m18, m19
+ REPX {psrad x, 12 }, m16, m17, m18, m19
+.main_fast:
+ pmulld m23, m16, [o(pd_4076)] {1to16} ; t15a
+ pmulld m16, [o(pd_401)] {1to16} ; t8a
+ pmulld m20, m19, [o(pd_2598)] {1to16} ; t9a
+ pmulld m19, [o(pd_3166)] {1to16} ; t14a
+ pmulld m22, m17, [o(pd_1189)] {1to16} ; t11a
+ pmulld m17, [o(pd_3920)] {1to16} ; t12a
+ pmulld m21, m18, [o(pd_3612)] {1to16} ; t13a
+ pmulld m18, [o(pd_1931)] {1to16} ; t10a
+ psubd m20, m13, m20
+ psubd m22, m13, m22
+ call .round2
+ jmp .main2
+.main_rect2:
+ call .round
+.main:
+ ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3166, 2598 ; t9a, t14a
+ ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3920, 1189 ; t11a, t12a
+ ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1931, 3612 ; t10a, t13a
+ call .round
+.main2:
+ paddd m9, m20, m16 ; t8
+ psubd m20, m16, m20 ; t9
+ psubd m16, m22, m18 ; t10
+ paddd m18, m22 ; t11
+ paddd m22, m23, m19 ; t15
+ psubd m23, m19 ; t14
+ psubd m19, m17, m21 ; t13
+ paddd m17, m21 ; t12
+ vpbroadcastd m11, [o(pd_3784)]
+ REPX {pmaxsd x, m14}, m20, m23, m16, m19
+ vpbroadcastd m10, [o(pd_1567)]
+ REPX {pminsd x, m15}, m20, m23, m16, m19
+ ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11
+ ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2
+ REPX {pmaxsd x, m14}, m9, m18, m22, m17
+ REPX {pminsd x, m15}, m9, m18, m22, m17
+ paddd m21, m20, m19 ; t14
+ psubd m20, m19 ; t13
+ psubd m19, m9, m18 ; t11a
+ paddd m9, m18 ; t8a
+ psubd m18, m23, m16 ; t10
+ paddd m16, m23 ; t9
+ psubd m23, m22, m17 ; t12a
+ paddd m22, m17 ; t15a
+ REPX {pmaxsd x, m14}, m20, m23, m18, m19
+ REPX {pminsd x, m15}, m20, m23, m18, m19
+ REPX {pmulld x, m12}, m20, m23, m18, m19
+ psubd m7, m0, m6 ; dct8 out7
+ paddd m0, m6 ; dct8 out0
+ psubd m6, m1, m5 ; dct8 out6
+ paddd m1, m5 ; dct8 out1
+ REPX {pmaxsd x, m14}, m7, m0, m6, m1
+ psubd m5, m2, m4 ; dct8 out5
+ paddd m2, m4 ; dct8 out2
+ REPX {pminsd x, m15}, m7, m0, m6, m1
+ psubd m4, m3, m8 ; dct8 out4
+ paddd m3, m8 ; dct8 out3
+ REPX {pmaxsd x, m14}, m5, m2, m4, m3
+ paddd m20, m13
+ paddd m23, m13
+ REPX {pminsd x, m15}, m5, m2, m4, m3
+ psubd m17, m20, m18 ; t10a
+ paddd m20, m18 ; t13a
+ REPX {pmaxsd x, m14}, m22, m21, m16, m9
+ psubd m18, m23, m19 ; t11
+ paddd m19, m23 ; t12
+ REPX {pminsd x, m15}, m22, m21, m16, m9
+ REPX {psrad x, 12 }, m20, m19, m18, m17
+ ret
+.main_end:
+ vpbroadcastd m11, [o(pd_2)]
+.main_end2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m23, m0, m22 ; out15
+ paddd m0, m22 ; out0
+ psubd m22, m1, m21 ; out14
+ paddd m1, m21 ; out1
+ psubd m21, m2, m20 ; out13
+ paddd m2, m20 ; out2
+ psubd m20, m3, m19 ; out12
+ paddd m3, m19 ; out3
+ psubd m19, m4, m18 ; out11
+ paddd m4, m18 ; out4
+ psubd m18, m5, m17 ; out10
+ paddd m5, m17 ; out5
+ psubd m17, m6, m16 ; out9
+ paddd m6, m16 ; out6
+ psubd m16, m7, m9 ; out8
+ paddd m7, m9 ; out7
+ REPX {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \
+ m4, m20, m5, m21, m6, m22, m7, m23
+ packssdw m0, m16
+ packssdw m1, m17
+ packssdw m2, m18
+ packssdw m3, m19
+ packssdw m4, m20
+ packssdw m5, m21
+ packssdw m6, m22
+ packssdw m7, m23
+ ret
+.main_end3:
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m4, m5
+ punpcklwd m4, m5
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+ punpckhdq m7, m0, m2
+ punpckldq m0, m2
+ punpckhdq m2, m8, m1
+ punpckldq m8, m1
+ punpckhdq m1, m4, m5
+ punpckldq m4, m5
+ punpckhdq m5, m3, m6
+ punpckldq m3, m6
+ vshufi32x4 m6, m0, m4, q3232
+ vinserti32x8 m0, ym4, 1
+ vinserti32x8 m4, m8, ym3, 1
+ vshufi32x4 m8, m3, q3232
+ vinserti32x8 m3, m7, ym1, 1
+ vshufi32x4 m7, m1, q3232
+ vshufi32x4 m1, m2, m5, q3232
+ vinserti32x8 m2, ym5, 1
+ vshufi32x4 m5, m7, m1, q2020 ; 10 11
+ vshufi32x4 m7, m1, q3131 ; 14 15
+ vshufi32x4 m1, m3, m2, q2020 ; 2 3
+ vshufi32x4 m3, m2, q3131 ; 6 7
+ vshufi32x4 m2, m0, m4, q3131 ; 4 5
+ vshufi32x4 m0, m4, q2020 ; 0 1
+ vshufi32x4 m4, m6, m8, q2020 ; 8 9
+ vshufi32x4 m6, m8, q3131 ; 12 13
+ ret
+ALIGN function_align
+.round:
+ paddd m20, m13
+ paddd m22, m13
+.round2:
+ paddd m16, m13
+ paddd m18, m13
+.round3:
+ REPX {psrad x, 12 }, m16, m18, m20, m22
+ REPX {paddd x, m13}, m17, m19, m21, m23
+ REPX {psrad x, 12 }, m17, m19, m21, m23
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, flipadst
+INV_TXFM_16X16_FN adst, adst
+
+cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 36
+ jl .fast
+ call .main_pass1
+ packssdw m0, m16
+ packssdw m1, m17
+ packssdw m2, m18
+ packssdw m3, m19
+ packssdw m4, m5, m20
+ packssdw m5, m6, m21
+ packssdw m6, m7, m22
+ packssdw m7, m8, m23
+ jmp m(idct_16x16_internal_10bpc).pass1_end
+.fast:
+ call .main_pass1_fast
+ vpbroadcastd m9, [o(pd_2)]
+ paddd m0, m9
+ psubd m1, m9, m1
+ paddd m2, m9
+ psubd m3, m9, m3
+ paddd m4, m9, m5
+ psubd m5, m9, m6
+ paddd m6, m9, m7
+ psubd m7, m9, m8
+.pass1_fast_end:
+ mova m9, [o(permA)]
+ psrlq m8, m9, 8
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
+.pass1_fast_end2:
+ mova m10, m9
+ mova m11, m8
+ call m(idct_16x8_internal_10bpc).transpose_16x8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(iadst_16x16_internal_8bpc).main_pass2b
+ movshdup m12, [permC]
+ mova m11, [pw_2048_m2048]
+ psrlq m13, m12, 8
+ vpermq m8, m13, m0
+ vpermq m0, m12, m7
+ vpermq m7, m13, m1
+ vpermq m1, m12, m6
+ vpermq m6, m13, m2
+ vpermq m2, m12, m5
+ vpermq m5, m13, m3
+ vpermq m3, m12, m4
+ jmp m(idct_16x16_internal_10bpc).pass2_end
+ALIGN function_align
+.main_pass1:
+ mova m0, [cq+64* 0]
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ mova m23, [cq+64*15]
+ vpbroadcastd m13, [o(pd_2048)]
+ ITX_MULSUB_2D 23, 0, 8, 9, 10, 13, 201, 4091 ; t1 t0
+ mova m7, [cq+64* 7]
+ mova m16, [cq+64* 8]
+ ITX_MULSUB_2D 7, 16, 8, 9, 10, 13, 3035, 2751 ; t9 t8
+ mova m2, [cq+64* 2]
+ mova m21, [cq+64*13]
+ ITX_MULSUB_2D 21, 2, 8, 9, 10, 13, 995, 3973 ; t3 t2
+ mova m5, [cq+64* 5]
+ mova m18, [cq+64*10]
+ ITX_MULSUB_2D 5, 18, 8, 9, 10, 13, 3513, 2106 ; t11 t10
+ mova m4, [cq+64* 4]
+ mova m19, [cq+64*11]
+ ITX_MULSUB_2D 19, 4, 8, 9, 10, 13, 1751, 3703 ; t5 t4
+ mova m3, [cq+64* 3]
+ mova m20, [cq+64*12]
+ ITX_MULSUB_2D 3, 20, 8, 9, 10, 13, 3857, 1380 ; t13 t12
+ mova m6, [cq+64* 6]
+ mova m17, [cq+64* 9]
+ ITX_MULSUB_2D 17, 6, 8, 9, 10, 13, 2440, 3290 ; t7 t6
+ mova m1, [cq+64* 1]
+ mova m22, [cq+64*14]
+ ITX_MULSUB_2D 1, 22, 8, 9, 10, 13, 4052, 601 ; t15 t14
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psubd m9, m23, m7 ; t9a
+ paddd m23, m7 ; t1a
+ psubd m7, m2, m18 ; t10a
+ paddd m18, m2 ; t2a
+ REPX {pmaxsd x, m14}, m9, m23, m7, m18
+ psubd m2, m17, m1 ; t15a
+ paddd m17, m1 ; t7a
+ REPX {pminsd x, m15}, m9, m23, m7, m18
+ psubd m1, m21, m5 ; t11a
+ paddd m21, m5 ; t3a
+ REPX {pmaxsd x, m14}, m2, m17, m1, m21
+ psubd m5, m4, m20 ; t12a
+ paddd m4, m20 ; t4a
+ REPX {pminsd x, m15}, m2, m17, m1, m21
+ psubd m20, m19, m3 ; t13a
+ paddd m19, m3 ; t5a
+ REPX {pmaxsd x, m14}, m5, m4, m20, m19
+ psubd m8, m6, m22 ; t14a
+ paddd m6, m22 ; t6a
+ REPX {pminsd x, m15}, m5, m4, m20, m19
+ psubd m22, m0, m16 ; t8a
+ paddd m16, m0 ; t0a
+ REPX {pmaxsd x, m14}, m8, m6, m22, m16
+ vpbroadcastd m11, [o(pd_4017)]
+ vpbroadcastd m10, [o(pd_799)]
+ REPX {pminsd x, m15}, m8, m6, m22, m16
+ ITX_MULSUB_2D 22, 9, 0, 3, _, 13, 10, 11 ; t9 t8
+ ITX_MULSUB_2D 20, 5, 0, 3, _, 13, 11, 10 ; t12 t13
+ vpbroadcastd m11, [o(pd_2276)]
+ vpbroadcastd m10, [o(pd_3406)]
+ ITX_MULSUB_2D 7, 1, 0, 3, _, 13, 10, 11 ; t11 t10
+ ITX_MULSUB_2D 2, 8, 0, 3, _, 13, 11, 10 ; t14 t15
+ paddd m0, m16, m4 ; t0
+ psubd m16, m4 ; t4
+ psubd m3, m23, m19 ; t5
+ paddd m23, m19 ; t1
+ REPX {pmaxsd x, m14}, m0, m16, m3, m23
+ psubd m19, m18, m6 ; t6
+ paddd m18, m6 ; t2
+ REPX {pminsd x, m15}, m0, m16, m3, m23
+ psubd m6, m21, m17 ; t7
+ paddd m21, m17 ; t3
+ REPX {pmaxsd x, m14}, m19, m18, m6, m21
+ paddd m17, m9, m20 ; t8a
+ psubd m9, m20 ; t12a
+ REPX {pminsd x, m15}, m19, m18, m6, m21
+ psubd m20, m22, m5 ; t13a
+ paddd m22, m5 ; t9a
+ REPX {pmaxsd x, m14}, m17, m9, m20, m22
+ psubd m5, m1, m2 ; t14a
+ paddd m1, m2 ; t10a
+ REPX {pminsd x, m15}, m17, m9, m20, m22
+ psubd m2, m7, m8 ; t15a
+ paddd m7, m8 ; t11a
+ REPX {pmaxsd x, m14}, m5, m1, m2, m7
+ vpbroadcastd m11, [o(pd_3784)]
+ vpbroadcastd m10, [o(pd_1567)]
+ REPX {pminsd x, m15}, m5, m1, m2, m7
+ ITX_MULSUB_2D 16, 3, 4, 8, _, 13, 10, 11 ; t5a t4a
+ ITX_MULSUB_2D 6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a
+ ITX_MULSUB_2D 9, 20, 4, 8, _, 13, 10, 11 ; t13 t12
+ ITX_MULSUB_2D 2, 5, 4, 8, _, 13, 11, 10 ; t14 t15
+ psubd m8, m0, m18 ; t2a
+ paddd m0, m18 ; out0
+ psubd m18, m23, m21 ; t3a
+ paddd m23, m21 ; -out15
+ paddd m21, m9, m5 ; -out13
+ psubd m9, m5 ; t15a
+ psubd m5, m3, m6 ; t6
+ paddd m3, m6 ; -out3
+ REPX {pmaxsd x, m14}, m8, m18, m9, m5
+ psubd m6, m20, m2 ; t14a
+ paddd m2, m20 ; out2
+ paddd m20, m16, m19 ; out12
+ psubd m16, m19 ; t7
+ REPX {pminsd x, m15}, m8, m18, m9, m5
+ psubd m19, m22, m7 ; t11
+ paddd m22, m7 ; out14
+ psubd m7, m17, m1 ; t10
+ paddd m1, m17 ; -out1
+ REPX {pmaxsd x, m14}, m6, m16, m19, m7
+ vpbroadcastd m12, [o(pd_1448)]
+ vpbroadcastd m4, [o(pd_2)]
+ vpbroadcastd m10, [o(pd_5120)]
+ vpbroadcastd m11, [o(pd_5119)]
+ REPX {pminsd x, m15}, m6, m16, m19, m7
+ psubd m17, m7, m19 ; -out9
+ paddd m7, m19 ; out6
+ psubd m19, m5, m16 ; -out11
+ paddd m5, m16 ; out4
+ REPX {pmulld x, m12}, m17, m7, m19, m5
+ psubd m16, m8, m18 ; out8
+ paddd m8, m18 ; -out7
+ psubd m18, m6, m9 ; out10
+ paddd m6, m9 ; -out5
+ REPX {pmulld x, m12}, m16, m8, m18, m6
+ REPX {paddd x, m4 }, m0, m2, m20, m22
+ REPX {psubd x, m4, x}, m1, m3, m21, m23
+ REPX {paddd x, m10 }, m7, m5, m16, m18
+ REPX {psubd x, m11, x}, m17, m19, m8, m6
+ REPX {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3
+ REPX {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8
+ ret
+ALIGN function_align
+.main_pass1_fast:
+ mova ym0, [cq+64*0]
+ mova ym1, [cq+64*2]
+ movshdup m8, [o(permB)]
+ mova ym6, [cq+64*1]
+ mova ym7, [cq+64*3]
+ mova ym2, [cq+64*4]
+ mova ym3, [cq+64*6]
+ mova ym4, [cq+64*5]
+ mova ym5, [cq+64*7]
+ vpermt2q m0, m8, m1 ; 0 2
+ vpermt2q m7, m8, m6 ; 3 1
+ vpermt2q m2, m8, m3 ; 4 6
+ vpermt2q m5, m8, m4 ; 7 5
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m12, [o(pd_2896)]
+ jmp m(iadst_16x8_internal_10bpc).main_fast
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 36
+ jl .fast
+ call m(iadst_16x16_internal_10bpc).main_pass1
+ packssdw m4, m19, m3
+ packssdw m3, m20, m5
+ packssdw m5, m18, m2
+ packssdw m2, m21, m6
+ packssdw m6, m17, m1
+ packssdw m1, m22, m7
+ packssdw m7, m16, m0
+ packssdw m0, m23, m8
+ jmp m(idct_16x16_internal_10bpc).pass1_end
+.fast:
+ call m(iadst_16x16_internal_10bpc).main_pass1_fast
+ vpbroadcastd m9, [o(pd_2)]
+ psubd m4, m9, m3
+ paddd m3, m9, m5
+ paddd m5, m9, m2
+ psubd m2, m9, m6
+ psubd m6, m9, m1
+ paddd m1, m9, m7
+ paddd m7, m9, m0
+ psubd m0, m9, m8
+ jmp m(iadst_16x16_internal_10bpc).pass1_fast_end
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(iadst_16x16_internal_8bpc).main_pass2b
+ movshdup m12, [permC]
+ movu m11, [pw_m2048_2048]
+ psrlq m13, m12, 8
+ vpermq m8, m13, m7
+ vpermq m7, m13, m6
+ vpermq m6, m13, m5
+ vpermq m5, m13, m4
+ vpermq m3, m12, m3
+ vpermq m2, m12, m2
+ vpermq m1, m12, m1
+ vpermq m0, m12, m0
+ jmp m(idct_16x16_internal_10bpc).pass2_end
+
+INV_TXFM_16X16_FN identity, dct, -92
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m10, [o(pd_5793)]
+ vpbroadcastd m11, [o(pd_5120)]
+ mov r6, cq
+ cmp eobd, 36
+ jl .fast
+ call .pass1_main
+ packssdw m0, m6, m8
+ packssdw m1, m7, m9
+ call .pass1_main
+ packssdw m2, m6, m8
+ packssdw m3, m7, m9
+ call .pass1_main
+ packssdw m4, m6, m8
+ packssdw m5, m7, m9
+ call .pass1_main
+ packssdw m6, m8
+ packssdw m7, m9
+ jmp m(idct_16x16_internal_10bpc).pass1_end2
+.fast:
+ call .pass1_main_fast
+ packssdw m0, m6, m7
+ call .pass1_main_fast
+ packssdw m1, m6, m7
+ call .pass1_main_fast
+ packssdw m2, m6, m7
+ call .pass1_main_fast
+ packssdw m3, m6, m7
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckldq m3, m4, m1
+ punpckhdq m4, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ pxor m7, m7
+ vshufi32x4 m2, m0, m3, q3131
+ vshufi32x4 m0, m3, q2020
+ vshufi32x4 m3, m1, m4, q3131
+ vshufi32x4 m1, m4, q2020
+ REPX {mova x, m7}, m4, m5, m6
+ jmp m(idct_16x16_internal_10bpc).pass1_end3
+.pass2:
+ movshdup m14, [o(permC)]
+ vpbroadcastd m15, [o(pw_1697x16)]
+ lea r6, [strideq*3]
+ vpbroadcastd m11, [o(pw_2048)]
+ pxor m12, m12
+ vpbroadcastd m13, [pixel_10bpc_max]
+ vpermq m8, m14, m0
+ vpermq m9, m14, m1
+ call .pass2_main
+ vpermq m8, m14, m2
+ vpermq m9, m14, m3
+ call .pass2_main
+ vpermq m8, m14, m4
+ vpermq m9, m14, m5
+ call .pass2_main
+ vpermq m8, m14, m6
+ vpermq m9, m14, m7
+.pass2_main:
+ pmulhrsw m0, m15, m8
+ pmulhrsw m1, m15, m9
+ paddsw m8, m8
+ paddsw m9, m9
+ paddsw m8, m0
+ paddsw m9, m1
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+ALIGN function_align
+.pass1_main:
+ pmulld m6, m10, [r6+64*0]
+ pmulld m7, m10, [r6+64*1]
+ pmulld m8, m10, [r6+64*8]
+ pmulld m9, m10, [r6+64*9]
+ add r6, 64*2
+ REPX {paddd x, m11}, m6, m7, m8, m9
+ REPX {psrad x, 13 }, m6, m8, m7, m9
+ ret
+ALIGN function_align
+.pass1_main_fast:
+ mova ym6, [r6+64* 0]
+ vinserti32x8 m6, [r6+64* 4], 1
+ mova ym7, [r6+64* 8]
+ vinserti32x8 m7, [r6+64*12], 1
+ add r6, 64
+ REPX {pmulld x, m10}, m6, m7
+ REPX {paddd x, m11}, m6, m7
+ REPX {psrad x, 13 }, m6, m7
+ ret
+
+cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ vpbroadcastd m11, [o(pd_2)]
+ mova m20, [o(idct8x32p)]
+ pxor m21, m21
+ cmp eobd, 43
+ jl .fast
+ call .pass1_main
+ punpcklwd m16, m0, m1
+ punpcklwd m17, m2, m3
+ punpckhwd m18, m0, m1
+ punpckhwd m19, m2, m3
+ cmp eobd, 107
+ jge .full
+ punpckldq m0, m16, m17 ; 0 2
+ punpckhdq m1, m16, m17 ; 4 6
+ punpckldq m2, m18, m19 ; 8 10
+ punpckhdq m3, m18, m19 ; 12 14
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ vextracti32x8 ym16, m2, 1
+ vextracti32x8 ym17, m3, 1
+ call m(idct_8x16_internal_8bpc).main_fast
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+ jmp .end
+.full:
+ add cq, 64
+ call .pass1_main
+ punpcklwd m5, m0, m1
+ punpcklwd m6, m2, m3
+ punpckhwd m7, m0, m1
+ punpckhwd m8, m2, m3
+ punpckldq m0, m16, m17 ; 0 2
+ punpckhdq m1, m16, m17 ; 4 6
+ punpckldq m2, m18, m19 ; 8 10
+ punpckhdq m3, m18, m19 ; 12 14
+ punpckldq m4, m5, m6 ; 16 18
+ punpckhdq m5, m6 ; 20 22
+ punpckldq m6, m7, m8 ; 24 26
+ punpckhdq m7, m8 ; 28 30
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ vextracti32x8 ym16, m2, 1
+ vextracti32x8 ym17, m3, 1
+ vextracti32x8 ym18, m4, 1
+ vextracti32x8 ym19, m5, 1
+ vextracti32x8 ym20, m6, 1
+ vextracti32x8 ym21, m7, 1
+ call m(idct_8x16_internal_8bpc).main
+ REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+ jmp .end
+.fast:
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+128*1]
+ mova ym5, [cq+128*5]
+ mova ym7, [cq+128*3]
+ mova ym3, [cq+128*7]
+ mova ym0, [cq+128*0]
+ mova ym4, [cq+128*2]
+ mova ym2, [cq+128*4]
+ mova ym6, [cq+128*6]
+ vpermt2q m1, m8, m5 ; 1 5
+ vpermt2q m3, m8, m7 ; 7 3
+ vpermt2q m0, m8, m4 ; 0 2
+ vpermt2q m2, m8, m6 ; 4 6
+ mova [cq+128*0], ym21
+ REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_10bpc).main_end
+ packssdw m0, m2
+ packssdw m1, m3
+ vpermb m0, m20, m0
+ vprold m20, 16
+ vpermb m2, m20, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ call m(idct_8x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2
+.end:
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper
+ lea r3, [strideq*2]
+ vpbroadcastd m12, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m11, m11
+ lea r3, [dstq+r3*8]
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ call .write_8x4x2
+ pmulhrsw m0, m10, m2
+ pmulhrsw m1, m10, m3
+ call .write_8x4x2
+ pmulhrsw m0, m10, m4
+ pmulhrsw m1, m10, m5
+ call .write_8x4x2
+ pmulhrsw m0, m10, m6
+ pmulhrsw m1, m10, m7
+.write_8x4x2:
+ mova xm8, [dstq+strideq*0]
+ vinserti32x4 ym8, [dstq+strideq*1], 1
+ vinserti32x4 m8, [dstq+strideq*2], 2
+ vinserti32x4 m8, [dstq+r6 ], 3
+ mova xm9, [r3 +r6 ]
+ vinserti32x4 ym9, [r3 +strideq*2], 1
+ vinserti32x4 m9, [r3 +strideq*1], 2
+ vinserti32x4 m9, [r3 +strideq*0], 3
+ paddw m8, m0
+ paddw m9, m1
+ pmaxsw m8, m11
+ pmaxsw m9, m11
+ pminsw m8, m12
+ pminsw m9, m12
+ mova [dstq+strideq*0], xm8
+ vextracti32x4 [dstq+strideq*1], ym8, 1
+ vextracti32x4 [dstq+strideq*2], m8, 2
+ vextracti32x4 [dstq+r6 ], m8, 3
+ lea dstq, [dstq+strideq*4]
+ vextracti32x4 [r3 +strideq*0], m9, 3
+ vextracti32x4 [r3 +strideq*1], m9, 2
+ vextracti32x4 [r3 +strideq*2], ym9, 1
+ mova [r3 +r6 ], xm9
+ lea r3, [r3+strideq*4]
+ ret
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
+ALIGN function_align
+.pass1_main:
+ mova m0, [cq+128*0]
+ mova m1, [cq+128*1]
+ mova m2, [cq+128*2]
+ mova m3, [cq+128*3]
+ mova m4, [cq+128*4]
+ mova m5, [cq+128*5]
+ mova m6, [cq+128*6]
+ mova m7, [cq+128*7]
+ REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(idct_8x16_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_end2
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ REPX {vpermb x, m20, x}, m0, m1, m2, m3
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob
+ vpbroadcastd m9, [pw_5]
+ lea r4, [strideq*3]
+ pxor m10, m10
+ lea r5, [strideq*5]
+ vpbroadcastd m11, [pixel_10bpc_max]
+ sub eobd, 107
+ lea r6, [strideq+r4*2]
+.loop:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ lea r7, [dstq+strideq*8]
+ REPX {mova [cq+128*x], m10}, 0, 1, 2, 3
+ REPX {paddsw x, m9}, m0, m1, m2, m3
+ REPX {mova [cq+128*x], m10}, 4, 5, 6, 7
+ REPX {psraw x, 3 }, m0, m1, m2, m3
+ add cq, 64
+ mova xm4, [dstq+strideq*0]
+ mova xm5, [dstq+strideq*1]
+ mova xm6, [dstq+strideq*2]
+ mova xm7, [dstq+r4 *1]
+ punpckhwd m8, m0, m1
+ vinserti32x4 ym4, [dstq+strideq*4], 1
+ punpcklwd m0, m1
+ vinserti32x4 ym5, [dstq+r5 *1], 1
+ punpckhwd m1, m2, m3
+ vinserti32x4 ym6, [dstq+r4 *2], 1
+ punpcklwd m2, m3
+ vinserti32x4 ym7, [dstq+r6 *1], 1
+ punpckhwd m3, m0, m8
+ vinserti32x4 m4, [r7 +strideq*0], 2
+ punpcklwd m0, m8
+ vinserti32x4 m5, [r7 +strideq*1], 2
+ punpckhwd m8, m2, m1
+ vinserti32x4 m6, [r7 +strideq*2], 2
+ punpcklwd m2, m1
+ vinserti32x4 m7, [r7 +r4 *1], 2
+ punpckhqdq m1, m0, m2
+ vinserti32x4 m4, [r7 +strideq*4], 3
+ punpcklqdq m0, m2
+ vinserti32x4 m5, [r7 +r5 *1], 3
+ punpcklqdq m2, m3, m8
+ vinserti32x4 m6, [r7 +r4 *2], 3
+ punpckhqdq m3, m8
+ vinserti32x4 m7, [r7 +r6 *1], 3
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ REPX {pmaxsw x, m10}, m0, m1, m2, m3
+ REPX {pminsw x, m11}, m0, m1, m2, m3
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ mova [dstq+strideq*2], xm2
+ mova [dstq+r4 *1], xm3
+ vextracti32x4 [dstq+strideq*4], ym0, 1
+ vextracti32x4 [dstq+r5 *1], ym1, 1
+ vextracti32x4 [dstq+r4 *2], ym2, 1
+ vextracti32x4 [dstq+r6 *1], ym3, 1
+ lea dstq, [r7+strideq*8]
+ vextracti32x4 [r7 +strideq*0], m0, 2
+ vextracti32x4 [r7 +strideq*1], m1, 2
+ vextracti32x4 [r7 +strideq*2], m2, 2
+ vextracti32x4 [r7 +r4 *1], m3, 2
+ vextracti32x4 [r7 +strideq*4], m0, 3
+ vextracti32x4 [r7 +r5 *1], m1, 3
+ vextracti32x4 [r7 +r4 *2], m2, 3
+ vextracti32x4 [r7 +r6 *1], m3, 3
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ mova m11, [o(permB)]
+ mova m0, [cq+64* 0] ; 0 1
+ mova m4, [cq+64* 1] ; 2 3
+ mova m1, [cq+64* 2] ; 4 5
+ mova m8, [cq+64* 3] ; 6 7
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psrlq m10, m11, 32
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ mova m16, m11
+ vpermi2q m16, m0, m1 ; 1 5
+ mova m17, m11
+ vpermi2q m17, m8, m4 ; 7 3
+ cmp eobd, 43
+ jl .fast
+ mova m18, [cq+64* 4] ; 8 9
+ mova m20, [cq+64* 5] ; 10 11
+ mova m6, [cq+64* 6] ; 12 13
+ mova m7, [cq+64* 7] ; 14 15
+ vpermt2q m0, m10, m18 ; 0 8
+ vpermt2q m18, m11, m6 ; 9 13
+ mova m19, m11
+ vpermi2q m19, m7, m20 ; 15 11
+ cmp eobd, 107
+ jge .full
+ vpermt2q m1, m10, m6 ; 4 12
+ vpermt2q m4, m10, m8 ; 2 6
+ vpermt2q m7, m10, m20 ; 14 10
+ mov r6d, 64*1
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ call .main_fast
+ call m(idct_16x16_internal_10bpc).main_end
+ jmp .end
+.full:
+ mova m2, [cq+64* 8] ; 16 17
+ mova m5, [cq+64* 9] ; 18 19
+ mova m9, [cq+64*10] ; 20 21
+ mova m21, [cq+64*11] ; 22 23
+ vpermt2q m1, m10, m9 ; 4 20
+ vpermt2q m7, m10, m21 ; 14 22
+ vpermt2q m21, m11, m5 ; 23 19
+ vpermt2q m5, m10, m20 ; 18 10
+ mova m20, m11
+ vpermi2q m20, m2, m9 ; 17 21
+ mova m22, [cq+64*12] ; 24 25
+ mova m9, [cq+64*13] ; 26 27
+ mova m3, [cq+64*14] ; 28 29
+ mova m23, [cq+64*15] ; 30 31
+ vpermt2q m2, m10, m22 ; 16 24
+ vpermt2q m22, m11, m3 ; 25 29
+ vpermt2q m3, m10, m6 ; 28 12
+ vpermt2q m4, m10, m9 ; 2 26
+ mova m6, m10
+ vpermi2q m6, m23, m8 ; 30 6
+ vpermt2q m23, m11, m9 ; 31 27
+ mov r6d, 64*3
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_16x8_internal_10bpc).main
+ call .main
+ call m(idct_16x16_internal_10bpc).main_end
+ jmp .end
+.fast:
+ vpermq m0, m10, m0 ; 0 0
+ vpermq m1, m10, m1 ; 4 4
+ vpermt2q m4, m10, m8 ; 2 6
+ xor r6d, r6d
+ call .main_fast2
+ call m(idct_16x16_internal_10bpc).main_end
+.end:
+%if WIN64
+ movaps xmm6, [cq+16*0]
+ movaps xmm7, [cq+16*1]
+%endif
+ vzeroupper
+ call .transpose_8x32
+ pxor m14, m14
+.zero_loop:
+ mova [cq+r6*4+64*3], m14
+ mova [cq+r6*4+64*2], m14
+ mova [cq+r6*4+64*1], m14
+ mova [cq+r6*4+64*0], m14
+ sub r6d, 64
+ jge .zero_loop
+ lea r5, [o_base_8bpc]
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m8
+ punpcklqdq m6, m8
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ pxor m12, m12
+.write_32x8_start:
+ vpbroadcastd m11, [pw_2048]
+ vpbroadcastd m13, [pixel_10bpc_max]
+ lea r3, [strideq*3]
+.write_32x8:
+ pmulhrsw m0, m11
+ pmulhrsw m1, m11
+ pmulhrsw m2, m11
+ pmulhrsw m3, m11
+ call .write_32x4
+ pmulhrsw m0, m11, m4
+ pmulhrsw m1, m11, m5
+ pmulhrsw m2, m11, m6
+ pmulhrsw m3, m11, m7
+.write_32x4:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r3 ]
+ REPX {pmaxsw x, m12}, m0, m1, m2, m3
+ REPX {pminsw x, m13}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r3 ], m3
+ lea dstq, [dstq+strideq*4]
+ ret
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 8
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2
+ALIGN function_align
+.main_fast2: ; bottom three-quarters are zero
+ vbroadcasti32x4 m8, [o(pd_799_4017)]
+ pmulld m8, m1 ; t4 t7
+ vpmulld m0, [o(pd_2896)] {1to16} ; t0 t1
+ REPX {paddd x, m13}, m8, m0
+ REPX {psrad x, 12 }, m8, m0
+ pmulld m3, m8, m12
+ mova m2, m0 ; t3 t2
+ call m(idct_8x8_internal_10bpc).main3
+ vbroadcasti32x4 m6, [o(pd_4076_3920)]
+ vbroadcasti32x4 m3, [o(pd_401_m1189)]
+ pmulld m6, m4 ; t15 t12
+ pmulld m4, m3 ; t9 t10
+ REPX {paddd x, m13}, m6, m4
+ REPX {psrad x, 12 }, m6, m4
+ mova m5, m6 ; t14 t13
+ mova m9, m4 ; t8 t11
+ call m(idct_16x8_internal_10bpc).main3
+ vbroadcasti32x4 m23, [o(pd_4091_3973)]
+ vbroadcasti32x4 m7, [o(pd_201_995)]
+ vbroadcasti32x4 m22, [o(pd_1380_601)]
+ vbroadcasti32x4 m9, [o(pd_3857_4052)]
+ pmulld m23, m16 ; t16 t20
+ pmulld m16, m7 ; t31 t27
+ pmulld m22, m17 ; -t19 -t25
+ pmulld m17, m9 ; t28 t24
+ REPX {paddd x, m13}, m23, m16, m17
+ psubd m22, m13, m22
+ REPX {psrad x, 12 }, m23, m16, m22, m17
+ mova m20, m23 ; t30 t26
+ mova m9, m16 ; t17 t21
+ mova m19, m22 ; t18 t22
+ mova m18, m17 ; t29 t25
+ jmp .main3
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m23, [o(pd_4091_3973)]
+ vbroadcasti32x4 m7, [o(pd_201_995)]
+ vbroadcasti32x4 m20, [o(pd_2751_2106)]
+ vbroadcasti32x4 m9, [o(pd_3035_3513)]
+ vbroadcasti32x4 m21, [o(pd_3703_3290)]
+ vbroadcasti32x4 m10, [o(pd_1751_2440)]
+ vbroadcasti32x4 m22, [o(pd_1380_601)]
+ vbroadcasti32x4 m11, [o(pd_3857_4052)]
+ pmulld m23, m16 ; t16a t20a
+ pmulld m16, m7 ; t31a t27a
+ pmulld m20, m19 ; -t17a -t21a
+ pmulld m19, m9 ; t30a t26a
+ pmulld m21, m18 ; t18a t22a
+ pmulld m18, m10 ; t29a t25a
+ pmulld m22, m17 ; -t19a -t25a
+ pmulld m17, m11 ; t28a t24a
+ psubd m20, m13, m20
+ psubd m22, m13, m22
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 201_995, 4091_3973
+ ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3035_3513, 2751_2106
+ ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1751_2440, 3703_3290
+ ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3857_4052, 1380_601
+ paddd m20, m13
+ paddd m22, m13
+.main2:
+ REPX {paddd x, m13}, m16, m23, m19
+ REPX {psrad x, 12 }, m16, m20, m23, m19
+ psubd m9, m16, m20 ; t17 t21
+ paddd m16, m20 ; t16 t20
+ psubd m20, m23, m19 ; t30 t26
+ paddd m23, m19 ; t31 t27
+ REPX {pmaxsd x, m14}, m9, m16, m20, m23
+ REPX {paddd x, m13}, m21, m18, m17
+ REPX {psrad x, 12 }, m18, m22, m21, m17
+ psubd m19, m22, m18 ; t18 t22
+ paddd m22, m18 ; t19 t23
+ psubd m18, m17, m21 ; t29 t25
+ paddd m17, m21 ; t28 t24
+ REPX {pmaxsd x, m14}, m19, m22, m18, m17
+ REPX {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17
+.main3:
+ vbroadcasti32x4 m11, [o(pd_4017_2276)]
+ vbroadcasti32x4 m10, [o(pd_799_3406)]
+ psubd m7, m0, m6 ; dct16 out15 out14
+ paddd m0, m6 ; dct16 out0 out1
+ psubd m6, m1, m5 ; dct16 out12 out13
+ paddd m1, m5 ; dct16 out3 out2
+ psubd m5, m2, m4 ; dct16 out11 out10
+ paddd m2, m4 ; dct16 out4 out5
+ psubd m4, m3, m8 ; dct16 out8 out9
+ paddd m3, m8 ; dct16 out7 out6
+ ITX_MULSUB_2D 20, 9, 8, 21, _, 13, 10, 11
+ ITX_MULSUB_2D 18, 19, 8, 21, _, 13, 10, 11, 2
+ REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3
+ punpckhqdq m21, m16, m20 ; t20 t21a
+ punpcklqdq m16, m20 ; t16 t17a
+ punpcklqdq m20, m22, m19 ; t19 t18a
+ punpckhqdq m22, m19 ; t23 t22a
+ REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
+ punpcklqdq m19, m23, m9 ; t31 t30a
+ punpckhqdq m23, m9 ; t27 t26a
+ punpckhqdq m9, m17, m18 ; t24 t25a
+ punpcklqdq m17, m18 ; t28 t29a
+ vpbroadcastd m11, [o(pd_3784)]
+ vpbroadcastd m10, [o(pd_1567)]
+ psubd m18, m16, m20 ; t19a t18
+ paddd m20, m16 ; t16a t17
+ psubd m16, m19, m17 ; t28a t29
+ paddd m19, m17 ; t31a t30
+ psubd m17, m22, m21 ; t20a t21
+ paddd m22, m21 ; t23a t22
+ psubd m21, m9, m23 ; t27a t26
+ paddd m23, m9 ; t24a t25
+ REPX {pmaxsd x, m14}, m18, m16, m17, m21
+ REPX {pminsd x, m15}, m16, m18, m21, m17
+ ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11
+ ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2
+ REPX {pmaxsd x, m14}, m20, m22, m19, m23
+ REPX {pminsd x, m15}, m20, m22, m19, m23
+ paddd m9, m20, m22 ; t16 t17a
+ psubd m20, m22 ; t23 t22a
+ paddd m22, m19, m23 ; t31 t30a
+ psubd m19, m23 ; t24 t25a
+ psubd m23, m16, m17 ; t20a t21
+ paddd m16, m17 ; t19a t18
+ psubd m17, m18, m21 ; t27a t26
+ paddd m21, m18 ; t28a t29
+ REPX {pmaxsd x, m14}, m20, m19, m23, m17
+ REPX {pminsd x, m15}, m19, m20, m17, m23
+ REPX {pmulld x, m12}, m19, m20, m17, m23
+ REPX {pmaxsd x, m14}, m22, m21, m16, m9
+ paddd m19, m13
+ paddd m17, m13
+ REPX {pminsd x, m15}, m22, m21, m16, m9
+ psubd m18, m19, m20 ; t23a t22
+ paddd m19, m20 ; t24a t25
+ paddd m20, m17, m23 ; t27 t26a
+ psubd m17, m23 ; t20 t21a
+ REPX {psrad x, 12 }, m20, m19, m18, m17
+ ret
+.transpose_8x32:
+ mova m10, [o(idct32x8p)]
+ psrlw m8, m10, 8
+ mova m9, m8
+ vpermi2w m8, m1, m5
+ vpermt2w m1, m10, m5
+ vprold m5, m9, 16
+ vpermi2w m9, m3, m7
+ vpermt2w m3, m10, m7
+ vprold m10, 16
+ mova m7, m5
+ vpermi2w m5, m0, m4
+ vpermt2w m0, m10, m4
+ vpermi2w m7, m2, m6
+ vpermt2w m2, m10, m6
+ punpckhdq m6, m5, m8
+ punpckldq m5, m8
+ punpckhdq m8, m7, m9
+ punpckldq m7, m9
+ punpckhdq m4, m2, m3
+ punpckldq m2, m3
+ punpckhdq m3, m0, m1
+ punpckldq m0, m1
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob
+ vpbroadcastd m5, [pw_4096]
+ lea r4, [strideq*3]
+ mova m6, [idtx32x8p]
+ lea r5, [strideq*5]
+ vpbroadcastd m9, [pixel_10bpc_max]
+ lea r6, [strideq+r4*2]
+ pxor m8, m8
+ sub eobd, 107
+ psrlw m7, m6, 8
+.loop:
+ mova m0, [cq+64*0]
+ packssdw m0, [cq+64*1] ; 02 13
+ mova m1, [cq+64*2]
+ packssdw m1, [cq+64*3] ; 46 57
+ mova m2, [cq+64*4]
+ packssdw m2, [cq+64*5] ; 8a 9b
+ mova m3, [cq+64*6]
+ packssdw m3, [cq+64*7] ; ce df
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {mova [cq+64*x], m8}, 0, 1, 2, 3
+ mova m4, m6
+ vpermi2w m4, m1, m3
+ vpermt2w m1, m7, m3
+ REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
+ mova m3, m7
+ vpermi2w m3, m0, m2
+ vpermt2w m0, m6, m2
+ add cq, 64*8
+ punpcklqdq m2, m3, m1 ; 4 5
+ punpckhqdq m3, m1 ; 6 7
+ punpckhqdq m1, m0, m4 ; 2 3
+ punpcklqdq m0, m4 ; 0 1
+ mova ym4, [dstq+strideq*0]
+ vinserti32x8 m4, [dstq+strideq*1], 1
+ paddw m0, m4
+ mova ym4, [dstq+strideq*2]
+ vinserti32x8 m4, [dstq+r4 *1], 1
+ paddw m1, m4
+ mova ym4, [dstq+strideq*4]
+ vinserti32x8 m4, [dstq+r5 *1], 1
+ paddw m2, m4
+ mova ym4, [dstq+r4 *2]
+ vinserti32x8 m4, [dstq+r6 *1], 1
+ paddw m3, m4
+ REPX {pmaxsw x, m8}, m0, m1, m2, m3
+ REPX {pminsw x, m9}, m0, m1, m2, m3
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+r4 *1], m1, 1
+ mova [dstq+strideq*4], ym2
+ vextracti32x8 [dstq+r5 *1], m2, 1
+ mova [dstq+r4 *2], ym3
+ vextracti32x8 [dstq+r6 *1], m3, 1
+ add dstq, 32
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+%if WIN64
+ movaps [rsp+ 8], xmm6
+ movaps [rsp+24], xmm7
+%endif
+ cmp eobd, 36
+ jl .fast
+ call .pass1
+ cmp eobd, 151
+ jge .full
+ lea r5, [o_base_8bpc]
+ pxor m9, m9
+ punpcklwd m8, m1, m1 ; 2
+ punpckhwd m14, m1, m1 ; 3
+ punpcklwd m1, m3, m3 ; 6
+ punpckhwd m15, m3, m3 ; 7
+ punpcklwd m3, m6, m6 ; 12
+ punpckhwd m19, m6, m6 ; 13
+ punpcklwd m6, m9, m4 ; __ 8
+ punpckhwd m20, m4, m4 ; 9
+ punpckhwd m16, m5, m5 ; 11
+ punpcklwd m5, m5 ; 10
+ punpcklwd m9, m0 ; __ 0
+ punpckhwd m21, m0, m0 ; 1
+ punpcklwd m0, m7, m7 ; 14
+ punpckhwd m17, m7, m7 ; 15
+ punpcklwd m7, m2, m2 ; 4
+ punpckhwd m18, m2, m2 ; 5
+ call m(idct_16x16_internal_8bpc).main_fast
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mov r6d, 64*3
+ pxor m8, m8
+.zero_loop:
+ REPX {mova [cq+r6*8+128*x], m8}, 3, 2, 1, 0
+ sub r6d, 64
+ jge .zero_loop
+ jmp .pass2_end
+.full:
+ mova [cq+128*0], m0
+ mova [cq+128*1], m1
+ mova [cq+128*2], m2
+ mova [cq+128*3], m3
+ mova [cq+128*4], m4
+ mova [cq+128*5], m5
+ mova [cq+128*6], m6
+ mova [cq+128*7], m7
+ add cq, 64
+ call .pass1
+ mova m9, [cq-64* 1] ; 0 1
+ mova m14, [cq+64* 1] ; 2 3
+ mova m18, [cq+64* 3] ; 4 5
+ mova m15, [cq+64* 5] ; 6 7
+ mova m20, [cq+64* 7] ; 8 9
+ mova m16, [cq+64* 9] ; 10 11
+ mova m22, [cq+64*11] ; 12 13
+ mova m19, [cq+64*13] ; 14 15
+ lea r5, [o_base_8bpc]
+ punpcklwd m8, m7, m14 ; 30 2
+ punpckhwd m21, m7, m9 ; 31 1
+ punpcklwd m7, m6, m18 ; 28 4
+ punpckhwd m14, m6 ; 3 29
+ punpcklwd m9, m0, m9 ; 16 0
+ punpckhwd m17, m19, m0 ; 15 17
+ punpcklwd m0, m19, m1 ; 14 18
+ punpckhwd m19, m1, m22 ; 19 13
+ punpcklwd m1, m15, m5 ; 6 26
+ punpckhwd m18, m5, m18 ; 27 5
+ punpcklwd m6, m4, m20 ; 24 8
+ punpckhwd m15, m4 ; 7 25
+ punpcklwd m5, m3, m16 ; 22 10
+ punpckhwd m20, m3, m20 ; 23 9
+ punpcklwd m3, m22, m2 ; 12 20
+ punpckhwd m16, m2 ; 11 21
+ call m(idct_16x16_internal_8bpc).main2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ mov r6d, 32*7
+ pxor m8, m8
+.full_zero_loop:
+ REPX {mova [cq+r6*8+64*x], m8}, 2, 1, 0, -1
+ sub r6d, 32
+ jge .full_zero_loop
+ jmp .pass2_end
+.fast:
+ mova ym0, [cq+128*0]
+ mova ym2, [cq+128*4]
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+128*2]
+ mova ym3, [cq+128*6]
+ mova ym4, [cq+128*1]
+ mova ym5, [cq+128*3]
+ mova ym6, [cq+128*5]
+ mova ym7, [cq+128*7]
+ vpermt2q m0, m8, m2 ; 0 4
+ vpermt2q m1, m8, m3 ; 2 6
+ vpermt2q m4, m8, m5 ; 1 3
+ vpermt2q m7, m8, m6 ; 7 5
+ REPX {pmulld x, m12}, m0, m1, m4, m7
+ pxor ym16, ym16
+ mova [cq+128*0], ym16
+ REPX {vmovdqa32 [cq+128*x], ym16}, 1, 2, 3, 4, 5, 6, 7
+ REPX {paddd x, m13}, m0, m1, m4, m7
+ REPX {psrad x, 12 }, m0, m1, m4, m7
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ vpbroadcastd m11, [o(pd_1)]
+ call m(idct_8x16_internal_10bpc).main_end2
+ mova m8, [o(idct8x32p)]
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ mova m6, [dup16_perm]
+ vpermb m0, m8, m0
+ vpermb m2, m8, m2
+ vprold m8, 16
+ vpermb m1, m8, m1
+ vpermb m3, m8, m3
+ punpckldq m4, m0, m2
+ punpckhdq m0, m2
+ punpckldq m2, m1, m3
+ punpckhdq m1, m3
+ punpckldq m21, m4, m2
+ punpckhdq m14, m4, m2
+ punpckldq m18, m0, m1
+ punpckhdq m15, m0, m1
+ vpermb m8, m6, m14 ; 2
+ vpermb m1, m6, m15 ; 6
+ vpermb m7, m6, m18 ; 4
+ pmovzxwd m9, ym21 ; 0
+ vpord m6, [o(pb_32)] {1to16}
+ lea r5, [o_base_8bpc]
+ vpermb m21, m6, m21 ; 1
+ vpermb m15, m6, m15 ; 7
+ vpermb m18, m6, m18 ; 5
+ vpermb m14, m6, m14 ; 3
+ pslld m9, 16
+ call m(idct_16x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+.pass2_end:
+ movshdup m22, [permC]
+ vpbroadcastd m11, [pw_2048]
+ vpbroadcastd m13, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m12, m12
+ psrlq m23, m22, 8
+ vpermq m8, m22, m0
+ vpermq m9, m23, m1
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m2
+ vpermq m9, m23, m3
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m4
+ vpermq m9, m23, m5
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m6
+ vpermq m9, m23, m7
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m14
+ vpermq m9, m23, m15
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m16
+ vpermq m9, m23, m17
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m18
+ vpermq m9, m23, m19
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m22, m20
+ vpermq m9, m23, m21
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
+ vzeroupper
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+.pass1:
+ pmulld m0, m12, [cq+128* 0]
+ pmulld m1, m12, [cq+128* 2]
+ pmulld m2, m12, [cq+128* 4]
+ pmulld m3, m12, [cq+128* 6]
+ pmulld m4, m12, [cq+128* 8]
+ pmulld m5, m12, [cq+128*10]
+ pmulld m6, m12, [cq+128*12]
+ pmulld m7, m12, [cq+128*14]
+ call m(idct_8x16_internal_10bpc).main_rect2
+ pmulld m16, m12, [cq+128* 1]
+ pmulld m17, m12, [cq+128* 3]
+ pmulld m18, m12, [cq+128* 5]
+ pmulld m19, m12, [cq+128* 7]
+ pmulld m20, m12, [cq+128* 9]
+ pmulld m21, m12, [cq+128*11]
+ pmulld m22, m12, [cq+128*13]
+ pmulld m23, m12, [cq+128*15]
+ call m(idct_16x16_internal_10bpc).main_rect2
+ vpbroadcastd m11, [o(pd_1)]
+ call m(idct_16x16_internal_10bpc).main_end2
+ jmp m(idct_16x16_internal_10bpc).main_end3
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly
+
+cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 16, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m10, [pw_2896x8]
+ vpbroadcastd m11, [pw_1697x16]
+ vpbroadcastd m13, [pw_8192]
+ vpbroadcastd m15, [pixel_10bpc_max]
+ lea r6, [strideq*9]
+ pxor m14, m14
+ paddw m12, m13, m13 ; pw_16384
+ cmp eobd, 151
+ jl .main
+ call .main
+ add cq, 64-128*4
+ lea dstq, [dstq+strideq*8]
+.main:
+ call .main_internal
+ add cq, 128*4
+ pmulhrsw m1, m13, m2
+ pmulhrsw m3, m13, m4
+ pmulhrsw m5, m13, m6
+ pmulhrsw m7, m13, m8
+ call .main_internal
+.main2:
+ pmulhrsw m2, m13
+ pmulhrsw m4, m13
+ pmulhrsw m6, m13
+ pmulhrsw m8, m13
+ punpcklqdq m0, m1, m2 ; 0 8
+ punpckhqdq m1, m2 ; 1 9
+ call .write_16x2x2
+ punpcklqdq m0, m3, m4 ; 2 10
+ punpckhqdq m1, m3, m4 ; 3 11
+ call .write_16x2x2
+ punpcklqdq m0, m5, m6 ; 4 12
+ punpckhqdq m1, m5, m6 ; 5 13
+ call .write_16x2x2
+ punpcklqdq m0, m7, m8 ; 6 14
+ punpckhqdq m1, m7, m8 ; 7 15
+.write_16x2x2:
+ mova ym2, [dstq+strideq*0]
+ vinserti32x8 m2, [dstq+strideq*8], 1
+ mova ym9, [dstq+strideq*1]
+ vinserti32x8 m9, [dstq+r6 ], 1
+ paddw m0, m2
+ paddw m1, m9
+ pmaxsw m0, m14
+ pmaxsw m1, m14
+ pminsw m0, m15
+ pminsw m1, m15
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*8], m0, 1
+ mova [dstq+strideq*1], ym1
+ vextracti32x8 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+strideq*2]
+ ret
+.main_internal:
+ mova m8, [cq+128* 0]
+ packssdw m8, [cq+128* 8]
+ mova m6, [cq+128* 1]
+ packssdw m6, [cq+128* 9]
+ mova m0, [cq+128* 2]
+ packssdw m0, [cq+128*10]
+ mova m2, [cq+128* 3]
+ packssdw m2, [cq+128*11]
+ REPX {pmulhrsw x, m10}, m8, m6, m0, m2
+ REPX {vpermq x, x, q3120}, m8, m6, m0, m2
+ pmulhrsw m4, m11, m8
+ pmulhrsw m9, m11, m6
+ REPX {mova [cq+128*x], m14}, 0, 1, 2, 3
+ pmulhrsw m4, m12
+ pmulhrsw m9, m12
+ paddsw m8, m4
+ paddsw m6, m9
+ pmulhrsw m4, m11, m0
+ pmulhrsw m9, m11, m2
+ REPX {mova [cq+128*x], m14}, 8, 9, 10, 11
+ pmulhrsw m4, m12
+ pmulhrsw m9, m12
+ paddsw m0, m4
+ paddsw m2, m9
+ punpcklwd m4, m8, m6
+ punpckhwd m8, m6
+ punpcklwd m6, m0, m2
+ punpckhwd m0, m2
+ punpckldq m2, m4, m6 ; 0 1
+ punpckhdq m4, m6 ; 2 3
+ punpckldq m6, m8, m0 ; 4 5
+ punpckhdq m8, m0 ; 6 7
+ ret
+
+cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+%if WIN64
+ movaps [rsp+ 8], xmm6
+ movaps [rsp+24], xmm7
+%endif
+ mov r6d, 8*12
+ cmp eobd, 36
+ jl .fast
+ pmulld m0, m12, [cq+64* 0]
+ pmulld m1, m12, [cq+64* 4]
+ pmulld m2, m12, [cq+64* 8]
+ pmulld m3, m12, [cq+64*12]
+ pmulld m16, m12, [cq+64* 2]
+ pmulld m17, m12, [cq+64* 6]
+ pmulld m18, m12, [cq+64*10]
+ pmulld m19, m12, [cq+64*14]
+ cmp eobd, 151
+ jge .full
+ call m(idct_8x16_internal_10bpc).main_fast_rect2
+ call m(idct_16x16_internal_10bpc).main_fast_rect2
+ call .idct16_sumsub
+ call .pass1_load_spill
+ call .main_fast_rect2
+ jmp .pass1_end
+.full:
+ pmulld m4, m12, [cq+64*16]
+ pmulld m5, m12, [cq+64*20]
+ pmulld m6, m12, [cq+64*24]
+ pmulld m7, m12, [cq+64*28]
+ pmulld m20, m12, [cq+64*18]
+ pmulld m21, m12, [cq+64*22]
+ pmulld m22, m12, [cq+64*26]
+ pmulld m23, m12, [cq+64*30]
+ add r6d, 8*16
+ call m(idct_8x16_internal_10bpc).main_rect2
+ call m(idct_16x16_internal_10bpc).main_rect2
+ call .idct16_sumsub
+ call .pass1_load_spill
+ pmulld m16, m12, [cq+64*17]
+ pmulld m17, m12, [cq+64*19]
+ pmulld m18, m12, [cq+64*21]
+ pmulld m19, m12, [cq+64*23]
+ pmulld m20, m12, [cq+64*25]
+ pmulld m21, m12, [cq+64*27]
+ pmulld m22, m12, [cq+64*29]
+ pmulld m23, m12, [cq+64*31]
+ call .main_rect2
+.pass1_end:
+ vpbroadcastd m11, [o(pd_1)]
+ lea r4, [cq+64]
+ call .idct32_pass1_end
+ lea r5, [o_base_8bpc]
+ punpckhqdq m19, m5, m16 ; 11
+ punpcklqdq m5, m16 ; 10
+ punpckhqdq m16, m2, m1 ; 5
+ punpcklqdq m2, m1 ; 4
+ punpcklqdq m1, m15, m4 ; 2
+ punpckhqdq m15, m4 ; 3
+ punpcklqdq m4, m14, m18 ; 8
+ punpckhqdq m18, m14, m18 ; 9
+ punpckhqdq m14, m0, m20 ; 1
+ punpcklqdq m0, m20 ; 0
+ punpckhqdq m20, m6, m17 ; 13
+ punpcklqdq m6, m17 ; 12
+ punpckhqdq m17, m3, m21 ; 7
+ punpcklqdq m3, m21 ; 6
+ punpckhqdq m21, m7, m8 ; 15
+ punpcklqdq m7, m8 ; 14
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ jmp .end
+.fast:
+ pmulld ym0, ym12, [cq+64*0]
+ pmulld ym1, ym12, [cq+64*4]
+ movshdup m7, [o(permB)]
+ mova ym4, [cq+64*2]
+ mova ym5, [cq+64*6]
+ mova ym16, [cq+64*1]
+ mova ym2, [cq+64*5]
+ mova ym3, [cq+64*3]
+ mova ym17, [cq+64*7]
+ vpermt2q m4, m7, m5 ; 2 6
+ vpermt2q m16, m7, m2 ; 1 5
+ vpermt2q m17, m7, m3 ; 7 3
+ paddd ym0, ym13
+ paddd ym1, ym13
+ psrad ym0, 12
+ psrad ym1, 12
+ vpermq m0, m7, m0 ; 0 0
+ vpermq m1, m7, m1 ; 4 4
+ REPX {pmulld x, m12}, m4, m16, m17
+ REPX {paddd x, m13}, m4, m16, m17
+ REPX {psrad x, 12 }, m4, m16, m17
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
+ vpbroadcastd m11, [o(pd_1)]
+ call m(idct_16x16_internal_10bpc).main_end2
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
+ lea r5, [o_base_8bpc]
+ punpckhqdq m14, m0, m2 ; 1
+ punpcklqdq m0, m2 ; 0
+ punpcklqdq m1, m3, m4 ; 2
+ punpckhqdq m15, m3, m4 ; 3
+ punpcklqdq m2, m5, m7 ; 4
+ punpckhqdq m16, m5, m7 ; 5
+ punpcklqdq m3, m6, m8 ; 6
+ punpckhqdq m17, m6, m8 ; 7
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+.end:
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
+ pxor m12, m12
+.zero_loop:
+ mova [cq+r6*8+64*3], m12
+ mova [cq+r6*8+64*2], m12
+ mova [cq+r6*8+64*1], m12
+ mova [cq+r6*8+64*0], m12
+ sub r6d, 8*4
+ jge .zero_loop
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start
+ pmulhrsw m0, m11, m14
+ pmulhrsw m1, m11, m15
+ pmulhrsw m2, m11, m16
+ pmulhrsw m3, m11, m17
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+ pmulhrsw m0, m11, m18
+ pmulhrsw m1, m11, m19
+ pmulhrsw m2, m11, m20
+ pmulhrsw m3, m11, m21
+ vzeroupper
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+.dconly2:
+ vpbroadcastd m3, [o(dconly_10bpc)]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw m2, r6d
+ paddsw m2, m3
+.dconly_loop:
+ paddsw m0, m2, [dstq+strideq*0]
+ paddsw m1, m2, [dstq+strideq*1]
+ psubusw m0, m3
+ psubusw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+ALIGN function_align
+.idct16_sumsub:
+ psubd m23, m0, m22 ; t15
+ paddd m0, m22 ; t0
+ psubd m22, m1, m21 ; t14
+ paddd m1, m21 ; t1
+ REPX {pmaxsd x, m14}, m23, m0, m22, m1
+ psubd m21, m2, m20 ; t13
+ paddd m2, m20 ; t2
+ REPX {pminsd x, m15}, m23, m0, m22, m1
+ psubd m20, m3, m19 ; t12
+ paddd m3, m19 ; t3
+ REPX {pmaxsd x, m14}, m21, m2, m20, m3
+ psubd m19, m4, m18 ; t11
+ paddd m4, m18 ; t4
+ REPX {pminsd x, m15}, m21, m2, m20, m3
+ psubd m18, m5, m17 ; t10
+ paddd m5, m17 ; t5
+ REPX {pmaxsd x, m14}, m19, m4, m18, m5
+ psubd m17, m6, m16 ; t9
+ paddd m6, m16 ; t6
+ REPX {pminsd x, m15}, m19, m4, m18, m5
+ psubd m16, m7, m9 ; t8
+ paddd m7, m9 ; t7
+ REPX {pmaxsd x, m14}, m17, m6, m16, m7
+ REPX {pminsd x, m15}, m17, m6, m16, m7
+ ret
+.idct32_pass1_end:
+ psrlq m12, [o(permC)], 24 ; 0 2 8 10 1 3 9 11
+ psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15
+%macro IDCT32_PASS1_END 2 ; low, high
+ paddd m8, m11, [r4+128*%1]
+ paddd m9, m11, [cq+128*%1]
+ psubd m10, m8, m%1 ; out 16+n
+ paddd m8, m%1 ; out 15-n
+ paddd m%1, m9, m%2 ; out 0+n
+ psubd m9, m%2 ; out 31-n
+ REPX {vpsravd x, m11}, m10, m%1, m8, m9
+ packssdw m%1, m10 ; 0+n 16+n
+ packssdw m%2, m8, m9 ; 15-n 31-n
+%endmacro
+ IDCT32_PASS1_END 0, 23 ; 0 16, 15 31
+ IDCT32_PASS1_END 7, 16 ; 7 23, 8 24
+ mova m14, m13
+ vpermi2q m14, m0, m16
+ vpermt2q m0, m12, m16
+ IDCT32_PASS1_END 1, 22 ; 1 17, 14 30
+ IDCT32_PASS1_END 6, 17 ; 6 22, 9 25
+ mova m15, m13
+ vpermi2q m15, m1, m17
+ vpermt2q m1, m12, m17
+ IDCT32_PASS1_END 2, 21 ; 2 18, 13 29
+ IDCT32_PASS1_END 5, 18 ; 5 21, 10 26
+ mova m16, m13
+ vpermi2q m16, m2, m18
+ vpermt2q m2, m12, m18
+ IDCT32_PASS1_END 3, 20 ; 3 19, 12 28
+ IDCT32_PASS1_END 4, 19 ; 4 20, 11 27
+ mova m17, m13
+ vpermi2q m17, m3, m19
+ vpermt2q m3, m12, m19
+ mova m18, m13
+ vpermi2q m18, m4, m20
+ vpermt2q m4, m12, m20
+ mova m19, m13
+ vpermi2q m19, m5, m21
+ vpermt2q m5, m12, m21
+ mova m20, m13
+ vpermi2q m20, m6, m22
+ vpermt2q m6, m12, m22
+ mova m21, m13
+ vpermi2q m21, m7, m23
+ vpermt2q m7, m12, m23
+ punpckhwd m8, m2, m3 ; c04 d04 c05 d05 c06 d06 c07 d07
+ punpcklwd m2, m3 ; c00 d00 c01 d01 c02 d02 c03 d03
+ punpckhwd m3, m0, m1 ; a04 b04 a05 b05 a06 b06 a07 b07
+ punpcklwd m0, m1 ; a00 b00 a01 b01 a02 b02 a03 b03
+ punpckhwd m1, m4, m5 ; e04 f04 e05 f05 e06 f06 e07 f07
+ punpcklwd m4, m5 ; e00 f00 e01 f01 e02 f02 e03 f03
+ punpckhwd m5, m6, m7 ; g04 h04 g05 h05 g06 h06 g07 h07
+ punpcklwd m6, m7 ; g00 h00 g01 h01 g02 h02 g03 h03
+ punpckhwd m7, m14, m15 ; a12 b12 a13 b13 a14 b14 a15 b15
+ punpcklwd m14, m15 ; a08 b08 a09 b09 a10 b10 a11 b11
+ punpckhwd m15, m16, m17 ; c12 d12 c13 d13 c14 d14 c15 d15
+ punpcklwd m16, m17 ; c08 d08 c09 d09 c10 d10 c11 d11
+ punpckhwd m17, m18, m19 ; e12 f12 e13 f13 e14 f14 e15 f15
+ punpcklwd m18, m19 ; e08 f08 e09 f09 e10 f10 e11 f11
+ punpckhwd m19, m20, m21 ; g12 h12 g13 h13 g14 h14 g15 h15
+ punpcklwd m20, m21 ; g08 h08 g09 h09 g10 h10 g11 h11
+ punpckhdq m21, m1, m5 ; e06 f06 g06 h06 e07 f07 g07 h07
+ punpckldq m1, m5 ; e04 f04 g04 h04 e05 f05 g05 h05
+ punpckhdq m5, m14, m16 ; a10 b10 c10 d10 a11 b11 c11 d11
+ punpckldq m14, m16 ; a08 b08 c08 d08 a09 b09 c09 d09
+ punpckhdq m16, m18, m20 ; e10 f10 g10 h10 e11 f11 g11 h11
+ punpckldq m18, m20 ; e08 f08 g08 h08 e09 f09 g09 h09
+ punpckldq m20, m4, m6 ; e00 f00 g00 h00 e01 f01 g01 h01
+ punpckhdq m4, m6 ; e02 f02 g02 h02 e03 f03 g03 h03
+ punpckldq m6, m7, m15 ; a12 b12 c12 d12 a13 b13 c13 d13
+ punpckhdq m7, m15 ; a14 b14 c14 d14 a15 b15 c15 d15
+ punpckhdq m15, m0, m2 ; a02 b02 c02 d02 a03 b03 c03 d03
+ punpckldq m0, m2 ; a00 b00 c00 d00 a01 b01 c01 d01
+ punpckldq m2, m3, m8 ; a04 b04 c04 d04 a05 b05 c05 d05
+ punpckhdq m3, m8 ; a06 b06 c06 d06 a07 b07 c07 d07
+ punpckhdq m8, m17, m19 ; e14 f14 g14 h14 e15 f15 g15 h15
+ punpckldq m17, m19 ; e12 f12 g12 h12 e13 f13 g13 h13
+ ret
+.pass1_load_spill:
+ mova [cq+64* 0], m0
+ mova [cq+64* 2], m1
+ mova [cq+64* 4], m2
+ mova [cq+64* 6], m3
+ mova [cq+64* 8], m4
+ mova [cq+64*10], m5
+ mova [cq+64*12], m6
+ mova [cq+64*14], m7
+ pmulld m0, m12, [cq+64* 1]
+ pmulld m1, m12, [cq+64* 3]
+ pmulld m2, m12, [cq+64* 5]
+ pmulld m3, m12, [cq+64* 7]
+ pmulld m4, m12, [cq+64* 9]
+ pmulld m5, m12, [cq+64*11]
+ pmulld m6, m12, [cq+64*13]
+ pmulld m7, m12, [cq+64*15]
+ mova [cq+64* 1], m23
+ mova [cq+64* 3], m22
+ mova [cq+64* 5], m21
+ mova [cq+64* 7], m20
+ mova [cq+64* 9], m19
+ mova [cq+64*11], m18
+ mova [cq+64*13], m17
+ mova [cq+64*15], m16
+ ret
+.main_fast_rect2:
+ call m(idct_8x16_internal_10bpc).round
+.main_fast: ; bottom half is zero
+ pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a
+ pmulld m0, [o(pd_201)] {1to16} ; t16a
+ pmulld m16, m7, [o(pd_2751)] {1to16} ; t17a
+ pmulld m7, [o(pd_3035)] {1to16} ; t30a
+ pmulld m19, m4, [o(pd_3703)] {1to16} ; t29a
+ pmulld m4, [o(pd_1751)] {1to16} ; t18a
+ pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a
+ pmulld m3, [o(pd_3857)] {1to16} ; t28a
+ pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a
+ pmulld m2, [o(pd_995)] {1to16} ; t20a
+ pmulld m18, m5, [o(pd_2106)] {1to16} ; t21a
+ pmulld m5, [o(pd_3513)] {1to16} ; t26a
+ pmulld m17, m6, [o(pd_3290)] {1to16} ; t25a
+ pmulld m6, [o(pd_2440)] {1to16} ; t22a
+ pmulld m22, m1, [o(pd_601)] {1to16} ; t23a
+ pmulld m1, [o(pd_4052)] {1to16} ; t24a
+ REPX {psubd x, m13, x}, m16, m20, m18, m22
+ call m(idct_16x16_internal_10bpc).round3
+ jmp .main2
+.main_rect2:
+ call m(idct_8x16_internal_10bpc).round
+ call m(idct_16x16_internal_10bpc).round
+.main:
+ ITX_MULSUB_2D 0, 23, 8, 9, 10, _, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2D 16, 7, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
+ ITX_MULSUB_2D 4, 19, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2D 20, 3, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2D 2, 21, 8, 9, 10, _, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2D 18, 5, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2D 6, 17, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
+ ITX_MULSUB_2D 22, 1, 8, 9, 10, _, 4052, 601 ; t23a, t24a
+ call m(idct_16x16_internal_10bpc).round
+.main2:
+ call m(idct_8x16_internal_10bpc).round
+ psubd m8, m0, m16 ; t17
+ paddd m0, m16 ; t16
+ psubd m16, m23, m7 ; t30
+ paddd m23, m7 ; t31
+ REPX {pmaxsd x, m14}, m8, m0, m16, m23
+ paddd m7, m20, m4 ; t19
+ psubd m20, m4 ; t18
+ REPX {pminsd x, m15}, m8, m0, m16, m23
+ paddd m4, m3, m19 ; t28
+ psubd m3, m19 ; t29
+ REPX {pmaxsd x, m14}, m7, m20, m4, m3
+ psubd m19, m2, m18 ; t21
+ paddd m2, m18 ; t20
+ REPX {pminsd x, m15}, m7, m20, m4, m3
+ psubd m18, m21, m5 ; t26
+ paddd m21, m5 ; t27
+ REPX {pmaxsd x, m14}, m19, m2, m18, m21
+ psubd m5, m22, m6 ; t22
+ paddd m6, m22 ; t23
+ REPX {pminsd x, m15}, m19, m2, m18, m21
+ psubd m22, m1, m17 ; t25
+ paddd m17, m1 ; t24
+ REPX {pmaxsd x, m14}, m5, m6, m22, m17
+ vpbroadcastd m11, [o(pd_4017)]
+ vpbroadcastd m10, [o(pd_799)]
+ REPX {pminsd x, m15}, m5, m6, m22, m17
+ ITX_MULSUB_2D 16, 8, 9, 1, _, 13, 10, 11 ; t17a, t30a
+ ITX_MULSUB_2D 3, 20, 9, 1, _, 13, 10, 11, 2 ; t29a, t18a
+ vpbroadcastd m11, [o(pd_2276)]
+ vpbroadcastd m10, [o(pd_3406)]
+ ITX_MULSUB_2D 18, 19, 9, 1, _, 13, 10, 11 ; t21a, t26a
+ ITX_MULSUB_2D 22, 5, 9, 1, _, 13, 10, 11, 2 ; t25a, t22a
+ paddd m1, m6, m2 ; t23a
+ psubd m6, m2 ; t20a
+ psubd m2, m17, m21 ; t27a
+ paddd m17, m21 ; t24a
+ REPX {pmaxsd x, m14}, m1, m6, m2, m17
+ psubd m21, m23, m4 ; t28a
+ paddd m23, m4 ; t31a
+ REPX {pminsd x, m15}, m1, m6, m2, m17
+ psubd m4, m16, m20 ; t18
+ paddd m16, m20 ; t17
+ REPX {pmaxsd x, m14}, m21, m23, m4, m16
+ psubd m20, m0, m7 ; t19a
+ paddd m0, m7 ; t16a
+ REPX {pminsd x, m15}, m21, m23, m4, m16
+ psubd m7, m8, m3 ; t29
+ paddd m3, m8 ; t30
+ REPX {pmaxsd x, m14}, m20, m0, m7, m3
+ paddd m8, m5, m18 ; t22
+ psubd m5, m18 ; t21
+ REPX {pminsd x, m15}, m20, m0, m7, m3
+ psubd m18, m22, m19 ; t26
+ paddd m22, m19 ; t25
+ REPX {pmaxsd x, m14}, m8, m5, m18, m22
+ vpbroadcastd m11, [o(pd_3784)]
+ vpbroadcastd m10, [o(pd_1567)]
+ REPX {pminsd x, m15}, m8, m5, m18, m22
+ ITX_MULSUB_2D 21, 20, 9, 19, _, 13, 10, 11 ; t19, t28
+ ITX_MULSUB_2D 2, 6, 9, 19, _, 13, 10, 11, 2 ; t27, t20
+ ITX_MULSUB_2D 7, 4, 9, 19, _, 13, 10, 11 ; t18a, t29a
+ ITX_MULSUB_2D 18, 5, 9, 19, _, 13, 10, 11, 2 ; t26a, t21a
+ psubd m19, m0, m1 ; t23
+ paddd m0, m1 ; t16
+ paddd m1, m8, m16 ; t17a
+ psubd m8, m16, m8 ; t22a
+ REPX {pmaxsd x, m14}, m19, m0, m1, m8
+ psubd m16, m23, m17 ; t24
+ paddd m23, m17 ; t31
+ REPX {pminsd x, m15}, m19, m0, m1, m8
+ psubd m17, m3, m22 ; t25a
+ paddd m22, m3 ; t30a
+ REPX {pmaxsd x, m14}, m16, m23, m17, m22
+ paddd m3, m6, m21 ; t19a
+ psubd m6, m21, m6 ; t20a
+ REPX {pminsd x, m15}, m16, m23, m17, m22
+ paddd m21, m18, m4 ; t29
+ psubd m18, m4, m18 ; t26
+ REPX {pmaxsd x, m14}, m3, m6, m21, m18
+ psubd m4, m20, m2 ; t27a
+ paddd m20, m2 ; t28a
+ REPX {pminsd x, m15}, m3, m6, m21, m18
+ paddd m2, m7, m5 ; t18
+ psubd m7, m5 ; t21
+ REPX {pmaxsd x, m14}, m4, m20, m2, m7
+ REPX {pminsd x, m15}, m4, m20, m2, m7
+ REPX {pmulld x, m12}, m18, m16, m4, m17, m7, m19, m6, m8
+ REPX {paddd x, m13}, m18, m16, m4, m17
+ psubd m5, m18, m7 ; t21a
+ paddd m18, m7 ; t26a
+ psubd m7, m16, m19 ; t23a
+ paddd m16, m19 ; t24a
+ REPX {psrad x, 12 }, m5, m18, m7, m16
+ paddd m19, m4, m6 ; t27
+ psubd m4, m6 ; t20
+ psubd m6, m17, m8 ; t22
+ paddd m17, m8 ; t25
+ REPX {psrad x, 12 }, m19, m4, m6, m17
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 16, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m10, [pw_2896x8]
+ vpbroadcastd m11, [pw_1697x16]
+ vpbroadcastd m13, [pw_2048]
+ vpbroadcastd m15, [pixel_10bpc_max]
+ lea r6, [strideq*9]
+ pxor m14, m14
+ cmp eobd, 151
+ jl .main
+ mov r4, dstq
+ call .main
+ add cq, 64*12
+ lea dstq, [r4+32]
+.main:
+ call .main_internal
+ add cq, 64*4
+ pmulhrsw m1, m13, m2
+ pmulhrsw m3, m13, m4
+ pmulhrsw m5, m13, m6
+ pmulhrsw m7, m13, m8
+ call .main_internal
+ jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
+.main_internal:
+ mova m8, [cq+64* 0]
+ packssdw m8, [cq+64* 8]
+ mova m6, [cq+64* 1]
+ packssdw m6, [cq+64* 9]
+ mova m0, [cq+64* 2]
+ packssdw m0, [cq+64*10]
+ mova m2, [cq+64* 3]
+ packssdw m2, [cq+64*11]
+ REPX {pmulhrsw x, m10}, m8, m6, m0, m2
+ REPX {paddsw x, x }, m8, m6, m0, m2
+ REPX {vpermq x, x, q3120}, m8, m6, m0, m2
+ pmulhrsw m4, m11, m8
+ pmulhrsw m9, m11, m6
+ paddsw m8, m8
+ paddsw m6, m6
+ REPX {mova [cq+64*x], m14}, 0, 1, 2, 3
+ paddsw m8, m4
+ paddsw m6, m9
+ pmulhrsw m4, m11, m0
+ pmulhrsw m9, m11, m2
+ paddsw m0, m0
+ paddsw m2, m2
+ REPX {mova [cq+64*x], m14}, 8, 9, 10, 11
+ paddsw m0, m4
+ paddsw m2, m9
+ punpcklwd m4, m8, m6
+ punpckhwd m8, m6
+ punpcklwd m6, m0, m2
+ punpckhwd m0, m2
+ punpckldq m2, m4, m6 ; 0 1
+ punpckhdq m4, m6 ; 2 3
+ punpckldq m6, m8, m0 ; 4 5
+ punpckhdq m8, m0 ; 6 7
+ ret
+
+cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ WIN64_SPILL_XMM 30
+ cmp eobd, 136
+ jl .fast
+ add cq, 64
+ cmp eobd, 543
+ jge .full
+ call .pass1_fast ; bottomright 16x16 zero
+ mov r6d, 16*12
+ jmp .lefthalf
+.full:
+ call .pass1
+ mov r6d, 16*28
+.lefthalf:
+ mova [cq+128* 0], m0
+ mova [cq+128* 1], m1
+ mova [cq+128* 2], m2
+ mova [cq+128* 3], m3
+ mova [cq+128* 4], m14
+ mova [cq+128* 5], m15
+ mova [cq+128* 6], m16
+ mova [cq+128* 7], m17
+ mova [cq+128* 8], m22
+ mova [cq+128* 9], m23
+ mova [cq+128*10], m24
+ mova [cq+128*11], m25
+ mova [cq+128*12], m26
+ mova [cq+128*13], m27
+ mova [cq+128*14], m28
+ mova [cq+128*15], m29
+ sub cq, 64
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ call .pass1
+ lea r5, [o_base_8bpc]
+ mova m4, [cq+64+128* 0]
+ mova m5, [cq+64+128* 1]
+ mova m6, [cq+64+128* 2]
+ mova m7, [cq+64+128* 3]
+ mova m18, [cq+64+128* 4]
+ mova m19, [cq+64+128* 5]
+ mova m20, [cq+64+128* 6]
+ mova m21, [cq+64+128* 7]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+128*0], m14
+ mova [cq+128*1], m15
+ mova [cq+128*2], m16
+ mova [cq+128*3], m17
+ mova [cq+128*4], m18
+ mova [cq+128*5], m19
+ mova [cq+128*6], m20
+ mova [cq+128*7], m21
+ mova m14, [cq+64+128* 8]
+ mova m15, [cq+64+128* 9]
+ mova m16, [cq+64+128*10]
+ mova m17, [cq+64+128*11]
+ mova m18, [cq+64+128*12]
+ mova m19, [cq+64+128*13]
+ mova m20, [cq+64+128*14]
+ mova m21, [cq+64+128*15]
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+ pxor m12, m12
+.right_zero_loop:
+ mova [cq+r6*8+64+128*3], m12
+ mova [cq+r6*8+64+128*2], m12
+ mova [cq+r6*8+64+128*1], m12
+ mova [cq+r6*8+64+128*0], m12
+ sub r6d, 16*4
+ jge .right_zero_loop
+ mov r6d, 16*28
+ jmp .end2
+.fast: ; topleft 16x16 nonzero
+ cmp eobd, 36
+ jl .fast2
+ call .pass1_fast
+ lea r5, [o_base_8bpc]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova [cq+128*0], m14
+ mova [cq+128*1], m15
+ mova [cq+128*2], m16
+ mova [cq+128*3], m17
+ mova [cq+128*4], m18
+ mova [cq+128*5], m19
+ mova [cq+128*6], m20
+ mova [cq+128*7], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ jmp .end
+.fast2: ; topleft 8x8 nonzero
+ movshdup m7, [o(permB)]
+ mova ym0, [cq+128*0]
+ mova ym1, [cq+128*4]
+ mova ym4, [cq+128*2]
+ mova ym5, [cq+128*6]
+ mova ym16, [cq+128*1]
+ mova ym2, [cq+128*5]
+ mova ym3, [cq+128*3]
+ mova ym17, [cq+128*7]
+ mov r6d, 16*4
+ vpermq m0, m7, m0 ; 0 0
+ vpermq m1, m7, m1 ; 4 4
+ vpermt2q m4, m7, m5 ; 2 6
+ vpermt2q m16, m7, m2 ; 1 5
+ vpermt2q m17, m7, m3 ; 7 3
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2
+ call m(idct_16x16_internal_10bpc).main_end
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32
+ lea r5, [o_base_8bpc]
+ punpckhqdq m22, m0, m2 ; 1
+ punpcklqdq m0, m2 ; 0
+ punpcklqdq m1, m5, m7 ; 4
+ punpckhqdq m24, m5, m7 ; 5
+ punpcklqdq m14, m3, m4 ; 2
+ punpckhqdq m23, m3, m4 ; 3
+ punpcklqdq m15, m6, m8 ; 6
+ punpckhqdq m25, m6, m8 ; 7
+ mova m10, m13
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
+ mova [cq+128*0], m14
+ mova [cq+128*1], m15
+ mova [cq+128*2], m16
+ mova [cq+128*3], m17
+ mova [cq+128*4], m18
+ mova [cq+128*5], m19
+ mova [cq+128*6], m20
+ mova [cq+128*7], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
+.end:
+ pxor m12, m12
+.end2:
+ psubsw m9, m0, m29 ; out31
+ paddsw m0, m29 ; out0
+ psubsw m29, m1, m28 ; out30
+ paddsw m1, m28 ; out1
+ psubsw m28, m2, m27 ; out29
+ paddsw m2, m27 ; out2
+ psubsw m27, m3, m26 ; out28
+ paddsw m3, m26 ; out3
+ psubsw m26, m4, m25 ; out27
+ paddsw m4, m25 ; out4
+ psubsw m25, m5, m24 ; out26
+ paddsw m5, m24 ; out5
+ psubsw m24, m6, m23 ; out25
+ paddsw m6, m23 ; out6
+ psubsw m23, m7, m22 ; out24
+ paddsw m7, m22 ; out7
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start
+ mova m0, [cq+128*0]
+ mova m1, [cq+128*1]
+ mova m2, [cq+128*2]
+ mova m3, [cq+128*3]
+ mova m4, [cq+128*4]
+ mova m5, [cq+128*5]
+ mova m6, [cq+128*6]
+ mova m7, [cq+128*7]
+ psubsw m22, m0, m21 ; out23
+ paddsw m0, m21 ; out8
+ psubsw m21, m1, m20 ; out22
+ paddsw m1, m20 ; out9
+ psubsw m20, m2, m19 ; out21
+ paddsw m2, m19 ; out10
+ psubsw m19, m3, m18 ; out20
+ paddsw m3, m18 ; out11
+ psubsw m18, m4, m17 ; out19
+ paddsw m4, m17 ; out12
+ psubsw m17, m5, m16 ; out18
+ paddsw m5, m16 ; out13
+ psubsw m16, m6, m15 ; out17
+ paddsw m6, m15 ; out14
+ psubsw m15, m7, m14 ; out16
+ paddsw m7, m14 ; out15
+.zero_loop:
+ mova [cq+r6*8+128*3], m12
+ mova [cq+r6*8+128*2], m12
+ mova [cq+r6*8+128*1], m12
+ mova [cq+r6*8+128*0], m12
+ sub r6d, 16*4
+ jge .zero_loop
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8
+ pmulhrsw m0, m11, m15
+ pmulhrsw m1, m11, m16
+ pmulhrsw m2, m11, m17
+ pmulhrsw m3, m11, m18
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+ pmulhrsw m0, m11, m19
+ pmulhrsw m1, m11, m20
+ pmulhrsw m2, m11, m21
+ pmulhrsw m3, m11, m22
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+ pmulhrsw m0, m11, m23
+ pmulhrsw m1, m11, m24
+ pmulhrsw m2, m11, m25
+ pmulhrsw m3, m11, m26
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+ pmulhrsw m0, m11, m27
+ pmulhrsw m1, m11, m28
+ pmulhrsw m2, m11, m29
+ pmulhrsw m3, m11, m9
+ WIN64_RESTORE_XMM
+ vzeroupper
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2
+.pass1_fast:
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 4]
+ mova m2, [cq+128* 8]
+ mova m3, [cq+128*12]
+ mov r6d, 16*12
+ call m(idct_8x16_internal_10bpc).main_fast
+ mova m16, [cq+128* 2]
+ mova m17, [cq+128* 6]
+ mova m18, [cq+128*10]
+ mova m19, [cq+128*14]
+ call m(idct_16x16_internal_10bpc).main_fast
+ call .pass1_load_spill
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast
+ jmp .pass1_end
+.pass1:
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 4]
+ mova m2, [cq+128* 8]
+ mova m3, [cq+128*12]
+ mova m4, [cq+128*16]
+ mova m5, [cq+128*20]
+ mova m6, [cq+128*24]
+ mova m7, [cq+128*28]
+ call m(idct_8x16_internal_10bpc).main
+ mova m16, [cq+128* 2]
+ mova m17, [cq+128* 6]
+ mova m18, [cq+128*10]
+ mova m19, [cq+128*14]
+ mova m20, [cq+128*18]
+ mova m21, [cq+128*22]
+ mova m22, [cq+128*26]
+ mova m23, [cq+128*30]
+ call m(idct_16x16_internal_10bpc).main
+ call .pass1_load_spill
+ mova m16, [cq+128*17]
+ mova m17, [cq+128*19]
+ mova m18, [cq+128*21]
+ mova m19, [cq+128*23]
+ mova m20, [cq+128*25]
+ mova m21, [cq+128*27]
+ mova m22, [cq+128*29]
+ mova m23, [cq+128*31]
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).main
+.pass1_end:
+ vpbroadcastd m11, [o(pd_2)]
+ lea r4, [cq+128*8]
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end
+ punpckhqdq m22, m0, m20 ; 1
+ punpcklqdq m0, m20 ; 0
+ punpckhqdq m24, m2, m1 ; 5
+ punpcklqdq m1, m2, m1 ; 4
+ punpcklqdq m2, m14, m18 ; 8
+ punpckhqdq m26, m14, m18 ; 9
+ punpcklqdq m14, m15, m4 ; 2
+ punpckhqdq m23, m15, m4 ; 3
+ punpckhqdq m25, m3, m21 ; 7
+ punpcklqdq m15, m3, m21 ; 6
+ punpckhqdq m28, m6, m17 ; 13
+ punpcklqdq m3, m6, m17 ; 12
+ punpckhqdq m27, m5, m16 ; 11
+ punpcklqdq m16, m5, m16 ; 10
+ punpckhqdq m29, m7, m8 ; 15
+ punpcklqdq m17, m7, m8 ; 14
+ ret
+.pass1_load_spill:
+ call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub
+ mova [cq+128* 0], m0
+ mova m0, [cq+128* 1]
+ mova [cq+128* 1], m1
+ mova [cq+128* 2], m2
+ mova m1, [cq+128* 3]
+ mova m2, [cq+128* 5]
+ mova [cq+128* 3], m3
+ mova [cq+128* 4], m4
+ mova m3, [cq+128* 7]
+ mova m4, [cq+128* 9]
+ mova [cq+128* 5], m5
+ mova [cq+128* 6], m6
+ mova [cq+128* 7], m7
+ mova m5, [cq+128*11]
+ mova m6, [cq+128*13]
+ mova m7, [cq+128*15]
+ mova [cq+128* 8], m23
+ mova [cq+128* 9], m22
+ mova [cq+128*10], m21
+ mova [cq+128*11], m20
+ mova [cq+128*12], m19
+ mova [cq+128*13], m18
+ mova [cq+128*14], m17
+ mova [cq+128*15], m16
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 7, 16, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m13, [pw_8192]
+ vpbroadcastd m15, [pixel_10bpc_max]
+ pxor m14, m14
+ lea r6, [strideq*9]
+ cmp eobd, 136
+ jl .main
+ mov r4, dstq
+ call .main
+ add cq, 64-128*4
+ lea dstq, [dstq+strideq*8]
+ call .main
+ add cq, 128*12-64
+ lea dstq, [r4+32]
+ cmp eobd, 543
+ jl .main
+ call .main
+ add cq, 64-128*4
+ lea dstq, [dstq+strideq*8]
+.main:
+ call .main_internal
+ add cq, 128*4
+ pmulhrsw m1, m13, m2
+ pmulhrsw m3, m13, m4
+ pmulhrsw m5, m13, m6
+ pmulhrsw m7, m13, m8
+ call .main_internal
+ jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
+.main_internal:
+ mova m8, [cq+128* 0]
+ packssdw m8, [cq+128* 8]
+ mova m6, [cq+128* 1]
+ packssdw m6, [cq+128* 9]
+ mova m0, [cq+128* 2]
+ packssdw m0, [cq+128*10]
+ mova m2, [cq+128* 3]
+ packssdw m2, [cq+128*11]
+ REPX {vpermq x, x, q3120}, m8, m6, m0, m2
+ REPX {mova [cq+128*x], m14}, 0, 1, 2, 3
+ punpcklwd m4, m8, m6
+ punpckhwd m8, m6
+ punpcklwd m6, m0, m2
+ punpckhwd m0, m2
+ REPX {mova [cq+128*x], m14}, 8, 9, 10, 11
+ punpckldq m2, m4, m6 ; 0 1
+ punpckhdq m4, m6 ; 2 3
+ punpckldq m6, m8, m0 ; 4 5
+ punpckhdq m8, m0 ; 6 7
+ ret
+
+cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+
+ PROLOGUE 4, 7, 32, -8*mmsize, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ cmp eobd, 36
+ jl .fast
+ call .pass1
+ cmp eobd, 151
+ jge .full
+ lea r5, [o_base_8bpc]
+
+ punpckhwd m22, m0, m0
+ punpckhwd m23, m1, m1
+ punpckhwd m24, m2, m2
+ punpckhwd m25, m3, m3
+ punpckhwd m26, m4, m4
+ punpckhwd m27, m5, m5
+ punpckhwd m28, m6, m6
+ punpckhwd m29, m7, m7
+ punpcklwd m21, m1, m1
+ punpcklwd m14, m3, m3
+ punpcklwd m18, m5, m5
+ punpcklwd m15, m7, m7
+ pxor m9, m9
+ punpcklwd m9, m9, m0
+ punpcklwd m8, m2, m2
+ punpcklwd m7, m4, m4
+ punpcklwd m1, m6, m6
+ call m(idct_16x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ mova [rsp+mmsize*0], m14
+ mova [rsp+mmsize*1], m15
+ mova [rsp+mmsize*2], m16
+ mova [rsp+mmsize*3], m17
+ mova [rsp+mmsize*4], m18
+ mova [rsp+mmsize*5], m19
+ mova [rsp+mmsize*6], m20
+ mova [rsp+mmsize*7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+
+ pxor m12, m12
+ mov r3d, 64*3
+.zero_loop:
+ REPX {mova [cq+r3*8+128*x], m12}, 0, 1, 2, 3
+ sub r3d, 64
+ jge .zero_loop
+
+ jmp .pass2_end
+.full:
+ mova [cq+128*0], m0
+ mova [cq+128*1], m1
+ mova [cq+128*2], m2
+ mova [cq+128*3], m3
+ mova [cq+128*4], m4
+ mova [cq+128*5], m5
+ mova [cq+128*6], m6
+ mova [cq+128*7], m7
+ add cq, 64
+ call .pass1
+ sub cq, 64
+ mova m22, [cq+128*0] ; 0 1
+ mova m23, [cq+128*1] ; 2 3
+ mova m24, [cq+128*2] ; 4 5
+ mova m25, [cq+128*3] ; 6 7
+ mova m26, [cq+128*4] ; 8 9
+ mova m27, [cq+128*5] ; 10 11
+ mova m28, [cq+128*6] ; 12 13
+ mova m29, [cq+128*7] ; 14 15
+ mova [cq+64* 8], m0
+ mova [cq+64* 9], m1
+ mova [cq+64*10], m2
+ mova [cq+64*11], m3
+ mova [cq+64*12], m4
+ mova [cq+64*13], m5
+ mova [cq+64*14], m6
+ mova [cq+64*15], m7
+ lea r5, [o_base_8bpc]
+
+ punpcklwd m20, m1, m1
+ punpcklwd m16, m3, m3
+ punpcklwd m19, m5, m5
+ punpcklwd m17, m7, m7
+ punpcklwd m8, m24, m24 ; 4
+ punpcklwd m5, m2, m2 ; 20
+ punpcklwd m1, m28, m28 ; 12
+ punpcklwd m7, m26, m26 ; 8
+ punpcklwd m3, m4, m4 ; 24
+ punpcklwd m4, m6, m6 ; 28
+ pxor m9, m9
+ punpcklwd m6, m9, m0 ; __ 16
+ mova m0, m4
+ punpcklwd m9, m9, m22 ; __ 0
+ call m(idct_16x16_internal_8bpc).main_fast
+ punpcklwd m21, m23, m23 ; 2
+ punpcklwd m15, m29, m29 ; 14
+ punpcklwd m18, m27, m27 ; 10
+ punpcklwd m14, m25, m25 ; 6
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova [rsp+mmsize*0], m14
+ mova [rsp+mmsize*1], m15
+ mova [rsp+mmsize*2], m16
+ mova [rsp+mmsize*3], m17
+ mova [rsp+mmsize*4], m18
+ mova [rsp+mmsize*5], m19
+ mova [rsp+mmsize*6], m20
+ mova [rsp+mmsize*7], m21
+ mova m21, [cq+64*15]
+ mova m14, [cq+64* 8]
+ mova m17, [cq+64*11]
+ mova m18, [cq+64*12]
+ mova m19, [cq+64*13]
+ mova m16, [cq+64*10]
+ mova m15, [cq+64* 9]
+ mova m20, [cq+64*14]
+ REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
+ m24, m19, m16, m27, m28, m15, m20, m23
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
+
+ pxor m12, m12
+ mov r3d, 32*7
+.full_zero_loop:
+ REPX {mova [cq+r3*8+64*x], m12}, 0, 1, 2, 3
+ sub r3d, 32
+ jge .full_zero_loop
+
+ jmp .pass2_end
+.fast:
+ mova ym0, [cq+128*0]
+ mova ym2, [cq+128*4]
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+128*2]
+ mova ym3, [cq+128*6]
+ mova ym4, [cq+128*1]
+ mova ym5, [cq+128*3]
+ mova ym6, [cq+128*5]
+ mova ym7, [cq+128*7]
+ vpermt2q m0, m8, m2 ; 0 4
+ vpermt2q m1, m8, m3 ; 2 6
+ vpermt2q m4, m8, m5 ; 1 3
+ vpermt2q m7, m8, m6 ; 7 5
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ vpbroadcastd m11, [o(pd_2)]
+ call m(idct_8x16_internal_10bpc).main_end2
+ mova m8, [o(idct8x32p)]
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ mova m6, [dup16_perm]
+ vpermb m0, m8, m0
+ vpermb m2, m8, m2
+ vprold m8, 16
+ vpermb m1, m8, m1
+ vpermb m3, m8, m3
+ punpckldq m4, m0, m2
+ punpckhdq m0, m2
+ punpckldq m2, m1, m3
+ punpckhdq m1, m3
+ punpckldq m21, m4, m2
+ punpckhdq m14, m4, m2
+ punpckldq m18, m0, m1
+ punpckhdq m15, m0, m1
+ vpord m7, m6, [o(pb_32)] {1to16}
+ vpermb m22, m7, m21 ; 1
+ pmovzxwd m9, ym21 ; 0
+ vpermb m8, m6, m18 ; 4
+ vpermb m24, m7, m18 ; 5
+ vpermb m21, m6, m14 ; 2
+ vpermb m23, m7, m14 ; 3
+ vpermb m14, m6, m15 ; 6
+ vpermb m25, m7, m15 ; 7
+ lea r5, [o_base_8bpc]
+ pslld m9, 16
+
+ pxor m7, m7
+ REPX {mova x, m7}, m1, m18, m15, m26, m27, m28, m29
+
+ call m(idct_16x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ mova [rsp+mmsize*0], m14
+ mova [rsp+mmsize*1], m15
+ mova [rsp+mmsize*2], m16
+ mova [rsp+mmsize*3], m17
+ mova [rsp+mmsize*4], m18
+ mova [rsp+mmsize*5], m19
+ mova [rsp+mmsize*6], m20
+ mova [rsp+mmsize*7], m21
+
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+
+ pxor m12, m12
+ REPX {mova [cq+128*x], ym12}, 0, 1, 2, 3, 4, 5, 6, 7
+.pass2_end:
+ movshdup m30, [permC]
+ vpbroadcastd m11, [pw_2048]
+ vpbroadcastd m13, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ psrlq m31, m30, 8
+ vpermq m8, m30, m0
+ vpermq m9, m31, m1
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m2
+ vpermq m9, m31, m3
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m4
+ vpermq m9, m31, m5
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m6
+ vpermq m9, m31, m7
+ call m(idct_16x8_internal_10bpc).write_16x4
+
+ mova m1, [rsp+mmsize*0]
+ mova m2, [rsp+mmsize*1]
+ mova m3, [rsp+mmsize*2]
+ mova m4, [rsp+mmsize*3]
+ mova m5, [rsp+mmsize*4]
+ mova m6, [rsp+mmsize*5]
+ mova m7, [rsp+mmsize*6]
+ mova m8, [rsp+mmsize*7]
+
+ paddsw m0, m1, m21
+ psubsw m21, m1, m21
+ paddsw m1, m2, m20
+ psubsw m20, m2, m20
+ paddsw m2, m3, m19
+ psubsw m19, m3, m19
+ paddsw m3, m4, m18
+ psubsw m18, m4, m18
+ paddsw m4, m5, m17
+ psubsw m17, m5, m17
+ paddsw m5, m6, m16
+ psubsw m16, m6, m16
+ paddsw m6, m7, m15
+ psubsw m15, m7, m15
+ paddsw m7, m8, m14
+ psubsw m14, m8, m14
+
+ vpermq m8, m30, m0
+ vpermq m9, m31, m1
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m2
+ vpermq m9, m31, m3
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m4
+ vpermq m9, m31, m5
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m6
+ vpermq m9, m31, m7
+ call m(idct_16x8_internal_10bpc).write_16x4
+
+ vpermq m8, m30, m14
+ vpermq m9, m31, m15
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m16
+ vpermq m9, m31, m17
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m18
+ vpermq m9, m31, m19
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m20
+ vpermq m9, m31, m21
+ call m(idct_16x8_internal_10bpc).write_16x4
+
+ vpermq m8, m30, m22
+ vpermq m9, m31, m23
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m24
+ vpermq m9, m31, m25
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m26
+ vpermq m9, m31, m27
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m30, m28
+ vpermq m9, m31, m29
+ call m(idct_16x8_internal_10bpc).write_16x4
+ RET
+.pass1:
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 2]
+ mova m2, [cq+128* 4]
+ mova m3, [cq+128* 6]
+ mova m4, [cq+128* 8]
+ mova m5, [cq+128*10]
+ mova m6, [cq+128*12]
+ mova m7, [cq+128*14]
+ call m(idct_8x16_internal_10bpc).main
+ mova m16, [cq+128* 1]
+ mova m17, [cq+128* 3]
+ mova m18, [cq+128* 5]
+ mova m19, [cq+128* 7]
+ mova m20, [cq+128* 9]
+ mova m21, [cq+128*11]
+ mova m22, [cq+128*13]
+ mova m23, [cq+128*15]
+ call m(idct_16x16_internal_10bpc).main
+ call m(idct_16x16_internal_10bpc).main_end
+ jmp m(idct_16x16_internal_10bpc).main_end3
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 64
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/itx16_sse.asm b/third_party/dav1d/src/x86/itx16_sse.asm
new file mode 100644
index 0000000000..3833e17c99
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx16_sse.asm
@@ -0,0 +1,8135 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; Copyright © 2017-2021, The rav1e contributors
+; Copyright © 2020, Nathan Egge
+; Copyright © 2021, Matthias Dressel
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+%macro COEF 1-2
+pd_%1: times 4 dd %1
+%if %0 == 2
+pd_m%1: times 4 dd -%1
+%endif
+%endmacro
+
+COEF 201
+COEF 401
+COEF 601, 1
+COEF 799
+COEF 995
+COEF 1189, 1
+COEF 1380, 1
+COEF 1567
+COEF 1751
+COEF 1931
+COEF 2106, 1
+COEF 2276, 1
+COEF 2440
+COEF 2598, 1
+COEF 2751, 1
+COEF 2896
+COEF 3035
+COEF 3166
+COEF 3290
+COEF 3406
+COEF 3513
+COEF 3612
+COEF 3703
+COEF 3784
+COEF 3857
+COEF 3920
+COEF 3973
+COEF 4017
+COEF 4052
+COEF 4076
+COEF 4091
+
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+
+%if ARCH_X86_32
+pd_1: times 4 dd 1
+%endif
+pd_2: times 4 dd 2
+pw_5: times 8 dw 5
+pd_1321: times 4 dd 1321
+pd_2482: times 4 dd 2482
+pd_m3344: times 4 dd -3344
+pd_2048: times 4 dd 2048
+pw_4x2048_4xm2048: times 4 dw 2048
+ times 4 dw -2048
+pw_4xm2048_4x2048: times 4 dw -2048
+ times 4 dw 2048
+pw_2048: times 8 dw 2048
+pw_m2048: times 8 dw -2048
+pd_3803: times 4 dd 3803
+pw_4096: times 8 dw 4096
+pd_5793: times 4 dd 5793
+pd_6144: times 4 dd 6144
+pw_8192: times 8 dw 8192
+pd_10240: times 4 dd 10240
+pd_11586: times 4 dd 11586
+pw_1697x8: times 8 dw 1697*8
+pw_2896x8: times 8 dw 2896*8
+pw_1697x16: times 8 dw 1697*16
+pw_16384: times 8 dw 16384
+pixel_10bpc_max: times 8 dw 0x03ff
+
+pw_1567_3784: times 4 dw 1567, 3784
+pw_m3784_1567: times 4 dw -3784, 1567
+pw_2896_2896: times 4 dw 2896, 2896
+pw_m2896_2896: times 4 dw -2896, 2896
+
+clip_18b_min: times 4 dd -0x20000
+clip_18b_max: times 4 dd 0x1ffff
+
+idct64_mul_16bpc:
+dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017
+dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799
+dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276
+dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406
+
+cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3
+cextern iadst_4x4_internal_8bpc_ssse3.main
+cextern idct_4x8_internal_8bpc_ssse3.main
+cextern iadst_4x8_internal_8bpc_ssse3.main
+cextern idct_16x4_internal_8bpc_ssse3.main
+cextern iadst_16x4_internal_8bpc_ssse3.main
+cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end
+cextern idct_8x4_internal_8bpc_ssse3.main
+cextern iadst_8x4_internal_8bpc_ssse3.main
+cextern idct_8x8_internal_8bpc_ssse3.main
+cextern idct_8x8_internal_8bpc_ssse3.pass1_end3
+cextern iadst_8x8_internal_8bpc_ssse3.main
+cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end
+cextern idct_16x8_internal_8bpc_ssse3.main
+cextern iadst_16x8_internal_8bpc_ssse3.main
+cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end
+cextern idct_8x32_internal_8bpc_ssse3.main
+cextern idct_8x32_internal_8bpc_ssse3.main_fast
+cextern idct_8x32_internal_8bpc_ssse3.main_veryfast
+cextern idct_16x64_internal_8bpc_ssse3.main
+cextern idct_16x64_internal_8bpc_ssse3.main_fast
+
+tbl_4x16_2d: db 0, 13, 29, 45
+tbl_4x16_h: db 0, 16, 32, 48
+tbl_4x16_v: db 0, 4, 8, 12
+
+tbl_8x16_2d: db 0, 14, 30, 46
+tbl_8x16_v: db 0, 4, 8, 12
+tbl_8x16_h: db 0, 32, 64, 96
+
+tbl_16x16_2d: db 0, 10, 36, 78
+tbl_16x16_v: db 0, 4, 8, 12
+tbl_16x16_h: db 0, 64, 128, 192
+
+tbl_8x32_2d: dw 0, 14, 43, 75, 107, 139, 171, 203
+
+tbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343
+
+tbl_32x16_2d: ; first 4 entries of 32x32 are identical to this one
+tbl_32x32_2d: dw 0, 10, 36, 78, 136, 210, 300, 406
+
+tbl_Nx32_odd_offset: db 2*16, 2*23
+ db 2*20, 2*19
+ db 2*18, 2*21
+ db 2*22, 2*17
+ db 2*30, 2*25
+ db 2*26, 2*29
+ db 2*28, 2*27
+ db 2*24, 2*31
+
+tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46
+ db 2* 8, 2*40, 2*23, 2*38
+ db 2* 1, 2*36, 2*20, 2*42
+ db 2* 9, 2*44, 2*19, 2*34
+ db 2* 2, 2*60, 2*18, 2*50
+ db 2*10, 2*52, 2*21, 2*58
+ db 2* 3, 2*56, 2*22, 2*54
+ db 2*11, 2*48, 2*17, 2*62
+
+SECTION .text
+
+%define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx)
+%define m(x) m_suffix(x, SUFFIX)
+
+; This refers to the first function in itx_sse i.e. the start of the text section
+; which is needed as a base pointer for constants.
+%define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3)
+
+%if ARCH_X86_64
+%define o(x) x
+%else
+%define o(x) r6-$$+x ; PIC
+%endif
+
+%macro IWHT4_1D 0
+ ; m0 = in0, m1 = in1, m2 = in2, m3 = in3
+ paddd m0, m1 ; in0 += in1
+ psubd m4, m2, m3 ; tmp0 = in2 - in3
+ psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1
+ psrad m5, 1
+ psubd m2, m5, m1 ; in2 = tmp1 - in1
+ psubd m5, m3 ; in1 = tmp1 - in3
+ psubd m0, m5 ; in0 -= in1
+ paddd m4, m2 ; in3 = tmp0 + in2
+ ; m0 = out0, m1 = in1, m2 = out2, m3 = in3
+ ; m4 = out3, m5 = out1
+%endmacro
+
+INIT_XMM sse2
+cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ mova m2, [cq+16*2]
+ mova m3, [cq+16*3]
+ REPX {psrad x, 2}, m0, m1, m2, m3
+ IWHT4_1D
+ punpckldq m1, m0, m5
+ punpckhdq m3, m0, m5
+ punpckldq m5, m2, m4
+ punpckhdq m2, m4
+ punpcklqdq m0, m1, m5
+ punpckhqdq m1, m5
+ punpcklqdq m4, m3, m2
+ punpckhqdq m3, m2
+ mova m2, m4
+ IWHT4_1D
+ packssdw m0, m4 ; low: out3, high: out0
+ packssdw m2, m5 ; low: out2, high: out1
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ lea r2, [dstq+strideq*2]
+ movq m1, [dstq+strideq*0]
+ movhps m1, [r2 +strideq*1]
+ movq m3, [r2 +strideq*0]
+ movhps m3, [dstq+strideq*1]
+ movd m5, bdmaxm
+ pshuflw m5, m5, q0000 ; broadcast
+ punpcklqdq m5, m5 ; broadcast
+ paddsw m0, m1
+ paddsw m2, m3
+ pmaxsw m0, m4
+ pmaxsw m2, m4
+ pminsw m0, m5
+ pminsw m2, m5
+ movhps [r2 +strideq*1], m0 ; write out0
+ movhps [dstq+strideq*1], m2 ; write out1
+ movq [r2 +strideq*0], m2 ; write out2
+ movq [dstq+strideq*0], m0 ; write out3
+ RET
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+; flags: 2 = inv_dst1, 4 = inv_dst2
+; skip round/shift if rnd is not a number
+%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
+; %1 dst/src[1]
+; %2 dst/src[2]
+; %3 tmp[1]
+; %4 tmp[2]
+; %5 tmp[3]
+; %6 rnd
+; %7 coef[1]
+; %8 coef[2]
+; %9 flags
+%ifnidn %7,%8 ; optimize when coef1 == coef2
+%if %8 < 32
+ pmulld m%4, m%1, m%8
+ pmulld m%3, m%2, m%8
+%else
+ mova m%3, [o(pd_%8)]
+ pmulld m%4, m%1, m%3
+ pmulld m%3, m%2
+%endif
+%endif
+%if %7 < 32
+ pmulld m%1, m%7
+ pmulld m%2, m%7
+%else
+ mova m%5, [o(pd_%7)]
+ pmulld m%1, m%5
+ pmulld m%2, m%5
+%endif
+%if %9 & 4 ; invert dst2
+ paddd m%4, m%2
+ psubd m%2, m%6, m%4
+%else
+%ifnum %6
+%ifnidn %7,%8
+ paddd m%4, m%6
+%else
+ paddd m%1, m%6
+%endif
+%endif
+%ifnidn %7,%8
+ paddd m%2, m%4
+%else
+ mova m%3, m%2
+ paddd m%2, m%1
+%endif
+%endif
+%if %9 & 2 ; invert dst1
+ psubd m%3, m%1
+ paddd m%1, m%3, m%6
+%else
+%ifnum %6
+%ifnidn %7,%8
+ paddd m%1, m%6
+%endif
+%endif
+ psubd m%1, m%3
+%endif
+%ifnum %6
+ psrad m%2, 12
+ psrad m%1, 12
+%endif
+%endmacro
+
+%macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack
+cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%4_internal_16bpc)
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+%if has_epilogue
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jz %%end
+%endif
+ lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
+%ifnum %3
+%if %3
+ add eobd, %3
+%endif
+%else
+ lea r5, [o(%3)]
+%endif
+ call %%p1
+ RET
+%%end:
+%else
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
+%ifnum %3
+%if %3
+ add eobd, %3
+%endif
+%else
+ lea r5, [o(%3)]
+%endif
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 0, 4x4
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 4
+.dconly:
+ add r5d, 128
+ sar r5d, 8
+.dconly2:
+ imul r5d, 2896
+ mova m2, [o(pixel_10bpc_max)]
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ pxor m3, m3
+ punpcklqdq m0, m0
+.dconly_loop:
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ paddw m1, m0
+ pminsw m1, m2
+ pmaxsw m1, m3
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
+ ; butterfly rotation
+ ITX_MULSUB_2D %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1 %3 out0
+ ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2 %4 out3
+ ; Hadamard rotation
+ psubd m%5, m%1, m%2
+ paddd m%2, m%1
+ paddd m%1, m%3, m%4
+ psubd m%3, m%4
+ ; %1 (src1) = out0
+ ; %2 (src2) = out1
+ ; %3 (src3) = out3
+ ; $5 (tmp1) = out2
+%endmacro
+
+INIT_XMM sse4
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, identity
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+
+cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ mova m2, [cq+16*2]
+ mova m3, [cq+16*3]
+ mova m5, [o(pd_2048)]
+ call .pass1_main
+ packssdw m0, m1 ; out0 out1
+ packssdw m4, m2 ; out2 out3
+ ; transpose
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass1_main:
+ IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5
+ ret
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+ ; m5 = pd_2048
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ pmaddwd m4, m2, [o(pw_m3784_1567)]
+ pmaddwd m2, [o(pw_1567_3784)]
+ pmaddwd m0, m1, [o(pw_m2896_2896)]
+ pmaddwd m1, [o(pw_2896_2896)]
+ REPX {paddd x, m5}, m4, m2, m0, m1
+ packssdw m5, m5 ; pw_2048
+ REPX {psrad x, 12}, m4, m2, m0, m1
+ packssdw m2, m4 ; t3 t2
+ packssdw m1, m0 ; t0 t1
+ paddsw m0, m1, m2 ; out0 out1
+ psubsw m1, m2 ; out3 out2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ movq m2, [dstq+strideq*0]
+ movhps m2, [dstq+strideq*1]
+ lea r5, [dstq+strideq*2]
+ movq m3, [r5 +strideq*1]
+ movhps m3, [r5 +strideq*0]
+ mova m5, [o(pixel_10bpc_max)]
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movhps [r5 +strideq*0], m1
+ movq [r5 +strideq*1], m1
+ RET
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call .main
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ ; transpose
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
+.end:
+ mova m4, [o(pw_2048)]
+ movq m2, [dstq+strideq*0]
+ movhps m2, [dstq+strideq*1]
+ lea r5, [dstq+strideq*2]
+ movq m3, [r5 +strideq*0]
+ movhps m3, [r5 +strideq*1]
+ mova m5, [o(pixel_10bpc_max)]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [r5 +strideq*0], m1
+ movhps [r5 +strideq*1], m1
+ RET
+ALIGN function_align
+.main:
+ mova m1, [cq+16*2]
+ mova m3, [cq+16*3]
+ mova m5, [cq+16*0]
+ lea r3, [cq+16*1]
+.main2:
+ mova m0, [o(pd_1321)] ; SINPI_1_9
+ mova m2, [o(pd_2482)] ; SINPI_2_9
+ mova m6, [o(pd_3803)] ; SINPI_4_9
+ pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2]
+ pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3]
+ pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2]
+ pmulld m0, m5 ; s[0] = SINPI_1_9 * T[0]
+ psubd m1, m3 ; T[2] - T[3]
+ pmulld m3, m2 ; s[5] = SINPI_2_9 * T[3]
+ pmulld m2, m5 ; s[1] = SINPI_2_9 * T[0]
+ paddd m0, m6 ; s[0] += s[3]
+ paddd m0, m3 ; s[0] += s[5]
+ mova m3, [o(pd_m3344)] ; -SINPI_3_9
+ psubd m2, m4 ; s[1] -= s[4]
+ psubd m2, m7 ; s[1] -= s[6]
+ psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0]
+ pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7
+ pmulld m3, [r3] ; -s[3] = -SINPI_3_9 * T[1]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048
+ paddd m4, m0, m2 ; x[3] = s[0] + s[1]
+ psubd m2, m3 ; x[1] = s[1] + s[3]
+ psubd m0, m3 ; x[0] = s[0] + s[3]
+ paddd m4, m3 ; x[3] -= s[3]
+ paddd m2, m5 ; x[1] + 2048
+ REPX {psrad x, 12}, m0, m2, m1, m4
+ ret
+
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call m(iadst_4x4_internal_16bpc).main
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ ; transpose
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
+ mova m4, [o(pw_2048)]
+ movq m3, [dstq+strideq*1]
+ movhps m3, [dstq+strideq*0]
+ lea r5, [dstq+strideq*2]
+ movq m2, [r5 +strideq*1]
+ movhps m2, [r5 +strideq*0]
+ mova m5, [o(pixel_10bpc_max)]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ movhps [dstq+strideq*0], m1
+ movq [dstq+strideq*1], m1
+ movhps [r5 +strideq*0], m0
+ movq [r5 +strideq*1], m0
+ RET
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ mova m3, [o(pd_5793)]
+ pmulld m0, m3, [cq+16*0]
+ pmulld m1, m3, [cq+16*1]
+ pmulld m2, m3, [cq+16*2]
+ pmulld m3, [cq+16*3]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ ; transpose
+ punpckhwd m3, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+ ; m5 = pd_2048
+ mova m4, [o(pw_1697x8)]
+ movq m2, [dstq+strideq*0]
+ movhps m2, [dstq+strideq*1]
+ lea r5, [dstq+strideq*2]
+ pmulhrsw m3, m4, m0
+ pmulhrsw m4, m1
+ paddsw m0, m3
+ paddsw m1, m4
+ movq m3, [r5 +strideq*0]
+ movhps m3, [r5 +strideq*1]
+ mova m4, [o(pixel_10bpc_max)]
+ packssdw m5, m5 ; pw_2048
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ pxor m5, m5
+ mova [cq+16*0], m5
+ mova [cq+16*1], m5
+ mova [cq+16*2], m5
+ mova [cq+16*3], m5
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m5
+ pmaxsw m1, m5
+ pminsw m0, m4
+ pminsw m1, m4
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [r5 +strideq*0], m1
+ movhps [r5 +strideq*1], m1
+ RET
+
+%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 4x8
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, identity, 9
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+
+cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+ mova m5, [o(pd_2048)]
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 13
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 13
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+.loop_pass1:
+ mova m3, [o(pd_2896)]
+ pmulld m0, m3, [cq+32*0+r5]
+ pmulld m1, m3, [cq+32*1+r5]
+ pmulld m2, m3, [cq+32*2+r5]
+ pmulld m3, [cq+32*3+r5]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ call m(idct_4x4_internal_16bpc).pass1_main
+ packssdw m0, m1 ; out0 out1
+ packssdw m4, m2 ; out2 out3
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32*0+16], m0
+ mova [cq+32*1+16], m4
+ xor r5d, r5d
+ jmp .loop_pass1
+.end_pass1:
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ mova m2, [cq+32*0+16]
+ mova m6, [cq+32*1+16]
+ punpckhwd m4, m2, m6
+ punpcklwd m2, m6
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_4x8_internal_8bpc, _ssse3).main
+ ; m0-3 is now out0/1,3/2,4/5,7/6
+ mova m4, [o(pw_2048)]
+ shufps m1, m1, q1032
+ shufps m3, m3, q1032
+.end:
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ pxor m4, m4
+ REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
+ mova m7, [o(pixel_10bpc_max)]
+ lea r2, [strideq*3]
+ movq m5, [dstq+strideq*0]
+ movq m6, [dstq+strideq*2]
+ movhps m5, [dstq+strideq*1]
+ movhps m6, [dstq+r2]
+ lea r4, [dstq+strideq*4]
+ paddw m0, m5
+ paddw m1, m6
+ movq m5, [r4+strideq*0]
+ movq m6, [r4+strideq*2]
+ movhps m5, [r4+strideq*1]
+ movhps m6, [r4+r2]
+ paddw m2, m5
+ paddw m3, m6
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ REPX {pmaxsw x, m4}, m0, m1, m2, m3
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r2 ], m1
+ movq [r4 +strideq*0], m2
+ movhps [r4 +strideq*1], m2
+ movq [r4 +strideq*2], m3
+ movhps [r4 +r2 ], m3
+ RET
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity, 9
+
+cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call .pass1_main
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ mova m2, [cq+32*2+16]
+ mova m6, [cq+32*3+16]
+ punpckhwd m4, m2, m6
+ punpcklwd m2, m6
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass1_main:
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 13
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 13
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+ lea r3, [cq+32*1+16]
+.loop_pass1:
+ mova m0, [o(pd_2048)]
+ mova m3, [o(pd_2896)]
+ pmulld m5, m3, [cq+32*0+r5]
+ pmulld m2, m3, [cq+32*1+r5]
+ pmulld m1, m3, [cq+32*2+r5]
+ pmulld m3, [cq+32*3+r5]
+ REPX {paddd x, m0}, m5, m2, m1, m3
+ REPX {psrad x, 12}, m5, m2, m1, m3
+ mova [r3], m2
+ call m(iadst_4x4_internal_16bpc).main2
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32*2+16], m0
+ mova [cq+32*3+16], m1
+ xor r5d, r5d
+ jmp .loop_pass1
+.end_pass1:
+ ret
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
+ mova m4, [o(pw_4x2048_4xm2048)]
+ jmp m(idct_4x8_internal_16bpc).end
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity, 9
+
+cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call m(iadst_4x8_internal_16bpc).pass1_main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ mova m6, [cq+32*2+16]
+ mova m2, [cq+32*3+16]
+ punpcklwd m4, m2, m6
+ punpckhwd m2, m6
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
+ mova m4, m0
+ mova m5, m1
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ pshufd m2, m5, q1032
+ pshufd m3, m4, q1032
+ mova m4, [o(pw_4xm2048_4x2048)]
+ jmp m(idct_4x8_internal_16bpc).end
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity, 3
+
+cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+ mova m5, [o(pd_2048)]
+ mova m4, [o(pd_2896)]
+ mova m6, [o(pd_5793)]
+ ; clear m7 in case we skip the bottom square
+ pxor m7, m7
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 16
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 16
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+.loop_pass1:
+ pmulld m0, m4, [cq+32*0+r5]
+ pmulld m1, m4, [cq+32*1+r5]
+ pmulld m2, m4, [cq+32*2+r5]
+ pmulld m3, m4, [cq+32*3+r5]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ REPX {pmulld x, m6}, m0, m1, m2, m3
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32*0+16], m0
+ mova m7, m2
+ xor r5d, r5d
+ jmp .loop_pass1
+.end_pass1:
+ punpckhwd m4, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m1, m0, m4
+ punpcklwd m0, m4
+ mova m2, [cq+32*0+16]
+ punpckhwd m4, m2, m7
+ punpcklwd m2, m7
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass2:
+ mova m4, [o(pw_4096)]
+ jmp m(idct_4x8_internal_16bpc).end
+
+%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
+ INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+ add r5d, 384
+ sar r5d, 9
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, identity, v
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+
+cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+ mova m5, [o(pd_2048)]
+.loop_pass1:
+ mova m0, [cq+64*0+r5]
+ mova m1, [cq+64*1+r5]
+ mova m2, [cq+64*2+r5]
+ mova m3, [cq+64*3+r5]
+ call m(idct_4x4_internal_16bpc).pass1_main
+ pcmpeqd m3, m3
+ REPX {psubd x, m3}, m0, m1, m4, m2
+ REPX {psrad x, 1}, m0, m1, m4, m2
+ packssdw m0, m1 ; out0 out1
+ packssdw m4, m2 ; out2 out3
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.end_pass1:
+ mova m2, [cq+64*0+16]
+ mova m3, [cq+64*1+16]
+ mova m4, [cq+64*0+32]
+ mova m5, [cq+64*1+32]
+ mova m6, [cq+64*0+48]
+ mova m7, [cq+64*1+48]
+ ; m0-7 = packed & transposed output
+ jmp tx2q
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_16x4_internal_8bpc, _ssse3).main
+ ; m0-6 is out0-13 [with odd registers having inversed output]
+ ; [coeffq+16*7] has out15/14
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [cq+16*7]
+ REPX {shufps x, x, q1032}, m1, m3, m5, m7
+ mova [cq+16*0], m4
+ mova [cq+16*1], m5
+ mova [cq+16*2], m6
+ mova [cq+16*3], m7
+.end:
+ pxor m4, m4
+ REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ mova m7, [o(pixel_10bpc_max)]
+ mov r5d, 2
+ lea r3, [strideq*3]
+.loop:
+ movq m5, [dstq+strideq*0]
+ movq m6, [dstq+strideq*2]
+ movhps m5, [dstq+strideq*1]
+ movhps m6, [dstq+r3]
+ lea r4, [dstq+strideq*4]
+ paddw m0, m5
+ paddw m1, m6
+ movq m5, [r4+strideq*0]
+ movq m6, [r4+strideq*2]
+ movhps m5, [r4+strideq*1]
+ movhps m6, [r4+r3]
+ paddw m2, m5
+ paddw m3, m6
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ REPX {pmaxsw x, m4}, m0, m1, m2, m3
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r3 ], m1
+ movq [r4 +strideq*0], m2
+ movhps [r4 +strideq*1], m2
+ movq [r4 +strideq*2], m3
+ movhps [r4 +r3 ], m3
+ dec r5d
+ jz .end2
+ lea dstq, [dstq+strideq*8]
+ mova m0, [cq+0*16]
+ mova m1, [cq+1*16]
+ mova m2, [cq+2*16]
+ mova m3, [cq+3*16]
+ REPX {mova [cq+x*16], m4}, 0, 1, 2, 3
+ jmp .loop
+.end2:
+ RET
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity, v
+
+cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r6+r5]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+.loop_pass1:
+ mova m5, [cq+64*0+r5]
+ lea r3, [cq+64*1+r5]
+ mova m1, [cq+64*2+r5]
+ mova m3, [cq+64*3+r5]
+ call m(iadst_4x4_internal_16bpc).main2
+ pcmpeqd m3, m3
+ REPX {psubd x, m3}, m0, m2, m1, m4
+ REPX {psrad x, 1}, m0, m2, m1, m4
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ test r5d, r5d
+ jz m(idct_4x16_internal_16bpc).end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
+ ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8
+ ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13
+ mova m1, [o(pw_4x2048_4xm2048)]
+ REPX {pmulhrsw x, m1}, m7, m2, m0
+ pshufd m6, m1, q1032 ; 4x-2048,4x2048
+ pmulhrsw m1, [cq+16*7]
+ REPX {pmulhrsw x, m6}, m5, m4, m3
+ pmulhrsw m6, [cq+16*6]
+ ; m7/5/2/4 = out4/11,5/10,6/9,7/8
+ ; m0/3/6/1 = out0/15,3/12,1/14,2/13
+ ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
+ movhps [cq+0*8], m4
+ movhps [cq+1*8], m2
+ movhps [cq+2*8], m5
+ movhps [cq+3*8], m7
+ movhps [cq+4*8], m3
+ movhps [cq+5*8], m1
+ movhps [cq+6*8], m6
+ movhps [cq+7*8], m0
+ punpcklqdq m0, m6
+ punpcklqdq m1, m3
+ punpcklqdq m3, m2, m4
+ punpcklqdq m2, m7, m5
+ jmp m(idct_4x16_internal_16bpc).end
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity, v
+
+cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+.loop_pass1:
+ mova m5, [cq+64*0+r5]
+ lea r3, [cq+64*1+r5]
+ mova m1, [cq+64*2+r5]
+ mova m3, [cq+64*3+r5]
+ call m(iadst_4x4_internal_16bpc).main2
+ pcmpeqd m3, m3
+ REPX {psubd x, m3}, m0, m2, m1, m4
+ REPX {psrad x, 1}, m0, m2, m1, m4
+ packssdw m0, m2 ; out3 out2
+ packssdw m1, m4 ; out1 out0
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ test r5d, r5d
+ jz m(idct_4x16_internal_16bpc).end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
+ ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7
+ ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2
+ mova m1, [o(pw_4x2048_4xm2048)]
+ REPX {pmulhrsw x, m1}, m7, m2, m0
+ pshufd m6, m1, q1032 ; 4x-2048,4x2048
+ pmulhrsw m1, [cq+16*7]
+ REPX {pmulhrsw x, m6}, m5, m4, m3
+ pmulhrsw m6, [cq+16*6]
+ ; m7/5/2/4 = out11/4,10/5,9/6,8/7
+ ; m0/3/6/1 = out15/0,12/3,14/1,13/2
+ ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
+ movq [cq+0*8], m4
+ movq [cq+1*8], m2
+ movq [cq+2*8], m5
+ movq [cq+3*8], m7
+ movq [cq+4*8], m3
+ movq [cq+5*8], m1
+ movq [cq+6*8], m6
+ movq [cq+7*8], m0
+ punpckhqdq m0, m6
+ punpckhqdq m1, m3
+ punpckhqdq m3, m2, m4
+ punpckhqdq m2, m7, m5
+ jmp m(idct_4x16_internal_16bpc).end
+
+INV_TXFM_4X16_FN identity, dct, h
+INV_TXFM_4X16_FN identity, adst, h
+INV_TXFM_4X16_FN identity, flipadst, h
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+ mova m5, [o(pd_6144)]
+ mova m4, [o(pd_5793)]
+.loop_pass1:
+ pmulld m0, m4, [cq+64*0+r5]
+ pmulld m1, m4, [cq+64*1+r5]
+ pmulld m2, m4, [cq+64*2+r5]
+ pmulld m3, m4, [cq+64*3+r5]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 13}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ punpckhwd m3, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ test r5d, r5d
+ jz m(idct_4x16_internal_16bpc).end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.pass2:
+ mova [cq+16*4], m0
+ mova [cq+16*5], m1
+ mova [cq+16*6], m2
+ mova [cq+16*7], m7
+ mova m0, [o(pw_1697x16)]
+ mova m7, [o(pw_2048)]
+ pmulhrsw m1, m0, m4
+ pmulhrsw m2, m0, m5
+ REPX {paddsw x, x}, m4, m5
+ paddsw m4, m1
+ paddsw m5, m2
+ REPX {pmulhrsw x, m7}, m4, m5
+ mova [cq+16*0], m4
+ mova [cq+16*1], m5
+ mova m4, [cq+16*7]
+ pmulhrsw m1, m0, m6
+ pmulhrsw m2, m0, m4
+ REPX {paddsw x, x}, m6, m4
+ paddsw m6, m1
+ paddsw m4, m2
+ REPX {pmulhrsw x, m7}, m6, m4
+ mova [cq+16*2], m6
+ mova [cq+16*3], m4
+ mova m4, [cq+16*4]
+ mova m1, [cq+16*5]
+ mova m2, [cq+16*6]
+ pmulhrsw m5, m0, m2
+ pmulhrsw m6, m0, m3
+ REPX {paddsw x, x}, m2, m3
+ paddsw m2, m5
+ paddsw m3, m6
+ pmulhrsw m6, m0, m1
+ pmulhrsw m0, m4
+ REPX {paddsw x, x}, m1, m4
+ paddsw m1, m6
+ paddsw m0, m4
+ REPX {pmulhrsw x, m7}, m2, m3, m1, m0
+ jmp m(idct_4x16_internal_16bpc).end
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, 0, 8x4, 15
+%else
+ INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+ lea r2, [strideq*3]
+ mova m1, [dstq+strideq*0]
+ mova m2, [dstq+strideq*1]
+ mova m3, [dstq+strideq*2]
+ mova m4, [dstq+r2]
+ REPX {paddw x, m0}, m1, m2, m3, m4
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ mova [dstq+strideq*2], m3
+ mova [dstq+r2 ], m4
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, identity
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+
+cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+.pass1_entry:
+%if ARCH_X86_32
+ lea r3, [rsp+gprsize]
+%else
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+0*16]
+ mova m1, [cq+1*16]
+ mova m2, [cq+2*16]
+ mova m3, [cq+3*16]
+ mova m4, [cq+4*16]
+ mova m5, [cq+5*16]
+ mova m6, [cq+6*16]
+ mova m7, [cq+7*16]
+ call .rect2_mul
+ call r5
+ call .transpose4x8packed
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.transpose4x8packed:
+ ; transpose
+ punpcklwd m1, m2, m6
+ punpckhwd m2, m6
+ punpckhwd m6, m0, m4
+ punpcklwd m0, m4
+
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m4, m6, m2
+ punpcklwd m6, m2
+
+ punpcklwd m2, m3, m4
+ punpckhwd m3, m4
+ punpckhwd m1, m0, m6
+ punpcklwd m0, m6
+ ret
+.main:
+ call .main_pass1
+ call .round
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ ret
+.rect2_mul:
+%if ARCH_X86_64
+ REPX {pmulld x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+%else
+ mova [r3], m7
+ mova m7, [o(pd_2896)]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulld m7, [r3]
+ mova [r3], m7
+ mova m7, [o(pd_2048)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+%endif
+ REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+%if ARCH_X86_64
+.main_pass1_fast:
+ pmulld m5, m3, [o(pd_m2276)]
+ pmulld m3, [o(pd_3406)]
+ pmulld m7, m1, [o(pd_4017)]
+ pmulld m1, [o(pd_799)]
+ pmulld m6, m2, [o(pd_3784)]
+ pmulld m2, [o(pd_1567)]
+ pmulld m0, m14
+ pxor m4, m4
+ jmp .main_pass1_fast2
+.main_pass1:
+ ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a
+ ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3
+ REPX {pmulld x, m14}, m0, m4
+.main_pass1_fast2:
+ REPX {paddd x, m11}, m1, m2, m3, m5, m6, m7
+ REPX {psrad x, 12 }, m1, m2, m3, m5, m6, m7
+ paddd m8, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ paddd m9, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ REPX {pmaxsd x, m12}, m1, m8, m7, m9
+ REPX {pminsd x, m13}, m1, m8, m7, m9
+ REPX {pmulld x, m14}, m7, m1
+ paddd m0, m11
+ paddd m7, m11
+ psubd m5, m0, m4
+ paddd m0, m4
+ psubd m4, m7, m1
+ paddd m7, m1
+ REPX {psrad x, 12 }, m5, m0, m4, m7
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ paddd m6, m5, m2 ; dct4 out1
+ psubd m5, m2 ; dct4 out2
+ REPX {pmaxsd x, m12}, m0, m6, m5, m3
+ REPX {pminsd x, m13}, m0, m6, m5, m3
+ ret
+.round:
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ psubd m7, m0, m9 ; out7
+ paddd m0, m9 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+%else
+.main_pass1_fast:
+ pmulld m5, m3, [o(pd_m2276)]
+ pmulld m3, [o(pd_3406)]
+ pmulld m7, m1, [o(pd_4017)]
+ pmulld m1, [o(pd_799)]
+ pmulld m6, m2, [o(pd_3784)]
+ pmulld m2, [o(pd_1567)]
+ mova m4, [o(pd_2048)]
+ mova [r3+0*16], m2
+ REPX {paddd x, m4}, m5, m3, m7, m1
+ REPX {psrad x, 12}, m5, m3, m7, m1
+ paddd m2, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ pmulld m5, m0, [o(pd_2896)]
+ mova m0, m4
+ paddd m4, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3 }, m1, m2, m7, m4
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3 }, m1, m2, m7, m4
+ mova [r3+3*16], m2
+ mova [r3+1*16], m4
+ pxor m4, m4
+ mova m2, [r3+0*16]
+ mova m3, [o(pd_2896)]
+ jmp .main_pass1_fast2
+.main_pass1:
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m4
+ mova [r3+3*16], m6
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a
+ paddd m2, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ paddd m4, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ mova m6, [o(clip_18b_min)]
+ REPX {pmaxsd x, m6 }, m1, m2, m7, m4
+ mova m6, [o(clip_18b_max)]
+ REPX {pminsd x, m6 }, m1, m2, m7, m4
+ mova m6, [r3+3*16]
+ mova [r3+3*16], m2
+ mova m2, [r3+1*16]
+ mova [r3+1*16], m4
+
+ ITX_MULSUB_2D 2, 6, 4, 3, 5, _, 1567, 3784 ; t2 t3
+ mova m3, [o(pd_2896)]
+ mova m5, [r3+0*16]
+ mova m4, [r3+2*16]
+ REPX {pmulld x, m3 }, m5, m4
+.main_pass1_fast2:
+ REPX {paddd x, m0 }, m2, m6
+ REPX {psrad x, 12 }, m2, m6
+ REPX {pmulld x, m3 }, m7, m1
+ paddd m7, m0
+ paddd m0, m5
+
+ psubd m5, m0, m4
+ paddd m0, m4
+ psubd m4, m7, m1
+ paddd m7, m1
+ REPX {psrad x, 12 }, m5, m0, m4, m7
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ paddd m6, m5, m2 ; dct4 out1
+ psubd m5, m2 ; dct4 out2
+
+ mova m1, [o(clip_18b_min)]
+ REPX {pmaxsd x, m1 }, m0, m6, m5, m3
+ mova m1, [o(clip_18b_max)]
+ REPX {pminsd x, m1 }, m0, m6, m5, m3
+ ret
+.round:
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ mova [r3+0*16], m6
+ mova m6, [r3+1*16]
+ psubd m7, m0, m6 ; out7
+ paddd m0, m6 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ mova m6, [r3+3*16]
+ psubd m4, m3, m6 ; out4
+ paddd m3, m6 ; out3
+ mova m6, [r3+0*16]
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_8x4_internal_8bpc, _ssse3).main
+.end:
+ lea r3, [strideq*3]
+ call .round2_and_write_8x4
+ REPX {mova [cq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ RET
+.round2_and_write_8x4:
+ pxor m6, m6
+ mova m5, [o(pixel_10bpc_max)]
+ mova m4, [o(pw_2048)]
+.round1_and_write_8x4:
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+.write_8x4:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r3]
+ REPX {pminsw x, m5}, m0, m1, m2, m3
+ REPX {pmaxsw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r3 ], m3
+ ret
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+ jmp m(idct_8x4_internal_16bpc).pass1_entry
+.main:
+ call .main_pass1
+ call .round
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ ret
+.main_pass1:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, 11, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a
+ psubd m8, m2, m6 ; t6
+ paddd m2, m6 ; t2
+ psubd m6, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ psubd m4, m5, m1 ; t7
+ paddd m5, m1 ; t3
+ psubd m1, m7, m3 ; t5
+ paddd m7, m3 ; t1
+ REPX {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7
+ REPX {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7
+ ITX_MULSUB_2D 6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2D 4, 8, 3, 9, 10, 11, 3784, 10 ; t6a, t7a
+ psubd m9, m6, m8 ; t7
+ paddd m6, m8 ; out6
+ mova m8, [o(pd_2896)]
+ psubd m3, m7, m5 ; t3
+ paddd m7, m5 ; -out7
+ psubd m5, m0, m2 ; t2
+ paddd m0, m2 ; out0
+ psubd m2, m1, m4 ; t6
+ paddd m1, m4 ; -out1
+ REPX {pmaxsd x, m12}, m5, m3, m2, m9
+ REPX {pminsd x, m13}, m5, m3, m2, m9
+ REPX {pmulld x, m14}, m5, m3, m2, m9
+ psubd m4, m5, m3 ; (t2 - t3) * 2896
+ paddd m3, m5 ; (t2 + t3) * 2896
+ psubd m5, m2, m9 ; (t6 - t7) * 2896
+ paddd m2, m9 ; (t6 + t7) * 2896
+ ret
+.round:
+
+ ; m0=out0,m1=-out1,m6=out6,m7=-out7
+
+ pcmpeqd m8, m8
+ REPX {pxor x, m8 }, m1, m7, m3, m5
+ REPX {psubd x, m8 }, m1, m7
+ REPX {paddd x, m11}, m2, m3, m4, m5
+ REPX {psrad x, 12 }, m2, m3, m4, m5
+%else
+ mova [r3+0*16], m2
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m5
+ mova m5, [o(pd_2048)]
+
+ ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a
+ mova m2, [r3+0*16]
+ mova m3, [r3+1*16]
+ mova m4, [r3+2*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m6
+ mova m1, [r3+3*16]
+ mova [r3+3*16], m7
+ ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a
+ mova m0, [r3+0*16]
+ mova m6, [r3+2*16]
+ psubd m7, m2, m6 ; t6
+ paddd m2, m6 ; t2
+ psubd m6, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ mova [r3+0*16], m7
+ mova m5, [r3+1*16]
+ mova m7, [r3+3*16]
+ psubd m4, m1, m5 ; t7
+ paddd m5, m1 ; t3
+ psubd m1, m7, m3 ; t5
+ paddd m7, m3 ; t1
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7
+ mova [r3+1*16], m7
+ mova m7, [o(clip_18b_max)]
+ pmaxsd m3, [r3+0*16]
+ REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5
+ pminsd m7, [r3+1*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m5
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a
+ mova m5, [r3+2*16]
+ mova m7, [r3+3*16]
+ psubd m2, m6, m3 ; t7
+ paddd m6, m3 ; out6
+ mova [r3+3*16], m6
+ mova m0, [r3+0*16]
+ mova m6, [r3+1*16]
+ psubd m3, m7, m5 ; t3
+ paddd m7, m5 ; -out7
+ psubd m5, m0, m6 ; t2
+ paddd m0, m6 ; out0
+ psubd m6, m1, m4 ; t6
+ paddd m1, m4 ; -out1
+ mova m4, [o(clip_18b_min)]
+ REPX {pmaxsd x, m4 }, m5, m3, m6, m2
+ mova m4, [o(clip_18b_max)]
+ REPX {pminsd x, m4 }, m5, m3, m6, m2
+ mova m4, [o(pd_2896)]
+ REPX {pmulld x, m4 }, m5, m3, m6, m2
+ psubd m4, m5, m3 ; (t2 - t3) * 2896
+ paddd m3, m5 ; (t2 + t3) * 2896
+ psubd m5, m6, m2 ; (t6 - t7) * 2896
+ paddd m2, m6 ; (t6 + t7) * 2896
+ ret
+.round:
+ mova [r3+2*16], m0
+
+ pcmpeqd m0, m0
+ mova m6, [o(pd_2048)]
+ REPX {pxor x, m0 }, m1, m7, m3, m5
+ REPX {psubd x, m0 }, m1, m7
+ REPX {paddd x, m6 }, m2, m3, m4, m5
+ REPX {psrad x, 12 }, m2, m3, m4, m5
+
+ mova m6, [r3+3*16]
+ mova m0, [r3+2*16]
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
+ jmp m(idct_8x4_internal_16bpc).end
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+ jmp m(idct_8x4_internal_16bpc).pass1_entry
+.main:
+ call m(iadst_8x4_internal_16bpc).main_pass1
+ call m(iadst_8x4_internal_16bpc).round
+ packssdw m7, m6
+ packssdw m5, m4
+ packssdw m3, m2
+ packssdw m1, m0
+ mova m0, m7
+ mova m2, m5
+ mova m4, m3
+ mova m6, m1
+ ret
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
+ lea r3, [strideq*3]
+ add dstq, r3
+ neg strideq
+ jmp m(idct_8x4_internal_16bpc).end
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+ jmp m(idct_8x4_internal_16bpc).pass1_entry
+.main:
+ REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ ret
+.pass2:
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(idct_8x4_internal_16bpc).end
+
+%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, %3, 8x8, 15, 0-3*16
+%else
+ INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 2
+.end:
+ add r5d, 384
+ sar r5d, 9
+.end2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+ lea r2, [strideq*3]
+.loop:
+ mova m1, [dstq+strideq*0]
+ mova m2, [dstq+strideq*1]
+ mova m3, [dstq+strideq*2]
+ mova m4, [dstq+r2]
+ REPX {paddw x, m0}, m1, m2, m3, m4
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ mova [dstq+strideq*2], m3
+ mova [dstq+r2 ], m4
+ lea dstq, [dstq+strideq*4]
+ dec r3d
+ jg .loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, identity, 6
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1
+ mov [rsp+4*16+1*gprsize], r1
+%else
+ DECLARE_REG_TMP 6
+%endif
+ lea t0, [o(.pass1_main)]
+
+.pass1_full:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 10
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 10
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+%if ARCH_X86_32
+ lea r3, [rsp+gprsize]
+%endif
+.loop_pass1:
+ mova m0, [cq+0*32+r5]
+ mova m1, [cq+1*32+r5]
+ mova m2, [cq+2*32+r5]
+ mova m3, [cq+3*32+r5]
+ mova m4, [cq+4*32+r5]
+ mova m5, [cq+5*32+r5]
+ mova m6, [cq+6*32+r5]
+ mova m7, [cq+7*32+r5]
+ call t0
+
+ test r5d, r5d
+ jz .end_pass1
+
+ mova [cq+0*32+16], m0
+ mova [cq+1*32+16], m1
+ mova [cq+2*32+16], m2
+ mova [cq+3*32+16], m3
+
+ sub r5d, 16
+ jmp .loop_pass1
+.end_pass1:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_32
+ mov r1, [rsp+4*16+1*gprsize]
+%endif
+ jmp tx2q
+.pass1_main:
+ call m(idct_8x4_internal_16bpc).main_pass1
+ pcmpeqd m1, m1
+ REPX {psubd x, m1}, m0, m6, m5, m3
+ call m(idct_8x4_internal_16bpc).round
+ REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
+.pack_and_transpose:
+ packssdw m2, m3
+ packssdw m6, m7
+ packssdw m0, m1
+ packssdw m4, m5
+ jmp m(idct_8x4_internal_16bpc).transpose4x8packed
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ lea r3, [strideq*3]
+%if ARCH_X86_64
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+%endif
+ call .round3_and_write_8x8
+.zero:
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+%undef mzero
+ RET
+
+ ; round (rounded right-shift by 5) before writing
+ ; data in m0-7
+ ; on x86-64, pw_2048 is in m8
+ ; .round1 is for m0-7
+ ; .round2 is for m0-6 & [rsp+gprsize*2]
+ ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
+ ; .round4 is x86-32-only, it is similar to .round2 but with constant already in m7
+%if ARCH_X86_32
+.round1_and_write_8x8:
+ mova [rsp+gprsize*2], m7
+.round2_and_write_8x8:
+%endif
+.round3_and_write_8x8:
+ mova m7, [o(pw_2048)]
+%if ARCH_X86_32
+.round4_and_write_8x8:
+%endif
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [rsp+gprsize*2]
+%if ARCH_X86_64
+ jmp .write_8x8
+.round2_and_write_8x8:
+ mova m7, [rsp+gprsize*2]
+.round1_and_write_8x8:
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+%endif
+
+ ; m0-7 have to-be-written data [pre-rounded]
+ ; on x86-64, m9-10 contain a zero/pixel_max
+ ; on x86-32, these are runtime-generated, and [rsp+gprsize*2] is scratch
+ ; r0,1,3 contain dstq/strideq/stride3q
+ ; r5 is a scratch register
+.write_8x8:
+ lea r5, [dstq+strideq*4]
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r3]
+ paddw m4, [r5 +strideq*0]
+ paddw m5, [r5 +strideq*1]
+ paddw m6, [r5 +strideq*2]
+ paddw m7, [r5 +r3]
+%if ARCH_X86_64
+ REPX {pmaxsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+%else
+ mova [rsp+gprsize*2], m7
+ pxor m7, m7
+ REPX {pmaxsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmaxsw m7, [rsp+gprsize*2]
+ mova [rsp+gprsize*2], m7
+ mova m7, [o(pixel_10bpc_max)]
+ REPX {pminsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsw m7, [rsp+gprsize*2]
+%endif
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r3 ], m3
+ mova [r5 +strideq*0], m4
+ mova [r5 +strideq*1], m5
+ mova [r5 +strideq*2], m6
+ mova [r5 +r3 ], m7
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity, 6
+
+cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+4*16+1*gprsize], r1
+%endif
+ lea t0, [o(.pass1_main)]
+ jmp m(idct_8x8_internal_16bpc).pass1_full
+.pass1_main:
+ call m(iadst_8x4_internal_16bpc).main_pass1
+ call .round
+ jmp m(idct_8x8_internal_16bpc).pack_and_transpose
+.round:
+%if ARCH_X86_64
+ pcmpeqd m8, m8 ; -1
+ REPX {psubd x, m8 }, m0, m6
+ REPX {pxor x, m8 }, m1, m7, m3, m5
+ REPX {psrad x, 1 }, m0, m1, m6, m7
+ REPX {psubd x, m8 }, m1, m7
+ mova m8, [o(pd_6144)]
+ REPX {paddd x, m8 }, m2, m3, m4, m5
+ REPX {psrad x, 13 }, m2, m3, m4, m5
+%else
+ mova [r3+2*16], m0
+
+ pcmpeqd m0, m0 ; -1
+ mova m6, [o(pd_6144)]
+ REPX {pxor x, m0 }, m1, m7, m3, m5
+ REPX {psrad x, 1 }, m1, m7
+ REPX {psubd x, m0 }, m1, m7
+ REPX {paddd x, m6 }, m2, m3, m4, m5
+ REPX {psrad x, 13 }, m2, m3, m4, m5
+
+ mova m0, [r3+2*16]
+ psrld m6, 12 ; +1
+ paddd m0, m6
+ paddd m6, [r3+3*16]
+ REPX {psrad x, 1 }, m0, m6
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
+ lea r3, [strideq*3]
+%if ARCH_X86_64
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+%endif
+ call .round3_and_write_8x8
+ jmp m(idct_8x8_internal_16bpc).zero
+
+ ; round (rounded right-shift by 5) before writing; odd registers are negated
+ ; data in m0-7
+ ; on x86-64, pw_2048 is in m8 and pw_m2048 is in m11
+ ; .round1 is for m0-7
+ ; .round2 is for m0-6 & [rsp+gprsize*2]
+ ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
+%if ARCH_X86_64
+.round2_and_write_8x8:
+ mova m7, [rsp+gprsize*2]
+.round1_and_write_8x8:
+ REPX {pmulhrsw x, m8 }, m0, m2, m4, m6
+ REPX {pmulhrsw x, m11}, m1, m3, m5, m7
+ jmp m(idct_8x8_internal_16bpc).write_8x8
+%else
+.round1_and_write_8x8:
+ mova [rsp+gprsize*2], m7
+.round2_and_write_8x8:
+%endif
+.round3_and_write_8x8:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova m7, [o(pw_m2048)]
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [rsp+gprsize*2]
+ jmp m(idct_8x8_internal_16bpc).write_8x8
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity, 6
+
+cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+4*16+1*gprsize], r1
+%endif
+ lea t0, [o(.pass1_main)]
+ jmp m(idct_8x8_internal_16bpc).pass1_full
+.pass1_main:
+ call m(iadst_8x4_internal_16bpc).main_pass1
+ call m(iadst_8x8_internal_16bpc).round
+ ; invert registers
+ packssdw m7, m6
+ packssdw m5, m4
+ packssdw m3, m2
+ packssdw m1, m0
+ mova m0, m7
+ mova m2, m5
+ mova m4, m3
+ mova m6, m1
+ jmp m(idct_8x4_internal_16bpc).transpose4x8packed
+
+.pass2:
+ lea dstq, [dstq+strideq*8]
+ sub dstq, strideq
+ neg strideq
+ jmp m(iadst_8x8_internal_16bpc).pass2
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+0*32]
+ mova m1, [cq+1*32]
+ mova m2, [cq+2*32]
+ mova m3, [cq+3*32]
+ mova m4, [cq+4*32]
+ mova m5, [cq+5*32]
+ mova m6, [cq+6*32]
+ mova m7, [cq+7*32]
+ packssdw m0, [cq+0*32+16]
+ packssdw m1, [cq+1*32+16]
+ packssdw m2, [cq+2*32+16]
+ packssdw m3, [cq+3*32+16]
+ packssdw m4, [cq+4*32+16]
+ packssdw m5, [cq+5*32+16]
+ packssdw m6, [cq+6*32+16]
+ packssdw m7, [cq+7*32+16]
+ mova [rsp+gprsize+16*1], m6
+ jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ lea r3, [strideq*3]
+%if ARCH_X86_64
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+ mova m8, [o(pw_4096)]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+%else
+ mova [rsp+gprsize], m7
+ mova m7, [o(pw_4096)]
+ call m(idct_8x8_internal_16bpc).round4_and_write_8x8
+%endif
+ jmp m(idct_8x8_internal_16bpc).zero
+
+%macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 15, 0-16*16
+%else
+ INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ mov r3d, 4
+%if stack_size_padded > 0
+ ; adjust to caller's stack allocation
+ add rsp, (12+ARCH_X86_64)*16
+%endif
+ jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity, v
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 7
+%endif
+
+cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(idct_8x8_internal_16bpc).pass1_main)]
+.pass1_full:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+%undef cmp
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, [rsp+16*16+2*gprsize]
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+%endif
+.loop_pass1:
+ mova m0, [cq+0*64+r5]
+ mova m1, [cq+1*64+r5]
+ mova m2, [cq+2*64+r5]
+ mova m3, [cq+3*64+r5]
+ mova m4, [cq+4*64+r5]
+ mova m5, [cq+5*64+r5]
+ mova m6, [cq+6*64+r5]
+ mova m7, [cq+7*64+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call t0
+
+ mova [cq+0*64+r5], m0
+ mova [cq+1*64+r5], m1
+ mova [cq+2*64+r5], m2
+ mova [cq+3*64+r5], m3
+ sub r5d, 16
+ jge .loop_pass1
+%if WIN64
+ POP r7
+%elif ARCH_X86_32
+ mov r1, [rsp+16*16+1*gprsize]
+%endif
+ jmp tx2q
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+
+ ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15
+ ; some are still pre-loaded from the final loop iteration in pass=1
+
+ mova m1, m2
+ mova m2, [cq+ 1*16]
+ mova m3, [cq+ 9*16]
+ mova m4, [cq+ 2*16]
+ mova m5, [cq+10*16]
+ mova m6, [cq+ 3*16]
+ mova m7, [cq+11*16]
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+3*16], m0
+ mova [rsp+gprsize+4*16], m1
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m3
+ mova [rsp+gprsize+7*16], m4
+ mova [rsp+gprsize+8*16], m5
+ mova [rsp+gprsize+9*16], m6
+ ; m7 is already stored in [rsp+gprsize+0*16]
+ mova m0, [cq+ 4*16]
+ mova m1, [cq+12*16]
+ mova m2, [cq+ 5*16]
+ mova m3, [cq+13*16]
+ mova m4, [cq+ 6*16]
+ mova m5, [cq+14*16]
+ mova m6, [cq+ 7*16]
+ mova m7, [cq+15*16]
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+
+ ; out0-7 is in rsp+gprsize+3-10*mmsize
+ ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
+
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+ mov r6, dstq
+%else
+ mov [rsp+16*16+gprsize*1], dstq
+%endif
+ lea r3, [strideq*3]
+ lea dstq, [dstq+strideq*8]
+ call m(idct_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+%undef mzero
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+%if ARCH_X86_64
+ mov dstq, r6
+%else
+ mov dstq, [rsp+16*16+gprsize*1]
+%endif
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ RET
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity, v
+
+cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)]
+ jmp m(idct_8x16_internal_16bpc).pass1_full
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m4, [cq+ 9*16]
+ mova m5, [cq+13*16]
+ mova [rsp+gprsize+7*16], m0
+ mova [rsp+gprsize+8*16], m1
+ mova [rsp+gprsize+5*16], m4
+ mova [rsp+gprsize+6*16], m5
+ mova m0, m2
+ mova m1, m3
+ mova m2, [cq+ 1*16]
+ mova m3, [cq+ 5*16]
+ mova m4, [cq+ 2*16]
+ mova m5, [cq+ 6*16]
+ mova m6, [cq+11*16]
+ mova m7, [cq+15*16]
+ mova [rsp+gprsize+ 3*16], m4
+ mova [rsp+gprsize+ 4*16], m5
+ mova [rsp+gprsize+ 9*16], m6
+ mova [rsp+gprsize+10*16], m7
+ mova m4, [cq+10*16]
+ mova m5, [cq+14*16]
+ mova m6, [cq+ 3*16]
+ mova m7, [cq+ 7*16]
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
+
+%if ARCH_X86_64
+ mova m11, [o(pw_m2048)]
+ mova m8, [o(pw_2048)]
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+ mov r6, dstq
+%else
+ mov [rsp+16*16+gprsize*1], dstq
+%endif
+ lea r3, [strideq*3]
+ lea dstq, [dstq+strideq*8]
+ call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+%undef mzero
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+%if ARCH_X86_64
+ mov dstq, r6
+%else
+ mov dstq, [rsp+16*16+gprsize*1]
+%endif
+ call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
+ RET
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity, v
+
+cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)]
+ jmp m(idct_8x16_internal_16bpc).pass1_full
+
+.pass2:
+ lea r3, [strideq*3]
+ lea r3, [r3*5]
+ add dstq, r3
+ neg strideq
+ jmp m(iadst_8x16_internal_16bpc).pass2
+
+INV_TXFM_8X16_FN identity, dct, h
+INV_TXFM_8X16_FN identity, adst, h
+INV_TXFM_8X16_FN identity, flipadst, h
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)]
+ jmp m(idct_8x16_internal_16bpc).pass1_full
+
+.pass2:
+%if ARCH_X86_64
+ mova m4, [o(pw_2048)]
+ mova m5, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mova m7, [o(pw_1697x16)]
+%endif
+ mov r5d, 4
+ lea r3, [strideq*3]
+.pass2_loop:
+ call .main
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).round1_and_write_8x4
+%else
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+%endif
+ REPX {mova [cq+x*16], m6}, 0, 4, 8, 12, 16, 20, 24, 28
+ dec r5d
+ jle .end
+ add cq, 16
+ lea dstq, [dstq+strideq*4]
+ mova m0, [cq+ 0*16]
+ mova m1, [cq+ 4*16]
+ mova m2, [cq+ 8*16]
+ mova m3, [cq+12*16]
+ jmp .pass2_loop
+.end:
+ RET
+.main:
+ ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y)
+%if ARCH_X86_32
+ mova m7, [o(pw_1697x16)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+%else
+ pmulhrsw m8, m7, m0
+ pmulhrsw m9, m7, m1
+ pmulhrsw m10, m7, m2
+ pmulhrsw m11, m7, m3
+%endif
+ REPX {paddsw x, x}, m0, m1, m2, m3
+%if ARCH_X86_64
+ paddsw m0, m8
+ paddsw m1, m9
+ paddsw m2, m10
+ paddsw m3, m11
+%else
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+%endif
+ ret
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, 0, 16x4, 16, 0-8*16
+%else
+ INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 4
+.dconly:
+ add r5d, 384
+ sar r5d, 9
+.dconly2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m3, [o(pixel_10bpc_max)]
+ pxor m4, m4
+.loop:
+ mova m1, [dstq+ 0]
+ mova m2, [dstq+16]
+ REPX {paddw x, m0}, m1, m2
+ REPX {pminsw x, m3}, m1, m2
+ REPX {pmaxsw x, m4}, m1, m2
+ mova [dstq+ 0], m1
+ mova [dstq+16], m2
+ add dstq, strideq
+ dec r3d
+ jg .loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, identity
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+
+cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+
+ mova m0, [cq+ 1*16]
+ mova m1, [cq+ 3*16]
+ mova m2, [cq+ 5*16]
+ mova m3, [cq+ 7*16]
+ mova m4, [cq+ 9*16]
+ mova m5, [cq+11*16]
+ mova m6, [cq+13*16]
+ mova m7, [cq+15*16]
+ call .main_oddhalf
+ mova m0, [cq+ 0*16]
+ mova m1, [cq+ 2*16]
+ mova m2, [cq+ 4*16]
+ mova m3, [cq+ 6*16]
+ mova m4, [cq+ 8*16]
+ mova m5, [cq+10*16]
+ mova m6, [cq+12*16]
+ mova m7, [cq+14*16]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ ; t0-7 is in m0-7
+
+ call .round
+
+%if ARCH_X86_64
+.pack_transpose:
+ ; transpose in two parts
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+.transpose:
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call .transpose4x8packed_hi
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m2
+ mova [r3+3*16], m3
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+ 9*16]
+ mova m4, [r3+10*16]
+ mova m6, [r3+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ jmp tx2q
+%if ARCH_X86_64
+.transpose4x8packed_hi:
+ punpcklwd m9, m10, m14
+ punpckhwd m10, m14
+ punpckhwd m14, m8, m12
+ punpcklwd m8, m12
+
+ punpckhwd m11, m8, m9
+ punpcklwd m8, m9
+ punpckhwd m12, m14, m10
+ punpcklwd m14, m10
+
+ punpcklwd m10, m11, m12
+ punpckhwd m11, m12
+ punpckhwd m9, m8, m14
+ punpcklwd m8, m14
+ ret
+%endif
+.main_oddhalf_fast: ; lower half zero
+ pmulld m7, m0, [o(pd_4076)]
+ pmulld m0, [o(pd_401)]
+ pmulld m6, m1, [o(pd_m1189)]
+ pmulld m1, [o(pd_3920)]
+%if ARCH_X86_32
+ mova m4, [o(pd_2048)]
+ REPX {paddd x, m4}, m1, m6
+ REPX {psrad x, 12}, m1, m6
+ mova [r3+1*16], m1
+%endif
+ pmulld m5, m2, [o(pd_3612)]
+ pmulld m2, [o(pd_1931)]
+%if ARCH_X86_32
+ pmulld m1, m3, [o(pd_m2598)]
+%else
+ pmulld m4, m3, [o(pd_m2598)]
+%endif
+ pmulld m3, [o(pd_3166)]
+ jmp .main_oddhalf_fast2
+.main_oddhalf:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
+ ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a
+.main_oddhalf_fast2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m8, m0, m4 ; t9
+ paddd m0, m4 ; t8
+ psubd m4, m6, m2 ; t10
+ paddd m2, m6 ; t11
+ psubd m6, m1, m5 ; t13
+ paddd m5, m1 ; t12
+ psubd m1, m7, m3 ; t14
+ paddd m7, m3 ; t15
+ REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
+ REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4
+ psubd m3, m1, m4 ; t10
+ paddd m1, m4 ; t9
+ psubd m4, m0, m2 ; t11a
+ paddd m0, m2 ; t8a
+ psubd m2, m8, m6 ; t13
+ paddd m6, m8 ; t14
+ psubd m8, m7, m5 ; t12a
+ paddd m7, m5 ; t15a
+ REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
+ REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
+ REPX {pmulld x, m14}, m2, m8, m3, m4
+ paddd m2, m11
+ paddd m8, m11
+ paddd m5, m2, m3 ; t13a
+ psubd m2, m3 ; t10a
+ psubd m3, m8, m4 ; t11
+ paddd m4, m8 ; t12
+ REPX {psrad x, 12}, m5, m2, m3, m4
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m2
+ mova [r3+3*16], m3
+ mova [r3+4*16], m4
+ mova [r3+5*16], m5
+ mova [r3+6*16], m6
+ mova [r3+7*16], m7
+%else
+ mova [r3+0*16], m2
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m5
+ mova m4, [o(pd_2048)]
+
+ ITX_MULSUB_2D 0, 7, 2, 3, 5, _, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 6, 1, 2, 3, 5, 4, 3920, 1189 ; t11a, t12a
+
+ mova m2, [r3+0*16]
+ mova m3, [r3+1*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova m1, [r3+2*16]
+ mova m5, [r3+3*16]
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+
+ ITX_MULSUB_2D 2, 5, 0, 6, 7, _, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2D 1, 3, 0, 6, 7, _, 3166, 2598 ; t9a, t14a
+
+ mova m0, [r3+0*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+.main_oddhalf_fast2:
+ REPX {paddd x, m4}, m0, m7, m2, m5, m1, m3
+ REPX {psrad x, 12}, m0, m7, m2, m5, m1, m3
+ psubd m4, m0, m1 ; t9
+ paddd m0, m1 ; t8
+ mova m1, [r3+1*16]
+ mova [r3+0*16], m4
+ psubd m4, m6, m2 ; t10
+ paddd m2, m6 ; t11
+ psubd m6, m1, m5 ; t13
+ paddd m5, m1 ; t12
+ psubd m1, m7, m3 ; t14
+ paddd m7, m3 ; t15
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7
+ pmaxsd m3, [r3+0*16]
+ mova [r3+0*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7
+ pminsd m3, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m5
+ mova [r3+3*16], m7
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2D 1, 3, 0, 2, 5, 7, 1567, 3784
+ ITX_MULSUB_2D 6, 4, 0, 2, _, 7, 5, 3784, 4
+ mova m0, [r3+0*16]
+ mova m2, [r3+1*16]
+ psubd m5, m1, m4 ; t10
+ mova [r3+1*16], m5
+ paddd m1, m4 ; t9
+ psubd m4, m0, m2 ; t11a
+ paddd m0, m2 ; t8a
+ mova m5, [r3+2*16]
+ mova m7, [r3+3*16]
+ psubd m2, m3, m6 ; t13
+ paddd m6, m3 ; t14
+ paddd m3, m7, m5 ; t15a
+ psubd m7, m5 ; t12a
+ mova [r3+0*16], m3
+ mova m3, [r3+1*16]
+ mova m5, [o(clip_18b_min)]
+ REPX {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6
+ pmaxsd m5, [r3+0*16]
+ mova [r3+0*16], m5
+ mova m5, [o(clip_18b_max)]
+ REPX {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6
+ pminsd m5, [r3+0*16]
+ mova [r3+0*16], m5
+ mova m5, [o(pd_2896)]
+ REPX {pmulld x, m5}, m2, m7, m3, m4
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m2, m7
+ paddd m5, m2, m3 ; t13a
+ psubd m2, m3 ; t10a
+ psubd m3, m7, m4 ; t11
+ paddd m4, m7 ; t12
+ REPX {psrad x, 12}, m5, m2, m3, m4
+ mova m7, [r3+0*16]
+ mova [r3+11*16], m0
+ mova [r3+10*16], m1
+ mova [r3+9*16], m2
+ mova [r3+8*16], m3
+ mova [r3+7*16], m4
+ mova [r3+6*16], m5
+ mova [r3+5*16], m6
+ mova [r3+4*16], m7
+%endif
+ ret
+.round:
+%if ARCH_X86_64
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ pcmpeqd m8, m8
+ REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ mova m8, [r3+1*16]
+ mova m9, [r3+2*16]
+ mova m10, [r3+3*16]
+ mova m11, [r3+4*16]
+ mova m12, [r3+5*16]
+ mova m13, [r3+6*16]
+ mova m14, [r3+7*16]
+ psubd m15, m0, m14 ; out15
+ paddd m0, m14 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m2, m12 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m10 ; out11
+ paddd m4, m10 ; out4
+ psubd m10, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, [r3+0*16] ; out8
+ paddd m7, [r3+0*16] ; out7
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ ; and out0-15 is now in m0-15
+%else
+ mova [r3+ 0*16], m0
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ pmaxsd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m7
+ mova m7, [o(clip_18b_max)]
+ REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsd m7, [r3+ 0*16]
+ mova [r3+ 0*16], m0
+ pcmpeqd m0, m0
+ REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ mova [r3+ 1*16], m1
+ mova [r3+ 2*16], m2
+ mova m1, [r3+ 0*16]
+ psubd m1, m0
+ mova [r3+ 0*16], m1
+ mova m1, [r3+11*16]
+ mova m2, [r3+10*16]
+ psubd m0, m7, m1
+ paddd m7, m1
+ psubd m1, m6, m2
+ paddd m6, m2
+ REPX {psrad x, 1}, m0, m1, m6, m7
+ packssdw m0, m1 ; out8-9
+ packssdw m6, m7 ; out6-7
+ mova [r3+11*16], m6
+ mova m1, [r3+9*16]
+ mova m7, [r3+8*16]
+ psubd m2, m5, m1
+ paddd m5, m1
+ psubd m1, m4, m7
+ paddd m4, m7
+ REPX {psrad x, 1}, m2, m1, m4, m5
+ packssdw m2, m1 ; out10-11
+ packssdw m4, m5 ; out4-5
+ mova m1, [r3+2*16]
+ mova [r3+10*16], m4
+ mova m6, [r3+7*16]
+ mova m7, [r3+6*16]
+ psubd m4, m3, m6
+ paddd m3, m6
+ psubd m6, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 1}, m4, m6, m1, m3
+ packssdw m4, m6 ; out12-13
+ packssdw m1, m3 ; out2-3
+ mova m3, [r3+1*16]
+ mova [r3+9*16], m1
+ mova m1, [r3+0*16]
+ mova m5, [r3+5*16]
+ mova m7, [r3+4*16]
+ psubd m6, m3, m5
+ paddd m3, m5
+ psubd m5, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 1}, m6, m5, m1, m3
+ packssdw m6, m5 ; out14-15
+ packssdw m1, m3 ; out0-1
+ mova [r3+8*16], m1
+%endif
+ ret
+
+.pass2:
+ lea r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)]
+.pass2_loop:
+ lea r3, [strideq*3]
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call r4
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+ REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+%if ARCH_X86_64
+ mova m0, m8
+ mova m1, m9
+ mova m2, m10
+ mova m3, m11
+%else
+ mova m0, [rsp+gprsize+0*16]
+ mova m1, [rsp+gprsize+1*16]
+ mova m2, [rsp+gprsize+2*16]
+ mova m3, [rsp+gprsize+3*16]
+%endif
+ add dstq, 16
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call r4
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+ RET
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+ call .main
+%if ARCH_X86_64
+ jmp m(idct_16x4_internal_16bpc).pack_transpose
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+gprsize+0*16], m0
+ mova [rsp+gprsize+1*16], m1
+ mova [rsp+gprsize+2*16], m2
+ mova [rsp+gprsize+3*16], m3
+ mova m0, [rsp+gprsize+ 8*16]
+ mova m2, [rsp+gprsize+ 9*16]
+ mova m4, [rsp+gprsize+10*16]
+ mova m6, [rsp+gprsize+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ jmp tx2q
+%endif
+
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 2*16]
+ mova m1, [cq+13*16]
+ mova m2, [cq+ 6*16]
+ mova m3, [cq+ 9*16]
+ mova m4, [cq+10*16]
+ mova m5, [cq+ 5*16]
+ mova m6, [cq+14*16]
+ mova m7, [cq+ 1*16]
+ call .main_part1
+ mova m0, [cq+ 0*16]
+ mova m1, [cq+15*16]
+ mova m2, [cq+ 4*16]
+ mova m3, [cq+11*16]
+ mova m4, [cq+ 8*16]
+ mova m5, [cq+ 7*16]
+ mova m6, [cq+12*16]
+ mova m7, [cq+ 3*16]
+ call .main_part2
+.round:
+%if ARCH_X86_64
+ mova m15, [o(pd_6144)]
+ psrld m14, 11 ; pd_1
+ pcmpeqd m8, m8 ; -1
+ psubd m13, m15, m14 ; pd_6143
+ REPX {paddd x, m14}, m0, m2
+ REPX {paddd x, m15}, m4, m6
+ REPX {pxor x, m8 }, m1, m3, m5, m7
+ REPX {psrad x, 1 }, m1, m3
+ REPX {paddd x, m15}, m5, m7
+ REPX {psubd x, m8 }, m1, m3
+ paddd m8, m15, m9
+ psubd m9, m13, m10
+ paddd m10, m15, m11
+ psubd m11, m13, m12
+ paddd m12, m14, [r3+3*16]
+ psubd m13, m14, [r3+2*16]
+ psubd m15, m14, [r3+0*16]
+ paddd m14, [r3+1*16]
+ REPX {psrad x, 1 }, m0, m2, m12, m13, m14, m15
+ REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
+%else
+ mova [r3+8*16], m1
+ mova [r3+9*16], m3
+ mova m3, [o(pd_6144)]
+ pcmpeqd m1, m1
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m5, m6, m7
+ REPX {psrad x, 13}, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {pxor x, m1}, m5, m7
+ REPX {psubd x, m1}, m4, m6
+ REPX {psrad x, 1 }, m4, m5, m6, m7
+ REPX {psubd x, m1}, m5, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova m5, [r3+8*16]
+ mova m7, [r3+9*16]
+ mova [r3+8*16], m4
+ mova [r3+9*16], m6
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m0, m5, m2, m7
+ REPX {psrad x, 13}, m0, m5, m2, m7
+ packssdw m0, m5
+ packssdw m2, m7
+ mova m4, [r3+0*16]
+ mova m5, [r3+1*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+ REPX {psubd x, m1}, m4, m6
+ REPX {pxor x, m1}, m5, m7
+ REPX {psrad x, 1 }, m4, m5, m6, m7
+ REPX {psubd x, m1}, m5, m7
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+
+.main_part2:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201, 4091
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751, 3703
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035, 2751
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857, 1380
+ psubd m8, m0, m4 ; t8a
+ paddd m0, m4 ; t0a
+ psubd m4, m1, m5 ; t9a
+ paddd m1, m5 ; t1a
+ psubd m5, m2, m6 ; t12a
+ paddd m2, m6 ; t4a
+ psubd m6, m3, m7 ; t13a
+ paddd m7, m3 ; t5a
+ REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
+ REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
+ mova m15, [o(pd_4017)]
+ mova m10, [o(pd_799)]
+ ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10
+ psubd m3, m0, m2 ; t4
+ paddd m0, m2 ; t0
+ psubd m2, m1, m7 ; t5
+ paddd m1, m7 ; t1
+ psubd m7, m4, m6 ; t12a
+ paddd m4, m6 ; t8a
+ psubd m6, m8, m5 ; t13a
+ paddd m5, m8 ; t9a
+ REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
+ REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 3, 2, 8, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 7, 6, 8, 9, _, 11, 10, 15
+ mova m10, [r3+0*16] ; t2
+ mova m8, [r3+1*16] ; t3
+ psubd m9, m0, m10 ; t2a
+ paddd m0, m10 ; out0
+ psubd m10, m1, m8 ; t3a
+ paddd m1, m8 ; -out15
+ mova [r3+0*16], m1
+ mova m15, [r3+3*16] ; t7a
+ mova m1, [r3+2*16] ; t6a
+ psubd m8, m3, m15 ; t7
+ paddd m15, m3 ; out12
+ paddd m3, m2, m1 ; -out3
+ psubd m2, m1 ; t6
+ mova [r3+3*16], m15
+ mova [r3+1*16], m2
+ mova m1, [r3+7*16] ; t15
+ mova m2, [r3+6*16] ; t14
+ paddd m15, m7, m1 ; -out13
+ psubd m7, m1 ; t15a
+ psubd m11, m6, m2 ; t14a
+ paddd m2, m6 ; out2
+ mova [r3+2*16], m15
+ mova m1, [r3+4*16] ; t10a
+ mova m15, [r3+5*16] ; t11a
+ psubd m6, m4, m1 ; t10
+ paddd m1, m4 ; -out1
+ psubd m4, m5, m15 ; t11
+ paddd m5, m15 ; out14
+ REPX {pmaxsd x, m12}, m11, m7, m9, m10, m6, m4, m8
+ pmaxsd m12, [r3+1*16] ; t6
+ mova [r3+1*16], m5
+ REPX {pminsd x, m13}, m11, m7, m9, m10, m6, m4, m12, m8
+ REPX {pmulld x, m14}, m11, m7, m9, m10, m6, m4, m12, m8
+ paddd m5, m11, m7 ; -out5 (unshifted)
+ psubd m11, m7 ; out10 (unshifted)
+ paddd m7, m9, m10 ; -out7 (unshifted)
+ psubd m9, m10 ; out8 (unshifted)
+ psubd m10, m6, m4 ; -out9 (unshifted)
+ paddd m6, m4 ; out6 (unshifted)
+ paddd m4, m12, m8 ; out4 (unshifted)
+ psubd m12, m8 ; -out11 (unshifted)
+%else
+ mova [r3+8*16], m0
+ mova [r3+9*16], m1
+ mova [r3+10*16], m2
+ mova [r3+11*16], m3
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3035, 2751
+ ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 3857, 1380
+ mova m0, [r3+8*16]
+ mova m1, [r3+9*16]
+ mova [r3+8*16], m4
+ mova m4, [r3+10*16]
+ mova [r3+9*16], m5
+ mova [r3+10*16], m6
+ mova m5, [r3+11*16]
+ mova [r3+11*16], m7
+ ITX_MULSUB_2D 1, 0, 2, 6, 7, 3, 201, 4091
+ ITX_MULSUB_2D 5, 4, 2, 6, 7, 3, 1751, 3703
+ mova m2, [r3+8*16]
+ mova m6, [r3+9*16]
+ psubd m3, m0, m2 ; t8a
+ paddd m0, m2 ; t0a
+ mova [r3+8*16], m3
+ psubd m2, m1, m6 ; t9a
+ paddd m1, m6 ; t1a
+ mova m3, [r3+10*16]
+ psubd m6, m4, m3 ; t12a
+ paddd m4, m3 ; t4a
+ mova m3, [r3+11*16]
+ psubd m7, m5, m3 ; t13a
+ paddd m5, m3 ; t5a
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5
+ pmaxsd m3, [r3+8*16]
+ mova [r3+8*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5
+ pminsd m3, [r3+8*16]
+ mova [r3+8*16], m3
+ psubd m3, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ psubd m4, m1, m5 ; t5
+ paddd m1, m5 ; t1
+ mova m5, [o(pd_2048)]
+ mova [r3+9*16], m1
+ mova [r3+10*16], m4
+ mova [r3+11*16], m3
+ mova m3, [r3+8*16]
+ mova [r3+8*16], m0
+ ITX_MULSUB_2D 3, 2, 0, 1, 4, 5, 799, 4017
+ ITX_MULSUB_2D 7, 6, 0, 1, 4, 5, 4017, 4
+ psubd m5, m2, m7 ; t12a
+ paddd m2, m7 ; t8a
+ psubd m7, m3, m6 ; t13a
+ paddd m6, m3 ; t9a
+ mova m0, [r3+8*16]
+ mova m1, [r3+9*16]
+ mova m4, [r3+10*16]
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6
+ pmaxsd m3, [r3+11*16]
+ mova [r3+8*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6
+ pminsd m3, [r3+8*16]
+ mova [r3+8*16], m0
+ mova [r3+9*16], m1
+ mova [r3+10*16], m2
+ mova [r3+11*16], m6
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 3, 4, 1, 2, 6, 0, 1567, 3784
+ ITX_MULSUB_2D 5, 7, 1, 2, 6, 0, 6, 3784
+ mova m0, [r3+7*16] ; t7a
+ mova m2, [r3+6*16] ; t6a
+ psubd m1, m3, m0 ; t7
+ paddd m0, m3 ; out12
+ paddd m3, m4, m2 ; -out3
+ psubd m4, m2 ; t6
+ mova [r3+7*16], m3
+ mova m3, [r3+3*16] ; t15
+ mova m2, [r3+2*16] ; t14
+ paddd m6, m5, m3 ; -out13
+ psubd m5, m3 ; t15a
+ psubd m3, m7, m2 ; t14a
+ paddd m2, m7 ; out2
+ mova [r3+6*16], m2
+ mova m7, [r3+0*16] ; t10a
+ mova m2, [r3+1*16] ; t11a
+ mova [r3+0*16], m0
+ mova [r3+1*16], m6
+ mova m6, [r3+11*16]
+ psubd m0, m6, m2 ; t11
+ paddd m6, m2 ; out14
+ mova [r3+2*16], m6
+ mova m2, [r3+10*16]
+ psubd m6, m2, m7 ; t10
+ paddd m2, m7 ; -out1
+ mova m7, [r3+5*16] ; t3
+ mova [r3+5*16], m2
+ mova [r3+10*16], m1
+ mova m1, [r3+9*16]
+ psubd m2, m1, m7 ; t3a
+ paddd m1, m7 ; -out15
+ mova [r3+3*16], m1
+ mova m1, [r3+4*16] ; t2
+ mova m7, [r3+8*16]
+ psubd m7, m1 ; t2a
+ paddd m1, [r3+8*16] ; out0
+ mova [r3+4*16], m1
+ mova m1, [o(clip_18b_min)]
+ REPX {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7
+ pmaxsd m1, [r3+10*16]
+ mova [r3+10*16], m1
+ mova m1, [o(clip_18b_max)]
+ REPX {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7
+ pminsd m1, [r3+10*16]
+ mova [r3+10*16], m1
+ mova m1, [o(pd_2896)]
+ REPX {pmulld x, m1}, m0, m2, m3, m4, m5, m6, m7
+ pmulld m1, [r3+10*16]
+ mova [r3+11*16], m3
+ psubd m3, m4, m1 ; -out11 (unshifted)
+ paddd m4, m1 ; out4 (unshifted)
+ psubd m1, m6, m0 ; -out9 (unshifted)
+ paddd m6, m0 ; out6 (unshifted)
+ psubd m0, m7, m2 ; out8 (unshifted)
+ paddd m7, m2 ; -out7 (unshifted)
+ mova m2, [r3+11*16]
+ mova [r3+11*16], m5
+ paddd m5, m2 ; -out5 (unshifted)
+ psubd m2, [r3+11*16] ; out10 (unshifted)
+ ; m0-3 contain out8-11 (unshifted), m4-7 contain out4-7 (unshifted)
+ ; r[-4,3] contain out0-3 and out12-15
+%endif
+ ret
+.main_part1:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 995, 3973
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 2440, 3290
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3513, 2106
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 4052, 601
+ psubd m8, m0, m4 ; t10a
+ paddd m0, m4 ; t2a
+ psubd m4, m1, m5 ; t11a
+ paddd m1, m5 ; t3a
+ psubd m5, m2, m6 ; t14a
+ paddd m2, m6 ; t6a
+ psubd m6, m3, m7 ; t15a
+ paddd m7, m3 ; t7a
+ REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
+ REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
+ mova m15, [o(pd_2276)]
+ mova m10, [o(pd_3406)]
+ ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10
+ psubd m3, m0, m2 ; t6
+ paddd m0, m2 ; t2
+ psubd m2, m1, m7 ; t7
+ paddd m1, m7 ; t3
+ psubd m7, m4, m6 ; t14a
+ paddd m4, m6 ; t10a
+ psubd m6, m8, m5 ; t15a
+ paddd m5, m8 ; t11a
+ REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
+ REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
+ mova m15, [o(pd_1567)]
+ mova m10, [o(pd_3784)]
+ ITX_MULSUB_2D 2, 3, 8, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 7, 8, 9, _, 11, 10, 15
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+4*16], m4
+ mova [r3+5*16], m5
+ mova [r3+2*16], m2
+ mova [r3+3*16], m3
+ mova [r3+6*16], m6
+ mova [r3+7*16], m7
+%else
+ mova [r3+4*16], m0
+ mova [r3+5*16], m1
+ mova [r3+6*16], m2
+ mova [r3+7*16], m3
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3513, 2106
+ ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 4052, 601
+ mova [r3+0*16], m4
+ mova [r3+1*16], m5
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+ mova m0, [r3+4*16]
+ mova m1, [r3+5*16]
+ mova m2, [r3+6*16]
+ mova m7, [r3+7*16]
+ ITX_MULSUB_2D 1, 0, 4, 5, 6, 3, 995, 3973
+ ITX_MULSUB_2D 7, 2, 4, 5, 6, 3, 2440, 3290
+ mova m4, [r3+0*16]
+ mova m5, [r3+1*16]
+ psubd m6, m0, m4 ; t10a
+ paddd m0, m4 ; t2a
+ mova [r3+4*16], m6
+ mova m6, [r3+2*16]
+ mova m3, [r3+3*16]
+ psubd m4, m1, m5 ; t11a
+ paddd m1, m5 ; t3a
+ psubd m5, m2, m6 ; t14a
+ paddd m2, m6 ; t6a
+ psubd m6, m7, m3 ; t15a
+ paddd m7, m3 ; t7a
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7
+ pmaxsd m3, [r3+4*16]
+ mova [r3+4*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7
+ pminsd m3, [r3+4*16]
+ mova [r3+4*16], m3
+ psubd m3, m0, m2 ; t6
+ paddd m0, m2 ; t2
+ psubd m2, m1, m7 ; t7
+ paddd m1, m7 ; t3
+ mova [r3+5*16], m1
+ mova [r3+6*16], m3
+ mova [r3+7*16], m2
+ mova m1, [r3+4*16]
+ mova [r3+4*16], m0
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 1, 4, 0, 7, 2, 3, 3406, 2276
+ ITX_MULSUB_2D 6, 5, 0, 7, 2, 3, 2276, 2
+ psubd m7, m4, m6 ; t14a
+ paddd m4, m6 ; t10a
+ psubd m6, m1, m5 ; t15a
+ paddd m5, m1 ; t11a
+ mova m1, [r3+5*16]
+ mova m3, [r3+6*16]
+ mova m2, [r3+7*16]
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5
+ pmaxsd m0, [r3+4*16]
+ mova [r3+4*16], m0
+ mova m0, [o(clip_18b_max)]
+ REPX {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5
+ pminsd m0, [r3+4*16]
+ mova [r3+4*16], m0
+ mova [r3+5*16], m1
+ mova [r3+0*16], m4
+ mova [r3+1*16], m5
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 2, 3, 1, 4, 5, 0, 3784, 1567
+ ITX_MULSUB_2D 6, 7, 1, 4, 5, 0, 5, 1567
+ mova [r3+6*16], m2
+ mova [r3+7*16], m3
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+%endif
+ ret
+
+.pass2:
+ lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
+ jmp m(idct_16x4_internal_16bpc).pass2_loop
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r3, [rsp+gprsize]
+ call m(iadst_16x4_internal_16bpc).main
+%if ARCH_X86_64
+ packssdw m1, m0
+ packssdw m3, m2
+ packssdw m5, m4
+ packssdw m7, m6
+ packssdw m9, m8
+ packssdw m11, m10
+ packssdw m13, m12
+ packssdw m15, m14
+ mova m0, m15
+ mova m2, m13
+ mova m4, m11
+ mova m6, m9
+ mova m8, m7
+ mova m10, m5
+ mova m12, m3
+ mova m14, m1
+ jmp m(idct_16x4_internal_16bpc).transpose
+%else
+ mova [rsp+gprsize+4*16], m0
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m4
+ mova [rsp+gprsize+7*16], m6
+ pshufd m6, [rsp+gprsize+ 8*16], q1032
+ pshufd m4, [rsp+gprsize+ 9*16], q1032
+ pshufd m2, [rsp+gprsize+10*16], q1032
+ pshufd m0, [rsp+gprsize+11*16], q1032
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+gprsize+0*16], m0
+ mova [rsp+gprsize+1*16], m1
+ mova [rsp+gprsize+2*16], m2
+ mova [rsp+gprsize+3*16], m3
+ pshufd m6, [rsp+gprsize+ 4*16], q1032
+ pshufd m4, [rsp+gprsize+ 5*16], q1032
+ pshufd m2, [rsp+gprsize+ 6*16], q1032
+ pshufd m0, [rsp+gprsize+ 7*16], q1032
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ jmp tx2q
+%endif
+
+.pass2:
+ lea r3, [strideq*3]
+ lea dstq, [dstq+r3]
+ neg strideq
+ lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
+ jmp m(idct_16x4_internal_16bpc).pass2_loop
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ mova m15, [o(pd_11586)]
+ pmulld m0, m15, [cq+ 0*16]
+ pmulld m1, m15, [cq+ 1*16]
+ pmulld m2, m15, [cq+ 2*16]
+ pmulld m3, m15, [cq+ 3*16]
+ pmulld m4, m15, [cq+ 4*16]
+ pmulld m5, m15, [cq+ 5*16]
+ pmulld m6, m15, [cq+ 6*16]
+ pmulld m7, m15, [cq+ 7*16]
+ pmulld m8, m15, [cq+ 8*16]
+ pmulld m9, m15, [cq+ 9*16]
+ pmulld m10, m15, [cq+10*16]
+ pmulld m11, m15, [cq+11*16]
+ pmulld m12, m15, [cq+12*16]
+ pmulld m13, m15, [cq+13*16]
+ pmulld m14, m15, [cq+14*16]
+ pmulld m15, [cq+15*16]
+ mova [cq+ 0*16], m15
+ mova m15, [o(pd_6144)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [cq+ 0*16]
+ REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp m(idct_16x4_internal_16bpc).pack_transpose
+%else
+ add cq, 8*16
+ mov r5d, 2
+.loop_pass1:
+ mova m7, [o(pd_11586)]
+ pmulld m0, m7, [cq+0*16]
+ pmulld m1, m7, [cq+1*16]
+ pmulld m2, m7, [cq+2*16]
+ pmulld m3, m7, [cq+3*16]
+ pmulld m4, m7, [cq+4*16]
+ pmulld m5, m7, [cq+5*16]
+ pmulld m6, m7, [cq+6*16]
+ pmulld m7, [cq+7*16]
+ mova [cq+7*16], m7
+ mova m7, [o(pd_6144)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [cq+7*16]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ dec r5d
+ jz .end_pass1
+ mova [rsp+gprsize+0*16], m0
+ mova [rsp+gprsize+1*16], m1
+ mova [rsp+gprsize+2*16], m2
+ mova [rsp+gprsize+3*16], m3
+ sub cq, 8*16
+ jmp .loop_pass1
+.end_pass1:
+ jmp tx2q
+%endif
+
+.pass2:
+%if ARCH_X86_64
+ mova m12, [o(pw_1697x8)]
+%endif
+ lea r4, [o(.main)]
+ jmp m(idct_16x4_internal_16bpc).pass2_loop
+.main:
+%if ARCH_X86_64
+ pmulhrsw m4, m0, m12
+ pmulhrsw m5, m1, m12
+ pmulhrsw m6, m2, m12
+ pmulhrsw m7, m3, m12
+%else
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m0, m7
+ pmulhrsw m5, m1, m7
+ pmulhrsw m6, m2, m7
+ pmulhrsw m7, m3
+%endif
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ ret
+
+%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, %3, 16x8, 16, 0-8*16
+%else
+ INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+%if ARCH_X86_32
+ add rsp, 1*16
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity, 6
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+
+cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ DECLARE_REG_TMP 6, 4, 6
+%else
+ mov [rsp+gprsize+12*16], r1
+ DECLARE_REG_TMP 1, 4, 3
+%endif
+ lea t0, [o(.main)]
+.loop_main:
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 10
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 10
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+
+ lea r3, [rsp+gprsize]
+.loop_pass1:
+ call t0
+%if ARCH_X86_64
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+4*32+r5], m8
+ mova [cq+5*32+r5], m9
+ mova [cq+6*32+r5], m10
+ mova [cq+7*32+r5], m11
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+4*32+r5], m0
+ mova [cq+5*32+r5], m1
+ mova [cq+6*32+r5], m2
+ mova [cq+7*32+r5], m3
+ mova m0, [rsp+gprsize+ 8*16]
+ mova m2, [rsp+gprsize+ 9*16]
+ mova m4, [rsp+gprsize+10*16]
+ mova m6, [rsp+gprsize+11*16]
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ pxor m7, m7
+ REPX {mova [cq+x*32+r5], m7}, 8, 9, 10, 11, 12, 13, 14, 15
+ test r5d, r5d
+ jz .end
+ mova [cq+0*32+r5], m0
+ mova [cq+1*32+r5], m1
+ mova [cq+2*32+r5], m2
+ mova [cq+3*32+r5], m3
+ xor r5d, r5d
+ jmp .loop_pass1
+.end:
+
+ jmp tx2q
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 1*32+r5]
+ mova m1, [cq+ 3*32+r5]
+ mova m2, [cq+ 5*32+r5]
+ mova m3, [cq+ 7*32+r5]
+ mova m4, [cq+ 9*32+r5]
+ mova m5, [cq+11*32+r5]
+ mova m6, [cq+13*32+r5]
+ mova m7, [cq+15*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*32+r5]
+ mova m1, [cq+ 2*32+r5]
+ mova m2, [cq+ 4*32+r5]
+ mova m3, [cq+ 6*32+r5]
+ mova m4, [cq+ 8*32+r5]
+ mova m5, [cq+10*32+r5]
+ mova m6, [cq+12*32+r5]
+ mova m7, [cq+14*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call m(idct_16x4_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ mov r4d, 2
+.pass2_main:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%endif
+ lea r3, [strideq*3]
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [cq+0*32+ 0]
+ mova m1, [cq+1*32+ 0]
+ mova m2, [cq+2*32+ 0]
+ mova m3, [cq+3*32+ 0]
+.loop_pass2_entry:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ call m(idct_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add dstq, 16
+ add cq, 4*32
+ dec r4d
+ jg .loop_pass2
+ RET
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity, 6
+
+cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], r1
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x8_internal_16bpc).loop_main
+
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 2*32+r5]
+ mova m1, [cq+13*32+r5]
+ mova m2, [cq+ 6*32+r5]
+ mova m3, [cq+ 9*32+r5]
+ mova m4, [cq+10*32+r5]
+ mova m5, [cq+ 5*32+r5]
+ mova m6, [cq+14*32+r5]
+ mova m7, [cq+ 1*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(iadst_16x4_internal_16bpc).main_part1
+ mova m0, [cq+ 0*32+r5]
+ mova m1, [cq+15*32+r5]
+ mova m2, [cq+ 4*32+r5]
+ mova m3, [cq+11*32+r5]
+ mova m4, [cq+ 8*32+r5]
+ mova m5, [cq+ 7*32+r5]
+ mova m6, [cq+12*32+r5]
+ mova m7, [cq+ 3*32+r5]
+%if ARCH_X86_32
+ add r3, 8*16
+%endif
+ call m(idct_8x4_internal_16bpc).rect2_mul
+%if ARCH_X86_32
+ sub r3, 8*16
+%endif
+ call m(iadst_16x4_internal_16bpc).main_part2
+ call m(iadst_16x4_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ mov r4d, 2
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+ mova m11, [o(pw_m2048)]
+%endif
+ lea r3, [strideq*3]
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [cq+0*32+ 0]
+ mova m1, [cq+1*32+ 0]
+ mova m2, [cq+2*32+ 0]
+ mova m3, [cq+3*32+ 0]
+.loop_pass2_entry:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
+ call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add dstq, 16
+ add cq, 4*32
+ dec r4d
+ jg .loop_pass2
+ RET
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity, 6
+
+cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], r1
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x8_internal_16bpc).loop_main
+.main:
+ call m(iadst_16x8_internal_16bpc).main
+%if ARCH_X86_64
+ pshufd m1, m0, q1032
+ pshufd m3, m2, q1032
+ pshufd m5, m4, q1032
+ pshufd m7, m6, q1032
+ pshufd m0, m14, q1032
+ pshufd m2, m12, q1032
+ pshufd m4, m10, q1032
+ pshufd m6, m8, q1032
+ mova m14, m1
+ mova m12, m3
+ mova m10, m5
+ mova m8, m7
+%else
+ pshufd m1, m0, q1032
+ pshufd m3, m2, q1032
+ pshufd m5, m4, q1032
+ pshufd m7, m6, q1032
+ pshufd m0, [r3+11*16], q1032
+ pshufd m2, [r3+10*16], q1032
+ pshufd m4, [r3+9*16], q1032
+ pshufd m6, [r3+8*16], q1032
+ mova [r3+8*16], m7
+ mova [r3+9*16], m5
+ mova [r3+10*16], m3
+ mova [r3+11*16], m1
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ lea dstq, [dstq+strideq*8]
+ neg strideq
+ add dstq, strideq
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], strideq
+%endif
+ jmp m(iadst_16x8_internal_16bpc).pass2
+
+INV_TXFM_16X8_FN identity, dct, -54
+INV_TXFM_16X8_FN identity, adst, -54
+INV_TXFM_16X8_FN identity, flipadst, -54
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], r1
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x8_internal_16bpc).loop_main
+.main:
+%if ARCH_X86_64
+ mova m15, [o(pd_2896)]
+ pmulld m0, m15, [cq+ 0*32+r5]
+ pmulld m1, m15, [cq+ 1*32+r5]
+ pmulld m2, m15, [cq+ 2*32+r5]
+ pmulld m3, m15, [cq+ 3*32+r5]
+ pmulld m4, m15, [cq+ 4*32+r5]
+ pmulld m5, m15, [cq+ 5*32+r5]
+ pmulld m6, m15, [cq+ 6*32+r5]
+ pmulld m7, m15, [cq+ 7*32+r5]
+ pmulld m8, m15, [cq+ 8*32+r5]
+ pmulld m9, m15, [cq+ 9*32+r5]
+ pmulld m10, m15, [cq+10*32+r5]
+ pmulld m11, m15, [cq+11*32+r5]
+ pmulld m12, m15, [cq+12*32+r5]
+ pmulld m13, m15, [cq+13*32+r5]
+ pmulld m14, m15, [cq+14*32+r5]
+ pmulld m15, [cq+15*32+r5]
+ mova [r3], m15
+ mova m15, [o(pd_2048)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [r3]
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ mova [r3], m15
+ mova m15, [o(pd_11586)]
+ REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ pmulld m15, [r3]
+ mova [r3], m15
+ mova m15, [o(pd_6144)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [r3]
+ REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%else
+ mova m0, [cq+ 0*32+r5]
+ mova m1, [cq+ 1*32+r5]
+ mova m2, [cq+ 2*32+r5]
+ mova m3, [cq+ 3*32+r5]
+ mova m4, [cq+ 4*32+r5]
+ mova m5, [cq+ 5*32+r5]
+ mova m6, [cq+ 6*32+r5]
+ mova m7, [cq+ 7*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ mova [r3], m7
+ mova m7, [o(pd_11586)]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulld m7, [r3]
+ mova [r3], m7
+ mova m7, [o(pd_6144)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+ 8*16], m0
+ mova [r3+ 9*16], m2
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m0, [cq+ 8*32+r5]
+ mova m1, [cq+ 9*32+r5]
+ mova m2, [cq+10*32+r5]
+ mova m3, [cq+11*32+r5]
+ mova m4, [cq+12*32+r5]
+ mova m5, [cq+13*32+r5]
+ mova m6, [cq+14*32+r5]
+ mova m7, [cq+15*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ mova [r3], m7
+ mova m7, [o(pd_11586)]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulld m7, [r3]
+ mova [r3], m7
+ mova m7, [o(pd_6144)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ mov r4d, 2
+%if ARCH_X86_64
+ mova m8, [o(pw_4096)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%endif
+ lea r3, [strideq*3]
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [cq+0*32+ 0]
+ mova m1, [cq+1*32+ 0]
+ mova m2, [cq+2*32+ 0]
+ mova m3, [cq+3*32+ 0]
+.loop_pass2_entry:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_64
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+%else
+ mova [rsp+gprsize], m7
+ mova m7, [o(pw_4096)]
+ call m(idct_8x8_internal_16bpc).round4_and_write_8x8
+%endif
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add dstq, 16
+ add cq, 4*32
+ dec r4d
+ jg .loop_pass2
+ RET
+
+%macro INV_TXFM_16X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 16, 0-(16+WIN64)*16
+%else
+ INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+ add r5d, 640
+ sar r5d, 10
+ add rsp, (5+ARCH_X86_64*3+WIN64)*16
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity, v
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+
+cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ DECLARE_REG_TMP 6, 7
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%endif
+%elif ARCH_X86_32
+ DECLARE_REG_TMP 1, 6
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+.pass1_full:
+%undef cmp
+ mov t1d, 4
+.zero_loop:
+ dec t1d
+ cmp eobb, byte [r5+t1]
+ jb .zero_loop
+ mov r5d, t1d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, [rsp+16*16+2*gprsize]
+%endif
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+.loop_pass1:
+ call t0
+%if ARCH_X86_64
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+4*64+r5], m8
+ mova [cq+5*64+r5], m9
+ mova [cq+6*64+r5], m10
+ mova [cq+7*64+r5], m11
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+4*64+r5], m0
+ mova [cq+5*64+r5], m1
+ mova [cq+6*64+r5], m2
+ mova [cq+7*64+r5], m3
+ mova m0, [rsp+gprsize+ 8*16]
+ mova m2, [rsp+gprsize+ 9*16]
+ mova m4, [rsp+gprsize+10*16]
+ mova m6, [rsp+gprsize+11*16]
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+0*64+r5], m0
+ mova [cq+1*64+r5], m1
+ mova [cq+2*64+r5], m2
+ mova [cq+3*64+r5], m3
+ pxor m0, m0
+ REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15
+ sub r5d, 16
+ jge .loop_pass1
+
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r1, [rsp+16*16+1*gprsize]
+%endif
+ jmp tx2q
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mova m0, [cq+ 1*64+r5]
+ mova m1, [cq+ 3*64+r5]
+ mova m2, [cq+ 5*64+r5]
+ mova m3, [cq+ 7*64+r5]
+ mova m4, [cq+ 9*64+r5]
+ mova m5, [cq+11*64+r5]
+ mova m6, [cq+13*64+r5]
+ mova m7, [cq+15*64+r5]
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*64+r5]
+ mova m1, [cq+ 2*64+r5]
+ mova m2, [cq+ 4*64+r5]
+ mova m3, [cq+ 6*64+r5]
+ mova m4, [cq+ 8*64+r5]
+ mova m5, [cq+10*64+r5]
+ mova m6, [cq+12*64+r5]
+ mova m7, [cq+14*64+r5]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call .round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+.round:
+%if ARCH_X86_64
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ psrld m8, m11, 10 ; 2
+ REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ mova m8, [r3+1*16]
+ mova m9, [r3+2*16]
+ mova m10, [r3+3*16]
+ mova m11, [r3+4*16]
+ mova m12, [r3+5*16]
+ mova m13, [r3+6*16]
+ mova m14, [r3+7*16]
+ psubd m15, m0, m14 ; out15
+ paddd m0, m14 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m2, m12 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m10 ; out11
+ paddd m4, m10 ; out4
+ psubd m10, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, [r3+0*16] ; out8
+ paddd m7, [r3+0*16] ; out7
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ ; and out0-15 is now in m0-15
+%else
+ mova [r3+ 0*16], m0
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ pmaxsd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m7
+ mova m7, [o(clip_18b_max)]
+ REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsd m7, [r3+ 0*16]
+ mova [r3+ 0*16], m0
+ mova m0, [o(pd_2)]
+ REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ paddd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m0
+ mova [r3+ 1*16], m1
+ mova [r3+ 2*16], m2
+ mova m1, [r3+11*16]
+ mova m2, [r3+10*16]
+ psubd m0, m7, m1
+ paddd m7, m1
+ psubd m1, m6, m2
+ paddd m6, m2
+ REPX {psrad x, 2}, m0, m1, m6, m7
+ packssdw m0, m1 ; out8-9
+ packssdw m6, m7 ; out6-7
+ mova [r3+11*16], m6
+ mova m1, [r3+9*16]
+ mova m7, [r3+8*16]
+ psubd m2, m5, m1
+ paddd m5, m1
+ psubd m1, m4, m7
+ paddd m4, m7
+ REPX {psrad x, 2}, m2, m1, m4, m5
+ packssdw m2, m1 ; out10-11
+ packssdw m4, m5 ; out4-5
+ mova m1, [r3+2*16]
+ mova [r3+10*16], m4
+ mova m6, [r3+7*16]
+ mova m7, [r3+6*16]
+ psubd m4, m3, m6
+ paddd m3, m6
+ psubd m6, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 2}, m4, m6, m1, m3
+ packssdw m4, m6 ; out12-13
+ packssdw m1, m3 ; out2-3
+ mova m3, [r3+1*16]
+ mova [r3+9*16], m1
+ mova m1, [r3+0*16]
+ mova m5, [r3+5*16]
+ mova m7, [r3+4*16]
+ psubd m6, m3, m5
+ paddd m3, m5
+ psubd m5, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 2}, m6, m5, m1, m3
+ packssdw m6, m5 ; out14-15
+ packssdw m1, m3 ; out0-1
+ mova [r3+8*16], m1
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 2
+.loop_pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m0, [cq+0*64+ 0]
+ mova m1, [cq+2*64+ 0]
+ mova m2, [cq+0*64+16]
+ mova m3, [cq+2*64+16]
+ mova m4, [cq+0*64+32]
+ mova m5, [cq+2*64+32]
+ mova m6, [cq+0*64+48]
+ mova m7, [cq+2*64+48]
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+3*16], m0
+ mova [rsp+gprsize+4*16], m1
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m3
+ mova [rsp+gprsize+7*16], m4
+ mova [rsp+gprsize+8*16], m5
+ mova [rsp+gprsize+9*16], m6
+ ; m7 is already stored in [rsp+gprsize+0*16]
+ mova m0, [cq+1*64+ 0]
+ mova m1, [cq+3*64+ 0]
+ mova m2, [cq+1*64+16]
+ mova m3, [cq+3*64+16]
+ mova m4, [cq+1*64+32]
+ mova m5, [cq+3*64+32]
+ mova m6, [cq+1*64+48]
+ mova m7, [cq+3*64+48]
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+
+ ; out0-7 is in rsp+gprsize+3-10*mmsize
+ ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
+
+%if ARCH_X86_64
+ lea dstq, [r7+strideq*8]
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+ lea dstq, [dstq+strideq*8]
+%endif
+ call m(idct_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+ mov dstq, r7
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+%endif
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+%if ARCH_X86_64
+ add r7, 16
+%define mzero m9
+%else
+ add dword [rsp+2*gprsize+16*16], 16
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add cq, 64*4
+ REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
+%undef mzero
+ dec r4d
+ jg .loop_pass2
+%if WIN64
+ mov r7, [rsp+16*16+gprsize]
+%endif
+ RET
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x16_internal_16bpc).pass1_full
+
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 2*64+r5]
+ mova m1, [cq+13*64+r5]
+ mova m2, [cq+ 6*64+r5]
+ mova m3, [cq+ 9*64+r5]
+ mova m4, [cq+10*64+r5]
+ mova m5, [cq+ 5*64+r5]
+ mova m6, [cq+14*64+r5]
+ mova m7, [cq+ 1*64+r5]
+ call m(iadst_16x4_internal_16bpc).main_part1
+ mova m0, [cq+ 0*64+r5]
+ mova m1, [cq+15*64+r5]
+ mova m2, [cq+ 4*64+r5]
+ mova m3, [cq+11*64+r5]
+ mova m4, [cq+ 8*64+r5]
+ mova m5, [cq+ 7*64+r5]
+ mova m6, [cq+12*64+r5]
+ mova m7, [cq+ 3*64+r5]
+ call m(iadst_16x4_internal_16bpc).main_part2
+ call .round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+.round:
+%if ARCH_X86_64
+ pcmpeqd m8, m8 ; -1
+ mova m15, [o(pd_10240)]
+ psrld m14, 10 ; +2
+ psubd m13, m14, m8 ; +3
+ REPX {pxor x, m8 }, m1, m3, m5, m7
+ REPX {paddd x, m14}, m0, m2
+ REPX {paddd x, m13}, m1, m3
+ REPX {paddd x, m15}, m4, m5, m6, m7
+ paddd m13, m15, m8 ; +10239
+ paddd m8, m15, m9
+ psubd m9, m13, m10
+ paddd m10, m15, m11
+ psubd m11, m13, m12
+ paddd m12, m14, [r3+3*16]
+ psubd m13, m14, [r3+2*16]
+ psubd m15, m14, [r3+0*16]
+ paddd m14, [r3+1*16]
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11
+%else
+ mova [r3+8*16], m1
+ mova [r3+9*16], m3
+ mova m3, [o(pd_10240)]
+ pcmpeqd m1, m1
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m5, m6, m7
+ REPX {psrad x, 14}, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ mova m3, [o(pd_2)]
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m6
+ psubd m3, m1
+ REPX {paddd x, m3}, m5, m7
+ REPX {psrad x, 2 }, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova m5, [r3+8*16]
+ mova m7, [r3+9*16]
+ mova [r3+8*16], m4
+ mova [r3+9*16], m6
+ mova m3, [o(pd_10240)]
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m0, m5, m2, m7
+ REPX {psrad x, 14}, m0, m5, m2, m7
+ packssdw m0, m5
+ packssdw m2, m7
+ mova m4, [r3+0*16]
+ mova m5, [r3+1*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+ mova m3, [o(pd_2)]
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m6
+ psubd m3, m1
+ REPX {paddd x, m3}, m5, m7
+ REPX {psrad x, 2 }, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ mova m11, [o(pw_m2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 2
+.loop_pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m0, [cq+0*64+32]
+ mova m1, [cq+1*64+32]
+ mova m2, [cq+2*64+16]
+ mova m3, [cq+3*64+16]
+ mova m4, [cq+0*64+ 0]
+ mova m5, [cq+1*64+ 0]
+ mova m6, [cq+2*64+48]
+ mova m7, [cq+3*64+48]
+ mova [rsp+gprsize+3*16], m0
+ mova [rsp+gprsize+4*16], m1
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m3
+ mova [rsp+gprsize+7*16], m4
+ mova [rsp+gprsize+8*16], m5
+ mova [rsp+gprsize+9*16], m6
+ mova [rsp+gprsize+10*16], m7
+ mova m0, [cq+2*64+ 0]
+ mova m1, [cq+3*64+ 0]
+ mova m2, [cq+0*64+16]
+ mova m3, [cq+1*64+16]
+ mova m4, [cq+2*64+32]
+ mova m5, [cq+3*64+32]
+ mova m6, [cq+0*64+48]
+ mova m7, [cq+1*64+48]
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
+
+ ; out0-7 is in rsp+gprsize+3-10*mmsize
+ ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
+
+%if ARCH_X86_64
+ lea dstq, [r7+strideq*8]
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+ lea dstq, [dstq+strideq*8]
+%endif
+ call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+ mov dstq, r7
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+%endif
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+ call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
+%if ARCH_X86_64
+ add r7, 16
+%define mzero m9
+%else
+ add dword [rsp+2*gprsize+16*16], 16
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add cq, 64*4
+ REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
+%undef mzero
+ dec r4d
+ jg .loop_pass2
+%if WIN64
+ mov r7, [rsp+16*16+gprsize]
+%endif
+ RET
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x16_internal_16bpc).pass1_full
+
+.main:
+ call m(iadst_16x16_internal_16bpc).main
+%if ARCH_X86_64
+ mova m1, m0
+ mova m3, m2
+ mova m5, m4
+ mova m7, m6
+ pshufd m0, m14, q1032
+ pshufd m2, m12, q1032
+ pshufd m4, m10, q1032
+ pshufd m6, m8, q1032
+ pshufd m8, m7, q1032
+ pshufd m10, m5, q1032
+ pshufd m12, m3, q1032
+ pshufd m14, m1, q1032
+%else
+ pshufd m1, m0, q1032
+ pshufd m3, m2, q1032
+ pshufd m5, m4, q1032
+ pshufd m7, m6, q1032
+ pshufd m0, [r3+11*16], q1032
+ pshufd m2, [r3+10*16], q1032
+ pshufd m4, [r3+9*16], q1032
+ pshufd m6, [r3+8*16], q1032
+ mova [r3+11*16], m1
+ mova [r3+10*16], m3
+ mova [r3+ 9*16], m5
+ mova [r3+ 8*16], m7
+%endif
+ ret
+
+.pass2:
+ lea r3, [strideq*3]
+ lea r3, [r3*5]
+ add dstq, r3
+ neg strideq
+ jmp m(iadst_16x16_internal_16bpc).pass2
+
+INV_TXFM_16X16_FN identity, dct, h
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x16_internal_16bpc).pass1_full
+
+.main:
+%if ARCH_X86_64
+ mova m15, [o(pd_11586)]
+ pmulld m0, m15, [cq+ 0*64+r5]
+ pmulld m1, m15, [cq+ 1*64+r5]
+ pmulld m2, m15, [cq+ 2*64+r5]
+ pmulld m3, m15, [cq+ 3*64+r5]
+ pmulld m4, m15, [cq+ 4*64+r5]
+ pmulld m5, m15, [cq+ 5*64+r5]
+ pmulld m6, m15, [cq+ 6*64+r5]
+ pmulld m7, m15, [cq+ 7*64+r5]
+ pmulld m8, m15, [cq+ 8*64+r5]
+ pmulld m9, m15, [cq+ 9*64+r5]
+ pmulld m10, m15, [cq+10*64+r5]
+ pmulld m11, m15, [cq+11*64+r5]
+ pmulld m12, m15, [cq+12*64+r5]
+ pmulld m13, m15, [cq+13*64+r5]
+ pmulld m14, m15, [cq+14*64+r5]
+ pmulld m15, [cq+15*64+r5]
+ mova [r3], m15
+ mova m15, [o(pd_10240)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [r3]
+ REPX {psrad x, 14 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%else
+ mova m7, [o(pd_11586)]
+ pmulld m0, m7, [cq+ 0*64+r5]
+ pmulld m1, m7, [cq+ 1*64+r5]
+ pmulld m2, m7, [cq+ 2*64+r5]
+ pmulld m3, m7, [cq+ 3*64+r5]
+ pmulld m4, m7, [cq+ 4*64+r5]
+ pmulld m5, m7, [cq+ 5*64+r5]
+ pmulld m6, m7, [cq+ 6*64+r5]
+ pmulld m7, [cq+ 7*64+r5]
+ mova [r3], m7
+ mova m7, [o(pd_10240)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+8*16], m0
+ mova [r3+9*16], m2
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m7, [o(pd_11586)]
+ pmulld m0, m7, [cq+ 8*64+r5]
+ pmulld m1, m7, [cq+ 9*64+r5]
+ pmulld m2, m7, [cq+10*64+r5]
+ pmulld m3, m7, [cq+11*64+r5]
+ pmulld m4, m7, [cq+12*64+r5]
+ pmulld m5, m7, [cq+13*64+r5]
+ pmulld m6, m7, [cq+14*64+r5]
+ pmulld m7, [cq+15*64+r5]
+ mova [r3], m7
+ mova m7, [o(pd_10240)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_64
+ mova m4, [o(pw_2048)]
+ mova m5, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mova m7, [o(pw_1697x16)]
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ mov r5d, 4
+ lea r3, [strideq*3]
+.pass2_loop:
+ mova m0, [cq+0*64+0]
+ mova m1, [cq+1*64+0]
+ mova m2, [cq+2*64+0]
+ mova m3, [cq+3*64+0]
+ call m(iidentity_8x16_internal_16bpc).main
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).round1_and_write_8x4
+%else
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+%endif
+ REPX {mova [cq+x*16], m6}, 0, 4, 8, 12
+ add cq, 16
+ lea dstq, [dstq+strideq*4]
+ dec r5w
+ jg .pass2_loop
+ add cq, 64*3
+ btc r5d, 16
+ jc .end
+%if ARCH_X86_64
+ lea dstq, [r7+16]
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+ add dstq, 16
+%endif
+ add r5d, 4
+ jmp .pass2_loop
+.end:
+%if WIN64
+ mov r7, [rsp+16*16+gprsize]
+%endif
+ RET
+
+cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ mova m5, [o(pw_5)]
+ mova m7, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mov r5d, eobd
+ add eobb, 21
+ cmovc eobd, r5d ; 43, 107, 171 -> 64, 128, 192
+ lea r4, [strideq*3]
+.loop:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {paddsw x, m5}, m0, m1, m2, m3
+ REPX {psraw x, 3 }, m0, m1, m2, m3
+ call .main_zero
+ add cq, 16
+ lea dstq, [dstq+strideq*4]
+ btc eobd, 16
+ jnc .loop
+ sub eobd, 64
+ jge .loop
+ RET
+ALIGN function_align
+.main_zero:
+ REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+.main:
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m4, m2, m1
+ punpcklwd m2, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r4 ]
+ REPX {pmaxsw x, m6}, m0, m1, m2, m3
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r4 ], m3
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ mova m5, [o(pw_4096)]
+ mova m7, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mov r4d, eobd
+ add eobb, 21
+ cmovc eobd, r4d
+ lea r4, [strideq*3]
+ mov r5, dstq
+.loop:
+ mova m0, [cq+32*0]
+ packssdw m0, [cq+32*1]
+ mova m1, [cq+32*2]
+ packssdw m1, [cq+32*3]
+ mova m2, [cq+32*4]
+ packssdw m2, [cq+32*5]
+ mova m3, [cq+32*6]
+ packssdw m3, [cq+32*7]
+ REPX {mova [cq+32*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .loop
+ add cq, 32*8-32
+ add r5, 16
+ mov dstq, r5
+ sub eobd, 64
+ jge .loop
+ RET
+
+cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%else
+ mova m8, [o(pw_2896x8)]
+ mova m9, [o(pw_1697x16)]
+ mova m11, [o(pw_8192)]
+%endif
+ mova m7, [o(pixel_10bpc_max)]
+ lea r4, [strideq*3]
+ pxor m6, m6
+%if ARCH_X86_64
+ paddw m10, m11, m11 ; pw_16384
+%endif
+ mov r5, dstq
+ call .main
+ sub eobd, 36
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 107 ; eob < 143
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 128 ; eob < 271
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 128 ; eob < 399
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+.ret:
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
+ pmulhrsw m4, m9, m0
+ pmulhrsw m5, m9, m1
+ REPX {pmulhrsw x, m10}, m4, m5
+%else
+ mova m6, [o(pw_2896x8)]
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+ mova m5, [o(pw_1697x16)]
+ pmulhrsw m4, m5, m0
+ pmulhrsw m5, m1
+ mova m6, [o(pw_16384)]
+ REPX {pmulhrsw x, m6 }, m4, m5
+%endif
+ paddsw m0, m4
+ paddsw m1, m5
+%if ARCH_X86_64
+ pmulhrsw m4, m9, m2
+ pmulhrsw m5, m9, m3
+ REPX {pmulhrsw x, m10}, m4, m5
+%else
+ mova m5, [o(pw_1697x16)]
+ pmulhrsw m4, m5, m2
+ pmulhrsw m5, m3
+ REPX {pmulhrsw x, m6 }, m4, m5
+%endif
+ paddsw m2, m4
+ paddsw m3, m5
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+%else
+ psrlw m6, 1 ; pw_8192
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+ pxor m6, m6
+%endif
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .main
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%else
+ mova m8, [o(pw_2896x8)]
+ mova m9, [o(pw_1697x16)]
+ mova m10, [o(pw_2048)]
+%endif
+ mova m7, [o(pixel_10bpc_max)]
+ lea r4, [strideq*3]
+ pxor m6, m6
+ mov r5, dstq
+ call .main
+ sub eobd, 36
+ jl .ret
+ call .main
+ add cq, 64*8-64
+ lea dstq, [r5+16*1]
+ call .main
+ sub eobd, 107 ; eob < 143
+ jl .ret
+ call .main
+ add cq, 64*8-64
+ lea dstq, [r5+16*2]
+ call .main
+ sub eobd, 128 ; eob < 271
+ jl .ret
+ call .main
+ add cq, 64*8-64
+ lea dstq, [r5+16*3]
+ call .main
+ sub eobd, 128 ; eob < 399
+ jl .ret
+ call .main
+.ret:
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+64*0]
+ packssdw m0, [cq+64*1]
+ mova m1, [cq+64*2]
+ packssdw m1, [cq+64*3]
+ mova m2, [cq+64*4]
+ packssdw m2, [cq+64*5]
+ mova m3, [cq+64*6]
+ packssdw m3, [cq+64*7]
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
+%else
+ mova m6, [o(pw_2896x8)]
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+%endif
+ REPX {paddsw x, x }, m0, m1, m2, m3
+%if ARCH_X86_64
+ pmulhrsw m4, m9, m0
+ pmulhrsw m5, m9, m1
+%else
+ mova m6, [o(pw_1697x16)]
+ pmulhrsw m4, m6, m0
+ pmulhrsw m5, m6, m1
+%endif
+ REPX {paddsw x, x }, m0, m1
+ paddsw m0, m4
+ paddsw m1, m5
+%if ARCH_X86_64
+ pmulhrsw m4, m9, m2
+ pmulhrsw m5, m9, m3
+%else
+ pmulhrsw m4, m6, m2
+ pmulhrsw m6, m3
+%endif
+ REPX {paddsw x, x }, m2, m3
+ paddsw m2, m4
+%if ARCH_X86_64
+ paddsw m3, m5
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3
+%else
+ paddsw m3, m6
+ mova m6, [o(pw_2048)]
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+ pxor m6, m6
+%endif
+ REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .main
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 7, 8, dst, stride, c, eob
+%undef cmp
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ mova m5, [o(pw_8192)]
+ mova m7, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ lea r4, [strideq*3]
+ mov r5, dstq
+ call .main ; 0
+ cmp eobd, 36
+ jl .ret
+ add cq, 128*8-32 ; 0 1
+ lea dstq, [r5+16] ; 1
+ call .main
+ call .main2
+ cmp eobd, 136
+ jl .ret
+ add cq, 128*16-64 ; 0 1 2
+ lea dstq, [r5+16*2] ; 1 2
+ call .main ; 2
+ call .main2
+ call .main2
+ cmp eobd, 300
+ jl .ret
+ add cq, 128*24-96 ; 0 1 2 3
+ add r5, 16*3 ; 1 2 3
+ mov dstq, r5 ; 2 3
+ call .main ; 3
+ call .main2
+ call .main2
+ call .main2
+ cmp eobd, 535
+ jl .ret
+ add cq, 128*24-96 ; 0 1 2 3
+ lea dstq, [r5+strideq*8] ; 1 2 3 4
+ mov r5, dstq ; 2 3 4
+ call .main ; 3 4
+ call .main2
+ call .main2
+ cmp eobd, 755
+ jl .ret
+ add cq, 128*16-64 ; 0 1 2 3
+ lea dstq, [r5+strideq*8] ; 1 2 3 4
+ mov r5, dstq ; 2 3 4 5
+ call .main ; 3 4 5
+ call .main2
+ cmp eobd, 911
+ jl .ret
+ add cq, 128*8-32 ; 0 1 2 3
+ lea dstq, [r5+strideq*8] ; 1 2 3 4
+ call .main ; 2 3 4 5
+.ret: ; 3 4 5 6
+ RET
+ALIGN function_align
+.main2:
+ sub cq, 128*8
+ sub dstq, 16
+.main:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .main
+ ret
+
+cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \
+ dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%define base $$
+ DECLARE_REG_TMP 0, 4
+%else
+ lea r6, [tbl_Nx32_odd_offset]
+%define base tbl_Nx32_odd_offset
+ DECLARE_REG_TMP 4, 7
+%if WIN64
+ mov [rsp+gprsize*1+35*16], r7
+%endif
+%endif
+%define o2(x) r6-base+x
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ mov [rsp+gprsize*1+35*16], r0
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_8x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [rsp+ 3*16+r5*8], m0
+ mova [rsp+11*16+r5*8], m0
+ mova [rsp+ 3*16+t0*8], m0
+ mova [rsp+ 3*16+t1*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_8x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+35*16], eobd
+ mov r3, rsp
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+0*128+r5*8]
+ mova m1, [cq+1*128+r5*8]
+ mova m2, [cq+2*128+r5*8]
+ mova m3, [cq+3*128+r5*8]
+ mova m4, [cq+4*128+r5*8]
+ mova m5, [cq+5*128+r5*8]
+ mova m6, [cq+6*128+r5*8]
+ mova m7, [cq+7*128+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ mova m1, [o(pd_2)]
+ REPX {paddd x, m1}, m0, m6, m5, m3
+ call m(idct_8x4_internal_16bpc).round
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [r3+ 3*16+r5*8], m0
+ mova [r3+11*16+r5*8], m2
+ mova [r3+ 3*16+t1*8], m1
+ mova [r3+ 3*16+t0*8], m3
+ pxor m7, m7
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass 2 code starts here
+ ; m0 is already loaded from last iteration of first pass
+%if ARCH_X86_32
+ mov r0, [rsp+gprsize*1+35*16]
+%endif
+ mov eobd, [rsp+gprsize*0+35*16]
+ cmp eobd, 43
+ jl .load_veryfast
+ cmp eobd, 107
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+ call .pass2
+%if WIN64
+ mov r7, [rsp+gprsize*1+35*16]
+%endif
+ RET
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m1, [rsp+gprsize+16* 4]
+ mova m2, [rsp+gprsize+16* 5]
+ mova m3, [rsp+gprsize+16* 6]
+ mova m4, [rsp+gprsize+16* 7]
+ mova m5, [rsp+gprsize+16* 8]
+ mova m6, [rsp+gprsize+16* 9]
+ mova m7, [rsp+gprsize+16*10]
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+ 3*16], m0
+ mova [rsp+gprsize+ 4*16], m1
+ mova [rsp+gprsize+ 5*16], m2
+ mova [rsp+gprsize+ 6*16], m3
+ mova [rsp+gprsize+ 7*16], m4
+ mova [rsp+gprsize+ 8*16], m5
+ mova [rsp+gprsize+ 9*16], m6
+ mova m0, [rsp+gprsize+11*16]
+ mova m1, [rsp+gprsize+12*16]
+ mova m2, [rsp+gprsize+13*16]
+ mova m3, [rsp+gprsize+14*16]
+ mova m4, [rsp+gprsize+15*16]
+ mova m5, [rsp+gprsize+16*16]
+ mova m6, [rsp+gprsize+17*16]
+ mova m7, [rsp+gprsize+18*16]
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+ mova m7, [rsp+gprsize+ 0*16]
+ mova [rsp+gprsize+11*16], m0
+ mova [rsp+gprsize+12*16], m1
+ mova [rsp+gprsize+13*16], m2
+ mova [rsp+gprsize+14*16], m3
+ mova [rsp+gprsize+15*16], m4
+ mova [rsp+gprsize+16*16], m5
+ mova [rsp+gprsize+17*16], m6
+ mova [rsp+gprsize+18*16], m7
+ call r4
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%endif
+ lea r3, [strideq*3]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ mova m0, [rsp+gprsize+11*16]
+ mova m1, [rsp+gprsize+12*16]
+ mova m2, [rsp+gprsize+13*16]
+ mova m3, [rsp+gprsize+14*16]
+ mova m4, [rsp+gprsize+15*16]
+ mova m5, [rsp+gprsize+16*16]
+ mova m6, [rsp+gprsize+17*16]
+ mova m7, [rsp+gprsize+18*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ mova m0, [rsp+gprsize+19*16]
+ mova m1, [rsp+gprsize+20*16]
+ mova m2, [rsp+gprsize+21*16]
+ mova m3, [rsp+gprsize+22*16]
+ mova m4, [rsp+gprsize+23*16]
+ mova m5, [rsp+gprsize+24*16]
+ mova m6, [rsp+gprsize+25*16]
+ mova m7, [rsp+gprsize+26*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ mova m0, [rsp+gprsize+27*16]
+ mova m1, [rsp+gprsize+28*16]
+ mova m2, [rsp+gprsize+29*16]
+ mova m3, [rsp+gprsize+30*16]
+ mova m4, [rsp+gprsize+31*16]
+ mova m5, [rsp+gprsize+32*16]
+ mova m6, [rsp+gprsize+33*16]
+ mova m7, [rsp+gprsize+34*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ ret
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+ add r5d, 640
+ sar r5d, 10
+ add rsp, (31+2*ARCH_X86_64)*16
+ jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2
+
+cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ mov [rsp+gprsize*1+76*16], r0
+%elif WIN64
+ mov [rsp+gprsize*1+76*16], r7
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [rsp+12*16+r5*8], m0
+ mova [rsp+20*16+r5*8], m0
+ mova [rsp+12*16+t0*8], m0
+ mova [rsp+12*16+t1*8], m0
+ mova [rsp+44*16+r5*8], m0
+ mova [rsp+52*16+r5*8], m0
+ mova [rsp+44*16+t0*8], m0
+ mova [rsp+44*16+t1*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+76*16], eobd
+ mov r3, rsp
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 1*128+r5*8]
+ mova m1, [cq+ 3*128+r5*8]
+ mova m2, [cq+ 5*128+r5*8]
+ mova m3, [cq+ 7*128+r5*8]
+ mova m4, [cq+ 9*128+r5*8]
+ mova m5, [cq+11*128+r5*8]
+ mova m6, [cq+13*128+r5*8]
+ mova m7, [cq+15*128+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*128+r5*8]
+ mova m1, [cq+ 2*128+r5*8]
+ mova m2, [cq+ 4*128+r5*8]
+ mova m3, [cq+ 6*128+r5*8]
+ mova m4, [cq+ 8*128+r5*8]
+ mova m5, [cq+10*128+r5*8]
+ mova m6, [cq+12*128+r5*8]
+ mova m7, [cq+14*128+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call m(idct_16x4_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+%if ARCH_X86_64
+ mova [rsp+12*16+r5*8], m0
+ mova [rsp+20*16+r5*8], m2
+ mova [rsp+12*16+t1*8], m1
+ mova [rsp+12*16+t0*8], m3
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+44*16+r5*8], m8
+ mova [rsp+52*16+r5*8], m10
+ mova [rsp+44*16+t1*8], m9
+ mova [rsp+44*16+t0*8], m11
+%else
+ mova [rsp+44*16+r5*8], m0
+ mova [rsp+52*16+r5*8], m2
+ mova [rsp+44*16+t1*8], m1
+ mova [rsp+44*16+t0*8], m3
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+ 9*16]
+ mova m4, [r3+10*16]
+ mova m6, [r3+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+12*16+r5*8], m0
+ mova [rsp+20*16+r5*8], m2
+ mova [rsp+12*16+t1*8], m1
+ mova [rsp+12*16+t0*8], m3
+%endif
+ pxor m7, m7
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2
+ add rsp, 9*16
+%if ARCH_X86_64
+ mov r6, dstq
+%else
+ mov dstq, [rsp+gprsize*1+67*16]
+%endif
+ mov eobd, [rsp+gprsize*0+67*16]
+ cmp eobd, 44
+ jl .load_veryfast
+ cmp eobd, 151
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+%if ARCH_X86_64
+ lea r2, [dstq+32]
+ mov r7, -4
+%else
+ lea r2, [rsp+67*16]
+ mov dword [r2+0*gprsize], 2
+%endif
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [rsp+16* 3]
+.loop_pass2_entry:
+%if ARCH_X86_32
+ mov dstq, [r2+1*gprsize]
+%endif
+ call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2
+ add rsp, 32*16
+%if ARCH_X86_64
+ add r7, 2
+ lea dstq, [r2+r7*8]
+ jl .loop_pass2
+%if WIN64
+ mov r7, [rsp+gprsize*1+3*16]
+%endif
+%else
+ add dword [r2+1*gprsize], 16
+ dec dword [r2+0*gprsize]
+ jg .loop_pass2
+%endif
+%assign stack_size (stack_size-73*16)
+%if STACK_ALIGNMENT >= 16
+%assign stack_size_padded (stack_size_padded-73*16)
+%assign stack_offset (stack_offset-73*16)
+%else
+%xdefine rstkm [rsp + stack_size]
+%endif
+ RET
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 32
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add rsp, (65+4*ARCH_X86_64)*16
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
+
+cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
+ dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 10
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 10
+ sbb r5d, 0
+%endif
+ add r5d, r5d
+
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+ mova m0, [cq+32* 1+r5*8]
+ mova m1, [cq+32* 7+r5*8]
+ mova m2, [cq+32* 9+r5*8]
+ mova m3, [cq+32*15+r5*8]
+ mova m4, [cq+32*17+r5*8]
+ mova m5, [cq+32*23+r5*8]
+ mova m6, [cq+32*25+r5*8]
+ mova m7, [cq+32*31+r5*8]
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mov r3, rsp
+ call .main_oddhalf_part1
+ mova m0, [cq+32* 3+r5*8]
+ mova m1, [cq+32* 5+r5*8]
+ mova m2, [cq+32*11+r5*8]
+ mova m3, [cq+32*13+r5*8]
+ mova m4, [cq+32*19+r5*8]
+ mova m5, [cq+32*21+r5*8]
+ mova m6, [cq+32*27+r5*8]
+ mova m7, [cq+32*29+r5*8]
+ call .main_oddhalf_part2
+ mova m0, [cq+32* 2+r5*8]
+ mova m1, [cq+32* 6+r5*8]
+ mova m2, [cq+32*10+r5*8]
+ mova m3, [cq+32*14+r5*8]
+ mova m4, [cq+32*18+r5*8]
+ mova m5, [cq+32*22+r5*8]
+ mova m6, [cq+32*26+r5*8]
+ mova m7, [cq+32*30+r5*8]
+ add r3, 16*(16+4*ARCH_X86_32)
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+ mova m0, [cq+32* 0+r5*8]
+ mova m1, [cq+32* 4+r5*8]
+ mova m2, [cq+32* 8+r5*8]
+ mova m3, [cq+32*12+r5*8]
+ mova m4, [cq+32*16+r5*8]
+ mova m5, [cq+32*20+r5*8]
+ mova m6, [cq+32*24+r5*8]
+ mova m7, [cq+32*28+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call .round_dct32
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+32* 8+r5*8], m8
+ mova [cq+32* 9+r5*8], m9
+ mova [cq+32*10+r5*8], m10
+ mova [cq+32*11+r5*8], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+32* 4+r5*8], m8
+ mova [cq+32* 5+r5*8], m9
+ mova [cq+32* 6+r5*8], m10
+ mova [cq+32* 7+r5*8], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+32*12+r5*8], m8
+ mova [cq+32*13+r5*8], m9
+ mova [cq+32*14+r5*8], m10
+ mova [cq+32*15+r5*8], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+32* 4+r5*8], m0
+ mova [cq+32* 5+r5*8], m1
+ mova [cq+32* 6+r5*8], m2
+ mova [cq+32* 7+r5*8], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+32* 8+r5*8], m0
+ mova [cq+32* 9+r5*8], m1
+ mova [cq+32*10+r5*8], m2
+ mova [cq+32*11+r5*8], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+32*12+r5*8], m0
+ mova [cq+32*13+r5*8], m1
+ mova [cq+32*14+r5*8], m2
+ mova [cq+32*15+r5*8], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ pxor m7, m7
+ ; clear lower half of [cq]
+ REPX {mova [cq+x*32+r5*8], m7}, 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32* 0+r5*8], m0
+ mova [cq+32* 1+r5*8], m1
+ mova [cq+32* 2+r5*8], m2
+ mova [cq+32* 3+r5*8], m3
+ sub r5d, 2
+ jmp .loop_pass1
+.end_pass1:
+
+ ; pass=2, we need to call this otherwise the stack pointer has
+ ; the wrong offset in the 8-bit code
+ mov r4d, 4
+ call m(idct_16x8_internal_16bpc).pass2_main
+ RET
+
+.main_oddhalf_part1_fast: ; lower half zero
+ pmulld m7, m0, [o(pd_4091)]
+ pmulld m0, [o(pd_201)]
+ pmulld m4, m3, [o(pd_m2751)]
+%if ARCH_X86_32
+ pmulld m3, [o(pd_3035)]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m7
+ REPX {psrad x, 12}, m0, m7
+ mova [r3+3*16], m7
+ mova m7, m3
+ mova m3, m5
+%else
+ pmulld m3, [o(pd_3035)]
+%endif
+ pmulld m6, m1, [o(pd_m1380)]
+ pmulld m1, [o(pd_3857)]
+ pmulld m5, m2, [o(pd_3703)]
+ pmulld m2, [o(pd_1751)]
+ jmp .main_oddhalf_part1_fast2
+.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
+%if ARCH_X86_64
+ ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
+.main_oddhalf_part1_fast2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m8, m0, m4 ; t17
+ paddd m0, m4 ; t16
+ psubd m4, m6, m2 ; t18
+ paddd m6, m2 ; t19
+ psubd m2, m1, m5 ; t29
+ paddd m1, m5 ; t28
+ psubd m5, m7, m3 ; t30
+ paddd m7, m3 ; t31
+ REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
+ REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
+ mova m15, [o(pd_4017)]
+ mova m10, [o(pd_799)]
+ ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a
+ ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a
+ psubd m3, m0, m6 ; t19a
+ paddd m0, m6 ; t16a
+ psubd m6, m7, m1 ; t28a
+ paddd m7, m1 ; t31a
+ psubd m1, m5, m4 ; t18
+ paddd m5, m4 ; t17
+ psubd m4, m8, m2 ; t29
+ paddd m8, m2 ; t30
+ REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
+ REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
+ ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28
+ mova [r3+16*0], m0
+ mova [r3+16*1], m5
+ mova [r3+16*2], m4
+ mova [r3+16*3], m6
+ mova [r3+16*4], m3
+ mova [r3+16*5], m1
+ mova [r3+16*6], m8
+ mova [r3+16*7], m7
+%else
+ mova [r3+0*16], m2
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m5
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 0, 7, 2, 4, 5, 3, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2D 6, 1, 2, 4, 5, _, 3857, 1380 ; t19a, t28a
+ mova m4, [r3+2*16]
+ mova m5, [r3+3*16]
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+ mova m2, [r3+0*16]
+ mova m7, [r3+1*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ ITX_MULSUB_2D 2, 5, 0, 1, 6, _, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2D 4, 7, 0, 1, 6, _, 3035, 2751 ; t17a, t30a
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m6, [r3+2*16]
+.main_oddhalf_part1_fast2:
+ REPX {paddd x, m3}, m1, m2, m4, m5, m6, m7
+ REPX {psrad x, 12}, m1, m2, m4, m5, m6, m7
+ psubd m3, m0, m4 ; t17
+ mova [r3+0*16], m3
+ mova m3, [r3+3*16]
+ paddd m0, m4 ; t16
+ psubd m4, m6, m2 ; t18
+ paddd m6, m2 ; t19
+ psubd m2, m1, m5 ; t29
+ paddd m1, m5 ; t28
+ psubd m5, m3, m7 ; t30
+ paddd m7, m3 ; t31
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pmaxsd m3, [r3+0*16]
+ mova [r3+0*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pminsd m3, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 3, 1, 6, 7, 0, 799, 4017 ; t17a, t30a
+ ITX_MULSUB_2D 2, 4, 1, 6, _, 0, 7, 4017, 4 ; t29a, t18a
+ psubd m1, m5, m4 ; t18
+ paddd m5, m4 ; t17
+ psubd m4, m3, m2 ; t29
+ paddd m3, m2 ; t30
+ mova m0, [r3+0*16]
+ mova m2, [r3+1*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+ mova [r3+0*16], m3
+ psubd m3, m0, m6 ; t19a
+ paddd m0, m6 ; t16a
+ psubd m6, m7, m2 ; t28a
+ paddd m7, m2 ; t31a
+ mova m2, [o(clip_18b_min)]
+ REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pmaxsd m2, [r3+0*16]
+ mova [r3+0*16], m2
+ mova m2, [o(clip_18b_max)]
+ REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pminsd m2, [r3+0*16]
+ mova [r3+16*0], m0
+ mova [r3+16*1], m5
+ mova [r3+16*6], m2
+ mova [r3+16*7], m7
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2D 4, 1, 0, 5, 2, 7, 1567, 3784 ; t18a, t29a
+ ITX_MULSUB_2D 6, 3, 0, 5, 2, 7, 2, 3784 ; t19, t28
+ mova [r3+16*2], m4
+ mova [r3+16*3], m6
+ mova [r3+16*4], m3
+ mova [r3+16*5], m1
+%endif
+ ret
+.main_oddhalf_part2_fast: ; lower half zero
+ pmulld m7, m0, [o(pd_m601)]
+ pmulld m0, [o(pd_4052)]
+ pmulld m4, m3, [o(pd_3290)]
+%if ARCH_X86_32
+ pmulld m3, [o(pd_2440)]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m7
+ REPX {psrad x, 12}, m0, m7
+ mova [r3+11*16], m7
+ mova m7, m3
+ mova m3, m5
+%else
+ pmulld m3, [o(pd_2440)]
+%endif
+ pmulld m6, m1, [o(pd_3973)]
+ pmulld m1, [o(pd_995)]
+ pmulld m5, m2, [o(pd_m2106)]
+ pmulld m2, [o(pd_3513)]
+ jmp .main_oddhalf_part2_fast2
+.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
+%if ARCH_X86_64
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
+.main_oddhalf_part2_fast2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m8, m0, m4 ; t25
+ paddd m0, m4 ; t24
+ psubd m4, m6, m2 ; t26
+ paddd m6, m2 ; t27
+ psubd m2, m1, m5 ; t21
+ paddd m1, m5 ; t20
+ psubd m5, m7, m3 ; t22
+ paddd m7, m3 ; t23
+ REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
+ REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
+ mova m15, [o(pd_2276)]
+ mova m10, [o(pd_3406)]
+ ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a
+ ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a
+ psubd m3, m0, m6 ; t27a
+ paddd m0, m6 ; t24a
+ psubd m6, m7, m1 ; t20a
+ paddd m7, m1 ; t23a
+ psubd m1, m5, m4 ; t21
+ paddd m5, m4 ; t22
+ psubd m4, m8, m2 ; t26
+ paddd m8, m2 ; t25
+ REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
+ REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a
+ ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20
+ mova m9, [r3+16*0] ; t16a
+ mova m10, [r3+16*1] ; t17
+ psubd m2, m9, m7 ; t23
+ paddd m9, m7 ; t16
+ psubd m7, m10, m5 ; t22a
+ paddd m10, m5 ; t17a
+ REPX {pmaxsd x, m12}, m9, m10, m2, m7
+ REPX {pminsd x, m13}, m9, m10, m2, m7
+ mova [r3+16*0], m9
+ mova [r3+16*1], m10
+ mova m9, [r3+16*2] ; t18a
+ mova m10, [r3+16*3] ; t19
+ psubd m5, m9, m1 ; t21
+ paddd m9, m1 ; t18
+ psubd m1, m10, m6 ; t20a
+ paddd m10, m6 ; t19a
+ REPX {pmaxsd x, m12}, m9, m10, m5, m1
+ REPX {pminsd x, m13}, m9, m10, m5, m1
+ mova [r3+16*2], m9
+ mova [r3+16*3], m10
+ mova m9, [r3+16*4] ; t28
+ mova m10, [r3+16*5] ; t29a
+ psubd m6, m9, m3 ; t27a
+ paddd m9, m3 ; t28a
+ psubd m3, m10, m4 ; t26
+ paddd m10, m4 ; t29
+ REPX {pmaxsd x, m12}, m9, m10, m6, m3
+ REPX {pminsd x, m13}, m9, m10, m6, m3
+ REPX {pmulld x, m14}, m6, m3, m1, m5
+ paddd m6, m11
+ paddd m3, m11
+ psubd m4, m6, m1 ; t20
+ paddd m6, m1 ; t27
+ psubd m1, m3, m5 ; t21a
+ paddd m3, m5 ; t26a
+ REPX {psrad x, 12 }, m4, m1, m3, m6
+ mova [r3+16*4], m4
+ mova [r3+16*5], m1
+ mova m4, [r3+16*6] ; t30
+ mova m1, [r3+16*7] ; t31a
+ psubd m5, m4, m8 ; t25a
+ paddd m4, m8 ; t30a
+ psubd m8, m1, m0 ; t24
+ paddd m1, m0 ; t31
+ REPX {pmaxsd x, m12}, m8, m5, m4, m1
+ REPX {pminsd x, m13}, m8, m5, m4, m1
+ REPX {pmulld x, m14}, m5, m8, m7, m2
+ paddd m5, m11
+ paddd m8, m11
+ psubd m0, m5, m7 ; t22
+ paddd m5, m7 ; t25
+ psubd m7, m8, m2 ; t23a
+ paddd m2, m8 ; t24a
+ REPX {psrad x, 12 }, m0, m7, m2, m5
+ mova [r3+16*6], m0
+ mova [r3+16*7], m7
+ mova [r3+16*8], m2
+ mova [r3+16*9], m5
+ mova [r3+16*10], m3
+ mova [r3+16*11], m6
+ mova [r3+16*12], m9
+ mova [r3+16*13], m10
+ mova [r3+16*14], m4
+ mova [r3+16*15], m1
+%else
+ mova [r3+ 8*16], m2
+ mova [r3+ 9*16], m3
+ mova [r3+10*16], m4
+ mova [r3+11*16], m5
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 7, 0, 2, 4, 5, 3, 4052, 601 ; t23a, t24a
+ ITX_MULSUB_2D 1, 6, 2, 4, 5, _, 995, 3973 ; t20a, t27a
+ mova m2, [r3+ 8*16]
+ mova m4, [r3+10*16]
+ mova m5, [r3+11*16]
+ mova [r3+ 8*16], m0
+ mova [r3+10*16], m6
+ mova [r3+11*16], m7
+ mova m7, [r3+ 9*16]
+ mova [r3+ 9*16], m1
+ ITX_MULSUB_2D 5, 2, 0, 6, 1, _, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2D 7, 4, 0, 6, 1, _, 2440, 3290 ; t22a, t25a
+ mova m0, [r3+ 8*16]
+ mova m1, [r3+ 9*16]
+ mova m6, [r3+10*16]
+.main_oddhalf_part2_fast2:
+ REPX {paddd x, m3}, m1, m2, m7, m4, m5, m6
+ REPX {psrad x, 12}, m1, m2, m7, m4, m5, m6
+ psubd m3, m0, m4 ; t25
+ mova [r3+ 8*16], m3
+ mova m3, [r3+11*16]
+ paddd m0, m4 ; t24
+ psubd m4, m6, m2 ; t26
+ paddd m6, m2 ; t27
+ psubd m2, m1, m5 ; t21
+ paddd m1, m5 ; t20
+ psubd m5, m3, m7 ; t22
+ paddd m7, m3 ; t23
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pmaxsd m3, [r3+ 8*16]
+ mova [r3+ 8*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pminsd m3, [r3+ 8*16]
+ mova [r3+ 8*16], m0
+ mova [r3+ 9*16], m1
+ mova [r3+10*16], m6
+ mova [r3+11*16], m7
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2D 4, 2, 0, 1, 6, 7, 3406, 2276 ; t21a, t26a
+ ITX_MULSUB_2D 3, 5, 0, 1, _, 7, 6, 2276, 4 ; t25a, t22a
+ psubd m1, m5, m4 ; t21
+ paddd m5, m4 ; t22
+ psubd m4, m3, m2 ; t26
+ paddd m3, m2 ; t25
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+ 9*16]
+ mova m6, [r3+10*16]
+ mova m7, [r3+11*16]
+ mova [r3+ 8*16], m3
+ psubd m3, m0, m6 ; t27a
+ paddd m0, m6 ; t24a
+ psubd m6, m7, m2 ; t20a
+ paddd m7, m2 ; t23a
+ mova m2, [o(clip_18b_min)]
+ REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pmaxsd m2, [r3+ 8*16]
+ mova [r3+ 8*16], m2
+ mova m2, [o(clip_18b_max)]
+ REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pminsd m2, [r3+ 8*16]
+ mova [r3+ 8*16], m0
+ mova [r3+ 9*16], m2
+ mova [r3+14*16], m5
+ mova [r3+15*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a
+ ITX_MULSUB_2D 3, 6, 2, 5, _, 0, 7, 3784, 4 ; t27, t20
+ mova [r3+10*16], m3
+ mova m0, [o(clip_18b_min)]
+ mova m2, [o(clip_18b_max)]
+ mova m5, [r3+16*2] ; t18a
+ mova m7, [r3+16*3] ; t19
+ psubd m3, m5, m1 ; t21
+ paddd m5, m1 ; t18
+ psubd m1, m7, m6 ; t20a
+ paddd m7, m6 ; t19a
+ REPX {pmaxsd x, m0}, m5, m7, m3, m1
+ REPX {pminsd x, m2}, m5, m7, m3, m1
+ mova [r3+16*2], m5
+ mova [r3+16*3], m7
+ mova [r3+11*16], m3
+ mova m3, [r3+10*16]
+ mova m5, [r3+16*4] ; t28
+ mova m7, [r3+16*5] ; t29a
+ psubd m6, m5, m3 ; t27a
+ paddd m5, m3 ; t28a
+ psubd m3, m7, m4 ; t26
+ paddd m7, m4 ; t29
+ REPX {pmaxsd x, m0}, m5, m7, m6, m3
+ REPX {pminsd x, m2}, m5, m7, m6, m3
+ mova [r3+16*12], m5
+ mova [r3+16*13], m7
+ mova m5, [o(pd_2048)]
+ mova m7, [o(pd_2896)]
+ mova m4, [r3+11*16]
+ REPX {pmulld x, m7}, m6, m3, m1, m4
+ paddd m6, m5
+ paddd m3, m5
+ psubd m5, m6, m1 ; t20
+ paddd m6, m1 ; t27
+ psubd m1, m3, m4 ; t21a
+ paddd m3, m4 ; t26a
+ REPX {psrad x, 12}, m5, m1, m3, m6
+ mova [r3+16*4], m5
+ mova [r3+16*5], m1
+ mova [r3+16*10], m3
+ mova [r3+16*11], m6
+
+ mova m5, [r3+14*16]
+ mova m6, [r3+15*16]
+ mova m3, [r3+16*0] ; t16a
+ mova m4, [r3+16*1] ; t17
+ psubd m1, m3, m6 ; t23
+ paddd m3, m6 ; t16
+ psubd m6, m4, m5 ; t22a
+ paddd m4, m5 ; t17a
+ REPX {pmaxsd x, m0}, m3, m4, m1, m6
+ REPX {pminsd x, m2}, m3, m4, m1, m6
+ mova [r3+16*0], m3
+ mova [r3+16*1], m4
+ mova m5, [r3+ 8*16]
+ mova m3, [r3+ 9*16]
+ mova [r3+ 8*16], m1
+ mova [r3+ 9*16], m6
+ mova m4, [r3+16*6] ; t30
+ mova m1, [r3+16*7] ; t31a
+ psubd m6, m1, m5 ; t24
+ paddd m1, m5 ; t31
+ psubd m5, m4, m3 ; t25a
+ paddd m4, m3 ; t30a
+ REPX {pmaxsd x, m0}, m6, m5, m4, m1
+ REPX {pminsd x, m2}, m6, m5, m4, m1
+ mova [r3+16*14], m4
+ mova [r3+16*15], m1
+ mova m4, [o(pd_2048)]
+ mova m1, [r3+ 9*16]
+ mova m2, [r3+ 8*16]
+ REPX {pmulld x, m7}, m5, m6, m1, m2
+ paddd m5, m4
+ paddd m6, m4
+ psubd m0, m5, m1 ; t22
+ paddd m5, m1 ; t25
+ psubd m1, m6, m2 ; t23a
+ paddd m2, m6 ; t24a
+ REPX {psrad x, 12}, m0, m1, m2, m5
+ mova [r3+16*6], m0
+ mova [r3+16*7], m1
+ mova [r3+16*8], m2
+ mova [r3+16*9], m5
+%endif
+ ret
+
+ ; final sumsub for idct16 as well as idct32, plus final downshift
+%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx
+ mova m%4, [r3+16*(23-%1)]
+ pmaxsd m%1, m12
+ pminsd m%1, m13
+ psubd m%3, m%1, m%4 ; idct16 out15 - n
+ paddd m%1, m%4 ; idct16 out0 + n
+ pmaxsd m%1, m12
+ pmaxsd m%3, m12
+ pminsd m%1, m13
+ pminsd m%3, m13
+ paddd m%1, m11
+ paddd m%3, m11
+ mova m%5, [r3+16*( 0+%1)]
+ mova m%2, [r3+16*(15-%1)]
+ psubd m%4, m%1, m%2 ; out31 - n
+ paddd m%1, m%2 ; out0 + n
+ paddd m%2, m%3, m%5 ; out15 - n
+ psubd m%3, m%5 ; out16 + n
+ REPX {psrad x, %6}, m%1, m%3, m%2, m%4
+%endmacro
+
+.round_dct32:
+%if ARCH_X86_64
+ psrld m11, 10 ; pd_2
+ IDCT32_END 0, 15, 8, 9, 10, 2 ; 0 15 16 31
+ mova [r3+ 0*16], m6
+ mova [r3+23*16], m7
+ IDCT32_END 1, 14, 6, 7, 10, 2 ; 1 14 17 30
+ packssdw m0, m1 ; 0 1
+ packssdw m14, m15 ; 14 15
+ packssdw m8, m6 ; 16 17
+ packssdw m7, m9 ; 30 31
+ mova [r3+16*15], m14
+ mova [r3+16*14], m7
+ IDCT32_END 2, 15, 10, 7, 6, 2 ; 2 13 18 29
+ IDCT32_END 3, 14, 1, 9, 6, 2 ; 3 12 19 28
+ packssdw m2, m3 ; 2 3
+ packssdw m14, m15 ; 12 13
+ packssdw m10, m1 ; 18 19
+ packssdw m9, m7 ; 28 29
+ mova [r3+16*13], m14
+ mova [r3+16*12], m9
+ IDCT32_END 4, 15, 1, 7, 6, 2 ; 4 11 20 27
+ IDCT32_END 5, 14, 3, 9, 6, 2 ; 5 10 21 26
+ packssdw m4, m5 ; 4 5
+ packssdw m14, m15 ; 10 11
+ packssdw m1, m3 ; 20 21
+ packssdw m9, m7 ; 26 27
+ mova [r3+16*11], m14
+ mova [r3+16*10], m9
+ mova m6, [r3+ 0*16]
+ mova m7, [r3+23*16]
+ IDCT32_END 6, 15, 14, 5, 3, 2 ; 6 9 22 25
+ IDCT32_END 7, 11, 3, 9, 13, 2 ; 7 8 23 24
+ packssdw m6, m7 ; 6 7
+ packssdw m11, m15 ; 8 9
+ packssdw m14, m3 ; 22 23
+ packssdw m9, m5 ; 24 25
+ mova [r3+16*9], m11
+ mova [r3+16*8], m9
+ mova m12, m1
+ ret
+%else
+ mova [r3+16*16], m0
+ mova [r3+17*16], m1
+ mova [r3+18*16], m2
+ mova [r3+19*16], m3
+ mova [r3+20*16], m4
+ mova [r3+21*16], m5
+ mova [r3+22*16], m6
+ mova [r3+23*16], m7
+ mova m1, [o(pd_2)]
+ mova m2, [o(clip_18b_min)]
+ mova m3, [o(clip_18b_max)]
+
+ mov r4, 15*16
+.loop_dct32_end:
+ mova m0, [r3+16*16]
+ mova m6, [r3+16*24]
+ pmaxsd m0, m2
+ pminsd m0, m3
+ psubd m5, m0, m6 ; idct16 out15 - n
+ paddd m0, m6 ; idct16 out0 + n
+ pmaxsd m0, m2
+ pmaxsd m5, m2
+ pminsd m0, m3
+ pminsd m5, m3
+ paddd m0, m1
+ paddd m5, m1
+ mova m7, [r3]
+ mova m4, [r3+r4]
+ psubd m6, m0, m4 ; out31 - n
+ paddd m0, m4 ; out0 + n
+ paddd m4, m5, m7 ; out15 - n
+ psubd m5, m7 ; out16 + n
+ REPX {psrad x, 2}, m0, m5, m4, m6
+ mova [r3], m0
+ mova [r3+r4], m4
+ mova [r3+16*16], m5
+ mova [r3+24*16], m6
+ add r3, 16
+ sub r4, 32
+ jg .loop_dct32_end
+ ret
+%endif
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+.dconly1:
+ add r5d, 640
+ sar r5d, 10
+.dconly2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+.dconly_loop:
+ mova m1, [dstq+16*0]
+ mova m2, [dstq+16*1]
+ mova m3, [dstq+16*2]
+ mova m4, [dstq+16*3]
+ REPX {paddw x, m0}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m2
+ mova [dstq+16*2], m3
+ mova [dstq+16*3], m4
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%undef cmp
+ mov r5d, 8
+.zero_loop:
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x16_2d)+r5]
+ jl .zero_loop
+
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+64* 1+r5*8]
+ mova m1, [cq+64* 7+r5*8]
+ mova m2, [cq+64* 9+r5*8]
+ mova m3, [cq+64*15+r5*8]
+ mova m4, [cq+64*17+r5*8]
+ mova m5, [cq+64*23+r5*8]
+ mova m6, [cq+64*25+r5*8]
+ mova m7, [cq+64*31+r5*8]
+ mov r3, rsp
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
+
+ mova m0, [cq+64* 3+r5*8]
+ mova m1, [cq+64* 5+r5*8]
+ mova m2, [cq+64*11+r5*8]
+ mova m3, [cq+64*13+r5*8]
+ mova m4, [cq+64*19+r5*8]
+ mova m5, [cq+64*21+r5*8]
+ mova m6, [cq+64*27+r5*8]
+ mova m7, [cq+64*29+r5*8]
+%if ARCH_X86_32
+ add r3, 16*8
+%endif
+ call m(idct_8x4_internal_16bpc).rect2_mul
+%if ARCH_X86_32
+ sub r3, 16*8
+%endif
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
+ add r3, 16*(16+4*ARCH_X86_32)
+
+ mova m0, [cq+64* 2+r5*8]
+ mova m1, [cq+64* 6+r5*8]
+ mova m2, [cq+64*10+r5*8]
+ mova m3, [cq+64*14+r5*8]
+ mova m4, [cq+64*18+r5*8]
+ mova m5, [cq+64*22+r5*8]
+ mova m6, [cq+64*26+r5*8]
+ mova m7, [cq+64*30+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+64* 0+r5*8]
+ mova m1, [cq+64* 4+r5*8]
+ mova m2, [cq+64* 8+r5*8]
+ mova m3, [cq+64*12+r5*8]
+ mova m4, [cq+64*16+r5*8]
+ mova m5, [cq+64*20+r5*8]
+ mova m6, [cq+64*24+r5*8]
+ mova m7, [cq+64*28+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call .round_dct32
+
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+64* 8+r5*8], m8
+ mova [cq+64* 9+r5*8], m9
+ mova [cq+64*10+r5*8], m10
+ mova [cq+64*11+r5*8], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+64* 4+r5*8], m8
+ mova [cq+64* 5+r5*8], m9
+ mova [cq+64* 6+r5*8], m10
+ mova [cq+64* 7+r5*8], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+64*12+r5*8], m8
+ mova [cq+64*13+r5*8], m9
+ mova [cq+64*14+r5*8], m10
+ mova [cq+64*15+r5*8], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+64* 4+r5*8], m0
+ mova [cq+64* 5+r5*8], m1
+ mova [cq+64* 6+r5*8], m2
+ mova [cq+64* 7+r5*8], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+64* 8+r5*8], m0
+ mova [cq+64* 9+r5*8], m1
+ mova [cq+64*10+r5*8], m2
+ mova [cq+64*11+r5*8], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+64*12+r5*8], m0
+ mova [cq+64*13+r5*8], m1
+ mova [cq+64*14+r5*8], m2
+ mova [cq+64*15+r5*8], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ mova [cq+64* 0+r5*8], m0
+ mova [cq+64* 1+r5*8], m1
+ mova [cq+64* 2+r5*8], m2
+ mova [cq+64* 3+r5*8], m3
+ pxor m0, m0
+ REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2, we need to call this otherwise the stack pointer has
+ ; the wrong offset in the 8-bit code
+ call .pass2
+ RET
+
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%endif
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 4
+ jmp m(idct_16x16_internal_16bpc).loop_pass2
+
+.round_dct32:
+%if ARCH_X86_64
+ psrld m11, 11 ; pd_1
+ IDCT32_END 0, 15, 8, 9, 10, 1 ; 0 15 16 31
+ mova [r3+ 0*16], m6
+ mova [r3+23*16], m7
+ IDCT32_END 1, 14, 6, 7, 10, 1 ; 1 14 17 30
+ packssdw m0, m1 ; 0 1
+ packssdw m14, m15 ; 14 15
+ packssdw m8, m6 ; 16 17
+ packssdw m7, m9 ; 30 31
+ mova [r3+16*15], m14
+ mova [r3+16*14], m7
+ IDCT32_END 2, 15, 10, 7, 6, 1 ; 2 13 18 29
+ IDCT32_END 3, 14, 1, 9, 6, 1 ; 3 12 19 28
+ packssdw m2, m3 ; 2 3
+ packssdw m14, m15 ; 12 13
+ packssdw m10, m1 ; 18 19
+ packssdw m9, m7 ; 28 29
+ mova [r3+16*13], m14
+ mova [r3+16*12], m9
+ IDCT32_END 4, 15, 1, 7, 6, 1 ; 4 11 20 27
+ IDCT32_END 5, 14, 3, 9, 6, 1 ; 5 10 21 26
+ packssdw m4, m5 ; 4 5
+ packssdw m14, m15 ; 10 11
+ packssdw m1, m3 ; 20 21
+ packssdw m9, m7 ; 26 27
+ mova [r3+16*11], m14
+ mova [r3+16*10], m9
+ mova m6, [r3+ 0*16]
+ mova m7, [r3+23*16]
+ IDCT32_END 6, 15, 14, 5, 3, 1 ; 6 9 22 25
+ IDCT32_END 7, 11, 3, 9, 13, 1 ; 7 8 23 24
+ packssdw m6, m7 ; 6 7
+ packssdw m11, m15 ; 8 9
+ packssdw m14, m3 ; 22 23
+ packssdw m9, m5 ; 24 25
+ mova [r3+16*9], m11
+ mova [r3+16*8], m9
+ mova m12, m1
+ ret
+%else
+ mova [r3+16*16], m0
+ mova [r3+17*16], m1
+ mova [r3+18*16], m2
+ mova [r3+19*16], m3
+ mova [r3+20*16], m4
+ mova [r3+21*16], m5
+ mova [r3+22*16], m6
+ mova [r3+23*16], m7
+ pcmpeqd m1, m1 ; -1
+ mova m2, [o(clip_18b_min)]
+ mova m3, [o(clip_18b_max)]
+
+ mov r4, 15*16
+.loop_dct32_end:
+ mova m0, [r3+16*16]
+ mova m6, [r3+16*24]
+ psubd m5, m0, m6 ; idct16 out15 - n
+ paddd m0, m6 ; idct16 out0 + n
+ pmaxsd m0, m2
+ pmaxsd m5, m2
+ pminsd m0, m3
+ pminsd m5, m3
+ psubd m0, m1
+ psubd m5, m1
+ mova m7, [r3]
+ mova m4, [r3+r4]
+ psubd m6, m0, m4 ; out31 - n
+ paddd m0, m4 ; out0 + n
+ paddd m4, m5, m7 ; out15 - n
+ psubd m5, m7 ; out16 + n
+ REPX {psrad x, 1}, m0, m5, m4, m6
+ mova [r3], m0
+ mova [r3+r4], m4
+ mova [r3+16*16], m5
+ mova [r3+24*16], m6
+ add r3, 16
+ sub r4, 32
+ jg .loop_dct32_end
+ ret
+%endif
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
+ jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%if ARCH_X86_32
+ mov [rsp+5*32*16+1*gprsize], dstq
+%elif WIN64
+ mov [rsp+5*32*16+1*gprsize], r7
+%endif
+%undef cmp
+ mov r5d, 14
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [rsp+32*16+r5*8+0*32*16], m0
+ mova [rsp+40*16+r5*8+0*32*16], m0
+ mova [rsp+32*16+t0*8+0*32*16], m0
+ mova [rsp+32*16+t1*8+0*32*16], m0
+ mova [rsp+32*16+r5*8+1*32*16], m0
+ mova [rsp+40*16+r5*8+1*32*16], m0
+ mova [rsp+32*16+t0*8+1*32*16], m0
+ mova [rsp+32*16+t1*8+1*32*16], m0
+ mova [rsp+32*16+r5*8+2*32*16], m0
+ mova [rsp+40*16+r5*8+2*32*16], m0
+ mova [rsp+32*16+t0*8+2*32*16], m0
+ mova [rsp+32*16+t1*8+2*32*16], m0
+ mova [rsp+32*16+r5*8+3*32*16], m0
+ mova [rsp+40*16+r5*8+3*32*16], m0
+ mova [rsp+32*16+t0*8+3*32*16], m0
+ mova [rsp+32*16+t1*8+3*32*16], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+5*32*16], eobd
+.loop_pass1:
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128* 7+r5*8]
+ mova m2, [cq+128* 9+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ mova m4, [cq+128*17+r5*8]
+ mova m5, [cq+128*23+r5*8]
+ mova m6, [cq+128*25+r5*8]
+ mova m7, [cq+128*31+r5*8]
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mov r3, rsp
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128* 5+r5*8]
+ mova m2, [cq+128*11+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ mova m4, [cq+128*19+r5*8]
+ mova m5, [cq+128*21+r5*8]
+ mova m6, [cq+128*27+r5*8]
+ mova m7, [cq+128*29+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128* 6+r5*8]
+ mova m2, [cq+128*10+r5*8]
+ mova m3, [cq+128*14+r5*8]
+ mova m4, [cq+128*18+r5*8]
+ mova m5, [cq+128*22+r5*8]
+ mova m6, [cq+128*26+r5*8]
+ mova m7, [cq+128*30+r5*8]
+ add r3, 16*(16+4*ARCH_X86_32)
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 4+r5*8]
+ mova m2, [cq+128* 8+r5*8]
+ mova m3, [cq+128*12+r5*8]
+ mova m4, [cq+128*16+r5*8]
+ mova m5, [cq+128*20+r5*8]
+ mova m6, [cq+128*24+r5*8]
+ mova m7, [cq+128*28+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).round_dct32
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+32*16+r5*8+2*32*16], m8
+ mova [rsp+40*16+r5*8+2*32*16], m10
+ mova [rsp+32*16+t1*8+2*32*16], m9
+ mova [rsp+32*16+t0*8+2*32*16], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+32*16+r5*8+1*32*16], m8
+ mova [rsp+40*16+r5*8+1*32*16], m10
+ mova [rsp+32*16+t1*8+1*32*16], m9
+ mova [rsp+32*16+t0*8+1*32*16], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+32*16+r5*8+3*32*16], m8
+ mova [rsp+40*16+r5*8+3*32*16], m10
+ mova [rsp+32*16+t1*8+3*32*16], m9
+ mova [rsp+32*16+t0*8+3*32*16], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+32*16+r5*8+1*32*16], m0
+ mova [rsp+40*16+r5*8+1*32*16], m2
+ mova [rsp+32*16+t1*8+1*32*16], m1
+ mova [rsp+32*16+t0*8+1*32*16], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+32*16+r5*8+2*32*16], m0
+ mova [rsp+40*16+r5*8+2*32*16], m2
+ mova [rsp+32*16+t1*8+2*32*16], m1
+ mova [rsp+32*16+t0*8+2*32*16], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+32*16+r5*8+3*32*16], m0
+ mova [rsp+40*16+r5*8+3*32*16], m2
+ mova [rsp+32*16+t1*8+3*32*16], m1
+ mova [rsp+32*16+t0*8+3*32*16], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ pxor m7, m7
+ ; clear lower half of [cq]
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ mova [rsp+32*16+r5*8+0*32*16], m0
+ mova [rsp+40*16+r5*8+0*32*16], m2
+ mova [rsp+32*16+t1*8+0*32*16], m1
+ mova [rsp+32*16+t0*8+0*32*16], m3
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2 code starts here
+ mov eobd, [rsp+gprsize*0+5*32*16]
+ add rsp, 29*16
+ cmp eobd, 36
+ jl .load_veryfast
+ cmp eobd, 136
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+%if ARCH_X86_64
+ lea r2, [dstq+64]
+ mov r7, -8
+%else
+ lea r2, [rsp+(4*32+3)*16]
+ mov dword [r2+0*gprsize], 4
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 32
+ add rsp, (5*32+1-(24+8*ARCH_X86_32))*16
+ jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly1
+
+cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
+ 0-(12+2*64)*16-(4+4*ARCH_X86_32)*gprsize, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0
+ mov [rsp+gprsize*1+(64*2+12)*16], r0
+ mov [rsp+gprsize*2+(64*2+12)*16], r1
+ mov [rsp+gprsize*3+(64*2+12)*16], r2
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7
+ mov [rsp+gprsize*1+(64*2+12)*16], r9
+%if WIN64
+ mov [rsp+gprsize*2+(64*2+12)*16], r7
+ mov [rsp+gprsize*3+(64*2+12)*16], r8
+%endif
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ mova [rsp+12*16+t0*8], m0
+ mova [rsp+12*16+t1*8], m0
+ mova [rsp+12*16+t2*8], m0
+ mova [rsp+12*16+t3*8], m0
+ mova [rsp+76*16+t0*8], m0
+ mova [rsp+76*16+t1*8], m0
+ mova [rsp+76*16+t2*8], m0
+ mova [rsp+76*16+t3*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+(64*2+12)*16], eobd
+ mov r3, rsp
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 6, 0
+ mov r2, [rsp+gprsize*3+(64*2+12)*16]
+ mov [rsp+gprsize*3+(64*2+12)*16], r6
+%endif
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 1*128+r5*8]
+ mova m1, [cq+ 3*128+r5*8]
+ mova m2, [cq+ 5*128+r5*8]
+ mova m3, [cq+ 7*128+r5*8]
+ mova m4, [cq+ 9*128+r5*8]
+ mova m5, [cq+11*128+r5*8]
+ mova m6, [cq+13*128+r5*8]
+ mova m7, [cq+15*128+r5*8]
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*128+r5*8]
+ mova m1, [cq+ 2*128+r5*8]
+ mova m2, [cq+ 4*128+r5*8]
+ mova m3, [cq+ 6*128+r5*8]
+ mova m4, [cq+ 8*128+r5*8]
+ mova m5, [cq+10*128+r5*8]
+ mova m6, [cq+12*128+r5*8]
+ mova m7, [cq+14*128+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call m(idct_16x16_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+%if ARCH_X86_64
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+76*16+t0*8], m8
+ mova [rsp+76*16+t1*8], m9
+ mova [rsp+76*16+t2*8], m10
+ mova [rsp+76*16+t3*8], m11
+%else
+ mova [rsp+76*16+t0*8], m0
+ mova [rsp+76*16+t1*8], m1
+ mova [rsp+76*16+t2*8], m2
+ mova [rsp+76*16+t3*8], m3
+ mova m0, [rsp+ 8*16]
+ mova m2, [rsp+ 9*16]
+ mova m4, [rsp+10*16]
+ mova m6, [rsp+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ mova [rsp+12*16+t0*8], m0
+ mova [rsp+12*16+t1*8], m1
+ mova [rsp+12*16+t2*8], m2
+ mova [rsp+12*16+t3*8], m3
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*3+(64*2+12)*16]
+%endif
+ pxor m7, m7
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2
+ mov eobd, [rsp+gprsize*0+(64*2+12)*16]
+ cmp eobd, 151
+ jl .fast
+ ; fall-through
+%if ARCH_X86_64
+ DECLARE_REG_TMP 8, 9
+%else
+ DECLARE_REG_TMP 1, 5
+%endif
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
+ jmp .run
+.fast:
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
+.run:
+ add rsp, 9*16
+
+%if ARCH_X86_64
+ lea r2, [dstq+32]
+ mov r7, -4
+%else
+ lea r2, [rsp+(64*2+3)*16]
+ mov [r2+4*gprsize], t0
+ mov [r2+5*gprsize], t1
+ mov r1, [r2+2*gprsize]
+ mov dword [r2+0*gprsize], 2
+%endif
+.loop_pass2:
+%if ARCH_X86_32
+ mov dstq, [r2+1*gprsize]
+%endif
+ call .pass2
+ add rsp, 64*16
+%if ARCH_X86_64
+ add r7, 2
+ lea dstq, [r2+r7*8]
+ jl .loop_pass2
+%else
+ add dword [r2+1*gprsize], 16
+ dec dword [r2+0*gprsize]
+ jg .loop_pass2
+%endif
+%assign stack_size (stack_size-(64*2+9)*16)
+%if STACK_ALIGNMENT >= 16
+%assign stack_size_padded (stack_size_padded-(64*2+9)*16)
+%assign stack_offset (stack_offset-(64*2+9)*16)
+%else
+%xdefine rstkm [rsp + stack_size]
+%endif
+%if ARCH_X86_64
+ mov r9, [rsp+gprsize*1+3*16]
+%if WIN64
+ mov r7, [rsp+gprsize*2+3*16]
+ mov r8, [rsp+gprsize*3+3*16]
+%endif
+%endif
+ RET
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m0, [rsp+gprsize+16* 3]
+ mova m1, [rsp+gprsize+16* 4]
+ mova m2, [rsp+gprsize+16* 5]
+ mova m3, [rsp+gprsize+16* 6]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+ 3*16], m0
+ mova [rsp+gprsize+ 4*16], m1
+ mova [rsp+gprsize+ 5*16], m2
+ mova [rsp+gprsize+ 6*16], m3
+ mova [rsp+gprsize+ 7*16], m4
+ mova [rsp+gprsize+ 8*16], m5
+ mova [rsp+gprsize+ 9*16], m6
+ mova [rsp+gprsize+10*16], m7
+ mova m0, [rsp+gprsize+16*11]
+ mova m1, [rsp+gprsize+16*12]
+ mova m2, [rsp+gprsize+16*13]
+ mova m3, [rsp+gprsize+16*14]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+ mova m7, [rsp+gprsize+ 0*16]
+ mova [rsp+gprsize+11*16], m0
+ mova [rsp+gprsize+12*16], m1
+ mova [rsp+gprsize+13*16], m2
+ mova [rsp+gprsize+14*16], m3
+ mova [rsp+gprsize+15*16], m4
+ mova [rsp+gprsize+16*16], m5
+ mova [rsp+gprsize+17*16], m6
+ mova [rsp+gprsize+18*16], m7
+%if ARCH_X86_64
+ call r8
+%else
+ call [r2+4*gprsize]
+%endif
+ mova [rsp+gprsize+ 3*16], m0
+ mova [rsp+gprsize+ 5*16], m2
+ mova [rsp+gprsize+ 8*16], m5
+ mova [rsp+gprsize+10*16], m7
+%if ARCH_X86_64
+ call r9
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%else
+ call [r2+5*gprsize]
+%endif
+ lea r3, [strideq*3]
+ lea r4, [rsp+gprsize+ 3*16]
+%if ARCH_X86_64
+ mov r6d, 8
+%else
+ mov dword [r2+2*gprsize], 8
+%endif
+.loop_write:
+ mova m0, [r4+0*16]
+ mova m1, [r4+1*16]
+ mova m2, [r4+2*16]
+ mova m3, [r4+3*16]
+ mova m4, [r4+4*16]
+ mova m5, [r4+5*16]
+ mova m6, [r4+6*16]
+ mova m7, [r4+7*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ add r4, 8*16
+%if ARCH_X86_64
+ dec r6d
+%else
+ dec dword [r2+2*gprsize]
+%endif
+ jg .loop_write
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 64
+ add r5d, 640
+ sar r5d, 10
+ add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \
+ 0-(32+4*64)*16-(4+4*ARCH_X86_32)*gprsize, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0
+ mov [rsp+gprsize*1+(64*4+32)*16], r0
+ mov [rsp+gprsize*2+(64*4+32)*16], r1
+ mov [rsp+gprsize*3+(64*4+32)*16], r2
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7
+ mov [rsp+gprsize*1+(64*4+32)*16], r9
+%if WIN64
+ mov [rsp+gprsize*2+(64*4+32)*16], r7
+ mov [rsp+gprsize*3+(64*4+32)*16], r8
+%endif
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ mova [rsp+ 32*16+t0*8], m0
+ mova [rsp+ 32*16+t1*8], m0
+ mova [rsp+ 32*16+t2*8], m0
+ mova [rsp+ 32*16+t3*8], m0
+ mova [rsp+ 96*16+t0*8], m0
+ mova [rsp+ 96*16+t1*8], m0
+ mova [rsp+ 96*16+t2*8], m0
+ mova [rsp+ 96*16+t3*8], m0
+ mova [rsp+160*16+t0*8], m0
+ mova [rsp+160*16+t1*8], m0
+ mova [rsp+160*16+t2*8], m0
+ mova [rsp+160*16+t3*8], m0
+ mova [rsp+224*16+t0*8], m0
+ mova [rsp+224*16+t1*8], m0
+ mova [rsp+224*16+t2*8], m0
+ mova [rsp+224*16+t3*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+(64*4+32)*16], eobd
+ mov r3, rsp
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 6, 0
+ mov r2, [rsp+gprsize*3+(64*4+32)*16]
+ mov [rsp+gprsize*3+(64*4+32)*16], r6
+%endif
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128* 7+r5*8]
+ mova m2, [cq+128* 9+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ mova m4, [cq+128*17+r5*8]
+ mova m5, [cq+128*23+r5*8]
+ mova m6, [cq+128*25+r5*8]
+ mova m7, [cq+128*31+r5*8]
+ mov r3, rsp
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
+
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128* 5+r5*8]
+ mova m2, [cq+128*11+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ mova m4, [cq+128*19+r5*8]
+ mova m5, [cq+128*21+r5*8]
+ mova m6, [cq+128*27+r5*8]
+ mova m7, [cq+128*29+r5*8]
+%if ARCH_X86_32
+ add r3, 16*8
+%endif
+ call m(idct_8x4_internal_16bpc).rect2_mul
+%if ARCH_X86_32
+ sub r3, 16*8
+%endif
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
+ add r3, 16*(16+4*ARCH_X86_32)
+
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128* 6+r5*8]
+ mova m2, [cq+128*10+r5*8]
+ mova m3, [cq+128*14+r5*8]
+ mova m4, [cq+128*18+r5*8]
+ mova m5, [cq+128*22+r5*8]
+ mova m6, [cq+128*26+r5*8]
+ mova m7, [cq+128*30+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 4+r5*8]
+ mova m2, [cq+128* 8+r5*8]
+ mova m3, [cq+128*12+r5*8]
+ mova m4, [cq+128*16+r5*8]
+ mova m5, [cq+128*20+r5*8]
+ mova m6, [cq+128*24+r5*8]
+ mova m7, [cq+128*28+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call m(inv_txfm_add_dct_dct_32x16_16bpc).round_dct32
+
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+160*16+t0*8], m8
+ mova [rsp+160*16+t1*8], m9
+ mova [rsp+160*16+t2*8], m10
+ mova [rsp+160*16+t3*8], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+ 96*16+t0*8], m8
+ mova [rsp+ 96*16+t1*8], m9
+ mova [rsp+ 96*16+t2*8], m10
+ mova [rsp+ 96*16+t3*8], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+224*16+t0*8], m8
+ mova [rsp+224*16+t1*8], m9
+ mova [rsp+224*16+t2*8], m10
+ mova [rsp+224*16+t3*8], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+ 96*16+t0*8], m0
+ mova [rsp+ 96*16+t1*8], m1
+ mova [rsp+ 96*16+t2*8], m2
+ mova [rsp+ 96*16+t3*8], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+160*16+t0*8], m0
+ mova [rsp+160*16+t1*8], m1
+ mova [rsp+160*16+t2*8], m2
+ mova [rsp+160*16+t3*8], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+224*16+t0*8], m0
+ mova [rsp+224*16+t1*8], m1
+ mova [rsp+224*16+t2*8], m2
+ mova [rsp+224*16+t3*8], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ mova [rsp+ 32*16+t0*8], m0
+ mova [rsp+ 32*16+t1*8], m1
+ mova [rsp+ 32*16+t2*8], m2
+ mova [rsp+ 32*16+t3*8], m3
+ pxor m0, m0
+ REPX {mova [cq+x*128+r5*8], m0}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*3+(64*4+32)*16]
+%endif
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2
+ mov eobd, [rsp+gprsize*0+(64*4+32)*16]
+ cmp eobd, 136
+ jl .fast
+ ; fall-through
+%if ARCH_X86_64
+ DECLARE_REG_TMP 8, 9
+%else
+ DECLARE_REG_TMP 1, 5
+%endif
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
+ jmp .run
+.fast:
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
+.run:
+ add rsp, 29*16
+
+%if ARCH_X86_64
+ lea r2, [dstq+64]
+ mov r7, -8
+%else
+ lea r2, [rsp+(64*4+3)*16]
+ mov [r2+4*gprsize], t0
+ mov [r2+5*gprsize], t1
+ mov r1, [r2+2*gprsize]
+ mov dword [r2+0*gprsize], 4
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 64
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
+ add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16
+ jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%undef cmp
+ mov r5d, 8
+.zero_loop:
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x16_2d)+r5]
+ jl .zero_loop
+
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mov r3, rsp
+ lea r4, [o(idct64_mul_16bpc)]
+ mova m0, [cq+64* 1+r5*8]
+ mova m1, [cq+64*31+r5*8]
+ mova m2, [cq+64*17+r5*8]
+ mova m3, [cq+64*15+r5*8]
+ call .main_part1
+ mova m0, [cq+64* 7+r5*8]
+ mova m1, [cq+64*25+r5*8]
+ mova m2, [cq+64*23+r5*8]
+ mova m3, [cq+64* 9+r5*8]
+ call .main_part1
+ mova m0, [cq+64* 5+r5*8]
+ mova m1, [cq+64*27+r5*8]
+ mova m2, [cq+64*21+r5*8]
+ mova m3, [cq+64*11+r5*8]
+ call .main_part1
+ mova m0, [cq+64* 3+r5*8]
+ mova m1, [cq+64*29+r5*8]
+ mova m2, [cq+64*19+r5*8]
+ mova m3, [cq+64*13+r5*8]
+ call .main_part1
+ call .main_part2
+
+ mova m0, [cq+64* 2+r5*8]
+ mova m1, [cq+64*14+r5*8]
+ mova m2, [cq+64*18+r5*8]
+ mova m3, [cq+64*30+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
+
+ mova m0, [cq+64* 6+r5*8]
+ mova m1, [cq+64*10+r5*8]
+ mova m2, [cq+64*22+r5*8]
+ mova m3, [cq+64*26+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
+ add r3, 16*(24+4*ARCH_X86_32)
+
+ mova m0, [cq+64* 4+r5*8]
+ mova m1, [cq+64*12+r5*8]
+ mova m2, [cq+64*20+r5*8]
+ mova m3, [cq+64*28+r5*8]
+ call m(idct_16x4_internal_16bpc).main_oddhalf_fast
+
+ mova m0, [cq+64* 0+r5*8]
+ mova m1, [cq+64* 8+r5*8]
+ mova m2, [cq+64*16+r5*8]
+ mova m3, [cq+64*24+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1_fast
+ call m(idct_8x4_internal_16bpc).round
+ mova [r3-(7+4*ARCH_X86_32)*16], m1
+ mova [r3-(6+4*ARCH_X86_32)*16], m2
+ mova [r3-(5+4*ARCH_X86_32)*16], m3
+ mova [r3-(4+4*ARCH_X86_32)*16], m4
+ mova [r3-(3+4*ARCH_X86_32)*16], m5
+ mova [r3-(2+4*ARCH_X86_32)*16], m6
+ mova [r3-(1+4*ARCH_X86_32)*16], m7
+ sub r3, 16*(40+4*ARCH_X86_32-4)
+
+%if ARCH_X86_64
+ psrld m15, m11, 10 ; pd_2
+%else
+ mova m7, [o(pd_2)]
+%endif
+ call .main_end_loop_start
+
+ lea r3, [rsp+56*16]
+ lea r4, [cq+r5*8+64*28]
+ call .shift_transpose
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2, we need to call this otherwise the stack pointer has
+ ; the wrong offset in the 8-bit code
+ call .pass2
+ RET
+
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%endif
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 8
+ jmp m(idct_16x16_internal_16bpc).loop_pass2
+
+.main_part1: ; idct64 steps 1-5
+ ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+%if ARCH_X86_64
+ movd m7, [r4+4*0]
+ movd m8, [r4+4*1]
+ movd m6, [r4+4*2]
+ movd m9, [r4+4*3]
+ movd m5, [r4+4*4]
+ movd m10, [r4+4*5]
+ movd m4, [r4+4*6]
+ movd m15, [r4+4*7]
+ REPX {pshufd x, x, q0000}, m7, m8, m6, m9, m5, m10, m4, m15
+ pmulld m7, m0 ; t63a
+ pmulld m0, m8 ; t32a
+ pmulld m6, m1 ; t62a
+ pmulld m1, m9 ; t33a
+ pmulld m5, m2 ; t61a
+ pmulld m2, m10 ; t34a
+ pmulld m4, m3 ; t60a
+ pmulld m3, m15 ; t35a
+ movd m10, [r4+4*8]
+ movd m15, [r4+4*9]
+ REPX {pshufd x, x, q0000}, m10, m15
+ REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
+ REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
+ psubd m8, m0, m1 ; t33
+ paddd m0, m1 ; t32
+ psubd m1, m7, m6 ; t62
+ paddd m7, m6 ; t63
+ psubd m6, m3, m2 ; t34
+ paddd m3, m2 ; t35
+ psubd m2, m4, m5 ; t61
+ paddd m4, m5 ; t60
+ REPX {pmaxsd x, m12}, m8, m1, m6, m2
+ REPX {pminsd x, m13}, m8, m1, m6, m2
+ ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a
+ ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a
+ REPX {pmaxsd x, m12}, m0, m3, m7, m4
+ REPX {pminsd x, m13}, m0, m3, m7, m4
+ movd m10, [r4+4*10]
+ movd m15, [r4+4*11]
+ REPX {pshufd x, x, q0000}, m10, m15
+ psubd m5, m0, m3 ; t35a
+ paddd m0, m3 ; t32a
+ psubd m3, m7, m4 ; t60a
+ paddd m7, m4 ; t63a
+ psubd m4, m1, m6 ; t34
+ paddd m1, m6 ; t33
+ psubd m6, m8, m2 ; t61
+ paddd m8, m2 ; t62
+ REPX {pmaxsd x, m12}, m5, m3, m4, m6
+ REPX {pminsd x, m13}, m5, m3, m4, m6
+ ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60
+ ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
+ REPX {pmaxsd x, m12}, m0, m7, m1, m8
+ REPX {pminsd x, m13}, m0, m7, m1, m8
+ add r4, 4*12
+ mova [r3+16*0], m0
+ mova [r3+16*7], m7
+ mova [r3+16*1], m1
+ mova [r3+16*6], m8
+ mova [r3+16*2], m6
+ mova [r3+16*5], m4
+ mova [r3+16*3], m3
+ mova [r3+16*4], m5
+%else
+ movd m7, [r4+4*0]
+ movd m6, [r4+4*2]
+ movd m5, [r4+4*4]
+ movd m4, [r4+4*6]
+ REPX {pshufd x, x, q0000}, m7, m6, m5, m4
+ pmulld m7, m0 ; t63a
+ pmulld m6, m1 ; t62a
+ pmulld m5, m2 ; t61a
+ pmulld m4, m3 ; t60a
+ mova [r3+0*16], m6
+ mova [r3+1*16], m7
+ movd m6, [r4+4*1]
+ movd m7, [r4+4*3]
+ REPX {pshufd x, x, q0000}, m7, m6
+ pmulld m0, m6 ; t32a
+ pmulld m1, m7 ; t33a
+ movd m6, [r4+4*5]
+ movd m7, [r4+4*7]
+ REPX {pshufd x, x, q0000}, m7, m6
+ pmulld m2, m6 ; t34a
+ pmulld m3, m7 ; t35a
+ mova m6, [r3+0*16]
+ mova m7, [o(pd_2048)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3+1*16]
+ REPX {psrad x, 12}, m0, m1, m7, m6, m2, m3, m5, m4
+ mova [r3+0*16], m5
+ psubd m5, m0, m1 ; t33
+ paddd m0, m1 ; t32
+ mova [r3+1*16], m0
+ mova m0, [r3+0*16]
+ psubd m1, m7, m6 ; t62
+ paddd m7, m6 ; t63
+ psubd m6, m3, m2 ; t34
+ paddd m3, m2 ; t35
+ psubd m2, m4, m0 ; t61
+ paddd m4, m0 ; t60
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4
+ pmaxsd m0, [r3+1*16]
+ mova [r3+0*16], m0
+ mova m0, [o(clip_18b_max)]
+ REPX {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4
+ pminsd m0, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ movd m3, [r4+4*8]
+ movd m4, [r4+4*9]
+ REPX {pshufd x, x, q0000}, m3, m4
+ mova [r3+4*16], m2
+ ITX_MULSUB_2D 1, 5, 2, 7, _, 0, 3, 4 ; t33a, t62a
+ mova m2, [r3+4*16]
+ mova [r3+4*16], m5
+ ITX_MULSUB_2D 2, 6, 5, 7, _, 0, 3, 4, 4 ; t61a, t34a
+ mova m0, [r3+0*16]
+ mova m3, [r3+1*16]
+ mova m4, [r3+2*16]
+ mova m7, [r3+3*16]
+ psubd m5, m0, m3 ; t35a
+ paddd m0, m3 ; t32a
+ mova [r3+0*16], m5
+ mova m5, [r3+4*16]
+ psubd m3, m7, m4 ; t60a
+ paddd m7, m4 ; t63a
+ psubd m4, m1, m6 ; t34
+ paddd m1, m6 ; t33
+ psubd m6, m5, m2 ; t61
+ paddd m2, m5 ; t62
+ mova m5, [o(clip_18b_min)]
+ REPX {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2
+ pmaxsd m5, [r3+0*16]
+ mova [r3+0*16], m5
+ mova m5, [o(clip_18b_max)]
+ REPX {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2
+ pminsd m5, [r3+0*16]
+ mova [r3+16*0], m0
+ mova [r3+16*7], m7
+ mova [r3+16*1], m1
+ mova [r3+16*6], m2
+ mova [r3+16*2], m4
+ mova m7, [o(pd_2048)]
+ movd m0, [r4+4*10]
+ movd m1, [r4+4*11]
+ REPX {pshufd x, x, q0000}, m0, m1
+ ITX_MULSUB_2D 3, 5, 2, 4, _, 7, 0, 1 ; t35, t60
+ mova [r3+16*3], m3
+ mova [r3+16*4], m5
+ mova m4, [r3+2*16]
+ ITX_MULSUB_2D 6, 4, 2, 3, _, 7, 0, 1 ; t34a, t61a
+ add r4, 4*12
+ mova [r3+16*2], m6
+ mova [r3+16*5], m4
+%endif
+ add r3, 16*8
+ ret
+
+.main_part2: ; idct64 steps 6-9
+ lea r4, [r3+16*7]
+%if ARCH_X86_64
+ mova m10, [o(pd_1567)]
+ mova m15, [o(pd_3784)]
+.main_part2_loop:
+ mova m0, [r3-16*32] ; t32a
+ mova m1, [r4-16*24] ; t39a
+ mova m2, [r4-16*32] ; t63a
+ mova m3, [r3-16*24] ; t56a
+ mova m4, [r3-16*16] ; t40a
+ mova m5, [r4-16* 8] ; t47a
+ mova m6, [r4-16*16] ; t55a
+ mova m7, [r3-16* 8] ; t48a
+ psubd m8, m0, m1 ; t39
+ paddd m0, m1 ; t32
+ psubd m1, m2, m3 ; t56
+ paddd m2, m3 ; t63
+ psubd m3, m5, m4 ; t40
+ paddd m5, m4 ; t47
+ psubd m4, m7, m6 ; t55
+ paddd m7, m6 ; t48
+ REPX {pmaxsd x, m12}, m8, m1, m3, m4
+ REPX {pminsd x, m13}, m8, m1, m3, m4
+ ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a
+ ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a
+ REPX {pmaxsd x, m12}, m0, m2, m5, m7
+ REPX {pminsd x, m13}, m0, m5, m2, m7
+ psubd m6, m2, m7 ; t48a
+ paddd m2, m7 ; t63a
+ psubd m7, m0, m5 ; t47a
+ paddd m0, m5 ; t32a
+ psubd m5, m8, m4 ; t55
+ paddd m8, m4 ; t56
+ psubd m4, m1, m3 ; t40
+ paddd m1, m3 ; t39
+ REPX {pmaxsd x, m12}, m6, m7, m5, m4
+ REPX {pminsd x, m13}, m6, m7, m5, m4
+ REPX {pmulld x, m14}, m6, m7, m5, m4
+ REPX {pmaxsd x, m12}, m2, m0, m8, m1
+ REPX {pminsd x, m13}, m2, m0, m8, m1
+ paddd m6, m11
+ paddd m5, m11
+ psubd m3, m6, m7 ; t47
+ paddd m6, m7 ; t48
+ psubd m7, m5, m4 ; t40a
+ paddd m5, m4 ; t55a
+ REPX {psrad x, 12}, m3, m6, m7, m5
+ mova [r4-16* 8], m2
+ mova [r3-16*32], m0
+ mova [r3-16* 8], m8
+ mova [r4-16*32], m1
+ mova [r4-16*24], m3
+ mova [r3-16*16], m6
+ mova [r3-16*24], m7
+ mova [r4-16*16], m5
+%else
+.main_part2_loop:
+ mova m0, [r3-16*32] ; t32a
+ mova m1, [r4-16*24] ; t39a
+ mova m2, [r4-16*32] ; t63a
+ mova m3, [r3-16*24] ; t56a
+ mova m4, [r3-16*16] ; t40a
+ mova m5, [r4-16* 8] ; t47a
+ mova m6, [r4-16*16] ; t55a
+ psubd m7, m0, m1 ; t39
+ paddd m0, m1 ; t32
+ mova [r3+0*16], m7
+ mova m7, [r3-16* 8] ; t48a
+ psubd m1, m2, m3 ; t56
+ paddd m2, m3 ; t63
+ psubd m3, m5, m4 ; t40
+ paddd m5, m4 ; t47
+ psubd m4, m7, m6 ; t55
+ paddd m7, m6 ; t48
+ mova m6, [o(clip_18b_min)]
+ REPX {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7
+ pmaxsd m6, [r3+0*16]
+ mova [r3+0*16], m6
+ mova m6, [o(clip_18b_max)]
+ REPX {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7
+ pminsd m6, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m5
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 1, 6, 2, 5, 7, 0, 1567, 3784 ; t39a, t56a
+ ITX_MULSUB_2D 4, 3, 2, 5, _, 0, 7, 3784, 4 ; t55a, t40a
+ mova m2, [r3+1*16]
+ mova m7, [r3+3*16]
+ psubd m5, m2, m7 ; t48a
+ paddd m2, m7 ; t63a
+ mova [r3+1*16], m5
+ mova m0, [r3+0*16]
+ mova m5, [r3+2*16]
+ psubd m7, m0, m5 ; t47a
+ paddd m0, m5 ; t32a
+ psubd m5, m6, m4 ; t55
+ paddd m6, m4 ; t56
+ psubd m4, m1, m3 ; t40
+ paddd m1, m3 ; t39
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1
+ pmaxsd m3, [r3+1*16]
+ mova [r3+0*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1
+ pminsd m3, [r3+0*16]
+ mova [r4-16* 8], m2
+ mova [r3-16*32], m0
+ mova [r3-16* 8], m6
+ mova [r4-16*32], m1
+ mova m0, [o(pd_2896)]
+ mova m1, [o(pd_2048)]
+ REPX {pmulld x, m0}, m3, m7, m5, m4
+ REPX {paddd x, m1}, m3, m5
+ psubd m6, m3, m7 ; t47
+ paddd m3, m7 ; t48
+ psubd m7, m5, m4 ; t40a
+ paddd m5, m4 ; t55a
+ REPX {psrad x, 12}, m6, m3, m7, m5
+ mova [r4-16*24], m6
+ mova [r3-16*16], m3
+ mova [r3-16*24], m7
+ mova [r4-16*16], m5
+%endif
+ add r3, 16
+ sub r4, 16
+ cmp r3, r4
+ jl .main_part2_loop
+ sub r3, 4*16
+ ret
+
+.main_end_loop:
+ mova m0, [r3+16*28] ; idct8 0 + n
+.main_end_loop_start:
+ mova m2, [r3+16*12] ; idct32 16 + n
+ mova m3, [r4+16*12] ; idct32 31 - n
+%if ARCH_X86_64
+ mova m1, [r4+16*28] ; idct16 15 - n
+ mova m4, [r4-16* 4] ; idct64 63 - n
+ mova m5, [r3-16* 4] ; idct64 48 + n
+ mova m6, [r4-16*20] ; idct64 47 - n
+ mova m7, [r3-16*20] ; idct64 32 + n
+ pmaxsd m0, m12
+ pminsd m0, m13
+ paddd m8, m0, m1 ; idct16 out0 + n
+ psubd m0, m1 ; idct16 out15 - n
+ REPX {pmaxsd x, m12}, m8, m0
+ REPX {pminsd x, m13}, m8, m0
+ paddd m1, m8, m3 ; idct32 out0 + n
+ psubd m8, m3 ; idct32 out31 - n
+ paddd m3, m0, m2 ; idct32 out15 - n
+ psubd m0, m2 ; idct32 out16 + n
+ REPX {pmaxsd x, m12}, m1, m8, m3, m0
+ REPX {pminsd x, m13}, m1, m3, m8, m0
+ REPX {paddd x, m15}, m1, m3, m0, m8
+ paddd m2, m1, m4 ; idct64 out0 + n (unshifted)
+ psubd m1, m4 ; idct64 out63 - n (unshifted)
+ paddd m4, m3, m5 ; idct64 out15 - n (unshifted)
+ psubd m3, m5 ; idct64 out48 + n (unshifted)
+ paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
+ psubd m0, m6 ; idct64 out47 - n (unshifted)
+ paddd m6, m8, m7 ; idct64 out31 - n (unshifted)
+ psubd m8, m7 ; idct64 out32 + n (unshifted)
+ mova [r3-16*20], m2
+ mova [r4+16*28], m1
+ mova [r4-16*20], m4
+ mova [r3+16*28], m3
+ mova [r3-16* 4], m5
+ mova [r4+16*12], m0
+ mova [r4-16* 4], m6
+ mova [r3+16*12], m8
+%else
+ mova m5, [o(clip_18b_min)]
+ mova m6, [o(clip_18b_max)]
+ mova m1, [r3+16*44] ; idct16 15 - n
+ pmaxsd m0, m5
+ pminsd m0, m6
+ paddd m4, m0, m1 ; idct16 out0 + n
+ psubd m0, m1 ; idct16 out15 - n
+ REPX {pmaxsd x, m5}, m4, m0
+ REPX {pminsd x, m6}, m4, m0
+ paddd m1, m4, m3 ; idct32 out0 + n
+ psubd m4, m3 ; idct32 out31 - n
+ paddd m3, m0, m2 ; idct32 out15 - n
+ psubd m0, m2 ; idct32 out16 + n
+ REPX {pmaxsd x, m5}, m1, m4, m3, m0
+ REPX {pminsd x, m6}, m1, m3, m4, m0
+ REPX {paddd x, m7}, m1, m3, m0, m4
+ mova m5, [r4-16* 4] ; idct64 63 - n
+ mova m6, [r3-16* 4] ; idct64 48 + n
+ paddd m2, m1, m5 ; idct64 out0 + n (unshifted)
+ psubd m1, m5 ; idct64 out63 - n (unshifted)
+ paddd m5, m3, m6 ; idct64 out15 - n (unshifted)
+ psubd m3, m6 ; idct64 out48 + n (unshifted)
+ mova [r4+16*28], m1
+ mova [r3+16*28], m3
+ mova m6, [r4-16*20] ; idct64 47 - n
+ mova m1, [r3-16*20] ; idct64 32 + n
+ mova [r3-16*20], m2
+ mova [r4-16*20], m5
+ paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
+ psubd m0, m6 ; idct64 out47 - n (unshifted)
+ paddd m6, m4, m1 ; idct64 out31 - n (unshifted)
+ psubd m4, m1 ; idct64 out32 + n (unshifted)
+ mova [r3-16* 4], m5
+ mova [r4+16*12], m0
+ mova [r4-16* 4], m6
+ mova [r3+16*12], m4
+%endif
+ sub r4, 16
+ add r3, 16
+ cmp r3, r4
+ jl .main_end_loop
+ ret
+
+.shift_transpose:
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m2, [r3+2*16]
+ mova m3, [r3+3*16]
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [r4+0*64], m0
+ mova [r4+1*64], m1
+ mova [r4+2*64], m2
+ mova [r4+3*64], m3
+ sub r4, 4*64
+ sub r3, 8*16
+ cmp r3, rsp
+ jg .shift_transpose
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+.dconly1:
+ add r5d, 640
+ sar r5d, 10
+.dconly2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+.dconly_loop:
+ paddw m1, m0, [dstq+16*0]
+ paddw m2, m0, [dstq+16*1]
+ paddw m3, m0, [dstq+16*2]
+ paddw m4, m0, [dstq+16*3]
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m2
+ mova [dstq+16*2], m3
+ mova [dstq+16*3], m4
+ add dstq, 64
+ btc r3d, 16
+ jnc .dconly_loop
+ lea dstq, [dstq+strideq-128]
+ dec r3d
+ jg .dconly_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \
+ 0-(1+64+8*ARCH_X86_32+8*32+1*WIN64)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 0, 4, 1
+ mov [rsp+(8*32+64+8)*16+1*gprsize], dstq
+ mov [rsp+(8*32+64+8)*16+2*gprsize], strideq
+%else
+ DECLARE_REG_TMP 4, 7, 8
+%if WIN64
+ mov [rsp+(8*32+64+1)*16+1*gprsize], r7
+ mov [rsp+64*16+0*gprsize], r8
+%endif
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 14
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ lea t2, [rsp+7*32*16]
+.zero_loop_inner:
+ mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
+ mova [t2+(72+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
+ mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t0*8], m0
+ mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t1*8], m0
+ sub t2, 32*16
+ cmp t2, rsp
+ jge .zero_loop_inner
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ mov [rsp+(8*32+64+8*ARCH_X86_32+1*WIN64)*16+0*gprsize], eobd
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mov r3, rsp
+ lea r4, [o(idct64_mul_16bpc)]
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128*31+r5*8]
+ mova m2, [cq+128*17+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 7+r5*8]
+ mova m1, [cq+128*25+r5*8]
+ mova m2, [cq+128*23+r5*8]
+ mova m3, [cq+128* 9+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 5+r5*8]
+ mova m1, [cq+128*27+r5*8]
+ mova m2, [cq+128*21+r5*8]
+ mova m3, [cq+128*11+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128*29+r5*8]
+ mova m2, [cq+128*19+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
+
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128*14+r5*8]
+ mova m2, [cq+128*18+r5*8]
+ mova m3, [cq+128*30+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
+
+ mova m0, [cq+128* 6+r5*8]
+ mova m1, [cq+128*10+r5*8]
+ mova m2, [cq+128*22+r5*8]
+ mova m3, [cq+128*26+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
+ add r3, 16*(24+4*ARCH_X86_32)
+
+ mova m0, [cq+128* 4+r5*8]
+ mova m1, [cq+128*12+r5*8]
+ mova m2, [cq+128*20+r5*8]
+ mova m3, [cq+128*28+r5*8]
+ call .rect2_mul_fast
+ call m(idct_16x4_internal_16bpc).main_oddhalf_fast
+
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 8+r5*8]
+ mova m2, [cq+128*16+r5*8]
+ mova m3, [cq+128*24+r5*8]
+ call .rect2_mul_fast
+ call m(idct_8x4_internal_16bpc).main_pass1_fast
+ call m(idct_8x4_internal_16bpc).round
+ mova [r3-(7+4*ARCH_X86_32)*16], m1
+ mova [r3-(6+4*ARCH_X86_32)*16], m2
+ mova [r3-(5+4*ARCH_X86_32)*16], m3
+ mova [r3-(4+4*ARCH_X86_32)*16], m4
+ mova [r3-(3+4*ARCH_X86_32)*16], m5
+ mova [r3-(2+4*ARCH_X86_32)*16], m6
+ mova [r3-(1+4*ARCH_X86_32)*16], m7
+ sub r3, 16*(40+4*ARCH_X86_32-4)
+
+%if ARCH_X86_64
+ psrld m15, m11, 11 ; pd_1
+%else
+ mova m7, [o(pd_1)]
+%endif
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
+
+ lea r3, [rsp+56*16]
+ lea t2, [rsp+7*32*16+(64+8*ARCH_X86_32+1*WIN64)*16]
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ call .shift_transpose
+ ; zero cq
+ pxor m7, m7
+ lea r4, [cq+30*128+r5*8]
+.zero_cq_loop:
+ REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
+ sub r4, 4*128
+ cmp r4, cq
+ jg .zero_cq_loop
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2 code starts here
+ mov eobd, [rsp+gprsize*0+(8*32+64+8*ARCH_X86_32+1*WIN64)*16]
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize*2+(8*32+64+8)*16]
+%elif WIN64
+ mov r8, [rsp+gprsize*0+64*16]
+%endif
+ add rsp, (64+8*ARCH_X86_32+1*WIN64-3)*16
+ cmp eobd, 36
+ jl .load_veryfast
+ cmp eobd, 136
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+%if ARCH_X86_64
+ lea r2, [dstq+128]
+ mov r7, -16
+%else
+ lea r2, [rsp+(8*32+3)*16]
+ mov dword [r2+0*gprsize], 8
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
+
+.rect2_mul_fast:
+%if ARCH_X86_64
+ REPX {pmulld x, m14}, m0, m1, m2, m3
+ REPX {paddd x, m11}, m0, m1, m2, m3
+%else
+ mova m4, [o(pd_2896)]
+ mova m5, [o(pd_2048)]
+ REPX {pmulld x, m4 }, m0, m1, m2, m3
+ REPX {paddd x, m5 }, m0, m1, m2, m3
+%endif
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ ret
+
+.shift_transpose:
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m2, [r3+2*16]
+ mova m3, [r3+3*16]
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [t2+0*16+r5*8], m0
+ mova [t2+8*16+r5*8], m2
+ mova [t2+0*16+t0*8], m3
+ mova [t2+0*16+t1*8], m1
+ sub t2, 16*32
+ sub r3, 8*16
+ cmp r3, rsp
+ jg .shift_transpose
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 32
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
+ add rsp, (1+8*32+1*WIN64)*16
+ jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \
+ 0-(64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16-(4+4*ARCH_X86_32)*gprsize, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0, 6
+ mov [rsp+gprsize*1+(64*9+8)*16], r0
+ mov [rsp+gprsize*2+(64*9+8)*16], r1
+ mov [rsp+gprsize*3+(64*9+8)*16], r2
+ mov [rsp+gprsize*4+(64*9+8)*16], r6
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7, 0
+ mov [rsp+gprsize*1+(64*9+1)*16], r9
+ mov [rsp+gprsize*0+64*16], r0
+%if WIN64
+ mov [rsp+gprsize*2+(64*9+1)*16], r7
+ mov [rsp+gprsize*3+(64*9+1)*16], r8
+%endif
+%endif
+%undef cmp
+
+ ; remove entirely-zero iterations
+ mov r5d, 14
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ lea t4, [rsp+7*64*16]
+.zero_loop_inner:
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t0*8], m0
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t1*8], m0
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t2*8], m0
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t3*8], m0
+ sub t4, 64*16
+ cmp t4, rsp
+ jge .zero_loop_inner
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*4+(64*9+8)*16]
+%endif
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ mov [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16], eobd
+%if ARCH_X86_32
+ mov cq, [rsp+gprsize*3+(64*9+8)*16]
+%endif
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mov r3, rsp
+ lea r4, [o(idct64_mul_16bpc)]
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128*31+r5*8]
+ mova m2, [cq+128*17+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 7+r5*8]
+ mova m1, [cq+128*25+r5*8]
+ mova m2, [cq+128*23+r5*8]
+ mova m3, [cq+128* 9+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 5+r5*8]
+ mova m1, [cq+128*27+r5*8]
+ mova m2, [cq+128*21+r5*8]
+ mova m3, [cq+128*11+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128*29+r5*8]
+ mova m2, [cq+128*19+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
+
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128*14+r5*8]
+ mova m2, [cq+128*18+r5*8]
+ mova m3, [cq+128*30+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
+
+ mova m0, [cq+128* 6+r5*8]
+ mova m1, [cq+128*10+r5*8]
+ mova m2, [cq+128*22+r5*8]
+ mova m3, [cq+128*26+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
+ add r3, 16*(24+4*ARCH_X86_32)
+
+ mova m0, [cq+128* 4+r5*8]
+ mova m1, [cq+128*12+r5*8]
+ mova m2, [cq+128*20+r5*8]
+ mova m3, [cq+128*28+r5*8]
+ call m(idct_16x4_internal_16bpc).main_oddhalf_fast
+
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 8+r5*8]
+ mova m2, [cq+128*16+r5*8]
+ mova m3, [cq+128*24+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1_fast
+ call m(idct_8x4_internal_16bpc).round
+ mova [r3-(7+4*ARCH_X86_32)*16], m1
+ mova [r3-(6+4*ARCH_X86_32)*16], m2
+ mova [r3-(5+4*ARCH_X86_32)*16], m3
+ mova [r3-(4+4*ARCH_X86_32)*16], m4
+ mova [r3-(3+4*ARCH_X86_32)*16], m5
+ mova [r3-(2+4*ARCH_X86_32)*16], m6
+ mova [r3-(1+4*ARCH_X86_32)*16], m7
+ sub r3, 16*(40+4*ARCH_X86_32-4)
+
+%if ARCH_X86_64
+ psrld m15, m11, 10 ; pd_2
+%else
+ mova m7, [o(pd_2)]
+%endif
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
+
+ lea r3, [rsp+56*16]
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ lea t4, [rsp+7*64*16+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
+ call .shift_transpose
+ ; zero cq
+ pxor m7, m7
+%if ARCH_X86_32
+ mov cq, [rsp+gprsize*3+(64*9+8)*16]
+%endif
+ lea r4, [cq+30*128+r5*8]
+.zero_cq_loop:
+ REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
+ sub r4, 4*128
+ cmp r4, cq
+ jg .zero_cq_loop
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*4+(64*9+8)*16]
+%endif
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2 code starts here
+ mov eobd, [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize*2+(9*64+8)*16]
+%else
+ mov r0, [rsp+gprsize*0+64*16]
+%endif
+ add rsp, (64+8*ARCH_X86_32+1*ARCH_X86_64-3)*16
+ cmp eobd, 151
+ jl .fast
+ ; fall-through
+%if ARCH_X86_64
+ DECLARE_REG_TMP 8, 9
+%else
+ DECLARE_REG_TMP 1, 5
+%endif
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
+ jmp .run
+.fast:
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
+.run:
+
+%if ARCH_X86_64
+ lea r2, [dstq+128]
+ mov r7, -16
+%else
+ lea r2, [rsp+(64*8+3)*16]
+ mov [r2+4*gprsize], t0
+ mov [r2+5*gprsize], t1
+ mov r1, [r2+2*gprsize]
+ mov dword [r2+0*gprsize], 8
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
+
+ ; copy of pass=1 tmp-regs
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0, 6
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7, 0
+%endif
+
+.shift_transpose:
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m2, [r3+2*16]
+ mova m3, [r3+3*16]
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [t4+t0*8], m0
+ mova [t4+t1*8], m1
+ mova [t4+t2*8], m2
+ mova [t4+t3*8], m3
+ sub t4, 16*64
+ sub r3, 8*16
+ cmp r3, rsp
+ jg .shift_transpose
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 64
+ add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \
+ (4+4*ARCH_X86_32)*gprsize - (64+8*ARCH_X86_32)*16
+ jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly1
diff --git a/third_party/dav1d/src/x86/itx_avx2.asm b/third_party/dav1d/src/x86/itx_avx2.asm
new file mode 100644
index 0000000000..a67f053a61
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx_avx2.asm
@@ -0,0 +1,5542 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+
+; Note: The order of (at least some of) those constants matter!
+
+const deint_shuf, db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+
+%macro COEF_PAIR 2
+pw_%1_%2: dw %1, %2
+pw_m%2_%1: dw -%2, %1
+%endmacro
+
+; ADST-only
+pw_3803_1321: dw 3803, 1321
+pw_m1321_2482: dw -1321, 2482
+pw_2482_3344: dw 2482, 3344
+pw_m3344_3344: dw -3344, 3344
+pw_m3803_3344: dw -3803, 3344
+pw_m3803_m6688: dw -3803, -6688
+pw_2896_m2896: dw 2896, -2896
+
+const pw_5, times 2 dw 5
+const pw_2048, times 2 dw 2048
+const pw_4096, times 2 dw 4096
+const pw_8192, times 2 dw 8192
+const pw_16384, times 2 dw 16384
+const pw_1697x16, times 2 dw 1697*16
+const pw_1697x8, times 2 dw 1697*8
+const pw_2896x8, times 2 dw 2896*8
+const pd_2048, dd 2048
+
+const pw_2896_2896, dw 2896, 2896
+const pw_m2896_2896, dw -2896, 2896
+const pw_1567_3784, dw 1567, 3784
+const pw_m3784_1567, dw -3784, 1567
+COEF_PAIR 3784, 1567
+COEF_PAIR 201, 4091
+COEF_PAIR 995, 3973
+COEF_PAIR 1751, 3703
+COEF_PAIR 2440, 3290
+COEF_PAIR 3035, 2751
+COEF_PAIR 3513, 2106
+COEF_PAIR 3857, 1380
+COEF_PAIR 4052, 601
+COEF_PAIR 401, 4076
+COEF_PAIR 1931, 3612
+COEF_PAIR 3166, 2598
+COEF_PAIR 3920, 1189
+COEF_PAIR 799, 4017
+COEF_PAIR 3406, 2276
+pw_m799_m4017: dw -799, -4017
+const pw_m1567_m3784, dw -1567, -3784
+pw_m3406_m2276: dw -3406, -2276
+pw_m401_m4076: dw -401, -4076
+pw_m3166_m2598: dw -3166, -2598
+pw_m1931_m3612: dw -1931, -3612
+pw_m3920_m1189: dw -3920, -1189
+COEF_PAIR 2276, 3406
+COEF_PAIR 4017, 799
+
+%macro COEF_X8 1-*
+%rep %0
+ dw %1*8, %1*8
+ %rotate 1
+%endrep
+%endmacro
+
+pw_3703x8: COEF_X8 3703
+pw_1751x8: COEF_X8 1751
+pw_m1380x8: COEF_X8 -1380
+pw_3857x8: COEF_X8 3857
+pw_3973x8: COEF_X8 3973
+pw_995x8: COEF_X8 995
+pw_m2106x8: COEF_X8 -2106
+pw_3513x8: COEF_X8 3513
+pw_3290x8: COEF_X8 3290
+pw_2440x8: COEF_X8 2440
+pw_m601x8: COEF_X8 -601
+pw_4052x8: COEF_X8 4052
+
+const idct64_mul
+COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520
+COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092
+COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842
+COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301
+
+pw_201_4091x8: dw 201*8, 4091*8
+pw_m601_4052x8: dw -601*8, 4052*8
+pw_995_3973x8: dw 995*8, 3973*8
+pw_m1380_3857x8: dw -1380*8, 3857*8
+pw_1751_3703x8: dw 1751*8, 3703*8
+pw_m2106_3513x8: dw -2106*8, 3513*8
+pw_2440_3290x8: dw 2440*8, 3290*8
+pw_m2751_3035x8: dw -2751*8, 3035*8
+
+%define o_idct64_offset idct64_mul - (o_base) - 8
+
+SECTION .text
+
+; Code size reduction trickery: Instead of using rip-relative loads with
+; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
+; single rip-relative lea and then address things relative from that with
+; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
+%define o_base deint_shuf + 128
+%define o(x) (r6 - (o_base) + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+; flags: 1 = swap, 2 = interleave, 4: coef_regs
+%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
+%if %7 & 4
+ pmaddwd m%2, m%5, m%1
+ pmaddwd m%1, m%6
+%else
+%if %7 & 1
+ vpbroadcastd m%2, [o(pw_%5_%6)]
+ vpbroadcastd m%3, [o(pw_m%6_%5)]
+%else
+ vpbroadcastd m%2, [o(pw_m%6_%5)]
+ vpbroadcastd m%3, [o(pw_%5_%6)]
+%endif
+ pmaddwd m%2, m%1
+ pmaddwd m%1, m%3
+%endif
+ paddd m%2, m%4
+ paddd m%1, m%4
+%if %7 & 2
+ pslld m%2, 4
+ psrld m%1, 12
+ pblendw m%1, m%2, 0xaa
+%else
+ psrad m%2, 12
+ psrad m%1, 12
+ packssdw m%1, m%2
+%endif
+%endmacro
+
+; flags: 1 = swap, 2 = interleave, 4 = coef_regs
+%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags
+%if %10 & 1
+ vpbroadcastd m%3, [o(pw_%8_%9)]
+ vpbroadcastd m%4, [o(pw_m%9_%8)]
+ vpbroadcastd xm%2, [o(pw_%6_%7)]
+ vpblendd m%2, m%3, 0xf0
+ vpbroadcastd xm%3, [o(pw_m%7_%6)]
+%else
+ vpbroadcastd m%3, [o(pw_m%9_%8)]
+ vpbroadcastd m%4, [o(pw_%8_%9)]
+ vpbroadcastd xm%2, [o(pw_m%7_%6)]
+ vpblendd m%2, m%3, 0xf0
+ vpbroadcastd xm%3, [o(pw_%6_%7)]
+%endif
+ vpblendd m%3, m%4, 0xf0
+ ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10)
+%endmacro
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
+ punpckhwd m%3, m%2, m%1
+ punpcklwd m%2, m%1
+%if %7 < 32
+ pmaddwd m%1, m%7, m%2
+ pmaddwd m%4, m%7, m%3
+%else
+ vpbroadcastd m%1, [o(pw_m%7_%6)]
+ pmaddwd m%4, m%3, m%1
+ pmaddwd m%1, m%2
+%endif
+ paddd m%4, m%5
+ paddd m%1, m%5
+ psrad m%4, 12
+ psrad m%1, 12
+ packssdw m%1, m%4
+%if %7 < 32
+ pmaddwd m%3, m%6
+ pmaddwd m%2, m%6
+%else
+ vpbroadcastd m%4, [o(pw_%6_%7)]
+ pmaddwd m%3, m%4
+ pmaddwd m%2, m%4
+%endif
+ paddd m%3, m%5
+ paddd m%2, m%5
+ psrad m%3, 12
+ psrad m%2, 12
+%if %0 == 8
+ packssdw m%8, m%2, m%3
+%else
+ packssdw m%2, m%3
+%endif
+%endmacro
+
+%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3
+ ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0
+ psubsw m%3, m%1, m%2
+ paddsw m%2, m%1
+ paddsw m%1, m%4, m%5
+ psubsw m%4, m%5
+%endmacro
+
+%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
+ ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a
+ ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
+ paddsw m%9, m%2, m%6 ; t4
+ psubsw m%2, m%6 ; t5a
+ paddsw m%10, m%8, m%4 ; t7
+ psubsw m%8, m%4 ; t6a
+ ITX_MULSUB_2W %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0
+ ITX_MULSUB_2W %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6
+ psubsw m%6, m%1, m%3 ; dct4 out2
+ paddsw m%3, m%1 ; dct4 out1
+ paddsw m%1, m%5, m%7 ; dct4 out0
+ psubsw m%5, m%7 ; dct4 out3
+ psubsw m%7, m%3, m%2 ; out6
+ paddsw m%2, m%3 ; out1
+ paddsw m%3, m%6, m%8 ; out2
+ psubsw m%6, m%8 ; out5
+ psubsw m%8, m%1, m%10 ; out7
+ paddsw m%1, m%10 ; out0
+ paddsw m%4, m%5, m%9 ; out3
+ psubsw m%5, m%9 ; out4
+%endmacro
+
+; in1 = %1, in3 = %2, in5 = %3, in7 = %4
+; in9 = %5, in11 = %6, in13 = %7, in15 = %8
+%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %1, %8, %9, %10, %11, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a
+ ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
+ psubsw m%9, m%2, m%6 ; t13
+ paddsw m%6, m%2 ; t12
+ psubsw m%2, m%8, m%4 ; t14
+ paddsw m%8, m%4 ; t15
+ psubsw m%4, m%7, m%3 ; t10
+ paddsw m%3, m%7 ; t11
+ psubsw m%7, m%1, m%5 ; t9
+ paddsw m%1, m%5 ; t8
+ ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a
+ ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
+ psubsw m%5, m%1, m%3 ; t11a
+ paddsw m%1, m%3 ; t8a
+ psubsw m%3, m%7, m%4 ; t13
+ paddsw m%7, m%4 ; t14
+ psubsw m%4, m%8, m%6 ; t12a
+ paddsw m%8, m%6 ; t15a
+ psubsw m%6, m%2, m%9 ; t10
+ paddsw m%2, m%9 ; t9
+ ITX_MULSUB_2W %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a
+ ITX_MULSUB_2W %4, %5, %9, %10, %11, 2896, 2896 ; t11, t12
+%endmacro
+
+%macro WRAP_XMM 1+
+ INIT_XMM cpuname
+ %1
+ INIT_YMM cpuname
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+ vpbroadcastd m2, [o(pw_%5)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+%endif
+ lea r2, [dstq+strideq*2]
+%assign %%i 1
+%rep 4
+ %if %1 & 2
+ CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1)
+ %else
+ CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+ %endif
+ %assign %%i %%i + 1
+ %rotate 1
+%endrep
+ movd m2, [%%row_adr1]
+ pinsrd m2, [%%row_adr2], 1
+ movd m3, [%%row_adr3]
+ pinsrd m3, [%%row_adr4], 1
+ pmovzxbw m2, m2
+ pmovzxbw m3, m3
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ movd [%%row_adr1], m0
+ pextrd [%%row_adr2], m0, 1
+ pextrd [%%row_adr3], m0, 2
+ pextrd [%%row_adr4], m0, 3
+ ret
+%endmacro
+
+%macro IWHT4_1D_PACKED 0
+ punpckhqdq m3, m0, m1 ; in1 in3
+ punpcklqdq m0, m1 ; in0 in2
+ psubw m2, m0, m3
+ paddw m0, m3
+ punpckhqdq m2, m2 ; t2 t2
+ punpcklqdq m0, m0 ; t0 t0
+ psubw m1, m0, m2
+ psraw m1, 1
+ psubw m1, m3 ; t1 t3
+ psubw m0, m1 ; ____ out0
+ paddw m2, m1 ; out3 ____
+%endmacro
+
+INIT_XMM avx2
+cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, c
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ pxor m2, m2
+ mova [cq+16*0], m2
+ mova [cq+16*1], m2
+ psraw m0, 2
+ psraw m1, 2
+ IWHT4_1D_PACKED
+ punpckhwd m0, m1
+ punpcklwd m3, m1, m2
+ punpckhdq m1, m0, m3
+ punpckldq m0, m3
+ IWHT4_1D_PACKED
+ vpblendd m0, m2, 0x03
+ ITX4_END 3, 0, 2, 1, 0
+
+%macro INV_TXFM_FN 3 ; type1, type2, size
+cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 5, 0, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%3_internal_8bpc)
+ lea r6, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%3_internal_8bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x4
+%ifidn %1_%2, dct_dct
+ vpbroadcastw m0, [cq]
+ vpbroadcastd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [cq], eobd ; 0
+ pmulhrsw m0, m1
+ mova m1, m0
+ jmp m(iadst_4x4_internal_8bpc).end2
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 0
+ vpbroadcastd m4, [o(pd_2048)]
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784
+ ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896
+ paddsw m0, m1, m2 ; out0 out1
+ psubsw m1, m2 ; out3 out2
+%endmacro
+
+%macro IADST4_1D_PACKED 0
+ punpcklwd m2, m1, m0
+ punpckhwd m3, m1, m0
+ vpbroadcastd m5, [o(pw_m3344_3344)]
+ vpbroadcastd m0, [o(pw_3803_1321)]
+ vpbroadcastd m4, [o(pw_m1321_2482)]
+ pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2
+ psrld m5, 16
+ pmaddwd m0, m2
+ pmaddwd m2, m4
+ pmaddwd m5, m3 ; 3344*in0
+ paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3
+ vpbroadcastd m4, [o(pw_2482_3344)]
+ vpbroadcastd m5, [o(pw_m3803_3344)]
+ pmaddwd m4, m3
+ pmaddwd m5, m3
+ paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
+ vpbroadcastd m0, [o(pw_m3803_m6688)]
+ pmaddwd m3, m0
+ vpbroadcastd m0, [o(pd_2048)]
+ paddd m2, m0
+ paddd m1, m0
+ paddd m0, m4
+ paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
+ paddd m2, m4
+ paddd m2, m3
+ REPX {psrad x, 12}, m1, m2, m0, m5
+ packssdw m0, m5 ; out0 out1
+ packssdw m1, m2 ; out2 out3
+%endmacro
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
+
+cglobal idct_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ IDCT4_1D_PACKED
+ mova m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ pxor m2, m2
+ mova [cq+16*0], m2
+ mova [cq+16*1], m2
+ ITX4_END 0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call .main
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ pxor m2, m2
+ mova [cq+16*0], m2
+ mova [cq+16*1], m2
+.end2:
+ ITX4_END 0, 1, 2, 3
+ALIGN function_align
+cglobal_label .main
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call m(iadst_4x4_internal_8bpc).main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ jmp tx2q
+.pass2:
+ call m(iadst_4x4_internal_8bpc).main
+.end:
+ pxor m2, m2
+ mova [cq+16*0], m2
+ mova [cq+16*1], m2
+.end2:
+ ITX4_END 3, 2, 1, 0
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x4_internal_8bpc).end
+
+%macro WRITE_4X8 2 ; coefs[1-2]
+ movd xm4, [dstq+strideq*0]
+ pinsrd xm4, [dstq+strideq*1], 1
+ movd xm5, [dstq+strideq*2]
+ pinsrd xm5, [dstq+r3 ], 1
+ pinsrd xm4, [r2 +strideq*0], 2
+ pinsrd xm4, [r2 +strideq*1], 3
+ pinsrd xm5, [r2 +strideq*2], 2
+ pinsrd xm5, [r2 +r3 ], 3
+ pmovzxbw m4, xm4
+ pmovzxbw m5, xm5
+ paddw m4, m%1
+ paddw m5, m%2
+ packuswb m4, m5
+ vextracti128 xm5, m4, 1
+ movd [dstq+strideq*0], xm4
+ pextrd [dstq+strideq*1], xm4, 1
+ pextrd [dstq+strideq*2], xm4, 2
+ pextrd [dstq+r3 ], xm4, 3
+ movd [r2 +strideq*0], xm5
+ pextrd [r2 +strideq*1], xm5, 1
+ pextrd [r2 +strideq*2], xm5, 2
+ pextrd [r2 +r3 ], xm5, 3
+%endmacro
+
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x8
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_2048)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ jmp m(iadst_4x8_internal_8bpc).end3
+%endif
+%endmacro
+
+%macro IDCT8_1D_PACKED 0
+ vpbroadcastd m6, [o(pd_2048)]
+ punpckhwd m5, m3, m0 ; in7 in1
+ punpckhwd m4, m1, m2 ; in3 in5
+ punpcklwd m3, m1 ; in6 in2
+ punpcklwd m2, m0 ; in4 in0
+ ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a
+ ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
+ ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2
+ psubsw m0, m5, m4 ; t5a t6a (interleaved)
+ paddsw m4, m5 ; t4 t7 (interleaved)
+ ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1
+ vpbroadcastd m1, [o(pw_m2896_2896)]
+ ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5
+%if mmsize > 16
+ vbroadcasti128 m1, [o(deint_shuf)]
+ pshufb m4, m1
+%else
+ pshufb m4, [o(deint_shuf)]
+%endif
+ psubsw m1, m2, m3 ; tmp3 tmp2
+ paddsw m3, m2 ; tmp0 tmp1
+ shufps m2, m4, m0, q1032 ; t7 t6
+ vpblendd m4, m0, 0xcc ; t4 t5
+ paddsw m0, m3, m2 ; out0 out1
+ psubsw m3, m2 ; out7 out6
+ psubsw m2, m1, m4 ; out4 out5
+ paddsw m1, m4 ; out3 out2
+%endmacro
+
+%macro IADST8_1D_PACKED 1 ; pass
+ vpbroadcastd m6, [o(pd_2048)]
+ punpckhwd m0, m4, m3 ; 0 7
+ punpckhwd m1, m5, m2 ; 2 5
+ punpcklwd m2, m5 ; 4 3
+ punpcklwd m3, m4 ; 6 1
+%if %1 == 1
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
+ psubsw m4, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
+%if mmsize > 16
+ vbroadcasti128 m2, [o(deint_shuf)]
+%else
+ mova m2, [o(deint_shuf)]
+%endif
+ pshuflw m1, m1, q2301
+ pshufhw m1, m1, q2301
+ psubsw m3, m0, m1 ; t3 t2
+ paddsw m0, m1 ; -out7 out0
+ psubsw m1, m4, m5 ; t7 t6
+ paddsw m4, m5 ; out6 -out1
+ pshufb m0, m2
+ pshufb m4, m2
+ vpbroadcastd m5, [o(pw_m2896_2896)]
+ pmaddwd m2, m5, m3
+ pmaddwd m5, m1
+ paddd m2, m6
+ paddd m5, m6
+ psrad m2, 12
+ psrad m5, 12
+ packssdw m2, m5 ; out4 -out5
+ vpbroadcastd m5, [o(pw_2896_2896)]
+ pmaddwd m3, m5
+ pmaddwd m1, m5
+ paddd m3, m6
+ paddd m1, m6
+ psrad m3, 12
+ psrad m1, 12
+ packssdw m1, m3 ; out2 -out3
+ punpcklqdq m3, m4, m0 ; out6 -out7
+ punpckhqdq m0, m4 ; out0 -out1
+%else
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a
+ psubsw m4, m0, m2 ; t4 t5
+ paddsw m0, m2 ; t0 t1
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ shufps m2, m5, m4, q1032
+ punpckhwd m4, m2
+ punpcklwd m5, m2
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a
+ psubsw m2, m0, m1 ; t2 t3
+ paddsw m0, m1 ; out0 -out7
+ psubsw m1, m4, m5 ; t7 t6
+ paddsw m4, m5 ; out6 -out1
+ vpbroadcastd m5, [o(pw_2896x8)]
+ vpblendd m3, m0, m4, 0x33 ; out6 -out7
+ vpblendd m0, m4, 0xcc ; out0 -out1
+ shufps m4, m2, m1, q1032 ; t3 t7
+ vpblendd m1, m2, 0x33 ; t2 t6
+ psubsw m2, m1, m4 ; t2-t3 t6-t7
+ paddsw m1, m4 ; t2+t3 t6+t7
+ pmulhrsw m2, m5 ; out4 -out5
+ pshufd m1, m1, q1032
+ pmulhrsw m1, m5 ; out2 -out3
+%endif
+%endmacro
+
+INIT_YMM avx2
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
+
+cglobal idct_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ IDCT4_1D_PACKED
+ vbroadcasti128 m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ call .main
+ vpbroadcastd m4, [o(pw_2048)]
+ vinserti128 m0, xm2, 1
+ vinserti128 m1, xm3, 1
+ pshufd m1, m1, q1032
+ jmp m(iadst_4x8_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ WRAP_XMM IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ call m(iadst_8x4_internal_8bpc).main
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+ pshufd xm5, xm1, q1032
+ call .main_pass2
+ vpbroadcastd m4, [o(pw_2048)]
+ vinserti128 m0, xm2, 1
+ vinserti128 m1, xm3, 1
+ pxor m5, m5
+ psubw m5, m4
+.end:
+ vpblendd m4, m5, 0xcc
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ WIN64_RESTORE_XMM
+ pxor m2, m2
+ mova [cq+32*0], m2
+ mova [cq+32*1], m2
+.end3:
+ lea r2, [dstq+strideq*4]
+ lea r3, [strideq*3]
+ WRITE_4X8 0, 1
+ RET
+ALIGN function_align
+.main_pass1:
+ WRAP_XMM IADST8_1D_PACKED 1
+ ret
+ALIGN function_align
+cglobal_label .main_pass2
+ WRAP_XMM IADST8_1D_PACKED 2
+ ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ call m(iadst_8x4_internal_8bpc).main
+ punpcklwd m3, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m3
+ punpckhwd m1, m3
+ jmp tx2q
+.pass2:
+ vextracti128 xm2, m0, 1
+ vextracti128 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+ pshufd xm5, xm1, q1032
+ call m(iadst_4x8_internal_8bpc).main_pass2
+ vpbroadcastd m5, [o(pw_2048)]
+ vinserti128 m3, xm1, 1
+ vinserti128 m2, xm0, 1
+ pxor m4, m4
+ psubw m4, m5
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ jmp m(iadst_4x8_internal_8bpc).end
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m2, [cq+32*0], q3120
+ vpermq m0, [cq+32*1], q3120
+ vpbroadcastd m3, [o(pw_2896x8)]
+ vpbroadcastd m4, [o(pw_1697x8)]
+ punpcklwd m1, m2, m0
+ punpckhwd m2, m0
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ pmulhrsw m2, m4, m0
+ pmulhrsw m4, m1
+ paddsw m0, m2
+ paddsw m1, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m4, [o(pw_4096)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x16
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ movd xm3, [o(pw_2048)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm2
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm3
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp m(iadst_4x16_internal_8bpc).end3
+%endif
+%endmacro
+
+%macro IDCT16_1D_PACKED 0
+ vpbroadcastd m10, [o(pd_2048)]
+.main2:
+ punpckhwd m8, m7, m0 ; dct16 in15 in1
+ punpcklwd m9, m4, m0 ; dct4 in2 in0
+ punpckhwd m0, m3, m4 ; dct16 in7 in9
+ punpcklwd m7, m1 ; dct8 in7 in1
+ punpckhwd m1, m6 ; dct16 in3 in13
+ punpcklwd m3, m5 ; dct8 in3 in5
+ punpckhwd m5, m2 ; dct16 in11 in5
+ punpcklwd m6, m2 ; dct4 in3 in1
+ ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 3 ; t8a t15a
+ ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a
+ ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
+ ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
+ ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 3 ; t4a t7a
+ ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 3 ; t5a t6a
+ ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2
+ psubsw m2, m8, m0 ; t9 t14
+ paddsw m8, m0 ; t8 t15
+ psubsw m0, m1, m5 ; t10 t13
+ paddsw m1, m5 ; t11 t12
+ vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784
+ ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a
+ vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
+ ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a
+ psubsw m4, m8, m1 ; t11a t12a
+ paddsw m8, m1 ; t8a t15a
+ psubsw m1, m7, m3 ; t5a t6a
+ paddsw m7, m3 ; t4 t7
+ paddsw m3, m2, m0 ; t9 t14
+ psubsw m2, m0 ; t10 t13
+%if mmsize > 16
+ vbroadcasti128 m0, [o(deint_shuf)]
+%else
+ mova m0, [o(deint_shuf)]
+%endif
+ pshufb m8, m0
+ pshufb m7, m0
+ pshufb m3, m0
+ ITX_MUL2X_PACK 9, 0, 5, 10, 2896, 2896 ; t0 t1
+ vpbroadcastd m0, [o(pw_m2896_2896)]
+ ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12
+ vpbroadcastd m5, [o(pw_2896_2896)]
+ ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5
+ vpbroadcastd m0, [o(pw_m2896_2896)]
+ ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a
+ punpckhqdq m0, m8, m3 ; t15a t14
+ punpcklqdq m8, m3 ; t8a t9
+ shufps m5, m4, m2, q1032 ; t12 t13a
+ vpblendd m4, m2, 0xcc ; t11 t10a
+ shufps m2, m7, m1, q1032 ; t7 t6
+ vpblendd m7, m1, 0xcc ; t4 t5
+ psubsw m1, m9, m6 ; dct4 out3 out2
+ paddsw m9, m6 ; dct4 out0 out1
+ psubsw m3, m9, m2 ; dct8 out7 out6
+ paddsw m9, m2 ; dct8 out0 out1
+ psubsw m2, m1, m7 ; dct8 out4 out5
+ paddsw m1, m7 ; dct8 out3 out2
+ psubsw m7, m9, m0 ; out15 out14
+ paddsw m0, m9 ; out0 out1
+ psubsw m6, m1, m5 ; out12 out13
+ paddsw m1, m5 ; out3 out2
+ psubsw m5, m2, m4 ; out11 out10
+ paddsw m2, m4 ; out4 out5
+ psubsw m4, m3, m8 ; out8 out9
+ paddsw m3, m8 ; out7 out6
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
+
+cglobal idct_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ call m(idct_16x4_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_16384)]
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ REPX {pmulhrsw x, m5}, m0, m4, m2, m3
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ vextracti128 xm4, m0, 1
+ vextracti128 xm5, m1, 1
+ vextracti128 xm6, m2, 1
+ vextracti128 xm7, m3, 1
+ call .main
+ vinserti128 m0, xm4, 1
+ vinserti128 m1, xm5, 1
+ vpbroadcastd m5, [o(pw_2048)]
+ vinserti128 m2, xm6, 1
+ vinserti128 m3, xm7, 1
+ pshufd m1, m1, q1032
+ pshufd m3, m3, q1032
+ jmp m(iadst_4x16_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ WRAP_XMM IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_16384)]
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ REPX {pmulhrsw x, m5}, m4, m2, m3, m0
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m5, [o(pw_2896x8)]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m5 ; out8 -out11 -out9 out10
+ vpbroadcastd m5, [o(pw_2048)]
+ pshufd m1, m1, q1032
+ vpblendd m4, m1, m0, 0x33
+ vpblendd m0, m2, 0x33
+ vpblendd m2, m3, 0x33
+ vpblendd m3, m1, 0x33
+ vpermq m0, m0, q2031
+ vpermq m1, m2, q1302
+ vpermq m2, m3, q3120
+ vpermq m3, m4, q0213
+ psubw m6, m7, m5
+.end:
+ vpblendd m5, m6, 0xcc
+.end2:
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ WIN64_RESTORE_XMM
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+.end3:
+ lea r2, [dstq+strideq*8]
+ lea r3, [strideq*3]
+ WRITE_4X8 0, 1
+ lea dstq, [dstq+strideq*4]
+ lea r2, [r2 +strideq*4]
+ WRITE_4X8 2, 3
+ RET
+ALIGN function_align
+.main:
+ vpblendd m4, m1, m0, 0xcc
+ vpblendd m1, m0, 0x33
+ vpblendd m5, m2, m3, 0xcc
+ vpblendd m2, m3, 0x33
+ vperm2i128 m3, m5, m2, 0x31
+ vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1
+ vperm2i128 m4, m1, m4, 0x31
+ vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5
+ pshufd m3, m3, q1032 ; in15 in12 in13 in14
+ pshufd m2, m4, q1032 ; in11 in8 in9 in10
+cglobal_label .main2
+ vpbroadcastd m8, [o(pd_2048)]
+ pxor m7, m7
+ punpckhwd m4, m3, m0 ; in12 in3 in14 in1
+ punpcklwd m0, m3 ; in0 in15 in2 in13
+ punpckhwd m3, m2, m1 ; in8 in7 in10 in5
+ punpcklwd m1, m2 ; in4 in11 in6 in9
+ ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3
+ ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
+ ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
+ ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3
+ psubsw m2, m0, m3 ; t9a t8a t11a t10a
+ paddsw m0, m3 ; t1a t0a t3a t2a
+ psubsw m3, m1, m4 ; t13a t12a t15a t14a
+ paddsw m1, m4 ; t5a t4a t7a t6a
+ ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3
+ psubw m6, m7, m5
+ ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6
+ vpbroadcastd m6, [o(pw_m3784_1567)]
+ vpbroadcastd m5, [o(pw_1567_3784)]
+ psubsw m4, m0, m1 ; t5 t4 t7 t6
+ paddsw m0, m1 ; t1 t0 t3 t2
+ psubsw m1, m2, m3 ; t13a t12a t15a t14a
+ paddsw m2, m3 ; t9a t8a t11a t10a
+ psubw m3, m7, m6 ; pw_3784_m1567
+ vpblendd m6, m3, 0xf0
+ ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
+ ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
+ vbroadcasti128 m5, [o(deint_shuf)]
+ pshufb m0, m5
+ pshufb m2, m5
+ vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a
+ vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a
+ vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14
+ vinserti128 m4, xm1, 1 ; t4a t5a t12 t13
+ pshufd m2, m2, q1032 ; t6a t7a t14 t15
+ psubsw m1, m0, m3 ; t3a t2a t11 t10
+ paddsw m0, m3 ; -out15 out0 out14 -out1
+ paddsw m3, m4, m2 ; -out3 out12 out2 -out13
+ psubsw m4, m2 ; t6 t7 t14a t15a
+ shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a
+ vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a
+ ret
+ALIGN function_align
+.main_pass1_end:
+ vpbroadcastd m5, [o(pw_m2896_2896)]
+ vpbroadcastd m6, [o(pw_2896_2896)]
+ punpcklwd m1, m4, m2
+ punpckhwd m4, m2
+ pmaddwd m2, m5, m4
+ pmaddwd m4, m6
+ pmaddwd m5, m1
+ pmaddwd m1, m6
+ REPX {paddd x, m8}, m5, m1, m2, m4
+ REPX {psrad x, 12}, m5, m2, m1, m4
+ packssdw m2, m5 ; -out11 out8 out10 -out9
+ packssdw m1, m4 ; -out7 out4 out6 -out5
+ ret
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_16384)]
+ punpcklwd m4, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m3, m2
+ punpckhwd m3, m2
+ REPX {pmulhrsw x, m5}, m4, m1, m0, m3
+ punpckldq m2, m3, m1
+ punpckhdq m3, m1
+ punpckhdq m1, m0, m4
+ punpckldq m0, m4
+ jmp tx2q
+.pass2:
+ call m(iadst_4x16_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_2896x8)]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m5 ; out8 -out11 -out9 out10
+ vpbroadcastd m6, [o(pw_2048)]
+ pshufd m1, m1, q1032
+ vpblendd m4, m0, m2, 0x33
+ vpblendd m0, m1, 0xcc
+ vpblendd m1, m3, 0xcc
+ vpblendd m2, m3, 0x33
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q0213
+ vpermq m2, m2, q2031
+ vpermq m3, m4, q1302
+ psubw m5, m7, m6
+ jmp m(iadst_4x16_internal_8bpc).end
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova m3, [cq+32*0]
+ mova m2, [cq+32*1]
+ mova m4, [cq+32*2]
+ mova m5, [cq+32*3]
+ vpbroadcastd m8, [o(pw_1697x8)]
+ pcmpeqw m0, m0 ; -1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m5
+ punpckhwd m4, m5
+ pmulhrsw m5, m8, m1
+ pmulhrsw m6, m8, m2
+ pmulhrsw m7, m8, m3
+ pmulhrsw m8, m4
+ pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is
+ pxor m1, m9 ; unsigned. as long as both signs are equal
+ pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the
+ pxor m2, m9 ; pmulhrsw result will become 0 which causes
+ pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless
+ pxor m3, m9 ; we explicitly deal with that case here.
+ pcmpeqw m0, m4
+ pxor m4, m0
+ pavgw m1, m5
+ pavgw m2, m6
+ pavgw m3, m7
+ pavgw m4, m8
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [o(pw_1697x16)]
+ vpbroadcastd m5, [o(pw_2048)]
+ pmulhrsw m4, m8, m0
+ pmulhrsw m6, m8, m1
+ pmulhrsw m7, m8, m2
+ pmulhrsw m8, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m6
+ paddsw m2, m7
+ paddsw m3, m8
+ jmp m(iadst_4x16_internal_8bpc).end2
+
+%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3]
+ movq xm%3, [dstq ]
+ movhps xm%3, [dstq+%5]
+ movq xm%4, [dstq+%6]
+ movhps xm%4, [dstq+%7]
+ pmovzxbw m%3, xm%3
+ pmovzxbw m%4, xm%4
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vextracti128 xm%4, m%3, 1
+ movq [dstq ], xm%3
+ movhps [dstq+%6], xm%3
+ movq [dstq+%5], xm%4
+ movhps [dstq+%7], xm%4
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x4
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
+
+cglobal idct_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpbroadcastd xm3, [o(pw_2896x8)]
+ pmulhrsw xm0, xm3, [cq+16*0]
+ pmulhrsw xm1, xm3, [cq+16*1]
+ pmulhrsw xm2, xm3, [cq+16*2]
+ pmulhrsw xm3, [cq+16*3]
+ call m(idct_4x8_internal_8bpc).main
+ vbroadcasti128 m4, [o(deint_shuf)]
+ vinserti128 m3, m1, xm3, 1
+ vinserti128 m1, m0, xm2, 1
+ shufps m0, m1, m3, q0220
+ shufps m1, m3, q1331
+ pshufb m0, m4
+ pshufb m1, m4
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ jmp m(iadst_8x4_internal_8bpc).end2
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpbroadcastd xm0, [o(pw_2896x8)]
+ pshufd xm4, [cq+16*0], q1032
+ pmulhrsw xm3, xm0, [cq+16*3]
+ pshufd xm5, [cq+16*1], q1032
+ pmulhrsw xm2, xm0, [cq+16*2]
+ pmulhrsw xm4, xm0
+ pmulhrsw xm5, xm0
+ call m(iadst_4x8_internal_8bpc).main_pass1
+ vinserti128 m0, xm2, 1
+ vinserti128 m1, xm3, 1
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pxor m3, m3
+ psubsw m3, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+.end2:
+ vpbroadcastd m2, [o(pw_2048)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ WIN64_RESTORE_XMM
+.end3:
+ pxor m2, m2
+ mova [cq+32*0], m2
+ mova [cq+32*1], m2
+ lea r3, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ RET
+ALIGN function_align
+cglobal_label .main
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpbroadcastd xm0, [o(pw_2896x8)]
+ pshufd xm4, [cq+16*0], q1032
+ pmulhrsw xm3, xm0, [cq+16*3]
+ pshufd xm5, [cq+16*1], q1032
+ pmulhrsw xm2, xm0, [cq+16*2]
+ pmulhrsw xm4, xm0
+ pmulhrsw xm5, xm0
+ call m(iadst_4x8_internal_8bpc).main_pass1
+ vinserti128 m3, xm1, 1
+ vinserti128 m2, xm0, 1
+ punpckhwd m1, m3, m2
+ punpcklwd m3, m2
+ pxor m0, m0
+ psubsw m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call m(iadst_8x4_internal_8bpc).main
+ mova m2, m1
+ vpermq m1, m0, q2031
+ vpermq m0, m2, q2031
+ jmp m(iadst_8x4_internal_8bpc).end2
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ mova xm2, [cq+16*0]
+ mova xm0, [cq+16*1]
+ vinserti128 m2, [cq+16*2], 1
+ vinserti128 m0, [cq+16*3], 1
+ vpbroadcastd m3, [o(pw_2896x8)]
+ punpcklwd m1, m2, m0
+ punpckhwd m2, m0
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ paddsw m0, m0
+ paddsw m1, m1
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_8x4_internal_8bpc).end
+
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x8
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ or r3d, 8
+.dconly:
+ pmulhrsw xm0, xm2
+.dconly2:
+ movd xm2, [pw_2048]
+ pmulhrsw xm0, xm1
+ lea r2, [strideq*3]
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+.dconly_loop:
+ WRITE_8X4 0, 0, 1, 2, strideq*1, strideq*2, r2
+ lea dstq, [dstq+strideq*4]
+ sub r3d, 4
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
+
+cglobal idct_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ call .main
+ shufps m4, m0, m1, q0220
+ shufps m5, m0, m1, q1331
+ shufps m1, m2, m3, q0220
+ shufps m3, m2, m3, q1331
+ vbroadcasti128 m0, [o(deint_shuf)]
+ vpbroadcastd m2, [o(pw_16384)]
+ REPX {pshufb x, m0}, m4, m5, m1, m3
+ REPX {pmulhrsw x, m2}, m4, m5, m1, m3
+ vinserti128 m0, m4, xm1, 1
+ vperm2i128 m2, m4, m1, 0x31
+ vinserti128 m1, m5, xm3, 1
+ vperm2i128 m3, m5, m3, 0x31
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m4, [o(pw_2048)]
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ jmp m(iadst_8x8_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call .main_pass1
+ vpbroadcastd m5, [o(pw_16384)]
+ punpcklwd m4, m0, m1
+ punpckhwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ pxor m3, m3
+ psubw m3, m5 ; negate odd elements during rounding
+ pmulhrsw m4, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m5
+ pmulhrsw m2, m3
+ punpcklwd m3, m4, m0
+ punpckhwd m4, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ vperm2i128 m2, m3, m0, 0x31
+ vinserti128 m0, m3, xm0, 1
+ vperm2i128 m3, m4, m1, 0x31
+ vinserti128 m1, m4, xm1, 1
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call .main_pass2
+ vpbroadcastd m5, [o(pw_2048)]
+ vpbroadcastd xm4, [o(pw_4096)]
+ psubw m4, m5 ; lower half = 2048, upper half = -2048
+.end:
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+.end3:
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ WIN64_RESTORE_XMM
+.end4:
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+ lea r3, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 2, 3, 4, 5
+ RET
+ALIGN function_align
+.main_pass1:
+ IADST8_1D_PACKED 1
+ ret
+ALIGN function_align
+cglobal_label .main_pass2
+ IADST8_1D_PACKED 2
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call m(iadst_8x8_internal_8bpc).main_pass1
+ vpbroadcastd m5, [o(pw_16384)]
+ punpckhwd m4, m3, m2
+ punpcklwd m3, m2
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ pxor m0, m0
+ psubw m0, m5
+ pmulhrsw m4, m0
+ pmulhrsw m3, m5
+ pmulhrsw m2, m0
+ pmulhrsw m1, m5
+ punpckhwd m0, m4, m3
+ punpcklwd m4, m3
+ punpckhwd m3, m2, m1
+ punpcklwd m2, m1
+ vinserti128 m1, m0, xm3, 1
+ vperm2i128 m3, m0, m3, 0x31
+ vinserti128 m0, m4, xm2, 1
+ vperm2i128 m2, m4, m2, 0x31
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_8x8_internal_8bpc).main_pass2
+ vpbroadcastd m4, [o(pw_2048)]
+ vpbroadcastd xm5, [o(pw_4096)]
+ psubw m4, m5 ; lower half = -2048, upper half = 2048
+ vpermq m5, m3, q2031
+ vpermq m3, m0, q2031
+ vpermq m0, m2, q2031
+ vpermq m2, m1, q2031
+ pmulhrsw m1, m0, m4
+ pmulhrsw m0, m5, m4
+ jmp m(iadst_8x8_internal_8bpc).end3
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
+ mova xm3, [cq+16*0]
+ mova xm2, [cq+16*1]
+ vinserti128 m3, [cq+16*4], 1
+ vinserti128 m2, [cq+16*5], 1
+ mova xm4, [cq+16*2]
+ mova xm0, [cq+16*3]
+ vinserti128 m4, [cq+16*6], 1
+ vinserti128 m0, [cq+16*7], 1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m4, [o(pw_4096)]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x16
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 16
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
+%endif
+%endmacro
+
+%macro ITX_8X16_LOAD_COEFS 0
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m0, m4, [cq+32*0]
+ add cq, 32*4
+ pmulhrsw m7, m4, [cq+32*3]
+ pmulhrsw m1, m4, [cq-32*3]
+ pmulhrsw m6, m4, [cq+32*2]
+ pmulhrsw m2, m4, [cq-32*2]
+ pmulhrsw m5, m4, [cq+32*1]
+ pmulhrsw m3, m4, [cq-32*1]
+ pmulhrsw m4, [cq+32*0]
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
+
+cglobal idct_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_8X16_LOAD_COEFS
+ call m(idct_16x8_internal_8bpc).main
+ vpbroadcastd m10, [o(pw_16384)]
+.pass1_end:
+ vperm2i128 m9, m3, m7, 0x31
+ vinserti128 m3, xm7, 1
+ vperm2i128 m8, m2, m6, 0x31
+ vinserti128 m2, xm6, 1
+ vperm2i128 m6, m1, m5, 0x31
+ vinserti128 m1, xm5, 1
+ vperm2i128 m5, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+.pass1_end2:
+ punpckhwd m7, m5, m6
+ punpcklwd m5, m6
+ punpcklwd m6, m8, m9
+ punpckhwd m8, m9
+ REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ punpckldq m4, m5, m6
+ punpckhdq m5, m6
+ punpckldq m6, m7, m8
+ punpckhdq m7, m8
+ jmp tx2q
+.pass2:
+ call .main
+ REPX {vpermq x, x, q3120}, m0, m2, m4, m6
+ REPX {vpermq x, x, q2031}, m1, m3, m5, m7
+.end:
+ vpbroadcastd m8, [o(pw_2048)]
+.end2:
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+.end3:
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
+ lea r3, [strideq*3]
+ WRITE_8X4 0, 1, 8, 9
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 2, 3, 0, 1
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 4, 5, 0, 1
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 6, 7, 0, 1
+ RET
+ALIGN function_align
+cglobal_label .main
+ IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_8X16_LOAD_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+ vpbroadcastd m10, [o(pw_16384)]
+ pslld m9, m10, 17
+ psubw m10, m9 ; 16384, -16384
+ jmp m(idct_8x16_internal_8bpc).pass1_end
+ALIGN function_align
+.pass2:
+ call .main
+ call .main_pass2_end
+ vpbroadcastd m9, [o(pw_2048)]
+ vpbroadcastd xm8, [o(pw_4096)]
+ psubw m8, m9
+ REPX {vpermq x, x, q2031}, m0, m1, m2, m3
+ REPX {vpermq x, x, q3120}, m4, m5, m6, m7
+ jmp m(idct_8x16_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ REPX {pshufd x, x, q1032}, m7, m1, m5, m3
+.main2:
+ vpbroadcastd m10, [o(pd_2048)]
+ punpckhwd m8, m7, m0 ; in14 in1
+ punpcklwd m0, m7 ; in0 in15
+ punpcklwd m7, m6, m1 ; in12 in3
+ punpckhwd m1, m6 ; in2 in13
+ punpckhwd m6, m5, m2 ; in10 in5
+ punpcklwd m2, m5 ; in4 in11
+ punpcklwd m5, m4, m3 ; in8 in7
+ punpckhwd m3, m4 ; in6 in9
+ ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1
+ ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3
+ ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5
+ ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7
+ ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9
+ ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
+ ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
+ ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15
+ psubsw m4, m0, m5 ; t9a t8a
+ paddsw m0, m5 ; t1a t0a
+ psubsw m5, m1, m6 ; t11a t10a
+ paddsw m1, m6 ; t3a t2a
+ psubsw m6, m2, m7 ; t13a t12a
+ paddsw m2, m7 ; t5a t4a
+ psubsw m7, m3, m8 ; t15a t14a
+ paddsw m3, m8 ; t7a t6a
+ vpbroadcastd m11, [o(pw_m4017_799)]
+ vpbroadcastd m12, [o(pw_799_4017)]
+ pxor m9, m9
+ ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9
+ psubw m8, m9, m11 ; pw_4017_m799
+ ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13
+ vpbroadcastd m11, [o(pw_m2276_3406)]
+ vpbroadcastd m12, [o(pw_3406_2276)]
+ ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11
+ psubw m8, m9, m11 ; pw_2276_m3406
+ ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15
+ psubsw m8, m1, m3 ; t7 t6
+ paddsw m1, m3 ; t3 t2
+ psubsw m3, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m2, m5, m7 ; t14a t15a
+ paddsw m7, m5 ; t10a t11a
+ psubsw m5, m4, m6 ; t12a t13a
+ paddsw m4, m6 ; t8a t9a
+ vpbroadcastd m11, [o(pw_m3784_1567)]
+ vpbroadcastd m12, [o(pw_1567_3784)]
+ ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a
+ psubw m6, m9, m11 ; pw_3784_m1567
+ ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a
+ vpbroadcastd m11, [o(pw_m1567_3784)]
+ vpbroadcastd m12, [o(pw_3784_1567)]
+ ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14
+ psubw m6, m9, m11 ; pw_1567_m3784
+ ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12
+ vbroadcasti128 m12, [o(deint_shuf)]
+ paddsw m6, m4, m7 ; -out1 out14
+ psubsw m4, m7 ; t10 t11
+ psubsw m11, m3, m8 ; t7 t6
+ paddsw m8, m3 ; out12 -out3
+ psubsw m3, m0, m1 ; t3a t2a
+ paddsw m0, m1 ; -out15 out0
+ paddsw m1, m2, m5 ; -out13 out2
+ psubsw m5, m2 ; t15a t14a
+ pshufb m0, m12
+ pshufb m6, m12
+ pshufb m8, m12
+ pshufb m1, m12
+ shufps m7, m6, m0, q1032 ; out14 -out15
+ vpblendd m0, m6, 0x33 ; -out1 out0
+ punpcklqdq m6, m8, m1 ; out12 -out13
+ punpckhqdq m1, m8, m1 ; -out3 out2
+ ret
+ALIGN function_align
+.main_pass1_end:
+ vpbroadcastd m8, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ pmaddwd m9, m8, m11 ; -out11
+ pmaddwd m2, m12, m5 ; -out5
+ pmaddwd m5, m8 ; out10
+ pmaddwd m11, m12 ; out4
+ REPX {paddd x, m10}, m9, m5, m2, m11
+ REPX {psrad x, 12 }, m9, m5, m2, m11
+ packssdw m5, m9 ; out10 -out11
+ packssdw m2, m11 ; -out5 out4
+ pmaddwd m11, m8, m3 ; out8
+ vpbroadcastd m8, [o(pw_2896_m2896)]
+ pmaddwd m3, m12 ; -out7
+ pmaddwd m8, m4 ; -out9
+ pmaddwd m4, m12 ; out6
+ REPX {paddd x, m10}, m11, m3, m8, m4
+ REPX {psrad x, 12 }, m11, m3, m8, m4
+ packssdw m3, m4 ; -out7 out6
+ packssdw m4, m11, m8 ; out8 -out9
+ vpbroadcastd m10, [o(pw_16384)]
+ pxor m9, m9
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ vpbroadcastd m8, [o(pw_2896x8)]
+ pshufb m2, m11, m12
+ pshufb m5, m12
+ pshufb m3, m12
+ pshufb m4, m12
+ punpcklqdq m11, m5, m2 ; t15a t7
+ punpckhqdq m5, m2 ; t14a t6
+ shufps m2, m3, m4, q1032 ; t2a t10
+ vpblendd m3, m4, 0xcc ; t3a t11
+ psubsw m4, m2, m3 ; out8 -out9
+ paddsw m3, m2 ; -out7 out6
+ paddsw m2, m5, m11 ; -out5 out4
+ psubsw m5, m11 ; out10 -out11
+ REPX {pmulhrsw x, m8}, m2, m3, m4, m5
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_8X16_LOAD_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+ vpbroadcastd m9, [o(pw_16384)]
+ pslld m10, m9, 17
+ psubw m10, m9 ; -16384, 16384
+ vperm2i128 m9, m4, m0, 0x31
+ vinserti128 m0, m4, xm0, 1
+ vperm2i128 m8, m5, m1, 0x31
+ vinserti128 m4, m5, xm1, 1
+ vperm2i128 m5, m7, m3, 0x31
+ vinserti128 m3, m7, xm3, 1
+ vinserti128 m1, m6, xm2, 1
+ vperm2i128 m6, m6, m2, 0x31
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ punpcklwd m0, m3, m1
+ punpckhwd m3, m1
+ jmp m(idct_8x16_internal_8bpc).pass1_end2
+.pass2:
+ call m(iadst_8x16_internal_8bpc).main
+ call m(iadst_8x16_internal_8bpc).main_pass2_end
+ vpbroadcastd m8, [o(pw_2048)]
+ vpbroadcastd xm9, [o(pw_4096)]
+ psubw m8, m9
+ vpermq m9, m0, q3120
+ vpermq m0, m7, q2031
+ vpermq m7, m1, q3120
+ vpermq m1, m6, q2031
+ vpermq m6, m2, q3120
+ vpermq m2, m5, q2031
+ vpermq m5, m3, q3120
+ vpermq m3, m4, q2031
+ pmulhrsw m0, m8
+ pmulhrsw m1, m8
+ pmulhrsw m2, m8
+ pmulhrsw m3, m8
+ pmulhrsw m4, m5, m8
+ pmulhrsw m5, m6, m8
+ pmulhrsw m6, m7, m8
+ pmulhrsw m7, m9, m8
+ jmp m(idct_8x16_internal_8bpc).end3
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
+ pmulhrsw m%2, m%3, m%1
+%if %0 == 4 ; if downshifting by 1
+ pmulhrsw m%2, m%4
+%else
+ paddsw m%1, m%1
+%endif
+ paddsw m%1, m%2
+%endmacro
+
+cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ mova xm3, [cq+16*0]
+ mova xm2, [cq+16*2]
+ add cq, 16*8
+ vinserti128 m3, [cq+16*0], 1
+ vinserti128 m2, [cq+16*2], 1
+ vpbroadcastd m9, [o(pw_2896x8)]
+ mova xm4, [cq-16*4]
+ mova xm5, [cq-16*2]
+ vinserti128 m4, [cq+16*4], 1
+ vinserti128 m5, [cq+16*6], 1
+ mova xm7, [cq-16*7]
+ mova xm6, [cq-16*5]
+ vinserti128 m7, [cq+16*1], 1
+ vinserti128 m6, [cq+16*3], 1
+ mova xm8, [cq-16*3]
+ mova xm0, [cq-16*1]
+ vinserti128 m8, [cq+16*5], 1
+ vinserti128 m0, [cq+16*7], 1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m5
+ punpckhwd m4, m5
+ punpcklwd m5, m7, m6
+ punpckhwd m7, m6
+ punpcklwd m6, m8, m0
+ punpckhwd m8, m0
+ REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ punpckldq m4, m5, m6
+ punpckhdq m5, m6
+ punpckldq m6, m7, m8
+ punpckhdq m7, m8
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [o(pw_1697x16)]
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7
+ jmp m(idct_8x16_internal_8bpc).end
+
+%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
+ pmovzxbw m%3, [dstq+%5]
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+ pmovzxbw m%4, [dstq+%6]
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vpermq m%3, m%3, q3120
+ mova [dstq+%5], xm%3
+ vextracti128 [dstq+%6], m%3, 1
+%endmacro
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x4
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ or r3d, 4
+.dconly:
+ pmulhrsw xm0, xm2
+ movd xm2, [pw_2048] ; intentionally rip-relative
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ pxor m3, m3
+.dconly_loop:
+ mova xm1, [dstq+strideq*0]
+ vinserti128 m1, [dstq+strideq*1], 1
+ punpckhbw m2, m1, m3
+ punpcklbw m1, m3
+ paddw m2, m0
+ paddw m1, m0
+ packuswb m1, m2
+ mova [dstq+strideq*0], xm1
+ vextracti128 [dstq+strideq*1], m1, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
+
+cglobal idct_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova xm0, [cq+16*0]
+ mova xm1, [cq+16*1]
+ mova xm2, [cq+16*2]
+ mova xm3, [cq+16*3]
+ mova xm4, [cq+16*4]
+ mova xm5, [cq+16*5]
+ mova xm6, [cq+16*6]
+ mova xm7, [cq+16*7]
+ call m(idct_4x16_internal_8bpc).main
+ vinserti128 m6, m2, xm6, 1
+ vinserti128 m2, m0, xm4, 1
+ vinserti128 m0, m1, xm5, 1
+ vinserti128 m1, m3, xm7, 1
+ punpcklwd m3, m2, m6
+ punpckhwd m2, m6
+ vpbroadcastd m6, [o(pw_16384)]
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ mova m1, m6
+ jmp m(iadst_16x4_internal_8bpc).pass1_end
+.pass2:
+ call .main
+ jmp m(iadst_16x4_internal_8bpc).end
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m6, [o(pd_2048)]
+ IDCT4_1D 0, 1, 2, 3, 4, 5, 6
+ ret
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q1230
+ vpermq m3, [cq+32*3], q2103
+ vpermq m1, [cq+32*1], q1230
+ vpermq m2, [cq+32*2], q2103
+ call m(iadst_4x16_internal_8bpc).main2
+ call m(iadst_4x16_internal_8bpc).main_pass1_end
+ punpcklwd m4, m3, m1
+ punpcklwd m5, m2, m0
+ punpckhwd m0, m1
+ punpckhwd m2, m3
+ vpbroadcastd m1, [o(pw_16384)]
+ vinserti128 m3, m0, xm2, 1
+ vperm2i128 m2, m0, m2, 0x31
+ vinserti128 m0, m4, xm5, 1
+ vperm2i128 m4, m4, m5, 0x31
+ psubw m6, m7, m1
+.pass1_end:
+ pmulhrsw m3, m1
+ pmulhrsw m2, m6
+ pmulhrsw m4, m1
+ pmulhrsw m0, m6
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ vpbroadcastd m4, [o(pw_2048)]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ WIN64_RESTORE_XMM
+.end2:
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+.end3:
+ WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1
+ lea dstq, [dstq+strideq*2]
+ WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1
+ RET
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m6, [o(pw_m3344_3344)]
+ vpbroadcastd m7, [o(pw_3803_1321)]
+ vpbroadcastd m8, [o(pw_m1321_2482)]
+ vpbroadcastd m9, [o(pw_2482_3344)]
+ punpcklwd m4, m2, m0 ; in2 in0 l
+ punpckhwd m2, m0 ; in2 in0 h
+ psrld m5, m6, 16
+ pmaddwd m10, m6, m4 ; t2:02 l
+ pmaddwd m6, m2 ; t2:02 h
+ pmaddwd m0, m7, m4 ; t0:02 l
+ pmaddwd m7, m2 ; t0:02 h
+ pmaddwd m4, m8 ; t1:02 l
+ pmaddwd m8, m2 ; t1:02 h
+ punpckhwd m2, m3, m1 ; in3 in1 h
+ punpcklwd m3, m1 ; in3 in1 l
+ pmaddwd m1, m5, m2 ; t2:3 h
+ pmaddwd m5, m3 ; t2:3 l
+ paddd m6, m1
+ vpbroadcastd m1, [o(pd_2048)]
+ paddd m10, m5
+ pmaddwd m5, m9, m3
+ pmaddwd m9, m2
+ paddd m0, m1
+ paddd m7, m1
+ paddd m0, m5 ; t0 + t3 + 2048 l
+ paddd m7, m9 ; t0 + t3 + 2048 h
+ vpbroadcastd m9, [o(pw_m3803_3344)]
+ pmaddwd m5, m9, m2
+ pmaddwd m9, m3
+ paddd m10, m1 ; t2 + 2048 l
+ paddd m6, m1 ; t2 + 2048 h
+ paddd m5, m1 ; t1:13 + 2048 h
+ paddd m1, m9 ; t1:13 + 2048 l
+ vpbroadcastd m9, [o(pw_m3803_m6688)]
+ pmaddwd m2, m9
+ pmaddwd m3, m9
+ paddd m5, m8 ; t1 + t3 + 2048 h
+ paddd m1, m4 ; t1 + t3 + 2048 l
+ paddd m8, m7
+ paddd m4, m0
+ paddd m2, m8 ; t0 + t1 - t3 + 2048 h
+ paddd m3, m4 ; t0 + t1 - t3 + 2048 l
+ REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3
+ packssdw m0, m7
+ packssdw m1, m5
+ packssdw m3, m2
+ packssdw m2, m10, m6
+ ret
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q1230
+ vpermq m3, [cq+32*3], q2103
+ vpermq m1, [cq+32*1], q1230
+ vpermq m2, [cq+32*2], q2103
+ call m(iadst_4x16_internal_8bpc).main2
+ call m(iadst_4x16_internal_8bpc).main_pass1_end
+ punpckhwd m4, m3, m2
+ punpckhwd m5, m1, m0
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ vpbroadcastd m6, [o(pw_16384)]
+ vinserti128 m3, m0, xm1, 1
+ vperm2i128 m2, m0, m1, 0x31
+ vinserti128 m0, m4, xm5, 1
+ vperm2i128 m4, m4, m5, 0x31
+ psubw m1, m7, m6
+ jmp m(iadst_16x4_internal_8bpc).pass1_end
+ALIGN function_align
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m4, [o(pw_2048)]
+ REPX {pmulhrsw x, m4}, m3, m2, m1, m0
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+ WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1
+ lea dstq, [dstq+strideq*2]
+ WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1
+ RET
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
+ mova xm2, [cq+16*0]
+ mova xm4, [cq+16*1]
+ vinserti128 m2, [cq+16*4], 1
+ vinserti128 m4, [cq+16*5], 1
+ mova xm0, [cq+16*2]
+ mova xm1, [cq+16*3]
+ vinserti128 m0, [cq+16*6], 1
+ vinserti128 m1, [cq+16*7], 1
+ vpbroadcastd m7, [o(pw_1697x16)]
+ vpbroadcastd m8, [o(pw_16384)]
+ punpcklwd m3, m2, m4
+ punpckhwd m2, m4
+ punpcklwd m4, m0, m1
+ punpckhwd m0, m1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ pmulhrsw m0, m7, m1
+ pmulhrsw m5, m7, m2
+ pmulhrsw m6, m7, m3
+ pmulhrsw m7, m4
+ REPX {pmulhrsw x, m8}, m0, m5, m6, m7
+ paddsw m1, m0
+ paddsw m2, m5
+ paddsw m3, m6
+ paddsw m4, m7
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(iadst_16x4_internal_8bpc).end
+
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x8
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 8
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+%endif
+%endmacro
+
+%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
+ vpbroadcastd m8, [o(pw_2896x8)]
+ vpermq m0, [cq+32*0], q3120
+ add cq, 32*4
+ vpermq m7, [cq+32*3], q%1
+ vpermq m1, [cq-32*3], q%1
+ vpermq m6, [cq+32*2], q3120
+ vpermq m2, [cq-32*2], q3120
+ vpermq m5, [cq+32*1], q%1
+ vpermq m3, [cq-32*1], q%1
+ vpermq m4, [cq+32*0], q3120
+ REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
+
+cglobal idct_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_16X8_LOAD_COEFS 3120
+ call m(idct_8x16_internal_8bpc).main
+ vpbroadcastd m10, [o(pw_16384)]
+ punpckhwd m8, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m2, m1, m3
+ punpcklwd m1, m3
+ punpcklwd m9, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m5, m7
+ punpckhwd m5, m7
+ REPX {pmulhrsw x, m10}, m8, m1, m4, m6
+.pass1_end:
+ REPX {pmulhrsw x, m10}, m0, m2, m9, m5
+ punpckhwd m3, m0, m8
+ punpcklwd m0, m8
+ punpckhwd m8, m2, m1
+ punpcklwd m2, m1
+ punpcklwd m7, m9, m4
+ punpckhwd m9, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m8
+ punpckhdq m3, m8
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckldq m8, m9, m5
+ punpckhdq m9, m5
+ vperm2i128 m4, m0, m6, 0x31
+ vinserti128 m0, xm6, 1
+ vperm2i128 m5, m1, m7, 0x31
+ vinserti128 m1, xm7, 1
+ vperm2i128 m6, m2, m8, 0x31
+ vinserti128 m2, xm8, 1
+ vperm2i128 m7, m3, m9, 0x31
+ vinserti128 m3, xm9, 1
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m8, [o(pw_2048)]
+.end:
+ REPX {pmulhrsw x, m8}, m0, m2, m4, m6
+.end2:
+ REPX {pmulhrsw x, m8}, m1, m3, m5, m7
+ lea r3, [strideq*3]
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r3
+.end3:
+ pxor m0, m0
+ REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
+.end4:
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r3
+ RET
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m10, [o(pd_2048)]
+.main2:
+ IDCT8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_16X8_LOAD_COEFS 1302
+ call m(iadst_8x16_internal_8bpc).main2
+ call m(iadst_8x16_internal_8bpc).main_pass1_end
+ psubw m11, m9, m10
+ punpcklwd m8, m0, m2
+ punpckhwd m0, m2
+ punpckhwd m2, m1, m3
+ punpcklwd m1, m3
+ punpcklwd m9, m4, m6
+ punpckhwd m4, m6
+ punpckhwd m6, m5, m7
+ punpcklwd m5, m7
+ REPX {pmulhrsw x, m11}, m8, m1, m4, m6
+ jmp m(idct_16x8_internal_8bpc).pass1_end
+ALIGN function_align
+.pass2:
+ call .main
+ call .main_pass2_end
+ pxor m8, m8
+ psubw m8, m9
+ REPX {pmulhrsw x, m9}, m0, m2, m4, m6
+ jmp m(idct_16x8_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m10, [o(pd_2048)]
+ ITX_MULSUB_2W 7, 0, 8, 9, 10, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
+ ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
+ psubsw m8, m2, m6 ; t6
+ paddsw m2, m6 ; t2
+ psubsw m6, m0, m4 ; t4
+ paddsw m0, m4 ; t0
+ psubsw m4, m5, m1 ; t7
+ paddsw m5, m1 ; t3
+ psubsw m1, m7, m3 ; t5
+ paddsw m7, m3 ; t1
+ ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
+ psubsw m9, m6, m8 ; t7
+ paddsw m6, m8 ; out6
+ psubsw m3, m7, m5 ; t3
+ paddsw m7, m5 ; -out7
+ psubsw m5, m0, m2 ; t2
+ paddsw m0, m2 ; out0
+ psubsw m2, m1, m4 ; t6
+ paddsw m1, m4 ; -out1
+ ret
+ALIGN function_align
+.main_pass1_end:
+ vpbroadcastd m11, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ punpckhwd m4, m3, m5
+ punpcklwd m3, m5
+ pmaddwd m5, m11, m4
+ pmaddwd m4, m12
+ pmaddwd m8, m11, m3
+ pmaddwd m3, m12
+ REPX {paddd x, m10}, m5, m4, m8, m3
+ REPX {psrad x, 12 }, m5, m8, m4, m3
+ packssdw m3, m4 ; -out3
+ packssdw m4, m8, m5 ; out4
+ punpcklwd m5, m9, m2
+ punpckhwd m9, m2
+ pmaddwd m2, m12, m5
+ pmaddwd m5, m11
+ pmaddwd m12, m9
+ pmaddwd m11, m9
+ REPX {paddd x, m10}, m2, m5, m12, m11
+ REPX {psrad x, 12 }, m2, m12, m5, m11
+ packssdw m2, m12 ; out2
+ packssdw m5, m11 ; -out5
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ vpbroadcastd m8, [o(pw_2896x8)]
+ psubsw m4, m5, m3
+ paddsw m3, m5
+ psubsw m5, m2, m9
+ paddsw m2, m9
+ pmulhrsw m2, m8 ; out2
+ pmulhrsw m3, m8 ; -out3
+ pmulhrsw m4, m8 ; out4
+ pmulhrsw m5, m8 ; -out5
+ vpbroadcastd m9, [o(pw_2048)]
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ ITX_16X8_LOAD_COEFS 1302
+ call m(iadst_8x16_internal_8bpc).main2
+ call m(iadst_8x16_internal_8bpc).main_pass1_end
+ psubw m9, m10
+ punpcklwd m8, m6, m4
+ punpckhwd m6, m4
+ punpcklwd m4, m7, m5
+ punpckhwd m7, m5
+ punpckhwd m5, m3, m1
+ punpcklwd m3, m1
+ punpckhwd m1, m2, m0
+ punpcklwd m2, m0
+ REPX {pmulhrsw x, m10}, m8, m4, m5, m1
+ REPX {pmulhrsw x, m9 }, m6, m7, m3, m2
+ punpcklwd m0, m7, m4
+ punpckhwd m7, m4
+ punpckhwd m4, m6, m8
+ punpcklwd m6, m8
+ punpckhwd m8, m3, m5
+ punpcklwd m3, m5
+ punpcklwd m5, m2, m1
+ punpckhwd m2, m1
+ punpckhdq m1, m0, m6
+ punpckldq m0, m6
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckhdq m4, m3, m5
+ punpckldq m3, m5
+ punpckldq m5, m8, m2
+ punpckhdq m8, m2
+ vinserti128 m2, m6, xm5, 1
+ vperm2i128 m6, m5, 0x31
+ vperm2i128 m5, m1, m4, 0x31
+ vinserti128 m1, xm4, 1
+ vperm2i128 m4, m0, m3, 0x31
+ vinserti128 m0, xm3, 1
+ vinserti128 m3, m7, xm8, 1
+ vperm2i128 m7, m8, 0x31
+ jmp tx2q
+.pass2:
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+ pxor m8, m8
+ psubw m8, m9
+ pmulhrsw m10, m7, m8
+ pmulhrsw m7, m0, m9
+ pmulhrsw m0, m6, m9
+ pmulhrsw m6, m1, m8
+ pmulhrsw m1, m5, m8
+ pmulhrsw m5, m2, m9
+ pmulhrsw m2, m4, m9
+ pmulhrsw m4, m3, m8
+ lea r3, [strideq*3]
+ WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1
+ WRITE_16X2 1, 2, 0, 1, strideq*2, r3
+ jmp m(idct_16x8_internal_8bpc).end3
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
+ mova xm7, [cq+16*0]
+ mova xm2, [cq+16*1]
+ add cq, 16*8
+ vpbroadcastd m3, [o(pw_2896x8)]
+ vinserti128 m7, [cq+16*0], 1
+ vinserti128 m2, [cq+16*1], 1
+ mova xm6, [cq-16*6]
+ mova xm4, [cq-16*5]
+ vinserti128 m6, [cq+16*2], 1
+ vinserti128 m4, [cq+16*3], 1
+ mova xm8, [cq-16*4]
+ mova xm5, [cq-16*3]
+ vinserti128 m8, [cq+16*4], 1
+ vinserti128 m5, [cq+16*5], 1
+ mova xm0, [cq-16*2]
+ mova xm1, [cq-16*1]
+ vinserti128 m0, [cq+16*6], 1
+ vinserti128 m1, [cq+16*7], 1
+ vpbroadcastd m10, [o(pw_1697x16)]
+ vpbroadcastd m11, [o(pw_16384)]
+ REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
+ punpcklwd m3, m7, m2
+ punpckhwd m7, m2
+ punpcklwd m2, m6, m4
+ punpckhwd m6, m4
+ punpcklwd m4, m8, m5
+ punpckhwd m8, m5
+ punpcklwd m5, m0, m1
+ punpckhwd m0, m1
+ punpckldq m1, m3, m2
+ punpckhdq m3, m2
+ punpckldq m2, m4, m5
+ punpckhdq m4, m5
+ punpckldq m5, m7, m6
+ punpckhdq m7, m6
+ punpckldq m6, m8, m0
+ punpckhdq m8, m0
+ REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m6
+ punpckhqdq m5, m6
+ punpcklqdq m6, m7, m8
+ punpckhqdq m7, m8
+ jmp tx2q
+.pass2:
+ vpbroadcastd m8, [o(pw_4096)]
+ jmp m(idct_16x8_internal_8bpc).end
+
+%define o_base pw_5 + 128
+
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x16
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 16
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+%endif
+%endmacro
+
+%macro ITX_16X16_LOAD_COEFS 0
+ mova m0, [cq+32*0]
+ mova m1, [cq+32*1]
+ mova m2, [cq+32*2]
+ mova m3, [cq+32*3]
+ add cq, 32*8
+ mova m4, [cq-32*4]
+ mova m5, [cq-32*3]
+ mova m6, [cq-32*2]
+ mova m7, [cq-32*1]
+ mova m8, [cq+32*0]
+ mova m9, [cq+32*1]
+ mova m10, [cq+32*2]
+ mova m11, [cq+32*3]
+ mova m12, [cq+32*4]
+ mova m13, [cq+32*5]
+ mova m14, [cq+32*6]
+ mova m15, [cq+32*7]
+ mova [rsp], m15
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
+
+cglobal idct_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+ ITX_16X16_LOAD_COEFS
+ call .main
+.pass1_end:
+ vpbroadcastd m1, [o(pw_8192)]
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+ vextracti128 [rsp+16*5], m8, 1
+ mova [rsp+16*1], xm8
+.pass1_end2:
+ vextracti128 [rsp+16*4], m0, 1
+ mova [rsp+16*0], xm0
+ REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
+ pmulhrsw m1, [rsp+32*1]
+ vperm2i128 m8, m1, m9, 0x31
+ vinserti128 m1, xm9, 1
+ vperm2i128 m9, m2, m10, 0x31
+ vinserti128 m2, xm10, 1
+ vperm2i128 m10, m3, m11, 0x31
+ vinserti128 m3, xm11, 1
+ vperm2i128 m11, m4, m12, 0x31
+ vinserti128 m4, xm12, 1
+ vperm2i128 m12, m5, m13, 0x31
+ vinserti128 m5, xm13, 1
+ vperm2i128 m13, m6, m14, 0x31
+ vinserti128 m6, xm14, 1
+ vperm2i128 m14, m7, m15, 0x31
+ vinserti128 m7, xm15, 1
+ mova m15, [rsp+32*2]
+.pass1_end3:
+ punpcklwd m0, m9, m10
+ punpckhwd m9, m10
+ punpcklwd m10, m15, m8
+ punpckhwd m15, m8
+ punpckhwd m8, m11, m12
+ punpcklwd m11, m12
+ punpckhwd m12, m13, m14
+ punpcklwd m13, m14
+ punpckhdq m14, m11, m13
+ punpckldq m11, m13
+ punpckldq m13, m15, m9
+ punpckhdq m15, m9
+ punpckldq m9, m10, m0
+ punpckhdq m10, m0
+ punpckhdq m0, m8, m12
+ punpckldq m8, m12
+ punpcklqdq m12, m13, m8
+ punpckhqdq m13, m8
+ punpcklqdq m8, m9, m11
+ punpckhqdq m9, m11
+ punpckhqdq m11, m10, m14
+ punpcklqdq m10, m14
+ punpcklqdq m14, m15, m0
+ punpckhqdq m15, m0
+ mova m0, [rsp]
+ mova [rsp], m15
+ punpckhwd m15, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m15, m1
+ punpckhdq m15, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m15
+ punpcklqdq m6, m15
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ vpbroadcastd m1, [o(pw_2048)]
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+ mova [rsp], m6
+.end2:
+ REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
+ pmulhrsw m1, [rsp+32*1]
+ lea r3, [strideq*3]
+ WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3
+.end3:
+ pxor m2, m2
+ REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 10, 11, 0, 1, strideq*2, r3
+ REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 14, 15, 0, 1, strideq*2, r3
+ RET
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m15, [o(pd_2048)]
+ mova [rsp+gprsize+32*1], m1
+ mova [rsp+gprsize+32*2], m9
+ IDCT8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15
+ mova m1, [rsp+gprsize+32*2] ; in9
+ mova [rsp+gprsize+32*2], m14 ; tmp7
+ mova m9, [rsp+gprsize+32*1] ; in1
+ mova [rsp+gprsize+32*1], m10 ; tmp5
+ mova m14, [rsp+gprsize+32*0] ; in15
+ mova [rsp+gprsize+32*0], m6 ; tmp3
+ IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15
+ mova m6, [rsp+gprsize+32*1] ; tmp5
+ psubsw m15, m0, m14 ; out15
+ paddsw m0, m14 ; out0
+ psubsw m14, m2, m13 ; out14
+ paddsw m2, m13 ; out1
+ mova [rsp+gprsize+32*1], m2
+ psubsw m13, m4, m11 ; out13
+ paddsw m2, m4, m11 ; out2
+ psubsw m11, m8, m7 ; out11
+ paddsw m4, m8, m7 ; out4
+ mova m7, [rsp+gprsize+32*2] ; tmp7
+ psubsw m10, m6, m5 ; out10
+ paddsw m5, m6 ; out5
+ psubsw m8, m7, m9 ; out8
+ paddsw m7, m9 ; out7
+ psubsw m9, m12, m3 ; out9
+ paddsw m6, m12, m3 ; out6
+ mova m3, [rsp+gprsize+32*0] ; tmp3
+ psubsw m12, m3, m1 ; out12
+ paddsw m3, m1 ; out3
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+ ITX_16X16_LOAD_COEFS
+ call .main
+ call .main_pass1_end
+ pmulhrsw m0, m1, [cq+32*0]
+ pmulhrsw m2, m1, [cq+32*1]
+ REPX {pmulhrsw x, m1}, m4, m6, m8, m10
+ pmulhrsw m12, m1, [cq+32*2]
+ pmulhrsw m14, m1, [cq+32*3]
+ vextracti128 [rsp+16*5], m8, 1
+ mova [rsp+16*1], xm8
+ pxor m8, m8
+ psubw m1, m8, m1
+ jmp m(idct_16x16_internal_8bpc).pass1_end2
+ALIGN function_align
+.pass2:
+ call .main
+ call .main_pass2_end
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
+ mova [rsp+32*0], m6
+ pxor m6, m6
+ psubw m1, m6, m1
+ jmp m(idct_16x16_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m15, [o(pd_2048)]
+ mova [rsp+gprsize+32*1], m0
+ mova [rsp+gprsize+32*2], m4
+ ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973 ; t3, t2
+ ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6
+ ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10
+ ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14
+ psubsw m0, m2, m10 ; t10a
+ paddsw m2, m10 ; t2a
+ psubsw m10, m13, m5 ; t11a
+ paddsw m13, m5 ; t3a
+ psubsw m5, m6, m14 ; t14a
+ paddsw m6, m14 ; t6a
+ psubsw m14, m9, m1 ; t15a
+ paddsw m9, m1 ; t7a
+ ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10
+ ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15
+ psubsw m1, m10, m14 ; t14a
+ paddsw m10, m14 ; t10a
+ psubsw m14, m0, m5 ; t15a
+ paddsw m0, m5 ; t11a
+ psubsw m5, m2, m6 ; t6
+ paddsw m2, m6 ; t2
+ psubsw m6, m13, m9 ; t7
+ paddsw m13, m9 ; t3
+ ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a
+ ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15
+ mova m9, [rsp+gprsize+32*0] ; in15
+ mova [rsp+gprsize+32*0], m10 ; t10a
+ mova m4, [rsp+gprsize+32*1] ; in0
+ mova [rsp+gprsize+32*1], m6 ; t6a
+ mova m6, [rsp+gprsize+32*2] ; in4
+ mova [rsp+gprsize+32*2], m2 ; t2
+ ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091 ; t1, t0
+ ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4
+ ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8
+ ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12
+ psubsw m10, m4, m8 ; t8a
+ paddsw m8, m4 ; t0a
+ psubsw m4, m9, m7 ; t9a
+ paddsw m9, m7 ; t1a
+ psubsw m7, m6, m12 ; t12a
+ paddsw m6, m12 ; t4a
+ psubsw m12, m11, m3 ; t13a
+ paddsw m11, m3 ; t5a
+ ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8
+ ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13
+ psubsw m3, m9, m11 ; t5
+ paddsw m9, m11 ; t1
+ psubsw m11, m4, m12 ; t12a
+ paddsw m4, m12 ; t8a
+ paddsw m12, m8, m6 ; t0
+ psubsw m8, m6 ; t4
+ paddsw m6, m10, m7 ; t9a
+ psubsw m10, m7 ; t13a
+ ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12
+ mova m7, [rsp+gprsize+32*0] ; t10a
+ mova m2, [rsp+gprsize+32*1] ; t6a
+ paddsw m15, m9, m13 ; -out15
+ psubsw m9, m13 ; t3a
+ paddsw m13, m11, m1 ; -out13
+ psubsw m11, m1 ; t15a
+ psubsw m1, m4, m7 ; t10
+ paddsw m7, m4 ; -out1
+ psubsw m4, m3, m2 ; t6
+ paddsw m3, m2 ; -out3
+ paddsw m2, m10, m14 ; out2
+ psubsw m10, m14 ; t14a
+ paddsw m14, m6, m0 ; out14
+ psubsw m6, m0 ; t11
+ mova m0, [rsp+gprsize+32*2] ; t2
+ mova [rsp+gprsize+32*1], m7
+ psubsw m7, m12, m0 ; t2a
+ paddsw m0, m12 ; out0
+ paddsw m12, m8, m5 ; out12
+ psubsw m8, m5 ; t7
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova [cq+32*0], m0
+ mova [cq+32*1], m2
+ mova [cq+32*2], m12
+ mova [cq+32*3], m14
+ vpbroadcastd m14, [pw_m2896_2896]
+ vpbroadcastd m12, [pw_2896_2896]
+ vpbroadcastd m2, [pd_2048]
+ punpcklwd m5, m11, m10
+ punpckhwd m11, m10
+ pmaddwd m10, m14, m5
+ pmaddwd m0, m14, m11
+ pmaddwd m5, m12
+ pmaddwd m11, m12
+ REPX {paddd x, m2}, m10, m0, m5, m11
+ REPX {psrad x, 12}, m10, m0, m5, m11
+ packssdw m10, m0 ; out10
+ packssdw m5, m11 ; -out5
+ punpcklwd m11, m8, m4
+ punpckhwd m8, m4
+ pmaddwd m4, m12, m11
+ pmaddwd m0, m12, m8
+ pmaddwd m11, m14
+ pmaddwd m8, m14
+ REPX {paddd x, m2}, m4, m0, m11, m8
+ REPX {psrad x, 12}, m4, m0, m11, m8
+ packssdw m4, m0 ; out4
+ packssdw m11, m8 ; -out11
+ punpcklwd m8, m9, m7
+ punpckhwd m9, m7
+ pmaddwd m7, m12, m8
+ pmaddwd m0, m12, m9
+ pmaddwd m8, m14
+ pmaddwd m9, m14
+ REPX {paddd x, m2}, m7, m0, m8, m9
+ REPX {psrad x, 12}, m7, m0, m8, m9
+ packssdw m7, m0 ; -out7
+ packssdw m8, m9 ; out8
+ punpckhwd m0, m6, m1
+ punpcklwd m6, m1
+ pmaddwd m1, m14, m0
+ pmaddwd m9, m14, m6
+ pmaddwd m0, m12
+ pmaddwd m6, m12
+ REPX {paddd x, m2}, m1, m9, m0, m6
+ REPX {psrad x, 12}, m1, m9, m0, m6
+ packssdw m9, m1 ; -out7
+ packssdw m6, m0 ; out8
+ vpbroadcastd m1, [o(pw_8192)]
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to
+ ; 16-bit here will produce the same result as using 32-bit intermediates.
+ paddsw m5, m10, m11 ; -out5
+ psubsw m10, m11 ; out10
+ psubsw m11, m4, m8 ; -out11
+ paddsw m4, m8 ; out4
+ psubsw m8, m7, m9 ; out8
+ paddsw m7, m9 ; -out7
+ psubsw m9, m1, m6 ; -out9
+ paddsw m6, m1 ; out6
+ vpbroadcastd m1, [o(pw_2896x8)]
+ REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
+ vpbroadcastd m1, [o(pw_2048)]
+ ret
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+ ITX_16X16_LOAD_COEFS
+ call m(iadst_16x16_internal_8bpc).main
+ call m(iadst_16x16_internal_8bpc).main_pass1_end
+ pmulhrsw m6, m1
+ pmulhrsw m2, m1, m8
+ mova [rsp+32*2], m6
+ pmulhrsw m6, m1, m4
+ pmulhrsw m4, m1, m10
+ pmulhrsw m8, m1, [cq+32*3]
+ pmulhrsw m10, m1, [cq+32*2]
+ pmulhrsw m12, m1, [cq+32*1]
+ pmulhrsw m14, m1, [cq+32*0]
+ pxor m0, m0
+ psubw m0, m1
+ REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15
+ pmulhrsw m1, m0, m9
+ pmulhrsw m9, m0, m13
+ pmulhrsw m0, [rsp+32*1]
+ mova [rsp+16*0], xm15
+ mova [rsp+16*1], xm7
+ vperm2i128 m15, m15, m7, 0x31
+ vinserti128 m7, m2, xm14, 1
+ vperm2i128 m14, m2, m14, 0x31
+ vinserti128 m2, m9, xm5, 1
+ vperm2i128 m9, m9, m5, 0x31
+ vinserti128 m5, m4, xm12, 1
+ vperm2i128 m12, m4, m12, 0x31
+ vinserti128 m4, m11, xm3, 1
+ vperm2i128 m11, m11, m3, 0x31
+ vinserti128 m3, m10, xm6, 1
+ vperm2i128 m10, m10, m6, 0x31
+ vinserti128 m6, m1, xm0, 1
+ vperm2i128 m13, m1, m0, 0x31
+ vinserti128 m1, m8, [rsp+32*2], 1
+ vperm2i128 m8, m8, [rsp+32*2], 0x31
+ jmp m(idct_16x16_internal_8bpc).pass1_end3
+.pass2:
+ call m(iadst_16x16_internal_8bpc).main
+ call m(iadst_16x16_internal_8bpc).main_pass2_end
+ pmulhrsw m0, m1
+ pmulhrsw m8, m1
+ mova [rsp+32*0], m0
+ mova [rsp+32*2], m8
+ pxor m0, m0
+ psubw m0, m1
+ pmulhrsw m8, m0, m7
+ pmulhrsw m7, m0, m9
+ pmulhrsw m9, m1, m6
+ pmulhrsw m6, m1, m10
+ pmulhrsw m10, m0, m5
+ pmulhrsw m5, m0, m11
+ pmulhrsw m11, m1, m4
+ pmulhrsw m4, m1, m12
+ pmulhrsw m12, m0, m3
+ pmulhrsw m3, m0, m13
+ pmulhrsw m13, m1, m2
+ pmulhrsw m1, m14
+ pmulhrsw m14, m0, [rsp+32*1]
+ pmulhrsw m0, m15
+ lea r3, [strideq*3]
+ WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1
+ mova m15, [rsp+32*0]
+ WRITE_16X2 3, 4, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3
+ jmp m(idct_16x16_internal_8bpc).end3
+
+%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
+ pmulhrsw m%2, m%3, m%1
+ psraw m%2, 1
+ pavgw m%1, m%2 ; signs are guaranteed to be equal
+%endmacro
+
+INV_TXFM_16X16_FN identity, dct
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
+ vpbroadcastd m7, [o(pw_1697x16)]
+ mova xm0, [cq+16* 0]
+ vinserti128 m0, [cq+16*16], 1
+ mova xm15, [cq+16* 1]
+ vinserti128 m15, [cq+16*17], 1
+ mova xm1, [cq+16* 2]
+ vinserti128 m1, [cq+16*18], 1
+ mova xm8, [cq+16* 3]
+ vinserti128 m8, [cq+16*19], 1
+ mova xm2, [cq+16* 4]
+ vinserti128 m2, [cq+16*20], 1
+ mova xm9, [cq+16* 5]
+ vinserti128 m9, [cq+16*21], 1
+ mova xm3, [cq+16* 6]
+ vinserti128 m3, [cq+16*22], 1
+ mova xm10, [cq+16* 7]
+ add cq, 16*16
+ vinserti128 m10, [cq+16* 7], 1
+ mova xm4, [cq-16* 8]
+ vinserti128 m4, [cq+16* 8], 1
+ mova xm11, [cq-16* 7]
+ vinserti128 m11, [cq+16* 9], 1
+ mova xm5, [cq-16* 6]
+ vinserti128 m5, [cq+16*10], 1
+ mova xm12, [cq-16* 5]
+ vinserti128 m12, [cq+16*11], 1
+ mova xm13, [cq-16* 3]
+ vinserti128 m13, [cq+16*13], 1
+ mova xm14, [cq-16* 1]
+ vinserti128 m14, [cq+16*15], 1
+ REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \
+ 10, 4, 11, 5, 12, 13, 14
+ mova xm6, [cq-16* 4]
+ vinserti128 m6, [cq+16*12], 1
+ mova [rsp], m0
+ IDTX16B 6, 0, 7
+ mova xm0, [cq-16* 2]
+ vinserti128 m0, [cq+16*14], 1
+ pmulhrsw m7, m0
+ psraw m7, 1
+ pavgw m7, m0
+ jmp m(idct_16x16_internal_8bpc).pass1_end3
+ALIGN function_align
+.pass2:
+ vpbroadcastd m15, [o(pw_1697x16)]
+ mova [rsp+32*1], m0
+ REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14
+ mova m0, [rsp+32*1]
+ mova [rsp+32*1], m1
+ IDTX16 0, 1, 15
+ mova m1, [rsp+32*0]
+ pmulhrsw m15, m1
+ paddsw m1, m1
+ paddsw m15, m1
+ jmp m(idct_16x16_internal_8bpc).end
+
+%define o_base deint_shuf + 128
+
+%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
+%if %3
+ vpbroadcastd m15, [o(pw_2896x8)]
+ pmulhrsw m0, m15, [%1+%2*0]
+ pmulhrsw m1, m15, [%1+%2*1]
+ pmulhrsw m2, m15, [%1+%2*2]
+ pmulhrsw m3, m15, [%1+%2*3]
+ pmulhrsw m4, m15, [%1+%2*4]
+ pmulhrsw m5, m15, [%1+%2*5]
+ pmulhrsw m6, m15, [%1+%2*6]
+ pmulhrsw m7, m15, [%1+%2*7]
+%else
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+ mova m4, [%1+%2*4]
+ mova m5, [%1+%2*5]
+ mova m6, [%1+%2*6]
+ mova m7, [%1+%2*7]
+%endif
+%endmacro
+
+%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2
+%if %3
+%if %3 == 1
+ vpbroadcastd m15, [o(pw_2896x8)]
+%endif
+ pmulhrsw m8, m15, [%1+%2*0]
+ pmulhrsw m9, m15, [%1+%2*1]
+ pmulhrsw m10, m15, [%1+%2*2]
+ pmulhrsw m11, m15, [%1+%2*3]
+ pmulhrsw m12, m15, [%1+%2*4]
+ pmulhrsw m13, m15, [%1+%2*5]
+ pmulhrsw m14, m15, [%1+%2*6]
+ pmulhrsw m15, [%1+%2*7]
+%else
+ mova m8, [%1+%2*0]
+ mova m9, [%1+%2*1]
+ mova m10, [%1+%2*2]
+ mova m11, [%1+%2*3]
+ mova m12, [%1+%2*4]
+ mova m13, [%1+%2*5]
+ mova m14, [%1+%2*6]
+ mova m15, [%1+%2*7]
+%endif
+%endmacro
+
+%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
+ vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%4_%5x8]
+ punpcklwd m%1, m%2, m%2
+ pmulhrsw m%1, m%3
+ vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%6_%7x8]
+ punpckhwd m%2, m%2
+ pmulhrsw m%2, m%3
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob
+ %undef cmp
+ cmp eobd, 106
+ jle .fast
+ LOAD_8ROWS cq+32*1, 32*2
+ call m(idct_16x8_internal_8bpc).main
+ vperm2i128 m11, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ vperm2i128 m4, m1, m5, 0x31
+ vinserti128 m1, xm5, 1
+ vperm2i128 m5, m2, m6, 0x31
+ vinserti128 m2, xm6, 1
+ vperm2i128 m6, m3, m7, 0x31
+ vinserti128 m3, xm7, 1
+ pxor m7, m7
+ REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15
+ punpckhwd m7, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpcklwd m3, m11, m4
+ punpckhwd m11, m4
+ punpckhwd m4, m5, m6
+ punpcklwd m5, m6
+ punpckhdq m6, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m5
+ punpckhdq m3, m5
+ punpckhdq m5, m11, m4
+ punpckldq m11, m4
+ punpckldq m4, m7, m1
+ punpckhdq m7, m1
+ punpckhqdq m12, m6, m0
+ punpcklqdq m0, m6 ; out4
+ punpckhqdq m13, m7, m4
+ punpcklqdq m4, m7 ; out5
+ punpckhqdq m14, m3, m2
+ punpcklqdq m2, m3 ; out6
+ punpckhqdq m15, m5, m11
+ punpcklqdq m11, m5 ; out7
+ mova [rsp+32*0], m0
+ mova [rsp+32*1], m4
+ mova [rsp+32*2], m2
+.fast:
+ LOAD_8ROWS cq+32*0, 32*2
+ call m(idct_16x8_internal_8bpc).main
+ vperm2i128 m8, m0, m4, 0x31
+ vinserti128 m0, xm4, 1
+ vperm2i128 m4, m1, m5, 0x31
+ vinserti128 m1, xm5, 1
+ vperm2i128 m5, m2, m6, 0x31
+ vinserti128 m2, xm6, 1
+ vperm2i128 m6, m3, m7, 0x31
+ vinserti128 m3, xm7, 1
+ vpbroadcastd m9, [o(pw_8192)]
+ pxor m7, m7
+ REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14
+ punpckhwd m7, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m8, m4
+ punpcklwd m8, m4
+ punpckhwd m4, m5, m6
+ punpcklwd m5, m6
+ punpckhdq m6, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m8, m5
+ punpckhdq m8, m5
+ punpckhdq m5, m3, m4
+ punpckldq m3, m4
+ punpckhdq m4, m7, m1
+ punpckldq m7, m1
+ punpcklqdq m1, m7, m4
+ punpckhqdq m7, m4 ; out9
+ punpckhqdq m4, m2, m8 ; out10
+ punpcklqdq m2, m8
+ punpckhqdq m8, m3, m5
+ punpcklqdq m3, m5
+ punpckhqdq m5, m0, m6 ; out8
+ punpcklqdq m0, m6
+ REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7
+ cmp eobd, 106
+ jg .full
+ mova [rsp+32*0], m5
+ mova [rsp+32*1], m7
+ mova [rsp+32*2], m4
+ pmulhrsw m11, m9, m8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call .main_fast
+ jmp .pass2
+.dconly:
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
+.full:
+ REPX {pmulhrsw x, m9}, m12, m13, m14, m15
+ pmulhrsw m6, m9, [rsp+32*2]
+ mova [rsp+32*2], m4
+ pmulhrsw m4, m9, [rsp+32*0]
+ mova [rsp+32*0], m5
+ pmulhrsw m5, m9, [rsp+32*1]
+ mova [rsp+32*1], m7
+ pmulhrsw m7, m9, m11
+ pmulhrsw m11, m9, m8
+ call .main
+.pass2:
+ vpbroadcastd m12, [o(pw_2048)]
+ REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m13, m14, m15
+ pmulhrsw m12, [rsp]
+ REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14
+ REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15
+ mova [rsp+32*0], m4
+ mova [rsp+32*1], m6
+ lea r3, [strideq*3]
+ WRITE_8X4 0, 1, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 2, 3, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 [rsp+32*0], 5, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 [rsp+32*1], 7, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 8, 9, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 10, 11, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 12, 13, 4, 6
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 14, 15, 4, 6
+ RET
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ call m(idct_8x16_internal_8bpc).main
+ mova m8, [rsp+gprsize+0*32]
+ mova [rsp+gprsize+0*32], m0
+ mova m9, [rsp+gprsize+1*32]
+ mova [rsp+gprsize+1*32], m1
+ mova m0, [rsp+gprsize+2*32]
+ mova [rsp+gprsize+2*32], m6
+ lea r5, [r6-(o_base)+pw_201_4091x8]
+ ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
+ ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
+ jmp .main2
+ALIGN function_align
+cglobal_label .main
+ call m(idct_8x16_internal_8bpc).main
+ mova m8, [rsp+gprsize+0*32]
+ mova [rsp+gprsize+0*32], m0
+ mova m9, [rsp+gprsize+1*32]
+ mova [rsp+gprsize+1*32], m1
+ mova m0, [rsp+gprsize+2*32]
+ mova [rsp+gprsize+2*32], m6
+ punpcklwd m1, m15, m8 ; in31 in1
+ punpckhwd m8, m15 ; in3 in29
+ punpcklwd m15, m14, m9 ; in27 in5
+ punpckhwd m9, m14 ; in7 in25
+ punpcklwd m14, m13, m0 ; in23 in9
+ punpckhwd m0, m13 ; in11 in21
+ punpcklwd m13, m12, m11 ; in19 in13
+ punpckhwd m11, m12 ; in15 in17
+ ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a
+ ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a
+ ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a
+ ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a
+ ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a
+ ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a
+ ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a
+ ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a
+.main2:
+ psubsw m6, m1, m11 ; t17 t30
+ paddsw m1, m11 ; t16 t31
+ psubsw m11, m9, m14 ; t18 t29
+ paddsw m9, m14 ; t19 t28
+ psubsw m14, m15, m0 ; t21 t26
+ paddsw m15, m0 ; t20 t27
+ psubsw m0, m8, m13 ; t22 t25
+ paddsw m8, m13 ; t23 t24
+ ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a
+ ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a
+ ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a
+ ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
+ psubsw m13, m1, m9 ; t19a t28a
+ paddsw m1, m9 ; t16a t31a
+ psubsw m9, m8, m15 ; t20a t27a
+ paddsw m8, m15 ; t23a t24a
+ psubsw m15, m6, m11 ; t18 t29
+ paddsw m6, m11 ; t17 t30
+ psubsw m11, m0, m14 ; t21 t26
+ paddsw m0, m14 ; t22 t25
+ ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 3 ; t18a t29a
+ ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 3 ; t19 t28
+ ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 3 ; t20 t27
+ ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a
+ vbroadcasti128 m12, [o(deint_shuf)]
+ psubsw m14, m1, m8 ; t23 t24
+ paddsw m1, m8 ; t16 t31
+ psubsw m8, m6, m0 ; t22a t25a
+ paddsw m6, m0 ; t17a t30a
+ psubsw m0, m15, m11 ; t21 t26
+ paddsw m15, m11 ; t18 t29
+ psubsw m11, m13, m9 ; t20a t27a
+ paddsw m13, m9 ; t19a t28a
+ REPX {pshufb x, m12}, m1, m6, m15, m13
+ ITX_MUL2X_PACK 14, 9, 12, 10, 2896, 2896 ; t24a t23a
+ vpbroadcastd m9, [o(pw_m2896_2896)]
+ ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20
+ shufps m9, m14, m8, q1032 ; t23a t22
+ vpblendd m14, m8, 0xcc ; t24a t25
+ shufps m8, m11, m0, q1032 ; t20 t21a
+ vpblendd m11, m0, 0xcc ; t27 t26a
+ punpcklqdq m0, m1, m6 ; t16 t17a
+ punpckhqdq m1, m6 ; t31 t30a
+ psubsw m10, m5, m8 ; out20 out21
+ paddsw m5, m8 ; out11 out10
+ psubsw m6, m3, m14 ; out24 out25
+ paddsw m3, m14 ; out7 out6
+ psubsw m8, m7, m0 ; out16 out17
+ paddsw m7, m0 ; out15 out14
+ mova m0, [rsp+gprsize+0*32]
+ punpcklqdq m12, m13, m15 ; t19a t18
+ punpckhqdq m13, m15 ; t28a t29
+ psubsw m15, m0, m1 ; out31 out30
+ paddsw m0, m1 ; out0 out1
+ mova m1, [rsp+gprsize+1*32]
+ mova [rsp+gprsize+0*32], m6
+ mova m6, [rsp+gprsize+2*32]
+ psubsw m14, m1, m13 ; out28 out29
+ paddsw m1, m13 ; out3 out2
+ psubsw m13, m2, m11 ; out27 out26
+ paddsw m2, m11 ; out4 out5
+ psubsw m11, m4, m9 ; out23 out22
+ paddsw m4, m9 ; out8 out9
+ psubsw m9, m6, m12 ; out19 out18
+ paddsw m6, m12 ; out12 out13
+ ret
+
+%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
+ vbroadcasti128 m%1, [cq+16*%3]
+ vbroadcasti128 m%2, [cq+16*%4]
+ shufpd m%1, m%2, 0x0c
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 8
+.dconly:
+ pmulhrsw xm0, xm2
+ movd xm2, [pw_2048] ; intentionally rip-relative
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ pxor m3, m3
+.dconly_loop:
+ mova m1, [dstq]
+ punpckhbw m2, m1, m3
+ punpcklbw m1, m3
+ paddw m2, m0
+ paddw m1, m0
+ packuswb m1, m2
+ mova [dstq], m1
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.normal:
+ PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob
+ %undef cmp
+ LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2
+ LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3
+ LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6
+ LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 0, 1, 2, 3
+ add cq, 16*16
+ LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10
+ LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11
+ LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14
+ LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15
+ REPX {mova [cq+32*x], m8}, -4, -3, -2, -1
+ mova [rsp+32*0], m4
+ mova [rsp+32*1], m5
+ mova [rsp+32*2], m6
+ cmp eobd, 106
+ jg .full
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+ jmp .pass2
+.full:
+ LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18
+ LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17
+ LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22
+ LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21
+ REPX {mova [cq+32*x], m8}, 0, 1, 2, 3
+ add cq, 16*8
+ LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26
+ LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25
+ LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30
+ LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 0, 1, 2, 3
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+.pass2:
+ vpbroadcastd m12, [o(pw_8192)]
+ REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15
+ mova [rsp+32*1], m9
+ mova [rsp+32*2], m10
+ punpckhwd m9, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m2, m1, m3
+ punpcklwd m1, m3
+ punpcklwd m10, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m5, m7
+ punpckhwd m5, m7
+ punpckhwd m3, m0, m9
+ punpcklwd m0, m9
+ punpckhwd m9, m2, m1
+ punpcklwd m2, m1
+ punpcklwd m7, m10, m4
+ punpckhwd m10, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m9
+ punpckhdq m3, m9
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckldq m9, m10, m5
+ punpckhdq m10, m5
+ REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10
+ pmulhrsw m12, [rsp+32*0]
+ mova [rsp+32*0], m8
+ vperm2i128 m4, m0, m6, 0x31
+ vinserti128 m0, xm6, 1
+ vperm2i128 m5, m1, m7, 0x31
+ vinserti128 m1, xm7, 1
+ vperm2i128 m6, m2, m9, 0x31
+ vinserti128 m2, xm9, 1
+ vperm2i128 m7, m3, m10, 0x31
+ vinserti128 m3, xm10, 1
+ call m(idct_16x8_internal_8bpc).main
+ vpbroadcastd m8, [o(pw_2048)]
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ lea r2, [strideq*3]
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r2
+ lea r3, [dstq+strideq*4]
+ %define dstq r3
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r2
+ mova m0, [rsp+32*0]
+ mova m1, [rsp+32*1]
+ mova m2, [rsp+32*2]
+ punpckhwd m7, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m2, m1, m11
+ punpcklwd m1, m11
+ punpckhwd m4, m12, m14
+ punpcklwd m12, m14
+ punpckhwd m5, m13, m15
+ punpcklwd m13, m15
+ punpckhwd m3, m0, m7
+ punpcklwd m0, m7
+ punpckhwd m9, m2, m1
+ punpcklwd m2, m1
+ punpcklwd m7, m12, m4
+ punpckhwd m12, m4
+ punpcklwd m4, m5, m13
+ punpckhwd m5, m13
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m3, m9
+ punpckhdq m3, m9
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckldq m9, m12, m5
+ punpckhdq m12, m5
+ vperm2i128 m4, m0, m6, 0x31
+ vinserti128 m0, xm6, 1
+ vperm2i128 m5, m1, m7, 0x31
+ vinserti128 m1, xm7, 1
+ vperm2i128 m6, m2, m9, 0x31
+ vinserti128 m2, xm9, 1
+ vperm2i128 m7, m3, m12, 0x31
+ vinserti128 m3, xm12, 1
+ call m(idct_16x8_internal_8bpc).main2
+ vpbroadcastd m8, [o(pw_2048)]
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ add r0, 16
+ add r3, 16
+ %define dstq r0
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r2
+ %define dstq r3
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r2
+ RET
+
+cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 5, 11, dst, stride, c, eob
+ vpbroadcastd m9, [pw_5]
+ lea r4, [strideq*3]
+ sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107)
+.loop:
+ mova xm0,[cq+16* 0]
+ mova xm1, [cq+16* 4]
+ vinserti128 m0, [cq+16* 1], 1
+ vinserti128 m1, [cq+16* 5], 1
+ pxor m8, m8
+ mova [cq+32*0], m8
+ mova [cq+32*2], m8
+ add cq, 16*16
+ mova xm2, [cq-16* 8]
+ mova xm3, [cq-16* 4]
+ vinserti128 m2, [cq-16* 7], 1
+ vinserti128 m3, [cq-16* 3], 1
+ mova xm4, [cq+16* 0]
+ mova xm5, [cq+16* 4]
+ vinserti128 m4, [cq+16* 1], 1
+ vinserti128 m5, [cq+16* 5], 1
+ mova xm6, [cq+16* 8]
+ mova xm7, [cq+16*12]
+ vinserti128 m6, [cq+16* 9], 1
+ vinserti128 m7, [cq+16*13], 1
+ REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6
+ REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ call .transpose8x8
+ REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4
+ add dstq, strideq
+ WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4
+ add dstq, strideq
+ WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4
+ add dstq, strideq
+ WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4
+ add dstq, strideq
+ sub cq, 16*16-32
+ lea dstq, [dstq+r4*4]
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+ALIGN function_align
+.transpose8x8:
+ punpckhwd m8, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m8, m1
+ punpckhdq m8, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m8
+ punpcklqdq m6, m8
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 10, dst, stride, c, eob
+ add cq, 16*8
+ vpbroadcastd m9, [pw_4096]
+ lea r4, [strideq*3]
+ lea r5, [dstq+strideq*4]
+ sub eobd, 107
+.loop:
+ mova xm0, [cq-16*8]
+ mova xm1, [cq-16*7]
+ vinserti128 m0, [cq+16*0], 1
+ vinserti128 m1, [cq+16*1], 1
+ mova xm2, [cq-16*6]
+ mova xm3, [cq-16*5]
+ vinserti128 m2, [cq+16*2], 1
+ vinserti128 m3, [cq+16*3], 1
+ mova xm4, [cq-16*4]
+ mova xm5, [cq-16*3]
+ vinserti128 m4, [cq+16*4], 1
+ vinserti128 m5, [cq+16*5], 1
+ mova xm6, [cq-16*2]
+ mova xm7, [cq-16*1]
+ vinserti128 m6, [cq+16*6], 1
+ vinserti128 m7, [cq+16*7], 1
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r4
+ %define dstq r5
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r4
+ add cq, 16*16
+ add r0, 16
+ add r5, 16
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+
+%define o_base pw_5 + 128
+
+%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs
+%if %3
+ vpbroadcastd m15, [o(pw_2896x8)]
+ pmulhrsw m0, m15, [%1+%2* 0]
+ pmulhrsw m1, m15, [%1+%2* 1]
+ pmulhrsw m2, m15, [%1+%2* 2]
+ pmulhrsw m3, m15, [%1+%2* 3]
+ pmulhrsw m4, m15, [%1+%2* 4]
+ pmulhrsw m5, m15, [%1+%2* 5]
+ pmulhrsw m6, m15, [%1+%2* 6]
+ pmulhrsw m7, m15, [%1+%2* 7]
+ pmulhrsw m8, m15, [%1+%2* 8]
+ pmulhrsw m9, m15, [%1+%2* 9]
+ pmulhrsw m10, m15, [%1+%2*10]
+ pmulhrsw m11, m15, [%1+%2*11]
+ pmulhrsw m12, m15, [%1+%2*12]
+ pmulhrsw m13, m15, [%1+%2*13]
+ pmulhrsw m14, m15, [%1+%2*14]
+ pmulhrsw m15, [%1+%2*15]
+%else
+ mova m0, [%1+%2* 0]
+ mova m1, [%1+%2* 1]
+ mova m2, [%1+%2* 2]
+ mova m3, [%1+%2* 3]
+ mova m4, [%1+%2* 4]
+ mova m5, [%1+%2* 5]
+ mova m6, [%1+%2* 6]
+ mova m7, [%1+%2* 7]
+ mova m8, [%1+%2* 8]
+ mova m9, [%1+%2* 9]
+ mova m10, [%1+%2*10]
+ mova m11, [%1+%2*11]
+ mova m12, [%1+%2*12]
+ mova m13, [%1+%2*13]
+ mova m14, [%1+%2*14]
+ mova m15, [%1+%2*15]
+%endif
+ mova [rsp], m15
+%if %4
+ pxor m15, m15
+ REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15
+%endif
+%endmacro
+
+%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
+ mova m%4, [%2]
+ paddsw m%3, m%1, m%4
+ psubsw m%1, m%4
+ pmovzxbw m%4, [dstq+%6]
+ pmulhrsw m%3, m%5
+ pmulhrsw m%1, m%5
+ paddw m%3, m%4
+ pmovzxbw m%4, [r2+%7]
+ paddw m%1, m%4
+ packuswb m%3, m%1
+ vpermq m%3, m%3, q3120
+ mova [dstq+%6], xm%3
+ vextracti128 [r2+%7], m%3, 1
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \
+ base, tmp3
+ %undef cmp
+ LOAD_16ROWS cq, 64, 1
+ call m(idct_16x16_internal_8bpc).main
+ lea tmp1q, [rsp+32*7]
+ lea tmp2q, [tmp1q+32*8]
+ lea tmp3q, [tmp1q+32*16]
+ mova m1, [rsp+32*1]
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ vpbroadcastd m7, [o(pw_16384)]
+ call .transpose_2x8x8_round
+ mova m15, [rsp+32*0]
+ mova [tmp3q-32*4+ 0], xm0
+ vextracti128 [tmp3q+32*0+ 0], m0, 1
+ mova [tmp3q-32*3+ 0], xm2
+ vextracti128 [tmp3q+32*1+ 0], m2, 1
+ mova [tmp3q-32*2+ 0], xm4
+ vextracti128 [tmp3q+32*2+ 0], m4, 1
+ mova [tmp3q-32*1+ 0], xm6
+ vextracti128 [tmp3q+32*3+ 0], m6, 1
+ mova [tmp3q-32*4+16], xm8
+ vextracti128 [tmp3q+32*0+16], m8, 1
+ mova [tmp3q-32*3+16], xm10
+ vextracti128 [tmp3q+32*1+16], m10, 1
+ mova [tmp3q-32*2+16], xm12
+ vextracti128 [tmp3q+32*2+16], m12, 1
+ mova [tmp3q-32*1+16], xm14
+ vextracti128 [tmp3q+32*3+16], m14, 1
+ cmp eobd, 150
+ jg .full
+ vinserti128 m0, m1, xm9, 1
+ vperm2i128 m4, m1, m9, 0x31
+ vinserti128 m2, m5, xm13, 1
+ vperm2i128 m6, m5, m13, 0x31
+ vinserti128 m1, m3, xm11, 1
+ vperm2i128 m5, m3, m11, 0x31
+ vinserti128 m3, m7, xm15, 1
+ vperm2i128 m7, m7, m15, 0x31
+ call .main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+ jmp .idct16
+.dconly:
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+.full:
+ mova [tmp1q-32*4], m1
+ mova [tmp1q-32*3], m3
+ mova [tmp1q-32*2], m5
+ mova [tmp1q-32*1], m7
+ mova [tmp1q+32*0], m9
+ mova [tmp1q+32*1], m11
+ mova [tmp1q+32*2], m13
+ mova [tmp1q+32*3], m15
+ LOAD_16ROWS cq+32, 64, 1
+ call m(idct_16x16_internal_8bpc).main
+ lea r2, [tmp3q+32*8]
+ mova m1, [rsp+32*1]
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ vpbroadcastd m7, [o(pw_16384)]
+ call .transpose_2x8x8_round
+ mova m15, [rsp+32*0]
+ mova [r2-32*4+ 0], xm0
+ vextracti128 [r2+32*0+ 0], m0, 1
+ mova [r2-32*3+ 0], xm2
+ vextracti128 [r2+32*1+ 0], m2, 1
+ mova [r2-32*2+ 0], xm4
+ vextracti128 [r2+32*2+ 0], m4, 1
+ mova [r2-32*1+ 0], xm6
+ vextracti128 [r2+32*3+ 0], m6, 1
+ mova [r2-32*4+16], xm8
+ vextracti128 [r2+32*0+16], m8, 1
+ mova [r2-32*3+16], xm10
+ vextracti128 [r2+32*1+16], m10, 1
+ mova [r2-32*2+16], xm12
+ vextracti128 [r2+32*2+16], m12, 1
+ mova [r2-32*1+16], xm14
+ vextracti128 [r2+32*3+16], m14, 1
+ vinserti128 m8, m1, xm9, 1
+ vperm2i128 m12, m1, m9, 0x31
+ mova xm0, [tmp1q-32*4]
+ mova xm1, [tmp1q-32*3]
+ vinserti128 m0, [tmp1q+32*0], 1
+ vinserti128 m1, [tmp1q+32*1], 1
+ vinserti128 m10, m5, xm13, 1
+ vperm2i128 m14, m5, m13, 0x31
+ mova xm4, [tmp1q-32*4+16]
+ mova xm5, [tmp1q-32*3+16]
+ vinserti128 m4, [tmp1q+32*0+16], 1
+ vinserti128 m5, [tmp1q+32*1+16], 1
+ vinserti128 m9, m3, xm11, 1
+ vperm2i128 m13, m3, m11, 0x31
+ mova xm2, [tmp1q-32*2]
+ mova xm3, [tmp1q-32*1]
+ vinserti128 m2, [tmp1q+32*2], 1
+ vinserti128 m3, [tmp1q+32*3], 1
+ vinserti128 m11, m7, xm15, 1
+ vperm2i128 m15, m7, m15, 0x31
+ mova xm6, [tmp1q-32*2+16]
+ mova xm7, [tmp1q-32*1+16]
+ vinserti128 m6, [tmp1q+32*2+16], 1
+ vinserti128 m7, [tmp1q+32*3+16], 1
+ call .main_oddhalf
+ LOAD_8ROWS_H r2-32*4, 32
+.idct16:
+ LOAD_8ROWS tmp3q-32*4, 32
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ call .pass2_end
+ RET
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; lower half is zero
+ mova [rsp+gprsize+32*1], m7
+ pxor m7, m7
+ mova [rsp+gprsize+32*0], m7
+ mova [rsp+gprsize+32*2], m7
+ vpbroadcastd m11, [o(pw_3703x8)]
+ vpbroadcastd m7, [o(pw_1751x8)]
+ vpbroadcastd m12, [o(pw_m1380x8)]
+ vpbroadcastd m8, [o(pw_3857x8)]
+ vpbroadcastd m13, [o(pw_3973x8)]
+ vpbroadcastd m15, [o(pw_995x8)]
+ pmulhrsw m11, m4 ; t29a
+ pmulhrsw m4, m7 ; t18a
+ pmulhrsw m12, m3 ; t19a
+ pmulhrsw m3, m8 ; t28a
+ pmulhrsw m13, m2 ; t27a
+ pmulhrsw m2, m15 ; t20a
+ vpbroadcastd m10, [o(pw_m2106x8)]
+ vpbroadcastd m7, [o(pw_3513x8)]
+ vpbroadcastd m9, [o(pw_3290x8)]
+ vpbroadcastd m8, [o(pw_2440x8)]
+ vpbroadcastd m14, [o(pw_m601x8)]
+ vpbroadcastd m15, [o(pw_4052x8)]
+ pmulhrsw m10, m5 ; t21a
+ pmulhrsw m5, m7 ; t26a
+ pmulhrsw m9, m6 ; t25a
+ pmulhrsw m6, m8 ; t22a
+ pmulhrsw m14, m1 ; t23a
+ pmulhrsw m1, m15 ; t24a
+ vpbroadcastd m15, [o(pd_2048)]
+ jmp .main2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ mova [rsp+gprsize+32*0], m15
+ mova [rsp+gprsize+32*1], m7
+ mova [rsp+gprsize+32*2], m8
+ vpbroadcastd m15, [o(pd_2048)]
+ ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290 ; t22a, t25a
+ ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601 ; t23a, t24a
+.main2:
+ psubsw m7, m12, m4 ; t18
+ paddsw m12, m4 ; t19
+ psubsw m4, m2, m10 ; t21
+ paddsw m2, m10 ; t20
+ psubsw m10, m14, m6 ; t22
+ paddsw m14, m6 ; t23
+ psubsw m6, m1, m9 ; t25
+ paddsw m1, m9 ; t24
+ psubsw m9, m13, m5 ; t26
+ paddsw m13, m5 ; t27
+ psubsw m5, m3, m11 ; t29
+ paddsw m3, m11 ; t28
+ ITX_MULSUB_2W 5, 7, 8, 11, 15, m4017, 799 ; t18a, t29a
+ ITX_MULSUB_2W 9, 4, 8, 11, 15, 3406, 2276 ; t21a, t26a
+ ITX_MULSUB_2W 6, 10, 8, 11, 15, m2276, 3406 ; t22a, t25a
+ psubsw m8, m14, m2 ; t20a
+ paddsw m14, m2 ; t23a
+ psubsw m2, m1, m13 ; t27a
+ paddsw m1, m13 ; t24a
+ psubsw m13, m6, m9 ; t21
+ paddsw m6, m9 ; t22
+ psubsw m9, m10, m4 ; t26
+ paddsw m10, m4 ; t25
+ ITX_MULSUB_2W 2, 8, 4, 11, 15, m3784, 1567 ; t20, t27
+ ITX_MULSUB_2W 9, 13, 4, 11, 15, m3784, 1567 ; t21a, t26a
+ mova m4, [rsp+gprsize+32*0] ; in31
+ mova [rsp+gprsize+32*0], m6 ; t22
+ mova m6, [rsp+gprsize+32*1] ; in15
+ mova [rsp+gprsize+32*1], m14 ; t23a
+ mova m14, [rsp+gprsize+32*2] ; in17
+ mova [rsp+gprsize+32*2], m1 ; t24a
+ ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751 ; t17a, t30a
+ psubsw m1, m0, m14 ; t17
+ paddsw m0, m14 ; t16
+ psubsw m14, m4, m6 ; t30
+ paddsw m4, m6 ; t31
+ ITX_MULSUB_2W 14, 1, 6, 11, 15, 799, 4017 ; t17a, t30a
+ psubsw m6, m0, m12 ; t19a
+ paddsw m0, m12 ; t16a
+ psubsw m12, m4, m3 ; t28a
+ paddsw m4, m3 ; t31a
+ psubsw m3, m14, m5 ; t18
+ paddsw m14, m5 ; t17
+ psubsw m5, m1, m7 ; t29
+ paddsw m1, m7 ; t30
+ ITX_MULSUB_2W 5, 3, 7, 11, 15, 1567, 3784 ; t18a, t29a
+ ITX_MULSUB_2W 12, 6, 7, 11, 15, 1567, 3784 ; t19, t28
+ psubsw m7, m1, m10 ; t25a
+ paddsw m1, m10 ; t30a
+ psubsw m10, m5, m9 ; t21
+ paddsw m5, m9 ; t18
+ psubsw m9, m12, m2 ; t20a
+ paddsw m12, m2 ; t19a
+ psubsw m2, m3, m13 ; t26
+ paddsw m3, m13 ; t29
+ psubsw m13, m6, m8 ; t27a
+ paddsw m6, m8 ; t28a
+ mova [tmp1q-32*2], m5
+ mova [tmp1q-32*1], m12
+ mova [tmp2q+32*0], m6
+ mova [tmp2q+32*1], m3
+ mova [tmp2q+32*2], m1
+ mova m5, [rsp+gprsize+32*0] ; t22
+ mova m6, [rsp+gprsize+32*1] ; t23
+ mova m3, [rsp+gprsize+32*2] ; t24a
+ psubsw m1, m14, m5 ; t22a
+ paddsw m14, m5 ; t17a
+ psubsw m5, m0, m6 ; t23
+ paddsw m0, m6 ; t16
+ psubsw m6, m4, m3 ; t24
+ paddsw m4, m3 ; t31
+ vpbroadcastd m8, [o(pw_m2896_2896)]
+ vpbroadcastd m3, [o(pw_2896_2896)]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m14
+ mova [tmp2q+32*3], m4
+ ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27
+ ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a
+ ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25
+ ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a
+ mova [tmp1q+32*0], m13
+ mova [tmp1q+32*1], m2
+ mova [tmp1q+32*2], m7
+ mova [tmp1q+32*3], m6
+ mova [tmp2q-32*4], m5
+ mova [tmp2q-32*3], m1
+ mova [tmp2q-32*2], m10
+ mova [tmp2q-32*1], m9
+ ret
+ALIGN function_align
+.transpose_2x8x8_round:
+ punpckhwd m6, m12, m13
+ punpcklwd m12, m13
+ punpckhwd m13, m8, m9
+ punpcklwd m8, m9
+ punpckhwd m9, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m10, m11
+ punpcklwd m10, m11
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5
+ punpckhdq m11, m8, m10
+ punpckldq m8, m10
+ punpckldq m10, m12, m14
+ punpckhdq m12, m14
+ punpckhdq m14, m13, m15
+ punpckldq m13, m15
+ punpckldq m15, m6, m9
+ punpckhdq m6, m9
+ punpckhqdq m9, m8, m10
+ punpcklqdq m8, m10
+ punpcklqdq m10, m11, m12
+ punpckhqdq m11, m12
+ punpcklqdq m12, m13, m15
+ punpckhqdq m13, m15
+ punpckhqdq m15, m14, m6
+ punpcklqdq m14, m6
+ pmulhrsw m6, m7, [rsp+gprsize+32*0]
+ REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15
+ pmulhrsw m7, [rsp+gprsize+32*1]
+ mova [rsp+gprsize+32*0], m15
+ punpckhwd m15, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m15, m1
+ punpckhdq m15, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m15
+ punpcklqdq m6, m15
+ ret
+ALIGN function_align
+.pass2_end:
+ mova [rsp+gprsize+32*0], m7
+ mova [rsp+gprsize+32*2], m15
+ vpbroadcastd m15, [o(pw_2048)]
+ IDCT32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4
+ IDCT32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m1, [rsp+gprsize+32*1]
+ IDCT32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4
+ IDCT32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ IDCT32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4
+ IDCT32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0
+ add dstq, strideq
+ sub r2, strideq
+ mova m7, [rsp+gprsize+32*0]
+ mova m1, [rsp+gprsize+32*2]
+ IDCT32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4
+ IDCT32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8
+ IDCT32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4
+ IDCT32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0
+ ret
+
+; Perform the final sumsub step and YMM lane shuffling
+%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
+ mova m%3, [tmp2q+32*( 3-%1)]
+ psubsw m%4, m%1, m%3
+ paddsw m%1, m%3
+ mova m%3, [tmp1q+32*(11-%2)]
+ mova [tmp1q+32*(11-%2)+16], xm%4
+ vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
+ paddsw m%4, m%2, m%3
+ psubsw m%2, m%3
+ mova [tmp1q+32*(11-%2)], xm%2
+ vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
+ vperm2i128 m%2, m%1, m%4, 0x31
+ vinserti128 m%1, xm%4, 1
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 16
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
+.normal:
+ PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
+ vpbroadcastd m15, [o(pw_2896x8)]
+ pmulhrsw m0, m15, [cq+32* 1]
+ pmulhrsw m1, m15, [cq+32* 3]
+ pmulhrsw m2, m15, [cq+32* 5]
+ pmulhrsw m3, m15, [cq+32* 7]
+ pmulhrsw m4, m15, [cq+32* 9]
+ pmulhrsw m5, m15, [cq+32*11]
+ pmulhrsw m6, m15, [cq+32*13]
+ pmulhrsw m7, m15, [cq+32*15]
+ pmulhrsw m8, m15, [cq+32*17]
+ pmulhrsw m9, m15, [cq+32*19]
+ pmulhrsw m10, m15, [cq+32*21]
+ pmulhrsw m11, m15, [cq+32*23]
+ pmulhrsw m12, m15, [cq+32*25]
+ pmulhrsw m13, m15, [cq+32*27]
+ pmulhrsw m14, m15, [cq+32*29]
+ pmulhrsw m15, [cq+32*31]
+ lea tmp1q, [rsp+32*7]
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ LOAD_16ROWS cq+32*0, 32*2, 1, 0
+ pxor m15, m15
+ mov r3d, 8
+.zero_loop:
+ mova [cq+32*0], m15
+ mova [cq+32*1], m15
+ mova [cq+32*2], m15
+ mova [cq+32*3], m15
+ add cq, 32*4
+ dec r3d
+ jg .zero_loop
+ call m(idct_16x16_internal_8bpc).main
+ call .pass1_end
+ lea r2, [strideq*3]
+ mov r3, dstq
+.pass2:
+ vpbroadcastd m7, [o(pw_16384)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ call m(idct_16x16_internal_8bpc).main
+ mova [rsp+32*2], m15
+ vpbroadcastd m15, [o(pw_2048)]
+ REPX {pmulhrsw x, m15}, m2, m3, m0
+ WRITE_16X2 2, 3, 1, 2, strideq*2, r2
+ pmulhrsw m1, m15, [rsp+32*1]
+ WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1
+ lea dstq, [dstq+strideq*4]
+ REPX {pmulhrsw x, m15}, m4, m5, m6, m7
+ WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 2, 3, strideq*2, r2
+ lea dstq, [dstq+strideq*4]
+ REPX {pmulhrsw x, m15}, m8, m9, m10, m11
+ WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 10, 11, 2, 3, strideq*2, r2
+ lea dstq, [dstq+strideq*4]
+ REPX {pmulhrsw x, m15}, m11, m12, m13, m14
+ pmulhrsw m15, [rsp+32*2]
+ WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 14, 15, 2, 3, strideq*2, r2
+ test r3, r3
+ jnz .right_half
+ RET
+.right_half:
+ LOAD_8ROWS tmp1q-32*4, 32
+ LOAD_8ROWS_H tmp2q-32*4, 32
+ lea dstq, [r3+16]
+ xor r3d, r3d
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ jmp .pass2
+ALIGN function_align
+.pass1_end:
+ mova [rsp+gprsize+32*0], m9
+ IDCT32_PASS1_END 0, 8, 1, 9
+ IDCT32_PASS1_END 2, 10, 1, 9
+ IDCT32_PASS1_END 3, 11, 1, 9
+ IDCT32_PASS1_END 4, 12, 1, 9
+ IDCT32_PASS1_END 5, 13, 1, 9
+ IDCT32_PASS1_END 6, 14, 1, 9
+ IDCT32_PASS1_END 7, 15, 1, 9
+ mova m1, [rsp+gprsize+32*1]
+ mova m9, [rsp+gprsize+32*0]
+ mova [rsp+gprsize+32*0], m6
+ mova [rsp+gprsize+32*1], m7
+ IDCT32_PASS1_END 1, 9, 6, 7
+ ret
+
+cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 5, 13, dst, stride, c, eob
+%undef cmp
+ lea r6, [o_base]
+ vpbroadcastd m9, [o(pw_2896x8)]
+ vpbroadcastd m10, [o(pw_1697x16)]
+ vpbroadcastd m12, [o(pw_8192)]
+ cmp eobd, 43 ; if (eob > 43)
+ setg r4b ; iteration_count++
+ cmp eobd, 150 ; if (eob > 150)
+ setg al ; iteration_count++
+ add eobd, -279 ; if (eob > 278)
+ adc r4b, al ; iteration_count++
+ lea r3, [strideq*3]
+ mov r6, cq
+ paddw m11, m12, m12 ; pw_16384
+.loop:
+ mova xm0, [cq+64* 0]
+ mova xm1, [cq+64* 1]
+ vinserti128 m0, [cq+64* 8], 1
+ vinserti128 m1, [cq+64* 9], 1
+ mova xm2, [cq+64* 2]
+ mova xm3, [cq+64* 3]
+ vinserti128 m2, [cq+64*10], 1
+ vinserti128 m3, [cq+64*11], 1
+ mova xm4, [cq+64* 4]
+ mova xm5, [cq+64* 5]
+ vinserti128 m4, [cq+64*12], 1
+ vinserti128 m5, [cq+64*13], 1
+ mova xm6, [cq+64* 6]
+ mova xm7, [cq+64* 7]
+ vinserti128 m6, [cq+64*14], 1
+ vinserti128 m7, [cq+64*15], 1
+ REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ dec r4b
+ jge .loop
+ sub cq, 32
+ pxor m0, m0
+ mov r0d, 8
+ cmp cq, r6
+ ja .zero_loop
+.zero_loop_half:
+ mova [r6+64*0], m0
+ mova [r6+64*1], m0
+ add r6, 64*4
+ mova [r6-64*2], m0
+ mova [r6-64*1], m0
+ sub r0d, 2
+ jg .zero_loop_half
+ RET
+.zero_loop:
+ mova [r6+32*0], m0
+ mova [r6+32*1], m0
+ mova [r6+32*2], m0
+ mova [r6+32*3], m0
+ add r6, 32*4
+ dec r0d
+ jg .zero_loop
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 12, dst, stride, c, eob
+%undef cmp
+ lea r6, [o_base]
+ vpbroadcastd m9, [o(pw_2896x8)]
+ vpbroadcastd m10, [o(pw_1697x16)]
+ vpbroadcastd m11, [o(pw_2048)]
+ cmp eobd, 35 ; if (eob > 35)
+ setg r4b ; iteration_count++
+ cmp eobd, 150 ; if (eob > 150)
+ setg r3b ; iteration_count += 2
+ lea r4d, [r4+r3*2]
+ lea r3, [strideq*3]
+ mov r5, dstq
+ mov r6, cq
+.loop:
+ mova xm0, [cq+32* 0]
+ mova xm1, [cq+32* 1]
+ vinserti128 m0, [cq+32* 8], 1
+ vinserti128 m1, [cq+32* 9], 1
+ mova xm2, [cq+32* 2]
+ mova xm3, [cq+32* 3]
+ vinserti128 m2, [cq+32*10], 1
+ vinserti128 m3, [cq+32*11], 1
+ mova xm4, [cq+32* 4]
+ mova xm5, [cq+32* 5]
+ vinserti128 m4, [cq+32*12], 1
+ vinserti128 m5, [cq+32*13], 1
+ mova xm6, [cq+32* 6]
+ mova xm7, [cq+32* 7]
+ vinserti128 m6, [cq+32*14], 1
+ vinserti128 m7, [cq+32*15], 1
+ REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r3
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ dec r4b
+ jl .ret
+ test r4b, 1
+ jz .loop
+ add cq, 32*15
+ lea dstq, [r5+16]
+ jmp .loop
+.ret:
+ sub cd, eax
+ pxor m0, m0
+ add cd, 384
+.zero_loop:
+ mova [r6+32*0], m0
+ mova [r6+32*1], m0
+ mova [r6+32*2], m0
+ mova [r6+32*3], m0
+ add r6, 32*4
+ sub cd, 128
+ jge .zero_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
+.normal:
+ PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \
+ base, tmp3, tmp4
+ %undef cmp
+ lea tmp1q, [rsp+32*7]
+ lea tmp2q, [tmp1q+32*8]
+ sub eobd, 136
+ mov tmp4d, eobd
+.pass1_loop:
+ LOAD_8ROWS cq+64*1, 64*2
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
+ test tmp4d, tmp4d
+ jl .fast
+ LOAD_8ROWS_H cq+64*17, 64*2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ LOAD_8ROWS_H cq+64*16, 64*2
+ pxor m0, m0
+ REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ mova [rsp], m15
+ jmp .idct16
+.fast:
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+.idct16:
+ LOAD_8ROWS cq+64*0, 64*2
+ pxor m15, m15
+ REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end
+ vpbroadcastd m7, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ lea tmp3q, [tmp1q+32*32]
+ mova m15, [rsp]
+ mova [tmp3q-32*4], m0
+ mova [tmp3q-32*3], m2
+ mova [tmp3q-32*2], m4
+ mova [tmp3q-32*1], m6
+ mova [tmp3q+32*0], m8
+ mova [tmp3q+32*1], m10
+ mova [tmp3q+32*2], m12
+ mova [tmp3q+32*3], m14
+ add tmp3q, 32*8
+ mova [tmp3q-32*4], m1
+ mova [tmp3q-32*3], m3
+ mova [tmp3q-32*2], m5
+ mova [tmp3q-32*1], m7
+ mova [tmp3q+32*0], m9
+ mova [tmp3q+32*1], m11
+ mova [tmp3q+32*2], m13
+ mova [tmp3q+32*3], m15
+ vpbroadcastd m9, [o(pw_8192)]
+ pmulhrsw m0, m9, [tmp1q-32*4]
+ pmulhrsw m1, m9, [tmp1q-32*3]
+ pmulhrsw m2, m9, [tmp1q-32*2]
+ pmulhrsw m3, m9, [tmp1q-32*1]
+ pmulhrsw m4, m9, [tmp1q+32*0]
+ pmulhrsw m5, m9, [tmp1q+32*1]
+ pmulhrsw m6, m9, [tmp1q+32*2]
+ pmulhrsw m7, m9, [tmp1q+32*3]
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q-32*4], m0
+ pmulhrsw m0, m9, [tmp2q-32*4]
+ mova [tmp2q-32*4], m1
+ pmulhrsw m1, m9, [tmp2q-32*3]
+ mova [tmp1q-32*3], m2
+ pmulhrsw m2, m9, [tmp2q-32*2]
+ mova [tmp2q-32*3], m3
+ pmulhrsw m3, m9, [tmp2q-32*1]
+ mova [tmp1q-32*2], m4
+ pmulhrsw m4, m9, [tmp2q+32*0]
+ mova [tmp2q-32*2], m5
+ pmulhrsw m5, m9, [tmp2q+32*1]
+ mova [tmp1q-32*1], m6
+ pmulhrsw m6, m9, [tmp2q+32*2]
+ mova [tmp2q-32*1], m7
+ pmulhrsw m7, m9, [tmp2q+32*3]
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q+32*0], m0
+ mova [tmp2q+32*0], m1
+ mova [tmp1q+32*1], m2
+ mova [tmp2q+32*1], m3
+ mova [tmp1q+32*2], m4
+ mova [tmp2q+32*2], m5
+ mova [tmp1q+32*3], m6
+ mova [tmp2q+32*3], m7
+ add cq, 32
+ add tmp1q, 32*16
+ add tmp2q, 32*16
+ add eobd, 0x80000000
+ jnc .pass1_loop
+ add tmp1q, 32*24
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ test tmp4d, tmp4d
+ jge .pass2_loop
+ add tmp1q, 32*16
+ add tmp2q, 32*16
+ add tmp3q, 32*16
+.pass2_loop:
+ LOAD_8ROWS tmp2q-32*4, 32
+ test tmp4d, tmp4d
+ jl .fast2
+ LOAD_8ROWS_H tmp3q-32*4, 32
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ sub tmp3q, 32*8
+ LOAD_8ROWS_H tmp3q-32*4, 32
+ sub tmp3q, 32*16
+ jmp .pass2_loop_end
+.fast2:
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ sub tmp3q, 32*24
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
+.pass2_loop_end:
+ LOAD_8ROWS tmp3q-32*4, 32
+ mova [rsp], m15
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end
+ lea tmp3q, [tmp1q-32*32]
+ cmp tmp2q, tmp3q
+ jb .ret
+ sub tmp2q, 32*32
+ sub dstq, r3
+ lea r2, [r2+r3+16]
+ add dstq, 16
+ jmp .pass2_loop
+.ret:
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 10, dst, stride, c, eob
+ %undef cmp
+ vpbroadcastd m9, [pw_8192]
+ sub eobd, 136 ; if (eob < 136)
+ shr eobd, 30 ; topleft 16x16 only
+ lea eobd, [eobq*2-8]
+ lea r4, [strideq*3]
+ mov r5, dstq
+ lea r6, [cq+32]
+.loop:
+ mova xm0, [cq+64* 0]
+ mova xm1, [cq+64* 1]
+ vinserti128 m0, [cq+64* 8], 1
+ vinserti128 m1, [cq+64* 9], 1
+ mova xm2, [cq+64* 2]
+ mova xm3, [cq+64* 3]
+ vinserti128 m2, [cq+64*10], 1
+ vinserti128 m3, [cq+64*11], 1
+ mova xm4, [cq+64* 4]
+ mova xm5, [cq+64* 5]
+ vinserti128 m4, [cq+64*12], 1
+ vinserti128 m5, [cq+64*13], 1
+ mova xm6, [cq+64* 6]
+ mova xm7, [cq+64* 7]
+ vinserti128 m6, [cq+64*14], 1
+ vinserti128 m7, [cq+64*15], 1
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
+ WRITE_16X2 2, 3, 0, 1, strideq*2, r4
+ lea dstq, [dstq+strideq*4]
+ WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 0, 1, strideq*2, r4
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ inc eobd
+ jz .ret
+ test eobd, 3
+ jnz .loop
+ add cq, 64*15
+ lea dstq, [r5+16]
+ jmp .loop
+.ret:
+ pxor m0, m0
+ mov r0d, 16
+ cmp cq, r6
+ jne .zero_loop
+.zero_loop_topleft:
+ mova [r6-32*1], m0
+ mova [r6+32*1], m0
+ mova [r6+32*3], m0
+ mova [r6+32*5], m0
+ add r6, 64*4
+ sub r0d, 4
+ jg .zero_loop_topleft
+ RET
+.zero_loop:
+ mova [r6-32*1], m0
+ mova [r6+32*0], m0
+ mova [r6+32*1], m0
+ mova [r6+32*2], m0
+ add r6, 32*4
+ dec r0d
+ jg .zero_loop
+ RET
+
+%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
+%if %1 & 1
+ mova m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n
+ mova m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n
+%else
+ mova m%5, [tmp1q-32*(45-%1)]
+ mova m%4, [tmp2q-32*(20+%1)]
+%endif
+ psubsw m%6, m%5, m%4 ; idct32 out31-n
+ paddsw m%5, m%4 ; idct32 out 0+n
+ psubsw m%4, m%6, m%3 ; out32+n
+ paddsw m%6, m%3 ; out31-n
+ psubsw m%3, m%5, m%2 ; out63-n
+ paddsw m%5, m%2 ; out 0+n
+%if %0 == 6 ; pass 1
+%if %1 & 1
+ mova [tmp2q-32*(19-%1)], m%4
+ mova [tmp1q-32*(14+%1)], m%6
+ mova [tmp1q+32*(18-%1)], m%3
+ mova [tmp2q-32*(51-%1)], m%5
+%else
+ mova [tmp1q-32*(13-%1)], m%4
+ mova [tmp2q-32*(20+%1)], m%6
+ mova [tmp2q+32*(12-%1)], m%3
+ mova [tmp1q-32*(45-%1)], m%5
+%endif
+%else ; pass 2
+ REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5
+%if %1 & 1
+ %define %%d0 r2
+ %define %%d1 dstq
+%else
+ %define %%d0 dstq
+ %define %%d1 r2
+%endif
+ pmovzxbw m%2, [%%d0+%9 ]
+ paddw m%2, m%4
+ pmovzxbw m%4, [%%d1+%8 ]
+ paddw m%4, m%6
+ pmovzxbw m%6, [%%d1+%10]
+ paddw m%3, m%6
+ pmovzxbw m%6, [%%d0+%7 ]
+ paddw m%5, m%6
+ packuswb m%2, m%4
+ packuswb m%3, m%5
+ vpermq m%2, m%2, q3120
+ vpermq m%3, m%3, q3120
+ mova [%%d0+%9 ], xm%2
+ vextracti128 [%%d1+%8 ], m%2, 1
+ mova [%%d1+%10], xm%3
+ vextracti128 [%%d0+%7 ], m%3, 1
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+.normal:
+ PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
+ %undef cmp
+ lea tmp1q, [rsp+32*23]
+ lea tmp2q, [tmp1q+32*24]
+ sub eobd, 151
+ mov r7d, eobd
+.pass1_loop:
+ LOAD_16ROWS cq, 64
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ vpbroadcastd m7, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ mova m15, [rsp+32*0]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m2
+ mova [tmp1q-32*2], m4
+ mova [tmp1q-32*1], m6
+ mova [tmp1q+32*0], m8
+ mova [tmp1q+32*1], m10
+ mova [tmp1q+32*2], m12
+ mova [tmp1q+32*3], m14
+ mova [tmp2q-32*4], m1
+ mova [tmp2q-32*3], m3
+ mova [tmp2q-32*2], m5
+ mova [tmp2q-32*1], m7
+ mova [tmp2q+32*0], m9
+ mova [tmp2q+32*1], m11
+ mova [tmp2q+32*2], m13
+ mova [tmp2q+32*3], m15
+ add cq, 32
+ add tmp1q, 32*8
+ add tmp2q, 32*8
+ add eobd, 0x80000000
+ jnc .pass1_loop
+ lea r2, [rsp+32*23]
+ mova xm0, [r2-32*4+ 0]
+ mova xm1, [r2-32*2+ 0]
+ vinserti128 m0, [r2+32*0+ 0], 1
+ vinserti128 m1, [r2+32*2+ 0], 1
+ mova xm2, [r2-32*4+16]
+ mova xm3, [r2-32*2+16]
+ vinserti128 m2, [r2+32*0+16], 1
+ vinserti128 m3, [r2+32*2+16], 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+ test r7d, r7d
+ jl .fast
+ lea r3, [r2+32*8]
+ mova xm4, [r3-32*4+ 0]
+ mova xm5, [r3-32*2+ 0]
+ vinserti128 m4, [r3+32*0+ 0], 1
+ vinserti128 m5, [r3+32*2+ 0], 1
+ mova xm6, [r3-32*4+16]
+ mova xm7, [r3-32*2+16]
+ vinserti128 m6, [r3+32*0+16], 1
+ vinserti128 m7, [r3+32*2+16], 1
+.fast:
+ mova [rsp], m8
+ lea tmp1q, [rsp+32*7]
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ mova xm0, [r2-32*3+ 0]
+ mova xm1, [r2-32*1+ 0]
+ vinserti128 m0, [r2+32*1+ 0], 1
+ vinserti128 m1, [r2+32*3+ 0], 1
+ mova xm2, [r2-32*3+16]
+ mova xm3, [r2-32*1+16]
+ vinserti128 m2, [r2+32*1+16], 1
+ vinserti128 m3, [r2+32*3+16], 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ test r7d, r7d
+ jl .fast2
+ mova xm4, [r3-32*3+ 0]
+ mova xm5, [r3-32*1+ 0]
+ vinserti128 m4, [r3+32*1+ 0], 1
+ vinserti128 m5, [r3+32*3+ 0], 1
+ mova xm6, [r3-32*3+16]
+ mova xm7, [r3-32*1+16]
+ vinserti128 m6, [r3+32*1+16], 1
+ vinserti128 m7, [r3+32*3+16], 1
+.fast2:
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ add r2, 32*24
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova xm0, [r2-32*4+ 0]
+ mova xm3, [r2-32*1+16]
+ vinserti128 m0, [r2+32*0+ 0], 1
+ vinserti128 m3, [r2+32*3+16], 1
+ mova xm4, [r2-32*4+16]
+ mova xm7, [r2-32*1+ 0]
+ vinserti128 m4, [r2+32*0+16], 1
+ vinserti128 m7, [r2+32*3+ 0], 1
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r7d, r7d
+ jl .fast3
+ add r3, 32*24
+ mova xm1, [r3-32*1+16]
+ mova xm2, [r3-32*4+ 0]
+ vinserti128 m1, [r3+32*3+16], 1
+ vinserti128 m2, [r3+32*0+ 0], 1
+ mova xm5, [r3-32*1+ 0]
+ mova xm6, [r3-32*4+16]
+ vinserti128 m5, [r3+32*3+ 0], 1
+ vinserti128 m6, [r3+32*0+16], 1
+.fast3:
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova xm0, [r2-32*2+ 0]
+ mova xm3, [r2-32*3+16]
+ vinserti128 m0, [r2+32*2+ 0], 1
+ vinserti128 m3, [r2+32*1+16], 1
+ mova xm4, [r2-32*2+16]
+ mova xm7, [r2-32*3+ 0]
+ vinserti128 m4, [r2+32*2+16], 1
+ vinserti128 m7, [r2+32*1+ 0], 1
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r7d, r7d
+ jl .fast4
+ mova xm1, [r3-32*3+16]
+ mova xm2, [r3-32*2+ 0]
+ vinserti128 m1, [r3+32*1+16], 1
+ vinserti128 m2, [r3+32*2+ 0], 1
+ mova xm5, [r3-32*3+ 0]
+ mova xm6, [r3-32*2+16]
+ vinserti128 m5, [r3+32*1+ 0], 1
+ vinserti128 m6, [r3+32*2+16], 1
+.fast4:
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
+ RET
+ALIGN function_align
+%define o_base idct64_mul - 8
+cglobal_label .main_part1
+ ; idct64 steps 1-5:
+ ; in1/31/17/15/ 9/23/25/ 7 ->
+ ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a
+ ; in5/27/21/11/13/19/29/ 3 ->
+ ; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a
+ vpbroadcastd m11, [o(idct64_mul+4* 0)]
+ vpbroadcastd m13, [o(idct64_mul+4* 1)]
+ vpbroadcastd m10, [o(idct64_mul+4* 4)]
+ vpbroadcastd m12, [o(idct64_mul+4* 5)]
+ pmulhrsw m11, m0 ; t63a
+ pmulhrsw m0, m13 ; t32a
+ pmulhrsw m10, m1 ; t62a
+ pmulhrsw m1, m12 ; t33a
+ vpbroadcastd m9, [o(idct64_mul+4* 8)]
+ vpbroadcastd m13, [o(idct64_mul+4* 9)]
+ vpbroadcastd m8, [o(idct64_mul+4*12)]
+ vpbroadcastd m12, [o(idct64_mul+4*13)]
+ pmulhrsw m9, m2 ; t61a
+ pmulhrsw m2, m13 ; t34a
+ pmulhrsw m8, m3 ; t60a
+ pmulhrsw m3, m12 ; t35a
+ psubsw m12, m0, m1 ; t33
+ paddsw m0, m1 ; t32
+ psubsw m1, m3, m2 ; t34
+ paddsw m3, m2 ; t35
+ psubsw m2, m8, m9 ; t61
+ paddsw m8, m9 ; t60
+ psubsw m9, m11, m10 ; t62
+ paddsw m11, m10 ; t63
+ ITX_MULSUB_2W 2, 1, 10, 13, 15, m4076, 401 ; t34a, t61a
+ vpbroadcastd m14, [o(pw_401_4076)]
+ ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13 ; t33a, t62a
+ psubsw m10, m0, m3 ; t35a
+ paddsw m0, m3 ; t32a
+ psubsw m3, m11, m8 ; t60a
+ paddsw m11, m8 ; t63a
+ psubsw m8, m9, m2 ; t34
+ paddsw m9, m2 ; t33
+ psubsw m2, m12, m1 ; t61
+ paddsw m12, m1 ; t62
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m9
+ mova [tmp2q+32*2], m12
+ mova [tmp2q+32*3], m11
+ vpbroadcastd m13, [o(pw_m4017_799)]
+ vpbroadcastd m14, [o(pw_799_4017)]
+ ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13 ; t34a, t61a
+ ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13 ; t35, t60
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp2q+32*0], m10
+ mova [tmp2q+32*1], m8
+ vpbroadcastd m3, [o(idct64_mul+4*16)]
+ vpbroadcastd m11, [o(idct64_mul+4*17)]
+ vpbroadcastd m2, [o(idct64_mul+4*20)]
+ vpbroadcastd m10, [o(idct64_mul+4*21)]
+ vpbroadcastd m1, [o(idct64_mul+4*24)]
+ vpbroadcastd m9, [o(idct64_mul+4*25)]
+ vpbroadcastd m0, [o(idct64_mul+4*28)]
+ vpbroadcastd m8, [o(idct64_mul+4*29)]
+ pmulhrsw m3, m4 ; t59a
+ pmulhrsw m4, m11 ; t36a
+ pmulhrsw m2, m5 ; t58a
+ pmulhrsw m5, m10 ; t37a
+ pmulhrsw m1, m6 ; t57a
+ pmulhrsw m6, m9 ; t38a
+ pmulhrsw m0, m7 ; t56a
+ pmulhrsw m7, m8 ; t39a
+ psubsw m8, m4, m5 ; t37
+ paddsw m4, m5 ; t36
+ psubsw m5, m7, m6 ; t38
+ paddsw m7, m6 ; t39
+ psubsw m6, m0, m1 ; t57
+ paddsw m0, m1 ; t56
+ psubsw m1, m3, m2 ; t58
+ paddsw m3, m2 ; t59
+ ITX_MULSUB_2W 6, 5, 2, 9, 15, m2598, 3166 ; t38a, t57a
+ vpbroadcastd m10, [o(pw_3166_2598)]
+ ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9 ; t37a, t58a
+ psubsw m2, m7, m4 ; t36a
+ paddsw m7, m4 ; t39a
+ psubsw m4, m0, m3 ; t59a
+ paddsw m0, m3 ; t56a
+ psubsw m3, m6, m1 ; t37
+ paddsw m6, m1 ; t38
+ psubsw m1, m5, m8 ; t58
+ paddsw m5, m8 ; t57
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ mova [tmp2q-32*4], m0
+ mova [tmp2q-32*3], m5
+ vpbroadcastd m6, [o(pw_m799_m4017)]
+ vpbroadcastd m7, [o(pw_m4017_799)]
+ ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6 ; t36, t59
+ ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6 ; t37a, t58a
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m1
+ mova [tmp2q-32*2], m3
+ mova [tmp2q-32*1], m2
+ ret
+%define o_base pw_5 + 128
+.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub
+ sub r6, o_idct64_offset + 8
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ vpbroadcastd m13, [o(pw_2896_2896)]
+ vpbroadcastd m14, [o(pw_m2896_2896)]
+.main_part2_pass1_loop:
+ call .main_part2_internal
+ IDCT64_PART2_END 0, 7, 0, 6, 9, 10
+ IDCT64_PART2_END 7, 8, 5, 0, 6, 7
+ IDCT64_PART2_END 8, 2, 1, 0, 6, 7
+ IDCT64_PART2_END 15, 3, 4, 0, 6, 7
+ cmp tmp1q, tmp2q
+ jne .main_part2_pass1_loop
+ ret
+cglobal_label .main_part2_internal
+ mova m0, [tmp1q-32*12] ; t32a
+ mova m6, [tmp2q-32*13] ; t39a
+ mova m1, [tmp1q-32* 4] ; t40a
+ mova m5, [tmp2q+32* 3] ; t55a
+ add tmp1q, 32
+ sub tmp2q, 32
+ mova m2, [tmp1q+32* 3] ; t48a
+ mova m4, [tmp2q-32* 4] ; t47a
+ mova m3, [tmp1q+32*11] ; t56a
+ mova m7, [tmp2q+32*12] ; t63a
+ psubsw m8, m0, m6 ; t39
+ paddsw m0, m6 ; t32
+ psubsw m6, m4, m1 ; t40
+ paddsw m4, m1 ; t47
+ psubsw m1, m2, m5 ; t55
+ paddsw m2, m5 ; t48
+ psubsw m5, m7, m3 ; t56
+ paddsw m7, m3 ; t63
+ ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12 ; t39a, t56a
+ vpbroadcastd m9, [o(pw_m1567_m3784)]
+ ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9 ; t40a, t55a
+ psubsw m3, m0, m4 ; t47a
+ paddsw m0, m4 ; t32a
+ psubsw m4, m7, m2 ; t48a
+ paddsw m7, m2 ; t63a
+ psubsw m2, m5, m1 ; t40
+ paddsw m5, m1 ; t39
+ psubsw m1, m8, m6 ; t55
+ paddsw m8, m6 ; t56
+ ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14 ; t47, t48
+ ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a
+ ret
+.main_part2_pass2:
+ sub r6, o_idct64_offset + 8
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ vpbroadcastd m13, [o(pw_2896_2896)]
+ lea r9, [strideq*5] ; stride*5
+ lea r3, [r9+strideq*1] ; stride*6
+ lea r7, [r9+strideq*2] ; stride*7
+ lea r8, [r3+strideq*2] ; stride*8
+ lea r2, [dstq+r7]
+.main_part2_pass2_loop:
+ vpbroadcastd m14, [o(pw_m2896_2896)]
+ call .main_part2_internal
+ vpbroadcastd m14, [o(pw_2048)]
+ IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8
+ IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8
+ IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8
+ IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8
+ add dstq, strideq
+ sub r2, strideq
+ cmp tmp1q, tmp2q
+ jne .main_part2_pass2_loop
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 16
+.dconly:
+ pmulhrsw xm0, xm2
+ movd xm2, [o(pw_2048)]
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ pxor m1, m1
+.dconly_loop:
+ mova m2, [dstq+32*0]
+ mova m3, [dstq+32*1]
+ punpckhbw m4, m2, m1
+ punpcklbw m2, m1
+ punpckhbw m5, m3, m1
+ punpcklbw m3, m1
+ paddw m4, m0
+ paddw m2, m0
+ paddw m5, m0
+ paddw m3, m0
+ packuswb m2, m4
+ packuswb m3, m5
+ mova [dstq+32*0], m2
+ mova [dstq+32*1], m3
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.normal:
+ PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
+ LOAD_8ROWS cq+32*0, 32*4
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ lea tmp1q, [rsp+32*7]
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ LOAD_8ROWS cq+32*2, 32*4
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova m0, [cq+32* 1]
+ mova m1, [cq+32*31]
+ mova m2, [cq+32*17]
+ mova m3, [cq+32*15]
+ mova m4, [cq+32* 9]
+ mova m5, [cq+32*23]
+ mova m6, [cq+32*25]
+ mova m7, [cq+32* 7]
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova m0, [cq+32* 5]
+ mova m1, [cq+32*27]
+ mova m2, [cq+32*21]
+ mova m3, [cq+32*11]
+ mova m4, [cq+32*13]
+ mova m5, [cq+32*19]
+ mova m6, [cq+32*29]
+ mova m7, [cq+32* 3]
+ pxor m8, m8
+ REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
+ sub tmp1q, 32*36
+ lea r2, [strideq*3]
+ mov tmp2d, 4
+.pass2_loop:
+ lea r3, [tmp1q-32*8]
+ mova xm0, [r3 -32*4]
+ mova xm1, [r3 -32*3]
+ vinserti128 m0, [tmp1q-32*4], 1
+ vinserti128 m1, [tmp1q-32*3], 1
+ mova xm2, [r3 -32*2]
+ mova xm3, [r3 -32*1]
+ vinserti128 m2, [tmp1q-32*2], 1
+ vinserti128 m3, [tmp1q-32*1], 1
+ mova xm4, [r3 +32*0]
+ mova xm5, [r3 +32*1]
+ vinserti128 m4, [tmp1q+32*0], 1
+ vinserti128 m5, [tmp1q+32*1], 1
+ mova xm6, [r3 +32*2]
+ mova xm7, [r3 +32*3]
+ vinserti128 m6, [tmp1q+32*2], 1
+ vinserti128 m7, [tmp1q+32*3], 1
+ mova xm8, [r3 -32*4+16]
+ mova xm9, [r3 -32*3+16]
+ vinserti128 m8, [tmp1q-32*4+16], 1
+ vinserti128 m9, [tmp1q-32*3+16], 1
+ mova xm10, [r3 -32*2+16]
+ mova xm11, [r3 -32*1+16]
+ vinserti128 m10, [tmp1q-32*2+16], 1
+ vinserti128 m11, [tmp1q-32*1+16], 1
+ mova xm12, [r3 +32*0+16]
+ mova xm13, [r3 +32*1+16]
+ vinserti128 m12, [tmp1q+32*0+16], 1
+ vinserti128 m13, [tmp1q+32*1+16], 1
+ mova xm14, [r3 +32*2+16]
+ mova xm15, [r3 +32*3+16]
+ vinserti128 m14, [tmp1q+32*2+16], 1
+ vinserti128 m15, [tmp1q+32*3+16], 1
+ mova [rsp+32*0], m6
+ mova [rsp+32*1], m7
+ vpbroadcastd m7, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ call m(idct_16x16_internal_8bpc).main
+ mova [rsp+32*0], m15
+ vpbroadcastd m15, [o(pw_2048)]
+ REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7
+ WRITE_16X2 2, 3, 1, 2, strideq*2, r2
+ pmulhrsw m1, m15, [rsp+32*1]
+ WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1
+ lea r3, [dstq+strideq*4]
+ %define dstq r3
+ WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 6, 7, 2, 3, strideq*2, r2
+ REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
+ lea r3, [r3+strideq*4]
+ WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 10, 11, 2, 3, strideq*2, r2
+ pmulhrsw m15, [rsp+32*0]
+ lea r3, [r3+strideq*4]
+ WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1
+ WRITE_16X2 14, 15, 2, 3, strideq*2, r2
+ add tmp1q, 32*16
+ add r0, 16
+ dec tmp2d
+ jg .pass2_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
+.normal:
+ PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2
+ lea tmp1q, [rsp+32*7]
+ lea r10d, [eobq-136]
+ sar r10d, 31
+.pass1_loop:
+ lea tmp2q, [tmp1q+32*16]
+ LOAD_8ROWS cq+64*1, 64*2, 1
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
+ test r10b, r10b
+ jnz .fast
+ LOAD_8ROWS_H cq+64*17, 64*2, 2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ LOAD_8ROWS_H cq+64*16, 64*2, 1
+ mova [rsp], m15
+ pxor m15, m15
+ REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ jmp .idct16
+.fast:
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+.idct16:
+ LOAD_8ROWS cq+64*0, 64*2, 1
+ pxor m15, m15
+ REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end
+ vpbroadcastd m7, [o(pw_16384)]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
+ lea r3, [tmp1q+32*48]
+ mova m15, [rsp]
+ mova [r3-32*4], m0
+ mova [r3-32*3], m2
+ mova [r3-32*2], m4
+ mova [r3-32*1], m6
+ mova [r3+32*0], m8
+ mova [r3+32*1], m10
+ mova [r3+32*2], m12
+ mova [r3+32*3], m14
+ add r3, 32*24
+ mova [r3-32*4], m1
+ mova [r3-32*3], m3
+ mova [r3-32*2], m5
+ mova [r3-32*1], m7
+ mova [r3+32*0], m9
+ mova [r3+32*1], m11
+ mova [r3+32*2], m13
+ mova [r3+32*3], m15
+ vpbroadcastd m9, [o(pw_16384)]
+ pmulhrsw m0, m9, [tmp1q-32*4]
+ pmulhrsw m1, m9, [tmp1q-32*3]
+ pmulhrsw m2, m9, [tmp1q-32*2]
+ pmulhrsw m3, m9, [tmp1q-32*1]
+ pmulhrsw m4, m9, [tmp1q+32*0]
+ pmulhrsw m5, m9, [tmp1q+32*1]
+ pmulhrsw m6, m9, [tmp1q+32*2]
+ pmulhrsw m7, m9, [tmp1q+32*3]
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q-32*4], m0
+ pmulhrsw m0, m9, [tmp2q-32*4]
+ mova [tmp2q-32*4], m1
+ pmulhrsw m1, m9, [tmp2q-32*3]
+ mova [tmp1q-32*3], m2
+ pmulhrsw m2, m9, [tmp2q-32*2]
+ mova [tmp2q-32*3], m3
+ pmulhrsw m3, m9, [tmp2q-32*1]
+ mova [tmp1q-32*2], m4
+ pmulhrsw m4, m9, [tmp2q+32*0]
+ mova [tmp2q-32*2], m5
+ pmulhrsw m5, m9, [tmp2q+32*1]
+ mova [tmp1q-32*1], m6
+ pmulhrsw m6, m9, [tmp2q+32*2]
+ mova [tmp2q-32*1], m7
+ pmulhrsw m7, m9, [tmp2q+32*3]
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q+32*0], m0
+ mova [tmp2q+32*0], m1
+ mova [tmp1q+32*1], m2
+ mova [tmp2q+32*1], m3
+ mova [tmp1q+32*2], m4
+ mova [tmp2q+32*2], m5
+ mova [tmp1q+32*3], m6
+ mova [tmp2q+32*3], m7
+ add cq, 32
+ add tmp1q, 32*8
+ add r10d, 0x80000000
+ jnc .pass1_loop
+ lea r2, [rsp+32*55]
+ lea r7, [r2+32*24]
+.pass2_loop:
+ lea r3, [r2+32*8]
+ lea r8, [r7+32*8]
+ mova m0, [r2-32*4]
+ mova m1, [r2-32*2]
+ mova m2, [r2+32*0]
+ mova m3, [r2+32*2]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+ test r10b, r10b
+ jnz .fast2
+ mova m4, [r3-32*4]
+ mova m5, [r3-32*2]
+ mova m6, [r3+32*0]
+ mova m7, [r3+32*2]
+.fast2:
+ mova [rsp], m8
+ lea tmp1q, [rsp+32*39]
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ mova m0, [r2-32*3]
+ mova m1, [r2-32*1]
+ mova m2, [r2+32*1]
+ mova m3, [r2+32*3]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ test r10b, r10b
+ jnz .fast3
+ mova m4, [r3-32*3]
+ mova m5, [r3-32*1]
+ mova m6, [r3+32*1]
+ mova m7, [r3+32*3]
+.fast3:
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova m0, [r7-32*4]
+ mova m3, [r7+32*3]
+ mova m4, [r7+32*0]
+ mova m7, [r7-32*1]
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r10b, r10b
+ jnz .fast4
+ mova m1, [r8+32*3]
+ mova m2, [r8-32*4]
+ mova m5, [r8-32*1]
+ mova m6, [r8+32*0]
+.fast4:
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova m0, [r7-32*2]
+ mova m3, [r7+32*1]
+ mova m4, [r7+32*2]
+ mova m7, [r7-32*3]
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r10b, r10b
+ jnz .fast5
+ mova m1, [r8+32*1]
+ mova m2, [r8-32*2]
+ mova m5, [r8-32*3]
+ mova m6, [r8+32*2]
+.fast5:
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
+ add r10d, 0x80000000
+ jc .ret
+ lea r2, [rsp+32*7]
+ lea r7, [r2+32*16]
+ sub dstq, r8
+ lea dstq, [dstq+strideq*4+16]
+ jmp .pass2_loop
+.ret:
+ RET
+
+cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_16384)]
+ mov [cq], eobd
+ pmulhrsw xm0, xm1
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
+.normal:
+ PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \
+ base, tmp3, tmp4
+ lea tmp1q, [rsp+32*7]
+ lea tmp4d, [eobq-136]
+.pass1_loop:
+ LOAD_8ROWS cq+64*0, 64*4, 1
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ LOAD_8ROWS cq+64*2, 64*4, 1
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ vpbroadcastd m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [cq+64* 1]
+ pmulhrsw m1, m7, [cq+64*31]
+ pmulhrsw m2, m7, [cq+64*17]
+ pmulhrsw m3, m7, [cq+64*15]
+ pmulhrsw m4, m7, [cq+64* 9]
+ pmulhrsw m5, m7, [cq+64*23]
+ pmulhrsw m6, m7, [cq+64*25]
+ pmulhrsw m7, [cq+64* 7]
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))]
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ pmulhrsw m0, m7, [cq+64* 5]
+ pmulhrsw m1, m7, [cq+64*27]
+ pmulhrsw m2, m7, [cq+64*21]
+ pmulhrsw m3, m7, [cq+64*11]
+ pmulhrsw m4, m7, [cq+64*13]
+ pmulhrsw m5, m7, [cq+64*19]
+ pmulhrsw m6, m7, [cq+64*29]
+ pmulhrsw m7, [cq+64* 3]
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
+ sub tmp1q, 32*44
+ vpbroadcastd m10, [o(pw_16384)]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave
+ add cq, 32
+ add tmp4d, 0x80000000
+ jnc .pass1_loop
+ lea tmp1q, [rsp+32*15]
+ imul r2, strideq, 19
+ lea r3, [strideq*3]
+ add r2, dstq
+ mov tmp4b, 4
+.pass2_loop:
+ lea tmp2q, [tmp1q+32*64]
+ LOAD_8ROWS tmp1q-32*4, 32
+ test tmp4d, 0x40000000
+ jnz .fast
+ LOAD_8ROWS_H tmp2q-32*4, 32
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ lea tmp3q, [tmp2q-32*8]
+ LOAD_8ROWS_H tmp3q-32*4, 32
+ mova [rsp], m15
+ jmp .idct16
+.fast:
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ pxor m8, m8
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+.idct16:
+ lea tmp3q, [tmp1q-32*8]
+ LOAD_8ROWS tmp3q-32*4, 32
+ call m(idct_16x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end
+ add tmp1q, 32*16
+ sub dstq, r3
+ lea r2, [r2+r3+16]
+ add dstq, 16
+ dec tmp4b
+ jg .pass2_loop
+ RET
+ALIGN function_align
+.transpose_round_interleave:
+ mov tmp3d, 4
+.loop:
+ lea tmp2q, [tmp1q+32*8]
+ mova xm0, [tmp1q-32*4]
+ mova xm1, [tmp1q-32*3]
+ vinserti128 m0, [tmp2q-32*4], 1
+ vinserti128 m1, [tmp2q-32*3], 1
+ mova xm2, [tmp1q-32*2]
+ mova xm3, [tmp1q-32*1]
+ vinserti128 m2, [tmp2q-32*2], 1
+ vinserti128 m3, [tmp2q-32*1], 1
+ mova xm4, [tmp1q+32*0]
+ mova xm5, [tmp1q+32*1]
+ vinserti128 m4, [tmp2q+32*0], 1
+ vinserti128 m5, [tmp2q+32*1], 1
+ mova xm6, [tmp1q+32*2]
+ mova xm7, [tmp1q+32*3]
+ vinserti128 m6, [tmp2q+32*2], 1
+ vinserti128 m7, [tmp2q+32*3], 1
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova xm8, [tmp1q-32*4+16]
+ mova xm9, [tmp1q-32*3+16]
+ vinserti128 m8, [tmp2q-32*4+16], 1
+ vinserti128 m9, [tmp2q-32*3+16], 1
+ mova [tmp1q-32*4], m0
+ mova [tmp2q-32*4], m1
+ mova [tmp1q-32*3], m2
+ mova [tmp2q-32*3], m3
+ mova xm2, [tmp1q-32*2+16]
+ mova xm3, [tmp1q-32*1+16]
+ vinserti128 m2, [tmp2q-32*2+16], 1
+ vinserti128 m3, [tmp2q-32*1+16], 1
+ mova [tmp1q-32*2], m4
+ mova [tmp2q-32*2], m5
+ mova [tmp1q-32*1], m6
+ mova [tmp2q-32*1], m7
+ mova xm4, [tmp1q+32*0+16]
+ mova xm5, [tmp1q+32*1+16]
+ vinserti128 m4, [tmp2q+32*0+16], 1
+ vinserti128 m5, [tmp2q+32*1+16], 1
+ mova xm6, [tmp1q+32*2+16]
+ mova xm7, [tmp1q+32*3+16]
+ vinserti128 m6, [tmp2q+32*2+16], 1
+ vinserti128 m7, [tmp2q+32*3+16], 1
+ pmulhrsw m0, m8, m10
+ pmulhrsw m1, m9, m10
+ REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
+ mova [tmp1q+32*0], m0
+ mova [tmp2q+32*0], m1
+ mova [tmp1q+32*1], m2
+ mova [tmp2q+32*1], m3
+ mova [tmp1q+32*2], m4
+ mova [tmp2q+32*2], m5
+ mova [tmp1q+32*3], m6
+ mova [tmp2q+32*3], m7
+ add tmp1q, 32*16
+ dec tmp3d
+ jg .loop
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob
+ lea r6, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_8192)]
+ mov [cq], eobd
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
+.normal:
+ PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2
+ lea tmp1q, [rsp+32*71]
+ lea r10d, [eobq-136]
+.pass1_loop:
+ LOAD_8ROWS cq+64*0, 64*4
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
+ REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
+ mova [rsp], m8
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ LOAD_8ROWS cq+64*2, 64*4
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova m0, [cq+64* 1]
+ mova m1, [cq+64*31]
+ mova m2, [cq+64*17]
+ mova m3, [cq+64*15]
+ mova m4, [cq+64* 9]
+ mova m5, [cq+64*23]
+ mova m6, [cq+64*25]
+ mova m7, [cq+64* 7]
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova m0, [cq+64* 5]
+ mova m1, [cq+64*27]
+ mova m2, [cq+64*21]
+ mova m3, [cq+64*11]
+ mova m4, [cq+64*13]
+ mova m5, [cq+64*19]
+ mova m6, [cq+64*29]
+ mova m7, [cq+64* 3]
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
+ sub tmp1q, 32*44
+ vpbroadcastd m10, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave
+ add cq, 32
+ add r10d, 0x80000000
+ jnc .pass1_loop
+ lea tmp1q, [rsp+32*7]
+ mov r10b, 4
+.pass2_loop:
+ lea r2, [tmp1q+32*64]
+ mova m0, [r2-32*4]
+ mova m1, [r2-32*2]
+ mova m2, [r2+32*0]
+ mova m3, [r2+32*2]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
+ mova [rsp], m4
+ test r10d, 0x40000000
+ jnz .fast
+ lea r3, [r2+32*64]
+ mova m4, [r3-32*4]
+ mova m5, [r3-32*2]
+ mova m6, [r3+32*0]
+ mova m7, [r3+32*2]
+.fast:
+ call m(idct_16x16_internal_8bpc).main
+ mova m1, [rsp+32*1]
+ mova [tmp1q-32*4], m0
+ mova [tmp1q-32*3], m1
+ mova [tmp1q-32*2], m2
+ mova [tmp1q-32*1], m3
+ mova [tmp1q+32*0], m4
+ mova [tmp1q+32*1], m5
+ mova [tmp1q+32*2], m6
+ mova [tmp1q+32*3], m7
+ add tmp1q, 32*8
+ mova [tmp1q-32*4], m8
+ mova [tmp1q-32*3], m9
+ mova [tmp1q-32*2], m10
+ mova [tmp1q-32*1], m11
+ mova [tmp1q+32*0], m12
+ mova [tmp1q+32*1], m13
+ mova [tmp1q+32*2], m14
+ mova [tmp1q+32*3], m15
+ mova m0, [r2-32*3]
+ mova m1, [r2-32*1]
+ mova m2, [r2+32*1]
+ mova m3, [r2+32*3]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ test r10d, 0x40000000
+ jnz .fast2
+ mova m4, [r3-32*3]
+ mova m5, [r3-32*1]
+ mova m6, [r3+32*1]
+ mova m7, [r3+32*3]
+.fast2:
+ add tmp1q, 32*8
+ lea tmp2q, [tmp1q+32*8]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m15, [o(pd_2048)]
+ add r2, 32*8
+ add r3, 32*8
+ add tmp1q, 32*16
+ add tmp2q, 32*32
+ mova m0, [r2-32*4] ; 1
+ mova m3, [r2+32*3] ; 15
+ mova m4, [r2+32*0] ; 9
+ mova m7, [r2-32*1] ; 7
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r10d, 0x40000000
+ jnz .fast3
+ mova m1, [r3+32*3] ; 31
+ mova m2, [r3-32*4] ; 17
+ mova m5, [r3-32*1] ; 23
+ mova m6, [r3+32*0] ; 25
+.fast3:
+ add r6, o_idct64_offset
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ add r6, 8
+ add tmp1q, 32*8
+ sub tmp2q, 32*8
+ mova m0, [r2-32*2] ; 5
+ mova m3, [r2+32*1] ; 11
+ mova m4, [r2+32*2] ; 13
+ mova m7, [r2-32*3] ; 3
+ pxor m1, m1
+ REPX {mova x, m1}, m2, m5, m6
+ test r10d, 0x40000000
+ jnz .fast4
+ mova m1, [r3+32*1] ; 27
+ mova m2, [r3-32*2] ; 21
+ mova m5, [r3-32*3] ; 19
+ mova m6, [r3+32*2] ; 29
+.fast4:
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
+ sub tmp1q, 32*28
+ sub dstq, r8
+ lea dstq, [dstq+strideq*4+16]
+ dec r10b
+ jg .pass2_loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/itx_avx512.asm b/third_party/dav1d/src/x86/itx_avx512.asm
new file mode 100644
index 0000000000..31c60fdd45
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx_avx512.asm
@@ -0,0 +1,7389 @@
+; Copyright © 2020-2023, VideoLAN and dav1d authors
+; Copyright © 2020-2023, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+const \
+dup16_perm, db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
+ db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15
+ db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23
+ db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31
+const \
+int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
+ db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55
+ db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
+ db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
+int8_permB: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
+ db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
+ db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55
+ db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
+int16_perm: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
+ db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
+ db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
+ db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
+idtx_16x4p: db 0, 1, 4, 5, 16, 17, 20, 21, 2, 3, 6, 7, 18, 19, 22, 23
+ db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55
+ db 8, 9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31
+ db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63
+idct_8x32p: db 60, 61, 4, 5, 32, 33, 0, 1, 28, 29, 36, 37, 56, 57, 8, 9
+ db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17
+ db 62, 63, 2, 3, 6, 7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51
+ db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35
+idct_16x32p: db 6, 7, 58, 59, 38, 39, 26, 27, 32, 33, 0, 1, 30, 31, 34, 35
+ db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21
+ db 62, 63, 2, 3, 48, 49, 16, 17, 56, 57, 8, 9, 14, 15, 50, 51
+ db 54, 55, 10, 11, 60, 61, 4, 5, 12, 13, 52, 53, 28, 29, 36, 37
+end_16x32p: db 0, 32, 1, 48, 2, 36, 3, 52, 16, 40, 17, 56, 18, 44, 19, 60
+ db 4, 33, 5, 49, 6, 37, 7, 53, 20, 41, 21, 57, 22, 45, 23, 61
+ db 8, 35, 9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63
+ db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62
+
+; packed 4-bit qword shuffle indices
+permA: dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262
+ dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373
+ dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb
+ dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea
+permB: dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604
+ dq 0xc824352d56128751, 0xd906171e74301e15
+ dq 0x6271604b03472d62, 0x735342782165b426
+ dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37
+permC: dq 0x9d409d041551c2e0, 0xbf62bf263773a486
+ dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597
+ dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e
+ dq 0x5115049dd9045b79, 0x733726bffb263d1f
+permD: dq 0x0cda098800041504, 0x0edb09b2028c3726
+ dq 0x0f11fa9c01150415, 0x0988f326039d2637
+ dq 0x05640f1108269d8c, 0x05290edb0aaebfae
+ dq 0x0005000509378c9d, 0xffffffff0bbfaebf
+
+pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11
+gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13
+gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10
+gather8d: dd 0, 19, 1, 18, 2, 17, 3, 16
+
+int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7
+int_shuf3: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+int_shuf4: db 8, 9, 0, 1, 12, 13, 4, 5, 10, 11, 2, 3, 14, 15, 6, 7
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+int_mshift: db 12, 20, 0, 0, 44, 52, 0, 0
+
+pb_32: times 4 db 32
+pw_2048: times 2 dw 2048
+pw_4096: times 2 dw 4096
+pw_8192: times 2 dw 8192
+pw_16384: times 2 dw 16384
+pw_1697x16: times 2 dw 1697*16
+pw_1697x8: times 2 dw 1697*8
+pw_2896x8: times 2 dw 2896*8
+pd_2048: dd 2048
+
+%define pw_5 (permD+52)
+%define pd_m1 (permD+60)
+%define pw_3803_1321 (permD+44)
+%define pw_2482_3803 (permD+12)
+%define pw_2440_3290 (permD+ 4)
+%define pw_m3290_2440 (permD+28)
+%define pw_3857_1380 (permD+36)
+%define pw_m1380_3857 (permD+20)
+
+pw_8192_m8192: dw 8192, -8192
+pw_m8192_8192: dw -8192, 8192
+pw_16384_m16384: dw 16384, -16384
+pw_m16384_16384: dw -16384, 16384
+
+pw_m1321_2482: dw -1321, 2482
+pw_m3344_3344: dw -3344, 3344
+pw_2482_3344: dw 2482, 3344
+pw_m3803_3344: dw -3803, 3344
+pd_3344: dd 3344
+pw_m1321_m3344: dw -1321, -3344
+pw_2896_m2896: dw 2896, -2896
+
+pw_1567_m3784: dw 1567, -3784
+pw_3784_m1567: dw 3784, -1567
+pw_4017_m799: dw 4017, -799
+pw_2276_m3406: dw 2276, -3406
+pw_m799_m4017: dw -799, -4017
+pw_m3406_m2276: dw -3406, -2276
+
+%macro COEF_PAIR 2-3 0
+pw_%1_%2: dw %1, %2
+pw_m%2_%1: dw -%2, %1
+%if %3
+pw_m%1_m%2: dw -%1, -%2
+%endif
+%endmacro
+
+COEF_PAIR 2896, 2896
+COEF_PAIR 1567, 3784, 1
+COEF_PAIR 3784, 1567
+COEF_PAIR 201, 4091
+COEF_PAIR 995, 3973
+COEF_PAIR 1751, 3703
+COEF_PAIR 3035, 2751
+COEF_PAIR 3513, 2106
+COEF_PAIR 4052, 601
+COEF_PAIR 3166, 2598, 1
+COEF_PAIR 3920, 1189, 1
+COEF_PAIR 2276, 3406
+COEF_PAIR 4017, 799
+
+%macro COEF_X8 1-*
+%rep %0
+ dw %1*8, %1*8
+ %rotate 1
+%endrep
+%endmacro
+
+pw_m2276x8: COEF_X8 -2276
+pw_3406x8: COEF_X8 3406
+pw_4017x8: COEF_X8 4017
+pw_799x8: COEF_X8 799
+pw_3784x8: COEF_X8 3784
+pw_1567x8: COEF_X8 1567
+
+pw_4076x8: COEF_X8 4076
+pw_401x8: COEF_X8 401
+pw_m2598x8: COEF_X8 -2598
+pw_3166x8: COEF_X8 3166
+pw_3612x8: COEF_X8 3612
+pw_1931x8: COEF_X8 1931
+pw_m1189x8: COEF_X8 -1189
+pw_3920x8: COEF_X8 3920
+
+pw_4091x8: COEF_X8 4091
+pw_201x8: COEF_X8 201
+pw_m2751x8: COEF_X8 -2751
+pw_3035x8: COEF_X8 3035
+pw_3703x8: COEF_X8 3703
+pw_1751x8: COEF_X8 1751
+pw_m1380x8: COEF_X8 -1380
+pw_3857x8: COEF_X8 3857
+pw_3973x8: COEF_X8 3973
+pw_995x8: COEF_X8 995
+pw_m2106x8: COEF_X8 -2106
+pw_3513x8: COEF_X8 3513
+pw_3290x8: COEF_X8 3290
+pw_2440x8: COEF_X8 2440
+pw_m601x8: COEF_X8 -601
+pw_4052x8: COEF_X8 4052
+
+pw_401_4076x8: dw 401*8, 4076*8
+pw_m2598_3166x8: dw -2598*8, 3166*8
+pw_1931_3612x8: dw 1931*8, 3612*8
+pw_m1189_3920x8: dw -1189*8, 3920*8
+pw_799_4017x8: dw 799*8, 4017*8
+pw_m2276_3406x8: dw -2276*8, 3406*8
+
+pw_201_4091x8: dw 201*8, 4091*8
+pw_m601_4052x8: dw -601*8, 4052*8
+pw_995_3973x8: dw 995*8, 3973*8
+pw_m1380_3857x8: dw -1380*8, 3857*8
+pw_1751_3703x8: dw 1751*8, 3703*8
+pw_m2106_3513x8: dw -2106*8, 3513*8
+pw_2440_3290x8: dw 2440*8, 3290*8
+pw_m2751_3035x8: dw -2751*8, 3035*8
+
+pw_101_4095x8: dw 101*8, 4095*8
+pw_m2824_2967x8: dw -2824*8, 2967*8
+pw_1660_3745x8: dw 1660*8, 3745*8
+pw_m1474_3822x8: dw -1474*8, 3822*8
+pw_897_3996x8: dw 897*8, 3996*8
+pw_m2191_3461x8: dw -2191*8, 3461*8
+pw_2359_3349x8: dw 2359*8, 3349*8
+pw_m700_4036x8: dw -700*8, 4036*8
+pw_501_4065x8: dw 501*8, 4065*8
+pw_m2520_3229x8: dw -2520*8, 3229*8
+pw_2019_3564x8: dw 2019*8, 3564*8
+pw_m1092_3948x8: dw -1092*8, 3948*8
+pw_1285_3889x8: dw 1285*8, 3889*8
+pw_m1842_3659x8: dw -1842*8, 3659*8
+pw_2675_3102x8: dw 2675*8, 3102*8
+pw_m301_4085x8: dw -301*8, 4085*8
+
+idct64_mul: COEF_X8 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474
+COEF_PAIR 401, 4076, 1
+COEF_PAIR 799, 4017
+ COEF_X8 -700, 4036, 2359, 3349, -2191, 3461, 897, 3996
+dw -2598, -3166, 3166, -2598, 2598, 3166, -4017, -799, 799, -4017
+ COEF_X8 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092
+COEF_PAIR 1931, 3612, 1
+COEF_PAIR 3406, 2276
+ COEF_X8 -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889
+dw -1189, -3920, 3920, -1189, 1189, 3920, -2276, -3406, 3406, -2276
+
+SECTION .text
+
+%define o_base int8_permA+64*18
+%define o(x) (r5 - (o_base) + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack,
+; 16 = special_mul1, 32 = special_mul2
+%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
+ mova m%2, m%4
+%if %7 & 16
+ vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd}
+ mova m%3, m%4
+%if %7 & 32
+ vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd}
+%else
+ vpdpwssd m%3, m%1, m%6
+%endif
+%elif %7 & 32
+ vpdpwssd m%2, m%1, m%5
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd}
+%elif %6 < 32
+ vpdpwssd m%2, m%1, m%5
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, m%6
+%elif %7 & 1
+ vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd}
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd}
+%else
+ vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd}
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd}
+%endif
+%if %7 & 2
+ psrld m%2, 12
+ pslld m%3, 4
+ vpshrdd m%1, m%3, m%2, 16
+%elif %7 & 4
+ ; compared to using shifts (as above) this has better throughput,
+ ; but worse latency and requires setting up the opmask/index
+ ; registers, so only use this method for the larger transforms
+ pslld m%1, m%2, 4
+ vpmultishiftqb m%1{k7}, m13, m%3
+%else
+ psrad m%2, 12
+ psrad m%3, 12
+%if %7 & 8 == 0
+ packssdw m%1, m%3, m%2
+%endif
+%endif
+%endmacro
+
+; flags: same as ITX_MUL2X_PACK
+%macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags
+%if %11 & 1
+ vpbroadcastd m%4, [o(pw_%9_%10)]
+ vpbroadcastd m%4{k1}, [o(pw_%7_%8)]
+ vpbroadcastd m%5, [o(pw_m%10_%9)]
+ vpbroadcastd m%5{k1}, [o(pw_m%8_%7)]
+%else
+ vpbroadcastd m%4, [o(pw_m%10_%9)]
+ vpbroadcastd m%4{k1}, [o(pw_m%8_%7)]
+ vpbroadcastd m%5, [o(pw_%9_%10)]
+ vpbroadcastd m%5{k1}, [o(pw_%7_%8)]
+%endif
+ ITX_MUL2X_PACK %1, %2, %3, %6, %4, %5, %11
+%endmacro
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
+ punpcklwd m%3, m%2, m%1
+ punpckhwd m%2, m%1
+%if %7 < 32
+ mova m%1, m%5
+ vpdpwssd m%1, m%3, m%7
+ mova m%4, m%5
+ vpdpwssd m%4, m%2, m%7
+%else
+ mova m%1, m%5
+ vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd}
+ mova m%4, m%5
+ vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd}
+%endif
+ psrad m%1, 12
+ psrad m%4, 12
+ packssdw m%1, m%4
+ mova m%4, m%5
+%if %7 < 32
+ vpdpwssd m%4, m%2, m%6
+ mova m%2, m%5
+ vpdpwssd m%2, m%3, m%6
+%else
+ vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd}
+ mova m%2, m%5
+ vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd}
+%endif
+ psrad m%4, 12
+ psrad m%2, 12
+%if %0 == 8
+ packssdw m%8, m%2, m%4
+%else
+ packssdw m%2, m%4
+%endif
+%endmacro
+
+%macro WRAP_XMM 1+
+ %xdefine %%reset RESET_MM_PERMUTATION
+ INIT_XMM cpuname
+ DEFINE_MMREGS xmm
+ AVX512_MM_PERMUTATION
+ %1
+ %%reset
+%endmacro
+
+%macro WRAP_YMM 1+
+ INIT_YMM cpuname
+ %1
+ INIT_ZMM cpuname
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+ vpbroadcastd m2, [o(pw_%5)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+%endif
+ lea r2, [dstq+strideq*2]
+%assign %%i 1
+%rep 4
+ %if %1 & 2
+ CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1)
+ %else
+ CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+ %endif
+ %assign %%i %%i + 1
+ %rotate 1
+%endrep
+ movd m2, [%%row_adr1]
+ pinsrd m2, [%%row_adr2], 1
+ movd m3, [%%row_adr3]
+ pinsrd m3, [%%row_adr4], 1
+ pmovzxbw m2, m2
+ pmovzxbw m3, m3
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ movd [%%row_adr1], m0
+ pextrd [%%row_adr2], m0, 1
+ pextrd [%%row_adr3], m0, 2
+ pextrd [%%row_adr4], m0, 3
+ ret
+%endmacro
+
+%macro INV_TXFM_FN 3 ; type1, type2, size
+cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base
+ %define %%p1 m(i%1_%3_internal_8bpc)
+ lea baseq, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%3_internal_8bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x4
+%ifidn %1_%2, dct_dct
+ vpbroadcastw m0, [cq]
+ vpbroadcastd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [cq], eobd
+ pmulhrsw m0, m1
+ mova m1, m0
+ jmp m(iadst_4x4_internal_8bpc).end2
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 0
+ vpbroadcastd m4, [o(pd_2048)]
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784
+ ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896
+ paddsw m0, m1, m2 ; out0 out1
+ psubsw m1, m2 ; out3 out2
+%endmacro
+
+%macro IADST4_1D_PACKED 0
+ punpcklwd m4, m1, m0 ; in2 in0
+ punpckhwd m5, m1, m0 ; in3 in1
+.main2:
+ vpbroadcastd m3, [o(pd_2048)]
+ mova m0, m3
+ vpdpwssd m0, m4, [o(pw_3803_1321)] {bcstd}
+ mova m2, m3
+ vpdpwssd m2, m4, [o(pw_m1321_2482)] {bcstd}
+ mova m1, m3
+ vpdpwssd m1, m4, [o(pw_m3344_3344)] {bcstd}
+ vpdpwssd m3, m4, [o(pw_2482_3803)] {bcstd}
+ vpdpwssd m0, m5, [o(pw_2482_3344)] {bcstd}
+ vpdpwssd m2, m5, [o(pw_m3803_3344)] {bcstd}
+ vpdpwssd m1, m5, [o(pd_3344)] {bcstd}
+ vpdpwssd m3, m5, [o(pw_m1321_m3344)] {bcstd}
+ REPX {psrad x, 12}, m0, m2, m1, m3
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m3 ; out2 out3
+%endmacro
+
+INIT_XMM avx512icl
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
+
+cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ IDCT4_1D_PACKED
+ mova m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ pxor ymm16, ymm16
+ mova [cq], ymm16
+ ITX4_END 0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call .main
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ pxor ymm16, ymm16
+ mova [cq], ymm16
+.end2:
+ ITX4_END 0, 1, 2, 3
+ALIGN function_align
+.main:
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call m(iadst_4x4_internal_8bpc).main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ jmp tx2q
+.pass2:
+ call m(iadst_4x4_internal_8bpc).main
+.end:
+ pxor ymm16, ymm16
+ mova [cq], ymm16
+.end2:
+ ITX4_END 3, 2, 1, 0
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x4_internal_8bpc).end
+
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x8
+%ifidn %1_%2, dct_dct
+ movd xmm1, [o(pw_2896x8)]
+ pmulhrsw xmm0, xmm1, [cq]
+ movd xmm2, [o(pw_2048)]
+ pmulhrsw xmm0, xmm1
+ pmulhrsw xmm0, xmm1
+ pmulhrsw xmm0, xmm2
+ vpbroadcastw ym0, xmm0
+ mova ym1, ym0
+ jmp m(iadst_4x8_internal_8bpc).end3
+%endif
+%endmacro
+
+%macro IDCT8_1D_PACKED 0
+ punpckhwd m5, m3, m0 ; in7 in1
+ punpckhwd m4, m1, m2 ; in3 in5
+ punpcklwd m3, m1 ; in6 in2
+ punpcklwd m2, m0 ; in4 in0
+.main2:
+ vpbroadcastd m6, [o(pd_2048)]
+ ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a
+ ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
+ ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2
+ psubsw m0, m5, m4 ; t5a t6a (interleaved)
+ paddsw m4, m5 ; t4 t7 (interleaved)
+ ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1
+ ITX_MUL2X_PACK 0, 1, 5, 6, 2896, 2896, 1 ; t6 t5
+%if mmsize > 16
+ vbroadcasti32x4 m1, [o(deint_shuf)]
+ pshufb m4, m1
+%else
+ pshufb m4, [o(deint_shuf)]
+%endif
+ psubsw m1, m2, m3 ; tmp3 tmp2
+ paddsw m3, m2 ; tmp0 tmp1
+ punpckhqdq m2, m4, m0 ; t7 t6
+ punpcklqdq m4, m0 ; t4 t5
+ paddsw m0, m3, m2 ; out0 out1
+ psubsw m3, m2 ; out7 out6
+ psubsw m2, m1, m4 ; out4 out5
+ paddsw m1, m4 ; out3 out2
+%endmacro
+
+%macro IADST8_1D_PACKED 1 ; pass
+ vpbroadcastd m6, [o(pd_2048)]
+%if %1 == 1
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
+ psubsw m4, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
+%if mmsize > 16
+ vbroadcasti32x4 m2, [o(deint_shuf)]
+%else
+ mova m2, [o(deint_shuf)]
+%endif
+ vprord m1, 16
+ psubsw m3, m0, m1 ; t3 t2
+ paddsw m0, m1 ; -out7 out0
+ psubsw m1, m4, m5 ; t7 t6
+ paddsw m4, m5 ; out6 -out1
+ pshufb m0, m2
+ pshufb m4, m2
+ mova m2, m6
+ vpdpwssd m2, m3, [o(pw_m2896_2896)] {bcstd}
+ mova m5, m6
+ vpdpwssd m5, m1, [o(pw_m2896_2896)] {bcstd}
+ psrad m2, 12
+ psrad m5, 12
+ packssdw m2, m5 ; out4 -out5
+ mova m5, m6
+ vpdpwssd m5, m3, [o(pw_2896_2896)] {bcstd}
+ mova m3, m6
+ vpdpwssd m3, m1, [o(pw_2896_2896)] {bcstd}
+ psrad m5, 12
+ psrad m3, 12
+ packssdw m1, m3, m5 ; out2 -out3
+%else
+ punpckhwd m0, m4, m3 ; 0 7
+ punpckhwd m1, m5, m2 ; 2 5
+ punpcklwd m2, m5 ; 4 3
+ punpcklwd m3, m4 ; 6 1
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a
+ psubsw m4, m0, m2 ; t4 t5
+ paddsw m0, m2 ; t0 t1
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ shufps m2, m5, m4, q1032
+ punpckhwd m4, m2
+ punpcklwd m5, m2
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784 ; t4a t5a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a
+ psubsw m2, m0, m1 ; t2 t3
+ paddsw m0, m1 ; out0 -out7
+ psubsw m1, m4, m5 ; t6 t7
+ paddsw m4, m5 ; -out1 out6
+ vpbroadcastd m5, [o(pw_2896x8)]
+ punpckhqdq m3, m2, m1 ; t3 t7
+ punpcklqdq m2, m1 ; t2 t6
+ paddsw m1, m2, m3 ; t2+t3 t6+t7
+ psubsw m2, m3 ; t2-t3 t6-t7
+ punpckhqdq m3, m4, m0 ; out6 -out7
+ punpcklqdq m0, m4 ; out0 -out1
+ pmulhrsw m2, m5 ; out4 -out5
+ pshufd m1, m1, q1032
+ pmulhrsw m1, m5 ; out2 -out3
+%endif
+%endmacro
+
+INIT_YMM avx512icl
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, identity
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+
+cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ IDCT4_1D_PACKED
+ vbroadcasti32x4 m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, m0, 1
+ vextracti32x4 xm3, m1, 1
+ call .main
+ vpbroadcastd m4, [o(pw_2048)]
+ vinserti32x4 m0, m0, xm2, 1
+ vinserti32x4 m1, m1, xm3, 1
+ pshufd m1, m1, q1032
+ jmp m(iadst_4x8_internal_8bpc).end2
+ALIGN function_align
+.main:
+ WRAP_XMM IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ call m(iadst_8x4_internal_8bpc).main
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, m0, 1
+ vextracti32x4 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+ pshufd xm5, xm1, q1032
+ call .main_pass2
+ vpbroadcastd m4, [o(pw_2048)]
+ vinserti32x4 m0, xm2, 1
+ vinserti32x4 m1, xm3, 1
+ pxor m5, m5
+ psubw m5, m4
+.end:
+ punpcklqdq m4, m5
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+.end3:
+ vpbroadcastd m3, strided
+ pmulld m5, m3, [o(pd_0to15)]
+ kxnorb k1, k1, k1
+ kmovb k2, k1
+ vpgatherdd m3{k1}, [dstq+m5]
+ pxor m4, m4
+ mova [cq], zmm20
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ vpscatterdd [dstq+m5]{k2}, m0
+ RET
+ALIGN function_align
+.main_pass1:
+ punpckhwd xm0, xm4, xm3 ; 0 7
+ punpckhwd xm1, xm5, xm2 ; 2 5
+ punpcklwd xm2, xm5 ; 4 3
+ punpcklwd xm3, xm4 ; 6 1
+ WRAP_XMM IADST8_1D_PACKED 1
+ punpcklqdq xm3, xm4, xm0 ; out6 -out7
+ punpckhqdq xm0, xm4 ; out0 -out1
+ ret
+ALIGN function_align
+.main_pass2:
+ WRAP_XMM IADST8_1D_PACKED 2
+ ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ call m(iadst_8x4_internal_8bpc).main
+ punpcklwd m3, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m3
+ punpckhwd m1, m3
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, m0, 1
+ vextracti32x4 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+ pshufd xm5, xm1, q1032
+ call m(iadst_4x8_internal_8bpc).main_pass2
+ vpbroadcastd m5, [o(pw_2048)]
+ vinserti32x4 m3, xm1, 1
+ vinserti32x4 m2, xm0, 1
+ pxor m4, m4
+ psubw m4, m5
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ jmp m(iadst_4x8_internal_8bpc).end
+
+INIT_ZMM avx512icl
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd m0, [o(pw_2896x8)]
+ pmulhrsw m0, [cq]
+ mova m1, [o(int8_permB)]
+ vpbroadcastd m2, [o(pw_1697x8)]
+ vpermb m0, m1, m0
+ pmulhrsw m2, m0
+ paddsw m0, m2
+ vextracti32x8 ym1, m0, 1
+ jmp tx2q
+.pass2:
+ vpbroadcastd ym4, [o(pw_4096)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x16
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ imul r6d, 181
+ add r6d, 128+2048
+ sar r6d, 8+4
+ vpbroadcastw m0, r6d
+ mova m1, m0
+ jmp m(iadst_4x16_internal_8bpc).end3
+%endif
+%endmacro
+
+%macro IDCT16_1D_PACKED 0
+ punpckhwd m8, m7, m0 ; dct16 in15 in1
+ punpcklwd m9, m4, m0 ; dct4 in2 in0
+ punpckhwd m0, m3, m4 ; dct16 in7 in9
+ punpcklwd m7, m1 ; dct8 in7 in1
+ punpckhwd m1, m6 ; dct16 in3 in13
+ punpcklwd m3, m5 ; dct8 in3 in5
+ punpckhwd m5, m2 ; dct16 in11 in5
+ punpcklwd m6, m2 ; dct4 in3 in1
+cglobal_label .main2
+ vpbroadcastd m10, [o(pd_2048)]
+.main3:
+ vpbroadcastq m13, [o(int_mshift)]
+ vpcmpub k7, m13, m10, 6 ; 0x33...
+ ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 5 ; t8a t15a
+ ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 5 ; t9a t14a
+ ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a
+ ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a
+ ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 5 ; t4a t7a
+ ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 5 ; t5a t6a
+.main4:
+ psubsw m2, m8, m0 ; t9 t14
+ paddsw m8, m0 ; t8 t15
+ psubsw m4, m1, m5 ; t10 t13
+ paddsw m1, m5 ; t11 t12
+ ITX_MUL2X_PACK 6, 0, 5, 10, 1567, 3784 ; t3 t2
+ psubsw m0, m8, m1 ; t11a t12a
+ paddsw m8, m1 ; t8a t15a
+ psubsw m1, m7, m3 ; t5a t6a
+ paddsw m7, m3 ; t4 t7
+.main5:
+ ITX_MUL2X_PACK 2, 3, 5, 10, 1567, 3784, 5 ; t9a t14a
+ ITX_MUL2X_PACK 4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a
+%if mmsize > 16
+ vbroadcasti32x4 m5, [o(deint_shuf)]
+%else
+ mova m5, [o(deint_shuf)]
+%endif
+ vpbroadcastd m11, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ paddsw m3, m2, m4 ; t9 t14
+ psubsw m2, m4 ; t10 t13
+ pshufb m8, m5
+ pshufb m7, m5
+ pshufb m3, m5
+ ITX_MUL2X_PACK 9, 4, 5, 10, 11, 12 ; t0 t1
+ ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6
+ ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12
+ ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a
+ punpckhqdq m2, m7, m1 ; t7 t6
+ punpcklqdq m7, m1 ; t4 t5
+ psubsw m1, m9, m6 ; dct4 out3 out2
+ paddsw m9, m6 ; dct4 out0 out1
+ packssdw m5, m11 ; t12 t13a
+ packssdw m4, m0 ; t11 t10a
+ punpckhqdq m0, m8, m3 ; t15a t14
+ punpcklqdq m8, m3 ; t8a t9
+ psubsw m3, m9, m2 ; dct8 out7 out6
+ paddsw m9, m2 ; dct8 out0 out1
+ psubsw m2, m1, m7 ; dct8 out4 out5
+ paddsw m1, m7 ; dct8 out3 out2
+ psubsw m7, m9, m0 ; out15 out14
+ paddsw m0, m9 ; out0 out1
+ psubsw m6, m1, m5 ; out12 out13
+ paddsw m1, m5 ; out3 out2
+ psubsw m5, m2, m4 ; out11 out10
+ paddsw m2, m4 ; out4 out5
+ psubsw m4, m3, m8 ; out8 out9
+ paddsw m3, m8 ; out7 out6
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, identity
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+
+cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova ym1, [cq+32*2]
+ vinserti32x8 m1, [cq+32*0], 1
+ mova m0, [o(int16_perm)]
+ mova ym2, [cq+32*3]
+ vinserti32x8 m2, [cq+32*1], 1
+ vpbroadcastd m4, [o(pd_2048)]
+ vpermb m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3
+ vpermb m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3
+ ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896, 2
+ ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784, 2
+ vpbroadcastd m4, [o(pw_16384)]
+ psubsw m3, m1, m2
+ paddsw m1, m2 ; out0 out1
+ vprord m3, 16 ; out2 out3
+ punpckldq m0, m1, m3
+ punpckhdq m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, ym1, 1
+ vextracti32x4 xm4, m0, 2
+ vextracti32x4 xm5, m1, 2
+ vextracti32x4 xm6, m0, 3
+ vextracti32x4 xm7, m1, 3
+ call .main
+ vinserti32x4 ym0, xm2, 1
+ vinserti32x4 ym1, xm3, 1
+ vinserti32x4 ym4, xm6, 1
+ vinserti32x4 ym5, xm7, 1
+ vinserti32x8 m0, ym4, 1
+ vinserti32x8 m1, ym5, 1
+ vpbroadcastd m5, [o(pw_2048)]
+ pshufd m1, m1, q1032
+ jmp m(iadst_4x16_internal_8bpc).end2
+ALIGN function_align
+.main:
+ WRAP_XMM IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m1, [o(permB)]
+ vpermq m0, m1, [cq+64*0]
+ vpermq m1, m1, [cq+64*1]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m3, [o(pw_16384)]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmulhrsw m2, m3
+ pmulhrsw m0, m3
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m5, [o(pw_2048)]
+ psrlq m10, 4
+ psubw m6, m8, m5
+.end:
+ vpbroadcastd m7, [o(pw_2896x8)]
+ paddsw ym1, ym2, ym4
+ psubsw ym2, ym4
+ vinserti32x8 m1, ym2, 1
+ pmulhrsw m1, m7 ; -out7 out4 out6 -out5 out8 -out11 -out9 out10
+ psrlq m0, m10, 4
+ vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d
+ vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f
+ punpcklqdq m5, m6
+.end2:
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+.end3:
+ vpbroadcastd m3, strided
+ pmulld m5, m3, [o(pd_0to15)]
+ kxnorw k1, k1, k1
+ kmovw k2, k1
+ vpgatherdd m3{k1}, [dstq+m5]
+ pxor m4, m4
+ mova [cq+64*0], m4
+ mova [cq+64*1], m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ vpscatterdd [dstq+m5]{k2}, m0
+ RET
+ALIGN function_align
+.main:
+ movu m3, [o(permB+1)]
+ psrlq m10, m3, 4
+.main2:
+ vpermi2q m3, m0, m1 ; in15 in12 in13 in14 in11 in8 in9 in10
+ vpermt2q m0, m10, m1 ; in0 in3 in2 in1 in4 in7 in6 in5
+ vpbroadcastd m9, [o(pd_2048)]
+ vpbroadcastq ym13, [o(int_mshift)]
+ kxnorb k1, k1, k1
+ punpckhwd m4, m3, m0 ; in12 in3 in14 in1
+ punpcklwd m0, m3 ; in0 in15 in2 in13
+ kshiftrb k1, k1, 4
+ vextracti32x8 ym3, m4, 1 ; in8 in7 in10 in5
+ vextracti32x8 ym1, m0, 1 ; in4 in11 in6 in9
+INIT_YMM avx512icl
+ vpcmpub k7, m13, m9, 6 ; 0x33...
+ pxor m8, m8
+ ITX_MUL4X_PACK 0, 2, 5, 6, 7, 9, 201, 4091, 995, 3973, 5
+ ITX_MUL4X_PACK 1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5
+ ITX_MUL4X_PACK 3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5
+ ITX_MUL4X_PACK 4, 2, 5, 6, 7, 9, 3857, 1380, 4052, 601, 5
+ psubsw m2, m0, m3 ; t9a t8a t11a t10a
+ paddsw m0, m3 ; t1a t0a t3a t2a
+ psubsw m3, m1, m4 ; t13a t12a t15a t14a
+ paddsw m4, m1 ; t5a t4a t7a t6a
+ ITX_MUL4X_PACK 2, 1, 5, 6, 7, 9, 799, 4017, 3406, 2276, 5
+ psubw m7, m8, m7
+ ITX_MUL2X_PACK 3, 1, 5, 9, 7, 6, 4
+ vpbroadcastd m6, [o(pw_3784_m1567)]
+ vpbroadcastd m6{k1}, [o(pw_m3784_1567)]
+ psubsw m1, m0, m4 ; t5 t4 t7 t6
+ paddsw m0, m4 ; t1 t0 t3 t2
+ psubsw m4, m2, m3 ; t13a t12a t15a t14a
+ paddsw m2, m3 ; t9a t8a t11a t10a
+ ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a
+ ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14
+ vbroadcasti32x4 m5, [o(deint_shuf)]
+ pshufb m0, m5
+ pshufb m2, m5
+ vshufi32x4 m3, m0, m2, 0x03 ; t3 t2 t11a t10a
+ vinserti32x4 m0, xm2, 1 ; t1 t0 t9a t8a
+ vshufi32x4 m2, m1, m4, 0x03 ; t7a t6a t15 t14
+ vinserti32x4 m1, xm4, 1 ; t4a t5a t12 t13
+ pshufd m2, m2, q1032 ; t6a t7a t14 t15
+ psubsw m4, m0, m3 ; t3a t2a t11 t10
+ paddsw m0, m3 ; -out15 out0 out14 -out1
+ paddsw m3, m1, m2 ; out12 -out3 -out13 out2
+ psubsw m1, m2 ; t7 t6 t15a t14a
+ punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a
+ punpcklqdq m4, m1 ; t3a t7 t11 t15a
+INIT_ZMM avx512icl
+ vinserti32x8 m3, ym0, 1 ; out12 -out3 -out13 out2 -out15 out0 out14 -out1
+ ret
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m1, [o(permB)]
+ vpermq m0, m1, [cq+64*0]
+ vpermq m1, m1, [cq+64*1]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m3, [o(pw_16384)]
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ jmp tx2q
+.pass2:
+ call m(iadst_4x16_internal_8bpc).main
+ vpbroadcastd m6, [o(pw_2048)]
+ psrlq m10, 12
+ psubw m5, m8, m6
+ jmp m(iadst_4x16_internal_8bpc).end
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m2, [o(int16_perm)]
+ vpermb m1, m2, [cq+64*0]
+ vpermb m2, m2, [cq+64*1]
+ vpbroadcastd m4, [o(pw_1697x8)]
+ vpbroadcastd m0, [o(pd_m1)]
+ pmulhrsw m3, m4, m1 ; we want to do a signed avg, but pavgw is
+ vpcmpw k1, m1, m0, 4 ; unsigned. as long as both signs are equal
+ pmulhrsw m4, m2 ; it still works, but if the input is -1 the
+ vpcmpw k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes
+ vpavgw m1{k1}{z}, m3 ; pavgw to output -32768 instead of 0 unless
+ vpavgw m2{k2}{z}, m4 ; we explicitly deal with that case here.
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x16)]
+ vpbroadcastd m5, [o(pw_2048)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m0
+ paddsw m1, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x16_internal_8bpc).end2
+
+%macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3]
+ movq xm%3, [dstq ]
+ movhps xm%3, [dstq+%5]
+ movq xm%4, [dstq+%6]
+ movhps xm%4, [dstq+%7]
+ pmovzxbw m%3, xm%3
+ pmovzxbw m%4, xm%4
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vextracti32x4 xm%4, m%3, 1
+ movq [dstq ], xm%3
+ movhps [dstq+%6], xm%3
+ movq [dstq+%5], xm%4
+ movhps [dstq+%7], xm%4
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x4
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_2048)]
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ jmp m(iadst_8x4_internal_8bpc).end3
+%endif
+%endmacro
+
+INIT_YMM avx512icl
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
+
+cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd xm3, [o(pw_2896x8)]
+ pmulhrsw xm0, xm3, [cq+16*0]
+ pmulhrsw xm1, xm3, [cq+16*1]
+ pmulhrsw xm2, xm3, [cq+16*2]
+ pmulhrsw xm3, [cq+16*3]
+ call m(idct_4x8_internal_8bpc).main
+ vbroadcasti32x4 m4, [o(deint_shuf)]
+ vinserti32x4 m3, m1, xm3, 1
+ vinserti32x4 m1, m0, xm2, 1
+ shufps m0, m1, m3, q0220
+ shufps m1, m3, q1331
+ pshufb m0, m4
+ pshufb m1, m4
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ jmp m(iadst_8x4_internal_8bpc).end2
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd xm0, [o(pw_2896x8)]
+ pshufd xm4, [cq+16*0], q1032
+ pmulhrsw xm3, xm0, [cq+16*3]
+ pshufd xm5, [cq+16*1], q1032
+ pmulhrsw xm2, xm0, [cq+16*2]
+ pmulhrsw xm4, xm0
+ pmulhrsw xm5, xm0
+ call m(iadst_4x8_internal_8bpc).main_pass1
+ vinserti32x4 m0, xm2, 1
+ vinserti32x4 m1, xm3, 1
+ pxor m3, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ psubsw m3, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+.end2:
+ vpbroadcastd m2, [o(pw_2048)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+.end3:
+ pxor m2, m2
+ mova [cq], zmm18
+ lea r6, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ RET
+ALIGN function_align
+.main:
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd xm0, [o(pw_2896x8)]
+ pshufd xm4, [cq+16*0], q1032
+ pmulhrsw xm3, xm0, [cq+16*3]
+ pshufd xm5, [cq+16*1], q1032
+ pmulhrsw xm2, xm0, [cq+16*2]
+ pmulhrsw xm4, xm0
+ pmulhrsw xm5, xm0
+ call m(iadst_4x8_internal_8bpc).main_pass1
+ vinserti32x4 m3, m3, xm1, 1
+ vinserti32x4 m2, m2, xm0, 1
+ punpckhwd m1, m3, m2
+ punpcklwd m3, m2
+ pxor m0, m0
+ psubsw m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call m(iadst_8x4_internal_8bpc).main
+ mova m2, m1
+ vpermq m1, m0, q2031
+ vpermq m0, m2, q2031
+ jmp m(iadst_8x4_internal_8bpc).end2
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova xm2, [cq+16*0]
+ mova xm0, [cq+16*1]
+ vinserti32x4 m2, [cq+16*2], 1
+ vinserti32x4 m0, [cq+16*3], 1
+ vpbroadcastd m3, [o(pw_2896x8)]
+ punpcklwd m1, m2, m0
+ punpckhwd m2, m0
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ paddsw m0, m0
+ paddsw m1, m1
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_8x4_internal_8bpc).end
+
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x8
+%ifidn %1_%2, dct_dct
+INIT_ZMM avx512icl
+ movsx r6d, word [cq]
+ mov [cq], eobd
+.dconly:
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+.dconly2:
+ vpbroadcastd ym2, strided
+ imul r6d, 181
+ pmulld ym5, ym2, [o(pd_0to15)]
+ kxnorb k1, k1, k1
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m3, m3
+ vpbroadcastw m4, r6d
+.dconly_loop:
+ kmovb k2, k1
+ vpgatherdq m2{k1}, [dstq+ym5]
+ punpcklbw m0, m2, m3
+ punpckhbw m1, m2, m3
+ paddw m0, m4
+ paddw m1, m4
+ packuswb m0, m1
+ kmovb k1, k2
+ vpscatterdq [dstq+ym5]{k2}, m0
+ lea dstq, [dstq+strideq*8]
+ sub r3d, 8
+ jg .dconly_loop
+ RET
+INIT_YMM avx512icl
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, identity
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ call .main
+ shufps m4, m0, m1, q0220
+ shufps m5, m0, m1, q1331
+ shufps m1, m2, m3, q0220
+ shufps m3, m2, m3, q1331
+ vbroadcasti32x4 m0, [o(deint_shuf)]
+ vpbroadcastd m2, [o(pw_16384)]
+ REPX {pshufb x, m0}, m4, m5, m1, m3
+ REPX {pmulhrsw x, m2}, m4, m5, m1, m3
+ vinserti32x4 m0, m4, xm1, 1
+ vshufi32x4 m2, m4, m1, 0x03
+ vinserti32x4 m1, m5, xm3, 1
+ vshufi32x4 m3, m5, m3, 0x03
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m4, [o(pw_2048)]
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ jmp m(iadst_8x8_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call .main_pass1
+ vpbroadcastd m5, [o(pw_16384_m16384)]
+ punpcklwd m4, m0, m1
+ punpckhwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ punpcklwd m3, m4, m0
+ punpckhwd m4, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ REPX {pmulhrsw x, m5}, m3, m4, m0, m1
+ vshufi32x4 m2, m3, m0, 0x03
+ vinserti32x4 m0, m3, xm0, 1
+ vshufi32x4 m3, m4, m1, 0x03
+ vinserti32x4 m1, m4, xm1, 1
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call .main_pass2
+ vpbroadcastd m5, [o(pw_2048)]
+ vpbroadcastd xm4, [o(pw_4096)]
+ psubw m4, m5 ; lower half = 2048, upper half = -2048
+.end:
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+.end3:
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+.end4:
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+ lea r6, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 2, 3, 4, 5
+ RET
+ALIGN function_align
+.main_pass1:
+ punpckhwd m0, m4, m3 ; 0 7
+ punpckhwd m1, m5, m2 ; 2 5
+ punpcklwd m2, m5 ; 4 3
+ punpcklwd m3, m4 ; 6 1
+ IADST8_1D_PACKED 1
+ punpcklqdq m3, m4, m0 ; out6 -out7
+ punpckhqdq m0, m4 ; out0 -out1
+ ret
+ALIGN function_align
+cglobal_label .main_pass2
+ IADST8_1D_PACKED 2
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call m(iadst_8x8_internal_8bpc).main_pass1
+ vpbroadcastd m5, [o(pw_m16384_16384)]
+ punpckhwd m4, m3, m2
+ punpcklwd m3, m2
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ punpckhwd m0, m4, m3
+ punpcklwd m4, m3
+ punpckhwd m3, m2, m1
+ punpcklwd m2, m1
+ REPX {pmulhrsw x, m5}, m0, m4, m3, m2
+ vinserti32x4 m1, m0, xm3, 1
+ vshufi32x4 m3, m0, m3, 0x03
+ vinserti32x4 m0, m4, xm2, 1
+ vshufi32x4 m2, m4, m2, 0x03
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_8x8_internal_8bpc).main_pass2
+ vpbroadcastd m4, [o(pw_2048)]
+ vpbroadcastd xm5, [o(pw_4096)]
+ psubw m4, m5 ; lower half = -2048, upper half = 2048
+ vpermq m5, m3, q2031
+ vpermq m3, m0, q2031
+ vpermq m0, m2, q2031
+ vpermq m2, m1, q2031
+ pmulhrsw m1, m0, m4
+ pmulhrsw m0, m5, m4
+ jmp m(iadst_8x8_internal_8bpc).end3
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova xm3, [cq+16*0]
+ mova xm2, [cq+16*1]
+ vinserti32x4 m3, [cq+16*4], 1
+ vinserti32x4 m2, [cq+16*5], 1
+ mova xm4, [cq+16*2]
+ mova xm0, [cq+16*3]
+ vinserti32x4 m4, [cq+16*6], 1
+ vinserti32x4 m0, [cq+16*7], 1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m4, [o(pw_4096)]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x16
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 16
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
+%endif
+%endmacro
+
+%macro ITX_8X16_LOAD_COEFS 0
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m0, m4, [cq+32*0]
+ add cq, 32*4
+ pmulhrsw m7, m4, [cq+32*3]
+ pmulhrsw m1, m4, [cq-32*3]
+ pmulhrsw m6, m4, [cq+32*2]
+ pmulhrsw m2, m4, [cq-32*2]
+ pmulhrsw m5, m4, [cq+32*1]
+ pmulhrsw m3, m4, [cq-32*1]
+ pmulhrsw m4, [cq+32*0]
+%endmacro
+
+INIT_ZMM avx512icl
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+
+cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m3, [o(permB)]
+ vpermq m0, m3, [cq+64*0]
+ vpbroadcastd m4, [o(pw_2896x8)]
+ vpermq m1, m3, [cq+64*1]
+ vpermq m2, m3, [cq+64*2]
+ vpermq m3, m3, [cq+64*3]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ call m(idct_16x8_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_16384)]
+ punpckhwd m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3
+ punpcklwd m0, m2 ; a0 e0 a1 e1 a2 e2 a3 e3
+ punpckhwd m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3
+ punpcklwd m1, m3 ; d0 h0 d1 h1 d2 h2 d3 h3
+ REPX {pmulhrsw x, m5}, m4, m0, m2, m1
+ punpckhwd m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3
+ punpcklwd m0, m4 ; a0 b0 e0 f0 a1 b1 e1 f1
+ punpckhwd m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3
+ punpcklwd m2, m1 ; c0 d0 g0 h0 c1 d1 g1 h1
+ punpckhdq m1, m0, m2 ; 1 5 9 13
+ punpckldq m0, m2 ; 0 4 8 12
+ punpckldq m2, m3, m4 ; 2 6 10 14
+ punpckhdq m3, m4 ; 3 7 11 15
+ jmp tx2q
+.pass2:
+ vprord m5, [o(int16_perm)], 16
+ vshufi32x4 m2, m2, q1320 ; 2 10 14 6
+ vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11
+ vshufi32x4 m1, m3, q0132 ; 9 13 7 3
+ vpermb m9, m5, m0
+ vpermb m7, m5, m2
+ vpermb m8, m5, m4
+ vpermb m0, m5, m1
+ vextracti32x8 ym6, m9, 1
+ vextracti32x8 ym3, m7, 1
+ vextracti32x8 ym5, m8, 1
+ vextracti32x8 ym1, m0, 1
+ call .main2
+ mova ym8, [o(gather8a)]
+ lea r3, [dstq+strideq*4]
+ pmovzxdq m9, ym8
+ pshufd ym8, ym8, q1230
+ vpermt2q m0, m9, m4
+ vpermt2q m1, m9, m5
+ vpermt2q m2, m9, m6
+ vpermt2q m3, m9, m7
+.end:
+ vpbroadcastd m7, [o(pw_2048)]
+.end2:
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+.end3:
+ pmulhrsw m2, m7
+ pmulhrsw m3, m7
+.end4:
+ vpbroadcastd ym6, strided
+ kxnorb k1, k1, k1
+ pxor m4, m4
+ pmulld ym8, ym6
+ kmovb k2, k1
+ vpgatherdq m6{k1}, [dstq+ym8]
+ kmovb k1, k2
+ vpgatherdq m7{k2}, [r3+ym8]
+ mova [cq+64*0], m4
+ mova [cq+64*1], m4
+ kmovb k2, k1
+ mova [cq+64*2], m4
+ mova [cq+64*3], m4
+ punpcklbw m5, m6, m4
+ punpckhbw m6, m4
+ paddw m0, m5
+ paddw m1, m6
+ packuswb m0, m1
+ vpscatterdq [dstq+ym8]{k1}, m0
+ punpcklbw m6, m7, m4
+ punpckhbw m7, m4
+ paddw m2, m6
+ paddw m3, m7
+ packuswb m2, m3
+ vpscatterdq [r3+ym8]{k2}, m2
+ RET
+ALIGN function_align
+cglobal_label .main_fast2 ; bottom three-quarters are zero
+ vpbroadcastd ym10, [o(pd_2048)]
+ vpbroadcastq ym13, [o(int_mshift)]
+ vpbroadcastd ym3, [o(pw_401_4076x8)]
+ vpbroadcastd ym5, [o(pw_799_4017x8)]
+ vpbroadcastd ym4, [o(pw_m1189_3920x8)]
+ pxor ym6, ym6
+ punpckhwd ym2, ym0, ym0
+ pmulhrsw ym2, ym3 ; t8a t15a
+ punpcklwd ym7, ym1, ym1
+ pmulhrsw ym7, ym5 ; t4a t7a
+ punpckhwd ym1, ym1
+ pmulhrsw ym4, ym1 ; t11a t12a
+ vpcmpub k7, ym13, ym10, 6
+ punpcklwd ym9, ym6, ym0
+ psubsw ym0, ym2, ym4 ; t11a t12a
+ paddsw ym8, ym2, ym4 ; t8a t15a
+ mova ym1, ym7
+ jmp .main5
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ vpbroadcastd ym10, [o(pd_2048)]
+ vpbroadcastq ym13, [o(int_mshift)]
+ pxor ym6, ym6
+ punpckhwd ym8, ym0, ym0
+ punpckhwd ym4, ym3, ym3
+ punpckhwd ym5, ym2, ym2
+ punpcklwd ym7, ym1, ym1
+ punpckhwd ym1, ym1
+ punpcklwd ym3, ym3
+ punpcklwd ym9, ym6, ym0
+ punpcklwd ym6, ym2
+ vpbroadcastd ym2, [o(pw_401_4076x8)]
+ vpbroadcastd ym0, [o(pw_m2598_3166x8)]
+ vpbroadcastd ym11, [o(pw_1931_3612x8)]
+ vpbroadcastd ym12, [o(pw_m1189_3920x8)]
+ pmulhrsw ym8, ym2 ; t8a t15a
+ vpbroadcastd ym2, [o(pw_799_4017x8)]
+ pmulhrsw ym0, ym4 ; t9a t14a
+ vpbroadcastd ym4, [o(pw_m2276_3406x8)]
+ pmulhrsw ym5, ym11 ; t10a t13a
+ pmulhrsw ym1, ym12 ; t11a t12a
+ pmulhrsw ym7, ym2 ; t4a t7a
+ pmulhrsw ym3, ym4 ; t5a t6a
+ vpcmpub k7, ym13, ym10, 6
+ jmp .main4
+ALIGN function_align
+cglobal_label .main
+ WRAP_YMM IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_16x8_internal_8bpc).main_pass1
+ vbroadcasti32x4 m6, [o(int_shuf1)]
+ vpbroadcastd m7, [o(pw_16384_m16384)]
+ punpckhwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpcklwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3
+ pshufb m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
+ pshufb m2, m6 ; e0 f0 e1 f1 e2 f2 e3 f3
+.pass1_end:
+ REPX {pmulhrsw x, m7}, m3, m5, m4, m2
+ punpckldq m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m3, m5 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckhdq m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m2, m4 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m5
+ punpckhqdq m3, m5
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ vpbroadcastd m6, [o(pw_2048)]
+ psrlq m10, 4
+ psubw m7, m8, m6
+.pass2_end:
+ vpbroadcastd m5, [o(pw_2896x8)]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m5, m2 ; out8 -out11 -out9 out10
+ mova ym8, [o(gather8c)]
+ lea r3, [dstq+strideq]
+ psrlq m2, m10, 4
+ vpermi2q m2, m0, m3 ; 1 3 13 15
+ vpermt2q m0, m10, m3 ; 0 2 12 14
+ psrlq m3, m10, 8
+ vpermi2q m3, m1, m5 ; 5 7 9 11
+ psrlq m10, 12
+ vpermt2q m1, m10, m5 ; 4 6 8 10
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ jmp m(idct_8x16_internal_8bpc).end3
+ALIGN function_align
+.main_pass1:
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m5, m2, [cq+64*0]
+ pmulhrsw m3, m2, [cq+64*3]
+ pmulhrsw m1, m2, [cq+64*1]
+ pmulhrsw m2, [cq+64*2]
+ movu m4, [o(permA+3)]
+ psrlq m10, m4, 4
+ mova m6, m4
+ vpermi2q m4, m5, m3 ; in0 in12 in2 in14
+ vpermt2q m5, m10, m3 ; in15 in3 in13 in1
+ vpermi2q m6, m1, m2 ; in4 in8 in6 in10
+ vpermt2q m1, m10, m2 ; in11 in7 in9 in5
+ jmp .main
+ALIGN function_align
+.main_pass2:
+ mova m4, [o(permC)]
+ psrlq m5, m4, 4
+ vpermi2q m4, m0, m2 ; in0 in12 in2 in14
+ psrlq m6, m5, 4
+ vpermi2q m5, m1, m3 ; in15 in3 in13 in1
+ psrlq m10, m6, 4
+ vpermi2q m6, m0, m2 ; in4 in8 in6 in10
+ vpermt2q m1, m10, m3 ; in11 in7 in9 in5
+.main:
+ punpcklwd m0, m4, m5 ; in0 in15 in2 in13
+ punpckhwd m4, m5 ; in12 in3 in14 in1
+ punpcklwd m5, m6, m1 ; in4 in11 in6 in9
+ punpckhwd m6, m1 ; in8 in7 in10 in5
+cglobal_label .main2
+ vpbroadcastd m9, [o(pd_2048)]
+ vpbroadcastq m13, [o(int_mshift)]
+ kxnorb k1, k1, k1
+ vpcmpub k7, m13, m9, 6 ; 0x33...
+ pxor m8, m8
+ ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5
+ ITX_MUL4X_PACK 6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5
+ ITX_MUL4X_PACK 4, 1, 2, 3, 7, 9, 3857, 1380, 4052, 601, 5
+ ITX_MUL4X_PACK 5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5
+ psubsw m2, m0, m6 ; t9a t8a t11a t10a
+ paddsw m0, m6 ; t1a t0a t3a t2a
+ psubsw m3, m5, m4 ; t13a t12a t15a t14a
+ paddsw m5, m4 ; t5a t4a t7a t6a
+ ITX_MUL4X_PACK 2, 4, 1, 6, 7, 9, 799, 4017, 3406, 2276, 5
+ psubw m7, m8, m7
+ ITX_MUL2X_PACK 3, 4, 1, 9, 7, 6, 4
+ vpbroadcastd m6, [o(pw_3784_m1567)]
+ vpbroadcastd m6{k1}, [o(pw_m3784_1567)]
+ psubsw m1, m0, m5 ; t5 t4 t7 t6
+ paddsw m0, m5 ; t1 t0 t3 t2
+ psubsw m4, m2, m3 ; t13a t12a t15a t14a
+ paddsw m2, m3 ; t9a t8a t11a t10a
+ ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a
+ ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15
+ vbroadcasti32x4 m5, [o(deint_shuf)]
+ pshufb m0, m5
+ pshufb m2, m5
+ vshufi32x4 m3, m0, m2, q3232 ; t3 t2 t11a t10a
+ vinserti32x8 m0, ym2, 1 ; t1 t0 t9a t8a
+ vshufi32x4 m2, m1, m4, q3232 ; t6a t7a t14 t15
+ vinserti32x8 m1, ym4, 1 ; t5a t4a t13 t12
+ pshufd m2, m2, q1032 ; t7a t6a t15 t14
+ psubsw m4, m0, m3 ; t3a t2a t11 t10
+ paddsw m0, m3 ; -out15 out0 out14 -out1
+ paddsw m3, m1, m2 ; out12 -out3 -out13 out2
+ psubsw m1, m2 ; t7 t6 t15a t14a
+ punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a
+ punpcklqdq m4, m1 ; t3a t7 t11 t15a
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_16x8_internal_8bpc).main_pass1
+ vbroadcasti32x4 m6, [o(int_shuf2)]
+ vpbroadcastd m7, [o(pw_m16384_16384)]
+ punpcklwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3
+ pshufb m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
+ pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3
+ jmp m(iadst_8x16_internal_8bpc).pass1_end
+.pass2:
+ call m(iadst_8x16_internal_8bpc).main_pass2
+ vpbroadcastd m7, [o(pw_2048)]
+ psrlq m10, 36
+ psubw m6, m8, m7
+ jmp m(iadst_8x16_internal_8bpc).pass2_end
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [o(int16_perm)]
+ vpermb m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
+ vpermb m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
+ vpermb m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
+ vpermb m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
+ vpbroadcastd m5, [o(pw_2896x8)]
+ punpckldq m1, m3, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m3, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m2, m4, m0 ; e0 f0 g0 h0 a1 f1 g1 h1
+ punpckhdq m4, m0 ; e2 f2 g2 h2 e3 f3 g3 h3
+ REPX {pmulhrsw x, m5}, m1, m2, m3, m4
+ punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0
+ punpckhqdq m1, m2 ; a1 b1 c1 d1 e1 f1 g1 h1
+ punpcklqdq m2, m3, m4 ; a2 b2 c2 d2 e2 f2 g2 h2
+ punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m7, [o(pw_1697x16)]
+ mova ym8, [o(gather8b)]
+ lea r3, [dstq+strideq*2]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(idct_8x16_internal_8bpc).end
+
+%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
+ pmovzxbw m%3, [dstq+%5]
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+ pmovzxbw m%4, [dstq+%6]
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vpermq m%3, m%3, q3120
+ mova [dstq+%5], xm%3
+ vextracti32x4 [dstq+%6], m%3, 1
+%endmacro
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x4
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2
+%endif
+%endmacro
+
+INIT_ZMM avx512icl
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
+
+cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova xm0, [cq+16*0]
+ mova xm1, [cq+16*1]
+ mova xm2, [cq+16*2]
+ mova xm3, [cq+16*3]
+ mova xm4, [cq+16*4]
+ mova xm5, [cq+16*5]
+ mova xm6, [cq+16*6]
+ mova xm7, [cq+16*7]
+ call m(idct_4x16_internal_8bpc).main
+ vpbroadcastd m8, [o(pw_16384)]
+ vinserti32x4 ym1, xm3, 1 ; 3 2 7 6
+ vinserti32x4 ym5, xm7, 1 ; b a f e
+ vinserti32x4 ym0, xm2, 1 ; 0 1 4 5
+ vinserti32x4 ym4, xm6, 1 ; 8 9 c d
+ vinserti32x8 m1, ym5, 1 ; 3 2 7 6 b a f e
+ vinserti32x8 m0, ym4, 1 ; 0 1 4 5 8 9 c d
+ pmulhrsw m1, m8
+ pmulhrsw m0, m8
+ pshufd m1, m1, q1032
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ mova m2, [o(permA)]
+ jmp m(iadst_16x4_internal_8bpc).end
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+64*0]
+ mova m1, [cq+64*1]
+ movshdup m3, [o(permB)]
+ psrlq m10, m3, 4
+ call m(iadst_4x16_internal_8bpc).main2
+ vpbroadcastd m6, [o(pw_16384_m16384)]
+ psrlq m0, m10, 4
+ psrlq m10, 8
+.pass1_end:
+ punpcklwd ym5, ym4, ym2
+ punpckhwd ym4, ym2
+ vinserti32x8 m5, ym4, 1
+ mova m1, m9
+ vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16}
+ mova m4, m9
+ vpdpwssd m4, m5, [o(pw_2896_2896)] {1to16}
+ psrad m1, 12
+ psrad m4, 12
+ packssdw m1, m4 ; out8 -out7 -out9 out6 -out11 out4 out10 -out5
+ vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d
+ vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ jmp tx2q
+.pass2:
+ call .main
+ movu m2, [o(permA+1)]
+.end:
+ vpbroadcastd m3, [o(pw_2048)]
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+.end2:
+ psrlq m3, m2, 4
+ vpermi2q m2, m0, m1
+ vpermi2q m3, m0, m1
+.end3:
+ lea r3, [dstq+strideq*2]
+ mova xm1, [dstq+strideq*0]
+ vinserti32x4 ym1, [dstq+strideq*1], 1
+ vinserti32x4 m1, [r3 +strideq*0], 2
+ vinserti32x4 m1, [r3 +strideq*1], 3
+ pxor m4, m4
+ mova [cq+64*0], m4
+ mova [cq+64*1], m4
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [r3 +strideq*0], m0, 2
+ vextracti32x4 [r3 +strideq*1], m0, 3
+ RET
+ALIGN function_align
+.main:
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+64*0]
+ mova m1, [cq+64*1]
+ movshdup m3, [o(permB)]
+ psrlq m10, m3, 4
+ call m(iadst_4x16_internal_8bpc).main2
+ vpbroadcastd m6, [o(pw_m16384_16384)]
+ psrlq m0, m10, 12
+ psrlq m10, 16
+ jmp m(iadst_16x4_internal_8bpc).pass1_end
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ movu m2, [o(permA+2)]
+ jmp m(iadst_16x4_internal_8bpc).end
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m1, [cq+64*0]
+ mova m2, [cq+64*1]
+ vpbroadcastd m3, [o(pw_1697x16)]
+ vpbroadcastd m4, [o(pw_16384)]
+ mova m5, [o(idtx_16x4p)]
+ shufps m0, m1, m2, q2020
+ shufps m1, m2, q3131
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddsw m0, m2
+ paddsw m1, m3
+ vpermb m0, m5, m0
+ vpermb m1, m5, m1
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ movu m2, [o(permA+1)]
+ jmp m(iadst_16x4_internal_8bpc).end
+
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x8
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 8
+.dconly:
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+.dconly2:
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+.dconly3:
+ imul r6d, 181
+ lea r2, [strideq*3]
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m2, m2
+ vpbroadcastw m3, r6d
+.dconly_loop:
+ mova xm1, [dstq+strideq*0]
+ vinserti32x4 ym1, [dstq+strideq*1], 1
+ vinserti32x4 m1, [dstq+strideq*2], 2
+ vinserti32x4 m1, [dstq+r2 ], 3
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+r2 ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub r3d, 4
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
+ vpbroadcastd m8, [o(pw_2896x8)]
+ vpermq m0, [cq+32*0], q3120
+ add cq, 32*4
+ vpermq m7, [cq+32*3], q%1
+ vpermq m1, [cq-32*3], q%1
+ vpermq m6, [cq+32*2], q3120
+ vpermq m2, [cq-32*2], q3120
+ vpermq m5, [cq+32*1], q%1
+ vpermq m3, [cq-32*1], q%1
+ vpermq m4, [cq+32*0], q3120
+ REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+
+cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd m1, [o(pw_2896x8)]
+ vpermq m0, [cq+64*0], q3120
+ vpermq m2, [cq+64*1], q3120
+ vpermq m4, [cq+64*2], q3120
+ vpermq m6, [cq+64*3], q3120
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m6
+ vextracti32x8 ym1, m0, 1
+ vextracti32x8 ym3, m2, 1
+ vextracti32x8 ym5, m4, 1
+ vextracti32x8 ym7, m6, 1
+ call m(idct_8x16_internal_8bpc).main
+ vbroadcasti32x4 m8, [o(int_shuf1)]
+ vbroadcasti32x4 m9, [o(int_shuf2)]
+ vinserti32x8 m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3
+ vinserti32x8 m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3
+ vinserti32x8 m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3
+ vinserti32x8 m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3
+ vpbroadcastd m2, [o(pw_16384)]
+ pshufb m0, m8 ; a0 b0 a1 b1 a2 b2 a3 b3
+ pshufb m1, m9 ; c0 d0 c1 d1 c2 d2 c3 d3
+ pshufb m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3
+ pshufb m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3
+ REPX {pmulhrsw x, m2}, m0, m1, m6, m7
+ punpckldq m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3
+ jmp tx2q
+.pass2:
+ vshufi32x4 m0, m2, m4, q2020 ; 0 1
+ vshufi32x4 m2, m4, q3131 ; 4 5
+ vshufi32x4 m1, m3, m5, q2020 ; 2 3
+ vshufi32x4 m3, m5, q3131 ; 6 7
+ call .main
+ movshdup m4, [o(permC)]
+ psrlq m6, m4, 4
+ vpermq m5, m4, q1032
+ vpermi2q m4, m0, m2 ; a2 a3 b2 b3 e2 e3 f2 f3
+ vpermt2q m0, m6, m2 ; a0 a1 b0 b1 e0 e1 f0 f1
+ psrlq m6, m5, 4
+ vpermi2q m5, m1, m3 ; c2 c3 d2 d3 g2 g3 h2 h3
+ vpermt2q m1, m6, m3 ; c0 c1 d0 d1 g0 g1 h0 h1
+ vpbroadcastd m6, [o(pw_2048)]
+.end:
+ REPX {pmulhrsw x, m6}, m0, m4, m1, m5
+.end2:
+ lea r3, [dstq+strideq*4]
+ lea r4, [strideq*3]
+ mova xm3, [dstq+strideq*0]
+ mova xm6, [dstq+strideq*2]
+ vinserti32x4 ym3, [dstq+strideq*1], 1
+ vinserti32x4 ym6, [dstq+r4 ], 1
+ vinserti32x4 m3, [r3 +strideq*0], 2
+ vinserti32x4 m6, [r3 +strideq*2], 2
+ vinserti32x4 m3, [r3 +strideq*1], 3
+ vinserti32x4 m6, [r3 +r4 ], 3
+ pxor m7, m7
+ mova [cq+64*0], m7
+ mova [cq+64*1], m7
+ mova [cq+64*2], m7
+ mova [cq+64*3], m7
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ paddw m0, m2
+ paddw m4, m3
+ packuswb m0, m4
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [r3 +strideq*0], m0, 2
+ vextracti32x4 [r3 +strideq*1], m0, 3
+ punpcklbw m3, m6, m7
+ punpckhbw m6, m7
+ paddw m1, m3
+ paddw m5, m6
+ packuswb m1, m5
+ mova [dstq+strideq*2], xm1
+ vextracti32x4 [dstq+r4 ], ym1, 1
+ vextracti32x4 [r3 +strideq*2], m1, 2
+ vextracti32x4 [r3 +r4 ], m1, 3
+ RET
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_8x16_internal_8bpc).main_pass1
+ vpbroadcastd m7, [o(pw_16384_m16384)]
+ psrlq m10, 4
+.pass1_end:
+ punpcklwd m5, m4, m2
+ punpckhwd m4, m2
+ mova m1, m9
+ vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16}
+ mova m6, m9
+ vpdpwssd m6, m5, [o(pw_2896_2896)] {1to16}
+ mova m2, m9
+ vpdpwssd m2, m4, [o(pw_m2896_2896)] {1to16}
+ vpdpwssd m9, m4, [o(pw_2896_2896)] {1to16}
+ psrad m1, 12
+ psrad m6, 12
+ packssdw m1, m6 ; out8 -out7 -out9 out6
+ psrad m2, 12
+ psrad m9, 12
+ packssdw m2, m9 ; -out11 out4 out10 -out5
+ psrlq m4, m10, 4
+ vpermi2q m4, m0, m2
+ vpermt2q m0, m10, m2
+ psrlq m5, m10, 8
+ vpermi2q m5, m1, m3
+ psrlq m10, 12
+ vpermt2q m1, m10, m3
+ punpcklwd m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3
+ punpckhwd m4, m5 ; b0 d0 b1 d1 b2 d2 b3 d3
+ punpcklwd m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3
+ punpckhwd m1, m0 ; j0 l0 j1 l1 j2 l2 j3 l3
+ punpcklwd m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhwd m3, m4 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpcklwd m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhwd m5, m1 ; i2 j2 k2 l2 i3 j3 k3 l3
+ REPX {pmulhrsw x, m7}, m2, m3, m4, m5
+ jmp tx2q
+.pass2:
+ vshufi32x4 m0, m2, m4, q2020
+ vshufi32x4 m2, m4, q3131 ; 4 5
+ vshufi32x4 m1, m3, m5, q2020
+ vshufi32x4 m3, m5, q3131 ; 6 7
+ pshufd m4, m0, q1032 ; 1 0
+ pshufd m5, m1, q1032 ; 3 2
+ call .main_pass2
+ movshdup m4, [o(permC)]
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ psrlq m6, m4, 4
+ mova m5, m4
+ vpermi2q m4, m0, m2
+ vpermt2q m0, m6, m2
+ vpermi2q m5, m1, m3
+ vpermt2q m1, m6, m3
+ jmp m(idct_16x8_internal_8bpc).end2
+ALIGN function_align
+.main_pass1:
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m3, m4, [cq+64*0]
+ pmulhrsw m1, m4, [cq+64*3]
+ pmulhrsw m2, m4, [cq+64*1]
+ pmulhrsw m4, [cq+64*2]
+ mova m5, [o(int16_perm)]
+ kxnorb k1, k1, k1
+ vpblendmd m0{k1}, m1, m3 ; 0 7
+ vmovdqa32 m3{k1}, m1 ; 6 1
+ vpblendmd m1{k1}, m4, m2 ; 2 5
+ vmovdqa32 m2{k1}, m4 ; 4 3
+ REPX {vpermb x, m5, x}, m0, m1, m2, m3
+ IADST8_1D_PACKED 1
+ ret
+ALIGN function_align
+cglobal_label .main_pass2
+ IADST8_1D_PACKED 2
+ pxor m5, m5
+ psubd m5, m6
+ packssdw m6, m5
+ pmulhrsw m2, m6
+ pmulhrsw m3, m6
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_8x16_internal_8bpc).main_pass1
+ vpbroadcastd m7, [o(pw_m16384_16384)]
+ psrlq m10, 20
+ jmp m(iadst_16x8_internal_8bpc).pass1_end
+.pass2:
+ vshufi32x4 m0, m2, m4, q2020
+ vshufi32x4 m2, m4, q3131 ; 4 5
+ vshufi32x4 m1, m3, m5, q2020
+ vshufi32x4 m3, m5, q3131 ; 6 7
+ pshufd m4, m0, q1032 ; 1 0
+ pshufd m5, m1, q1032 ; 3 2
+ call m(iadst_16x8_internal_8bpc).main_pass2
+ movshdup m4, [o(permC)]
+ pmulhrsw m5, m6, m0
+ pmulhrsw m0, m6, m1
+ psrlq m1, m4, 12
+ psrlq m4, 8
+ mova m7, m4
+ vpermi2q m4, m0, m3
+ vpermt2q m0, m1, m3
+ vpermi2q m1, m5, m2
+ vpermt2q m5, m7, m2
+ jmp m(idct_16x8_internal_8bpc).end2
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd m0, [o(pw_2896x8)]
+ pmulhrsw m3, m0, [cq+64*0]
+ pmulhrsw m4, m0, [cq+64*1]
+ pmulhrsw m5, m0, [cq+64*2]
+ pmulhrsw m0, [cq+64*3]
+ vpbroadcastd m7, [o(pw_1697x16)]
+ vpbroadcastd m8, [o(pw_16384)]
+ shufps m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5
+ shufps m3, m4, q3131 ; a2 a3 a6 a7 e2 e3 e6 e7
+ shufps m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5
+ shufps m5, m0, q3131 ; i2 i3 i6 i7 m2 m3 m6 m7
+ mova m9, [o(int8_permA)]
+ pmulhrsw m0, m7, m2
+ pmulhrsw m1, m7, m3
+ pmulhrsw m6, m7, m4
+ pmulhrsw m7, m5
+ REPX {pmulhrsw x, m8}, m0, m1, m6, m7
+ paddsw m2, m0
+ paddsw m3, m1
+ paddsw m4, m6
+ paddsw m5, m7
+ REPX {vpermb x, m9, x}, m2, m3, m4, m5
+ jmp tx2q
+.pass2:
+ mova m7, [o(permB)]
+ vpbroadcastd m6, [o(pw_4096)]
+ vpermq m0, m7, m2
+ vpermq m4, m7, m4
+ vpermq m1, m7, m3
+ vpermq m5, m7, m5
+ jmp m(idct_16x8_internal_8bpc).end
+
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x16
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 16
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+
+cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m7, [o(permB)]
+ vpermq m0, m7, [cq+64*0]
+ vpermq m1, m7, [cq+64*1]
+ vpermq m2, m7, [cq+64*2]
+ vpermq m3, m7, [cq+64*3]
+ vpermq m4, m7, [cq+64*4]
+ vpermq m5, m7, [cq+64*5]
+ vpermq m6, m7, [cq+64*6]
+ vpermq m7, m7, [cq+64*7]
+ call .main
+ vbroadcasti32x4 m12, [o(int_shuf1)]
+ vbroadcasti32x4 m11, [o(int_shuf2)]
+ vpbroadcastd m13, [o(pw_8192)]
+ pshufb m0, m12
+ pshufb m8, m1, m11
+ pshufb m2, m12
+ pshufb m9, m3, m11
+ pshufb m4, m12
+ pshufb m10, m5, m11
+ pshufb m6, m12
+ pshufb m11, m7, m11
+ REPX {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11
+ punpckhdq m1, m0, m8
+ punpckldq m0, m8
+ punpckhdq m3, m2, m9
+ punpckldq m2, m9
+ punpckhdq m5, m4, m10
+ punpckldq m4, m10
+ punpckhdq m7, m6, m11
+ punpckldq m6, m11
+ jmp tx2q
+.pass2:
+ vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc
+ vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4
+ vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec
+ vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4
+ vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me
+ vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6
+ vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee
+ vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6
+ vshufi32x4 m2, m0, m4, q3131 ; 4 5
+ vshufi32x4 m0, m4, q2020 ; 0 1
+ vshufi32x4 m4, m6, m8, q2020 ; 8 9
+ vshufi32x4 m6, m8, q3131 ; 12 13
+ vshufi32x4 m3, m1, m5, q3131 ; 6 7
+ vshufi32x4 m1, m5, q2020 ; 2 3
+ vshufi32x4 m5, m7, m9, q2020 ; 10 11
+ vshufi32x4 m7, m9, q3131 ; 14 15
+ call .main
+ mova m8, [o(permD)]
+ psrlq m12, m8, 4
+ psrlq m9, m8, 8
+ psrlq m13, m8, 12
+ mova m10, m8
+ vpermi2q m8, m0, m2 ; 0 1 4 5
+ vpermt2q m0, m12, m2
+ mova m11, m9
+ vpermi2q m9, m1, m3 ; 2 3 6 7
+ vpermt2q m1, m13, m3
+ vpermi2q m10, m4, m6 ; 8 9 12 13
+ vpermt2q m4, m12, m6
+ vpermi2q m11, m5, m7 ; 10 11 14 15
+ vpermt2q m5, m13, m7
+.end:
+ vpbroadcastd m12, [o(pw_2048)]
+.end2:
+ REPX {pmulhrsw x, m12}, m0, m1, m4, m5
+.end3:
+ REPX {pmulhrsw x, m12}, m8, m9, m10, m11
+ lea r3, [strideq*3]
+ lea r4, [dstq+strideq*4]
+ lea r5, [dstq+strideq*8]
+ lea r6, [r4 +strideq*8]
+ mova xm3, [dstq+strideq*0]
+ mova xm6, [dstq+strideq*2]
+ vinserti32x4 ym3, [dstq+strideq*1], 1
+ vinserti32x4 ym6, [dstq+r3 ], 1
+ vinserti32x4 m3, [r4+strideq*0], 2
+ vinserti32x4 m6, [r4+strideq*2], 2
+ vinserti32x4 m3, [r4+strideq*1], 3
+ vinserti32x4 m6, [r4+r3 ], 3
+ mova xm12, [r5+strideq*0]
+ mova xm13, [r5+strideq*2]
+ vinserti32x4 ym12, [r5+strideq*1], 1
+ vinserti32x4 ym13, [r5+r3 ], 1
+ vinserti32x4 m12, [r6+strideq*0], 2
+ vinserti32x4 m13, [r6+strideq*2], 2
+ vinserti32x4 m12, [r6+strideq*1], 3
+ vinserti32x4 m13, [r6+r3 ], 3
+ pxor m7, m7
+ REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ paddw m0, m2
+ paddw m8, m3
+ packuswb m0, m8
+ punpcklbw m2, m6, m7
+ punpckhbw m6, m7
+ paddw m1, m2
+ paddw m9, m6
+ packuswb m1, m9
+ punpcklbw m2, m12, m7
+ punpckhbw m12, m7
+ paddw m2, m4
+ paddw m10, m12
+ packuswb m2, m10
+ punpcklbw m3, m13, m7
+ punpckhbw m13, m7
+ paddw m3, m5
+ paddw m11, m13
+ packuswb m3, m11
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti32x4 [dstq+r3 ], ym1, 1
+ vextracti32x4 [r4+strideq*0], m0, 2
+ vextracti32x4 [r4+strideq*1], m0, 3
+ vextracti32x4 [r4+strideq*2], m1, 2
+ vextracti32x4 [r4+r3 ], m1, 3
+ mova [r5+strideq*0], xm2
+ vextracti32x4 [r5+strideq*1], ym2, 1
+ mova [r5+strideq*2], xm3
+ vextracti32x4 [r5+r3 ], ym3, 1
+ vextracti32x4 [r6+strideq*0], m2, 2
+ vextracti32x4 [r6+strideq*1], m2, 3
+ vextracti32x4 [r6+strideq*2], m3, 2
+ vextracti32x4 [r6+r3 ], m3, 3
+ RET
+ALIGN function_align
+cglobal_label .main_fast2 ; bottom three-quarters are zero
+ vpbroadcastd m10, [o(pd_2048)]
+ vpbroadcastq m13, [o(int_mshift)]
+ vpcmpub k7, m13, m10, 6
+.main_fast4:
+ vpbroadcastd m2, [o(pw_401_4076x8)]
+ vpbroadcastd m4, [o(pw_m1189_3920x8)]
+ vpbroadcastd m3, [o(pw_799_4017x8)]
+ pmulhrsw m2, m8 ; t8a t15a
+ pmulhrsw m4, m1 ; t11a t12a
+ pmulhrsw m7, m3 ; t4a t7a
+ pxor m6, m6
+ psubsw m0, m2, m4 ; t11a t12a
+ paddsw m8, m2, m4 ; t8a t15a
+ mova m1, m7
+ jmp .main5
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ vpbroadcastd m10, [o(pd_2048)]
+.main_fast3:
+ vpbroadcastq m13, [o(int_mshift)]
+ vpcmpub k7, m13, m10, 6
+.main_fast5:
+ vpbroadcastd m2, [o(pw_401_4076x8)]
+ vpbroadcastd m4, [o(pw_m2598_3166x8)]
+ vpbroadcastd m11, [o(pw_1931_3612x8)]
+ vpbroadcastd m12, [o(pw_m1189_3920x8)]
+ pmulhrsw m8, m2 ; t8a t15a
+ vpbroadcastd m2, [o(pw_799_4017x8)]
+ pmulhrsw m0, m4 ; t9a t14a
+ vpbroadcastd m4, [o(pw_m2276_3406x8)]
+ pmulhrsw m5, m11 ; t10a t13a
+ pmulhrsw m1, m12 ; t11a t12a
+ pmulhrsw m7, m2 ; t4a t7a
+ pmulhrsw m3, m4 ; t5a t6a
+ jmp .main4
+ALIGN function_align
+cglobal_label .main
+ IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call .main_pass1
+ vpbroadcastd m10, [o(pw_8192_m8192)]
+ punpcklwd m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3
+ punpckhwd m0, m1 ; a0 c0 a1 c1 a2 c2 a3 c3
+ punpckhwd m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpcklwd m0, m8 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpcklwd m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3
+ punpckhwd m2, m3 ; e0 g0 e1 g1 e2 g2 e3 g3
+ punpckhwd m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpcklwd m2, m8 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhwd m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3
+ punpcklwd m4, m5 ; j0 l0 j1 l1 j2 l2 j3 l3
+ punpckhwd m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpcklwd m4, m8 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhwd m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3
+ punpcklwd m6, m7 ; n0 p0 n1 p1 n2 p2 n3 p3
+ punpckhwd m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpcklwd m6, m8 ; m0 n0 o0 p0 m1 n1 o1 p1
+.pass1_end:
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ mova m10, [o(permD)]
+ psrlq m8, m10, 8
+ psrlq m12, m10, 12
+ psrlq m13, m10, 4
+ mova m9, m8
+ vpermi2q m8, m0, m2 ; 0 1 4 5
+ vpermt2q m0, m12, m2
+ vpermi2q m9, m1, m3 ; 2 3 6 7
+ vpermt2q m1, m12, m3
+ vpbroadcastd m12, [o(pw_2048)]
+ mov r3d, 0xff00ff00
+ mova m11, m10
+ vpermi2q m10, m4, m6 ; 8 9 12 13
+ vpermt2q m4, m13, m6
+ kmovd k1, r3d
+ vpermi2q m11, m5, m7 ; 10 11 14 15
+ vpermt2q m5, m13, m7
+ pxor m7, m7
+ vpsubw m12{k1}, m7, m12
+ jmp m(idct_16x16_internal_8bpc).end2
+ALIGN function_align
+.main_pass1:
+ mova m4, [o(permB)]
+ psrlq m3, m4, 4
+ vpermq m0, m4, [cq+64*0]
+ vpermq m7, m3, [cq+64*7]
+ vpermq m6, m4, [cq+64*6]
+ vpermq m1, m3, [cq+64*1]
+ vpermq m2, m4, [cq+64*2]
+ vpermq m5, m3, [cq+64*5]
+ vpermq m4, m4, [cq+64*4]
+ vpermq m3, m3, [cq+64*3]
+ call .main
+ vpbroadcastd m13, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ mova m2, m10
+ vpdpwssd m2, m5, m13 ; -out5
+ mova m8, m10
+ vpdpwssd m8, m11, m13 ; out4
+ mova m9, m10
+ vpdpwssd m9, m5, m12 ; out10
+ mova m5, m10
+ vpdpwssd m5, m11, m12 ; -out11
+ mova m11, m10
+ vpdpwssd m11, m3, m13 ; -out7
+ mova m14, m10
+ vpdpwssd m14, m4, m13 ; out6
+ mova m13, m10
+ vpdpwssd m13, m3, m12 ; out8
+ vpdpwssd m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9
+ REPX {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10
+ packssdw m2, m8 ; -out5 out4
+ packssdw m5, m9, m5 ; out10 -out11
+ packssdw m3, m11, m14 ; -out7 out6
+ packssdw m4, m13, m10 ; out8 -out9
+ ret
+ALIGN function_align
+.main_pass2:
+ vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc
+ vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4
+ vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec
+ vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4
+ vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me
+ vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6
+ vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee
+ vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6
+ vshufi32x4 m2, m0, m4, q3131 ; 4 5
+ vshufi32x4 m0, m4, q2020 ; 0 1
+ vshufi32x4 m4, m6, m8, q2020 ; 8 9
+ vshufi32x4 m6, m8, q3131 ; 12 13
+ vshufi32x4 m3, m1, m5, q3131 ; 6 7
+ vshufi32x4 m1, m5, q2020 ; 2 3
+ vshufi32x4 m5, m7, m9, q2020 ; 10 11
+ vshufi32x4 m7, m9, q3131 ; 14 15
+cglobal_label .main_pass2b
+ REPX {pshufd x, x, q1032}, m1, m3, m5, m7
+ call .main
+ vpbroadcastd m8, [o(pw_2896x8)]
+ pshufb m2, m11, m12
+ pshufb m5, m12
+ pshufb m3, m12
+ pshufb m4, m12
+ punpcklqdq m9, m5, m2 ; t15a t7
+ punpckhqdq m5, m2 ; t14a t6
+ shufps m2, m3, m4, q1032 ; t2a t10
+ shufps m3, m4, q3210 ; t3a t11
+ psubsw m4, m2, m3 ; out8 -out9
+ paddsw m3, m2 ; -out7 out6
+ paddsw m2, m5, m9 ; -out5 out4
+ psubsw m5, m9 ; out10 -out11
+ REPX {pmulhrsw x, m8}, m2, m3, m4, m5
+ ret
+ALIGN function_align
+.main:
+ vpbroadcastd m10, [o(pd_2048)]
+ vpbroadcastq m13, [o(int_mshift)]
+ punpckhwd m8, m7, m0 ; in14 in1
+ punpcklwd m0, m7 ; in0 in15
+ punpcklwd m7, m6, m1 ; in12 in3
+ punpckhwd m1, m6 ; in2 in13
+ punpckhwd m6, m5, m2 ; in10 in5
+ punpcklwd m2, m5 ; in4 in11
+ punpcklwd m5, m4, m3 ; in8 in7
+ punpckhwd m3, m4 ; in6 in9
+ vpcmpub k7, m13, m10, 6 ; 0x33...
+ ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 5 ; t0 t1
+ ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 5 ; t2 t3
+ ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 5 ; t4 t5
+ ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 5 ; t6 t7
+ ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 5 ; t8 t9
+ ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 5 ; t10 t11
+ ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 5 ; t12 t13
+ ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 5 ; t14 t15
+ psubsw m4, m0, m5 ; t9a t8a
+ paddsw m0, m5 ; t1a t0a
+ psubsw m5, m1, m6 ; t11a t10a
+ paddsw m1, m6 ; t3a t2a
+ psubsw m6, m2, m7 ; t13a t12a
+ paddsw m2, m7 ; t5a t4a
+ psubsw m7, m3, m8 ; t15a t14a
+ paddsw m3, m8 ; t7a t6a
+ ITX_MUL2X_PACK 4, 8, 9, 10, 799, 4017, 4 ; t8 t9
+ ITX_MUL2X_PACK 6, 8, 9, 10, 799_4017, 4017_m799, 52 ; t12 t13
+ ITX_MUL2X_PACK 5, 8, 9, 10, 3406, 2276, 4 ; t10 t11
+ ITX_MUL2X_PACK 7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15
+ psubsw m8, m1, m3 ; t7 t6
+ paddsw m1, m3 ; t3 t2
+ psubsw m3, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m2, m5, m7 ; t14a t15a
+ paddsw m7, m5 ; t10a t11a
+ psubsw m5, m4, m6 ; t12a t13a
+ paddsw m4, m6 ; t8a t9a
+ ITX_MUL2X_PACK 3, 6, 9, 10, 1567, 3784, 5 ; t5a t4a
+ ITX_MUL2X_PACK 8, 6, 9, 10, 3784_m1567, 1567_3784, 52 ; t7a t6a
+ ITX_MUL2X_PACK 2, 6, 9, 10, 3784, 1567, 4 ; t15 t14
+ ITX_MUL2X_PACK 5, 6, 9, 10, 3784_1567, 1567_m3784, 52 ; t13 t12
+ vbroadcasti32x4 m12, [o(deint_shuf)]
+ paddsw m6, m4, m7 ; -out1 out14
+ psubsw m4, m7 ; t10 t11
+ psubsw m11, m3, m8 ; t7 t6
+ paddsw m8, m3 ; out12 -out3
+ psubsw m3, m0, m1 ; t3a t2a
+ paddsw m0, m1 ; -out15 out0
+ paddsw m1, m2, m5 ; -out13 out2
+ psubsw m5, m2 ; t15a t14a
+ pshufb m0, m12
+ pshufb m6, m12
+ pshufb m8, m12
+ pshufb m1, m12
+ shufps m7, m6, m0, q1032 ; out14 -out15
+ shufps m0, m6, m0, q3210 ; -out1 out0
+ punpcklqdq m6, m8, m1 ; out12 -out13
+ punpckhqdq m1, m8, m1 ; -out3 out2
+ ret
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_16x16_internal_8bpc).main_pass1
+ vpbroadcastd m10, [o(pw_m8192_8192)]
+ punpcklwd m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3
+ punpckhwd m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3
+ punpckhwd m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3
+ punpcklwd m7, m6 ; b0 d0 b1 d1 b2 d2 b3 d3
+ punpcklwd m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhwd m1, m7 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpcklwd m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1
+ punpckhwd m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpcklwd m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3
+ punpckhwd m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3
+ punpckhwd m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3
+ punpcklwd m5, m4 ; f0 h0 f1 h1 f2 h2 f3 h3
+ punpcklwd m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhwd m3, m5 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpcklwd m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3
+ jmp m(iadst_16x16_internal_8bpc).pass1_end
+.pass2:
+ call m(iadst_16x16_internal_8bpc).main_pass2
+ mova m10, [o(permD)]
+ psrlq m8, m10, 8
+ psrlq m12, m10, 12
+ psrlq m13, m10, 4
+ mova m9, m8
+ vpermi2q m8, m7, m5 ; 0 1 4 5
+ vpermt2q m7, m12, m5
+ vpermi2q m9, m6, m4 ; 2 3 6 7
+ vpermt2q m6, m12, m4
+ vpbroadcastd m12, [o(pw_2048)]
+ mov r3d, 0x00ff00ff
+ mova m11, m10
+ vpermi2q m10, m3, m1 ; 8 9 12 13
+ vpermt2q m3, m13, m1
+ kmovd k1, r3d
+ vpermi2q m11, m2, m0 ; 10 11 14 15
+ vpermt2q m2, m13, m0
+ pxor m0, m0
+ vpsubw m12{k1}, m0, m12
+ pmulhrsw m0, m7, m12
+ pmulhrsw m1, m6, m12
+ pmulhrsw m4, m3, m12
+ pmulhrsw m5, m2, m12
+ jmp m(idct_16x16_internal_8bpc).end3
+
+INV_TXFM_16X16_FN identity, dct
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m8, [o(int16_perm)]
+ vpermb m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
+ vpermb m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
+ vpbroadcastd m0, [o(pw_1697x16)]
+ vpermb m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
+ vpermb m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
+ vpermb m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3
+ vpermb m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3
+ vpermb m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3
+ vpermb m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3
+ pmulhrsw m9, m0, m1
+ pmulhrsw m10, m0, m2
+ pmulhrsw m11, m0, m3
+ pmulhrsw m12, m0, m4
+ pmulhrsw m13, m0, m5
+ pmulhrsw m14, m0, m6
+ pmulhrsw m15, m0, m7
+ pmulhrsw m0, m8
+ REPX {psraw x, 1}, m9, m10, m11, m12
+ pavgw m1, m9
+ pavgw m2, m10
+ pavgw m3, m11
+ pavgw m4, m12
+ REPX {psraw x, 1}, m13, m14, m15, m0
+ pavgw m5, m13
+ pavgw m6, m14
+ pavgw m7, m15
+ pavgw m8, m0
+ punpckldq m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m1, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m3, m4 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m5, m6 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpckldq m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1
+ punpckhdq m7, m8 ; m2 n2 o2 p2 m3 n3 o3 p3
+ jmp tx2q
+ALIGN function_align
+.pass2:
+ vpbroadcastd m11, [o(pw_1697x16)]
+ pmulhrsw m12, m11, m0
+ pmulhrsw m13, m11, m1
+ pmulhrsw m14, m11, m2
+ pmulhrsw m15, m11, m3
+ pmulhrsw m8, m11, m4
+ pmulhrsw m9, m11, m5
+ pmulhrsw m10, m11, m6
+ pmulhrsw m11, m7
+ REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ paddsw m0, m12
+ paddsw m1, m13
+ paddsw m2, m14
+ paddsw m3, m15
+ paddsw m8, m4
+ movu m4, [o(permD+2)]
+ paddsw m9, m5
+ paddsw m6, m10
+ paddsw m7, m11
+ psrlq m12, m4, 4
+ mova m5, m4
+ mova m10, m4
+ mova m11, m4
+ vpermi2q m4, m0, m2 ; 8 9 12 13
+ vpermt2q m0, m12, m2 ; 0 1 4 5
+ vpermi2q m5, m1, m3 ; 10 11 14 15
+ vpermt2q m1, m12, m3 ; 2 3 6 7
+ vpermi2q m10, m8, m6
+ vpermt2q m8, m12, m6
+ vpermi2q m11, m9, m7
+ vpermt2q m9, m12, m7
+ jmp m(idct_16x16_internal_8bpc).end
+
+%macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4]
+ vpbroadcastd m%4, [o(pw_%5_%6x8)]
+ punpcklwd m%1, m%3, m%3
+ pmulhrsw m%1, m%4
+ vpbroadcastd m%4, [o(pw_%7_%8x8)]
+ punpckhwd m%2, m%3, m%3
+ pmulhrsw m%2, m%4
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ cmp eobd, 107
+ jb .fast
+ mova m5, [cq+64*5]
+ mova m3, [cq+64*3]
+ mova m1, [cq+64*1]
+ mova m7, [cq+64*7]
+ mova m2, [cq+64*2]
+ mova m6, [cq+64*6]
+ mova m0, [cq+64*0]
+ mova m4, [cq+64*4]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ mova m8, [o(idct_8x32p)]
+ vpbroadcastd m9, [o(pw_8192)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ punpckldq m8, m0, m1 ; ab
+ punpckhdq m0, m1
+ punpckldq m1, m2, m3 ; cd
+ punpckhdq m2, m3
+ punpckldq m3, m4, m5 ; ef
+ punpckhdq m4, m5
+ punpckldq m5, m6, m7 ; gh
+ punpckhdq m6, m7
+ REPX {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6
+ punpcklqdq m18, m8, m1 ; 30 2 6 26 31 1 23 9
+ punpckhqdq m14, m8, m1 ; 16 0 12 20 3 29 11 21
+ punpcklqdq m21, m0, m2 ; 14 18 22 10 27 5 19 13
+ punpckhqdq m15, m0, m2 ; 18 4 24 8 7 25 15 17
+ punpcklqdq m20, m3, m5
+ punpckhqdq m16, m3, m5
+ punpcklqdq m19, m4, m6
+ punpckhqdq m17, m4, m6
+ vinserti32x4 ym8, ym18, xm20, 1
+ vshufi32x4 ym1, ym18, ym20, 0x03
+ vinserti32x4 ym9, ym14, xm16, 1
+ vshufi32x4 ym3, ym14, ym16, 0x03
+ vinserti32x4 ym0, ym21, xm19, 1
+ vshufi32x4 ym5, ym21, ym19, 0x03
+ vinserti32x4 ym7, ym15, xm17, 1
+ vshufi32x4 ym6, ym15, ym17, 0x03
+ call m(idct_8x16_internal_8bpc).main2
+ psrlq m12, [o(permB)], 60
+ vpermt2q m14, m12, m16
+ vpermt2q m21, m12, m19
+ vpermt2q m15, m12, m17
+ vpermi2q m12, m18, m20
+ vextracti32x8 ym16, m14, 1
+ vextracti32x8 ym19, m21, 1
+ vextracti32x8 ym17, m15, 1
+ vextracti32x8 ym20, m12, 1
+ call .main2
+ jmp .end
+.fast: ; right half is zero
+ mova m0, [o(int16_perm)]
+ mova ym2, [cq+64*4]
+ vinserti32x8 m2, [cq+64*0], 1
+ mova ym3, [cq+64*6]
+ vinserti32x8 m3, [cq+64*2], 1
+ mova ym4, [cq+64*3]
+ vinserti32x8 m4, [cq+64*5], 1
+ mova ym5, [cq+64*7]
+ vinserti32x8 m5, [cq+64*1], 1
+ REPX {vpermb x, m0, x}, m2, m3, m4, m5
+ call m(idct_16x8_internal_8bpc).main2
+ vbroadcasti32x4 m4, [o(int_shuf3)]
+ vbroadcasti32x4 m5, [o(int_shuf4)]
+ pshufb m2, m4 ; e0 f0 e2 f2 e1 f1 e3 f3
+ pshufb m3, m5 ; g0 h0 g2 h2 g1 h1 g3 h3
+ pshufb m0, m4 ; a0 b0 a2 b2 a1 b1 a3 b3
+ pshufb m1, m5 ; c0 d0 c2 d2 c1 d1 c3 d3
+ vpbroadcastd m4, [o(pw_8192)]
+ psrlq m5, [o(permB)], 60
+ punpckldq m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2
+ punpckhdq m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3
+ punpckldq m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2
+ punpckhdq m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3
+ REPX {pmulhrsw x, m4}, m6, m17, m2, m16
+ vinserti32x4 ym0, ym2, xm6, 1 ; 0 2
+ vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6
+ vinserti32x4 ym14, ym16, xm17, 1 ; 1 3
+ vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7
+ vpermt2q m2, m5, m6 ; 8 10
+ vpermt2q m16, m5, m17 ; 9 11
+ vextracti32x8 ym3, m2, 1 ; 12 14
+ vextracti32x8 ym17, m16, 1 ; 13 15
+ call m(idct_8x16_internal_8bpc).main_fast
+ call .main_fast
+.end:
+ vpbroadcastd ym8, strided
+ pmulld ym8, [o(gather8d)]
+ call .main_end
+ lea r3, [dstq+strideq*4]
+ kxnorb k1, k1, k1
+ lea r4, [dstq+strideq*8]
+ pxor m9, m9
+ lea r1, [r3+strideq*8]
+ kmovb k2, k1
+ vpgatherdq m12{k1}, [r0+ym8]
+ kmovb k1, k2
+ vpgatherdq m13{k2}, [r3+ym8]
+ kmovb k2, k1
+ vpgatherdq m14{k1}, [r4+ym8]
+ kmovb k1, k2
+ vpgatherdq m15{k2}, [r1+ym8]
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m11, m12, m9
+ punpckhbw m12, m9
+ paddw m0, m11
+ paddw m1, m12
+ packuswb m0, m1
+ kmovb k2, k1
+ vpscatterdq [r0+ym8]{k1}, m0
+ punpcklbw m12, m13, m9
+ punpckhbw m13, m9
+ paddw m2, m12
+ paddw m3, m13
+ packuswb m2, m3
+ kmovb k1, k2
+ vpscatterdq [r3+ym8]{k2}, m2
+ punpcklbw m13, m14, m9
+ punpckhbw m14, m9
+ paddw m4, m13
+ paddw m5, m14
+ packuswb m4, m5
+ kmovb k2, k1
+ vpscatterdq [r4+ym8]{k1}, m4
+ punpcklbw m14, m15, m9
+ punpckhbw m15, m9
+ paddw m6, m14
+ paddw m7, m15
+ packuswb m6, m7
+ vpscatterdq [r1+ym8]{k2}, m6
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 32
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
+INIT_YMM avx512icl
+ALIGN function_align
+cglobal_label .main_fast2 ; bottom three-quarters are zero
+ ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ mova m11, m12
+ mova m17, m20
+ mova m15, m21
+ mova m16, m14
+ jmp .main4
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
+ ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
+ jmp .main3
+ALIGN function_align
+cglobal_label .main
+ punpcklwd m12, m21, m14 ; in31 in1
+ punpckhwd m14, m21 ; in3 in29
+ punpcklwd m21, m20, m15 ; in27 in5
+ punpckhwd m15, m20 ; in7 in25
+ punpcklwd m20, m19, m16 ; in23 in9
+ punpckhwd m16, m19 ; in11 in21
+ punpcklwd m19, m18, m17 ; in19 in13
+ punpckhwd m17, m18 ; in15 in17
+.main2:
+ ITX_MUL2X_PACK 12, 8, 9, 10, 201, 4091, 5 ; t16a, t31a
+ ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a
+ ITX_MUL2X_PACK 21, 8, 9, 10, 995, 3973, 5 ; t20a, t27a
+ ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
+ ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
+ ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
+ ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
+ ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
+.main3:
+ psubsw m11, m12, m17 ; t17 t30
+ paddsw m12, m17 ; t16 t31
+ psubsw m17, m15, m20 ; t18 t29
+ paddsw m20, m15 ; t19 t28
+ psubsw m15, m21, m16 ; t21 t26
+ paddsw m21, m16 ; t20 t27
+ psubsw m16, m14, m19 ; t22 t25
+ paddsw m14, m19 ; t23 t24
+.main4:
+ ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a
+ ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a
+ ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a
+ ITX_MUL2X_PACK 16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a
+ vpbroadcastd m8, [o(pw_m3784_1567)]
+ psubsw m19, m12, m20 ; t19a t28a
+ paddsw m20, m12 ; t16a t31a
+ psubsw m12, m14, m21 ; t20a t27a
+ paddsw m14, m21 ; t23a t24a
+ psubsw m21, m11, m17 ; t18 t29
+ paddsw m11, m17 ; t17 t30
+ psubsw m17, m16, m15 ; t21 t26
+ paddsw m16, m15 ; t22 t25
+ ITX_MUL2X_PACK 21, 18, 15, 10, 1567_3784, 8, 20 ; t18a t29a
+ ITX_MUL2X_PACK 19, 18, 15, 10, 1567_3784, 8, 20 ; t19 t28
+ ITX_MUL2X_PACK 12, 18, 15, 10, 8, m1567_m3784, 36 ; t20 t27
+ ITX_MUL2X_PACK 17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a
+ vbroadcasti32x4 m18, [o(deint_shuf)]
+ vpbroadcastd m8, [o(pw_m2896_2896)]
+ vpbroadcastd m9, [o(pw_2896_2896)]
+ psubsw m15, m20, m14 ; t23 t24
+ paddsw m20, m14 ; t16 t31
+ psubsw m14, m11, m16 ; t22a t25a
+ paddsw m11, m16 ; t17a t30a
+ psubsw m16, m21, m17 ; t21 t26
+ paddsw m21, m17 ; t18 t29
+ psubsw m17, m19, m12 ; t20a t27a
+ paddsw m19, m12 ; t19a t28a
+ REPX {pshufb x, m18}, m20, m11, m21, m19
+ ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a
+ ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25
+ packssdw m18, m13 ; t23a t22
+ packssdw m12, m15 ; t24a t25
+ ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a
+ ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27
+ packssdw m16, m13 ; t20 t21a
+ packssdw m14, m15 ; t27 t26a
+ punpcklqdq m13, m19, m21 ; t19a t18
+ punpckhqdq m19, m21 ; t28a t29
+ punpcklqdq m21, m20, m11 ; t16 t17a
+ punpckhqdq m20, m11 ; t31 t30a
+INIT_ZMM avx512icl
+ mova m15, [o(permA)]
+ ret
+cglobal_label .main_end
+ vpbroadcastd m10, [o(pw_2048)]
+ vpermt2q m0, m15, m1 ; t0 t1 t2 t3
+ vpermt2q m20, m15, m19 ; t31 t30a t29 t28a
+ vpermt2q m2, m15, m3 ; t4 t5 t6 t7
+ vpermt2q m14, m15, m12 ; t27 t26a t25 t24a
+ vpermt2q m4, m15, m5 ; t8 t9 t10 t11
+ vpermt2q m18, m15, m16 ; t23a t22 t21a t20
+ vpermt2q m6, m15, m7 ; t12 t13 t14 t15
+ vpermt2q m13, m15, m21 ; t19a t18 t17a t16
+ psubsw m7, m0, m20 ; out31 out30 out29 out28
+ paddsw m0, m20 ; out0 out1 out2 out3
+ psubsw m5, m2, m14 ; out27 out26 out25 out24
+ paddsw m2, m14 ; out4 out5 out6 out7
+ psubsw m3, m4, m18 ; out23 out22 out21 out20
+ paddsw m4, m18 ; out8 out9 out10 out11
+ psubsw m1, m6, m13 ; out19 out18 out17 out16
+ paddsw m6, m13 ; out12 out13 out14 out15
+ vzeroupper
+ ret
+
+%macro LOAD_PACKED_16X2 3 ; dst, row[1-2]
+ vbroadcasti32x4 ym%1, [cq+16*%2]
+ vbroadcasti32x4 ym8, [cq+16*%3]
+ shufpd ym%1, ym8, 0x0c
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
+%undef cmp
+ test eobd, eobd
+ jz .dconly
+ lea r5, [o_base]
+ LOAD_PACKED_16X2 0, 0, 2 ; in0 in2
+ LOAD_PACKED_16X2 1, 4, 6 ; in4 in6
+ LOAD_PACKED_16X2 2, 8, 10 ; in8 in10
+ LOAD_PACKED_16X2 3, 12, 14 ; in12 in14
+ LOAD_PACKED_16X2 14, 1, 3 ; in1 in3
+ LOAD_PACKED_16X2 15, 5, 7 ; in5 in7
+ LOAD_PACKED_16X2 16, 9, 11 ; in9 in11
+ LOAD_PACKED_16X2 17, 13, 15 ; in13 in15
+ pxor m4, m4
+ REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
+ cmp eobd, 107
+ jb .fast
+ LOAD_PACKED_16X2 4, 16, 18 ; in16 in18
+ LOAD_PACKED_16X2 5, 20, 22 ; in20 in22
+ LOAD_PACKED_16X2 6, 24, 26 ; in24 in26
+ LOAD_PACKED_16X2 7, 28, 30 ; in28 in30
+ call m(idct_8x16_internal_8bpc).main
+ LOAD_PACKED_16X2 18, 19, 17 ; in19 in17
+ LOAD_PACKED_16X2 19, 23, 21 ; in23 in21
+ LOAD_PACKED_16X2 20, 27, 25 ; in27 in25
+ LOAD_PACKED_16X2 21, 31, 29 ; in31 in29
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+ jmp .pass2
+.fast: ; bottom half is zero
+ mova ym5, ym4
+ mova ym6, ym4
+ mova ym7, ym4
+ call m(idct_8x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+.pass2:
+ vpbroadcastd m10, [o(pw_8192)]
+ vpermt2q m0, m15, m4 ; t0 t1 t9 t8
+ vpermt2q m20, m15, m18 ; t31 t30a t23a t22
+ vpermt2q m3, m15, m7 ; t7 t6 t14 t15
+ vpermt2q m12, m15, m21 ; t25 t24a t17a t16
+ vpermt2q m2, m15, m6 ; t4 t5 t13 t12
+ vpermt2q m14, m15, m13 ; t23a t22 t21a t20
+ vpermt2q m1, m15, m5 ; t3 t2 t10 t11
+ vpermt2q m19, m15, m16 ; t27 t26a t19a t18
+ psubsw m8, m0, m20 ; out31 out30 out22 out23
+ paddsw m0, m20 ; out0 out1 out9 out8
+ paddsw m6, m3, m12 ; out7 out6 out14 out15
+ psubsw m3, m12 ; out24 out25 out17 out16
+ psubsw m5, m2, m14 ; out27 out26 out18 out19
+ paddsw m4, m2, m14 ; out4 out5 out13 out12
+ psubsw m7, m1, m19 ; out28 out29 out21 out20
+ paddsw m2, m1, m19 ; out3 out2 out10 out11
+ vzeroupper
+ vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25
+ vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24
+ vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27
+ vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26
+ vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29
+ vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28
+ vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31
+ vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
+ call .main
+ vpbroadcastd m8, [o(pw_2048)]
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ lea r2, [strideq*3]
+ lea r3, [dstq+strideq*4]
+ movshdup m12, [o(permD)]
+ pmovzxbw m8, [dstq+strideq*0]
+ pmovzxbw m9, [dstq+strideq*1]
+ pmovzxbw m10, [dstq+strideq*2]
+ pmovzxbw m11, [dstq+r2 ]
+ paddw m0, m8
+ paddw m1, m9
+ paddw m2, m10
+ paddw m3, m11
+ pmovzxbw m8, [r3+strideq*0]
+ pmovzxbw m9, [r3+strideq*1]
+ pmovzxbw m10, [r3+strideq*2]
+ pmovzxbw m11, [r3+r2 ]
+ paddw m4, m8
+ paddw m5, m9
+ paddw m6, m10
+ paddw m7, m11
+ packuswb m0, m1
+ packuswb m2, m3
+ vpermq m0, m12, m0
+ vpermq m2, m12, m2
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym2
+ vextracti32x8 [dstq+r2 ], m2, 1
+ packuswb m4, m5
+ packuswb m6, m7
+ vpermq m4, m12, m4
+ vpermq m6, m12, m6
+ mova [r3+strideq*0], ym4
+ vextracti32x8 [r3+strideq*1], m4, 1
+ mova [r3+strideq*2], ym6
+ vextracti32x8 [r3+r2 ], m6, 1
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 8
+.dconly2:
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+.dconly3:
+ imul r6d, 181
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m2, m2
+ vpbroadcastw m3, r6d
+.dconly_loop:
+ mova ym1, [dstq+strideq*0]
+ vinserti32x8 m1, [dstq+strideq*1], 1
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+ALIGN function_align
+cglobal_label .main
+ vpbroadcastd m10, [o(pd_2048)]
+.main2:
+ ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a
+ ITX_MULSUB_2W 1, 7, 8, 9, 10, 799, 4017 ; t4a, t7a
+ ITX_MULSUB_2W 2, 6, 8, 9, 10, 1567, 3784 ; t2, t3
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ ITX_MULSUB_2W 0, 4, 8, 9, 10, 11, 12 ; t1, t0
+.main3:
+ paddsw m8, m1, m5 ; t4
+ psubsw m1, m5 ; t5a
+ paddsw m9, m7, m3 ; t7
+ psubsw m7, m3 ; t6a
+ ITX_MULSUB_2W 7, 1, 3, 5, 10, 11, 12 ; t5, t6
+ psubsw m5, m0, m2 ; dct4 out2
+ paddsw m2, m0 ; dct4 out1
+ paddsw m0, m4, m6 ; dct4 out0
+ psubsw m4, m6 ; dct4 out3
+ psubsw m6, m2, m1 ; out6
+ paddsw m1, m2 ; out1
+ paddsw m2, m5, m7 ; out2
+ psubsw m5, m7 ; out5
+ psubsw m7, m0, m9 ; out7
+ paddsw m0, m9 ; out0
+ paddsw m3, m4, m8 ; out3
+ psubsw m4, m8 ; out4
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c
+ vpbroadcastd m7, [pw_5]
+ paddsw m0, m7, [cq+64*0]
+ paddsw m1, m7, [cq+64*1]
+ vpbroadcastd ym9, strided
+ paddsw m2, m7, [cq+64*2]
+ paddsw m3, m7, [cq+64*3]
+ paddsw m4, m7, [cq+64*4]
+ paddsw m5, m7, [cq+64*5]
+ paddsw m6, m7, [cq+64*6]
+ paddsw m7, [cq+64*7]
+ pmulld ym14, ym9, [pd_0to15]
+ lea r3, [dstq+strideq*1]
+ lea r4, [dstq+strideq*2]
+ kxnorb k1, k1, k1
+ pxor m13, m13
+ add r1, r4 ; dstq+strideq*3
+ kmovb k2, k1
+ vpgatherdq m9{k1}, [r0+ym14*4]
+ kmovb k1, k2
+ vpgatherdq m10{k2}, [r3+ym14*4]
+ kmovb k2, k1
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
+ REPX {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpgatherdq m11{k1}, [r4+ym14*4]
+ kmovb k1, k2
+ vpgatherdq m12{k2}, [r1+ym14*4]
+ REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m8, m9, m13 ; 0 8 16 24
+ punpckhbw m9, m13 ; 4 12 20 28
+ paddw m0, m8
+ paddw m4, m9
+ packuswb m0, m4
+ kmovb k2, k1
+ vpscatterdq [r0+ym14*4]{k1}, m0
+ punpcklbw m8, m10, m13 ; 1 9 17 25
+ punpckhbw m10, m13 ; 5 13 21 29
+ paddw m1, m8
+ paddw m5, m10
+ packuswb m1, m5
+ kmovb k1, k2
+ vpscatterdq [r3+ym14*4]{k2}, m1
+ punpcklbw m8, m11, m13 ; 2 10 18 26
+ punpckhbw m11, m13 ; 6 14 22 30
+ paddw m2, m8
+ paddw m6, m11
+ packuswb m2, m6
+ kmovb k2, k1
+ vpscatterdq [r4+ym14*4]{k1}, m2
+ punpcklbw m8, m12, m13 ; 3 11 19 27
+ punpckhbw m12, m13 ; 7 15 23 31
+ paddw m3, m8
+ paddw m7, m12
+ packuswb m3, m7
+ vpscatterdq [r1+ym14*4]{k2}, m3
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c
+ vpbroadcastd m0, [pw_4096]
+ pmulhrsw m3, m0, [cq+64*0]
+ pmulhrsw m4, m0, [cq+64*4]
+ pmulhrsw m6, m0, [cq+64*1]
+ pmulhrsw m5, m0, [cq+64*5]
+ pmulhrsw m7, m0, [cq+64*2]
+ pmulhrsw m2, m0, [cq+64*6]
+ pmulhrsw m8, m0, [cq+64*3]
+ pmulhrsw m0, [cq+64*7]
+ mova m13, [int8_permA]
+ lea r3, [strideq*3]
+ lea r4, [dstq+strideq*4]
+ punpckldq m1, m3, m4
+ punpckhdq m3, m4
+ punpckldq m4, m6, m5
+ punpckhdq m6, m5
+ punpckldq m5, m7, m2
+ punpckhdq m7, m2
+ punpckldq m2, m8, m0
+ punpckhdq m8, m0
+ mova ym9, [dstq+strideq*0]
+ vinserti32x8 m9, [dstq+strideq*2], 1
+ mova ym10, [dstq+strideq*1]
+ vinserti32x8 m10, [dstq+r3 ], 1
+ mova ym11, [r4+strideq*0]
+ vinserti32x8 m11, [r4+strideq*2], 1
+ mova ym12, [r4+strideq*1]
+ vinserti32x8 m12, [r4+r3 ], 1
+ REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8
+ pxor m13, m13
+ REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklqdq m0, m1, m4 ; a0 a2 c0 c2
+ punpckhqdq m1, m4 ; b0 b2 d0 d2
+ punpcklqdq m4, m5, m2 ; a1 a3 c1 c3
+ punpckhqdq m5, m2 ; b1 b3 d1 d3
+ punpcklqdq m2, m3, m6 ; e0 e2 g0 g2
+ punpckhqdq m3, m6 ; f0 f2 h0 h2
+ punpcklqdq m6, m7, m8 ; e1 e3 g1 g3
+ punpckhqdq m7, m8 ; f1 f3 h1 h3
+ punpcklbw m8, m9, m13
+ punpckhbw m9, m13
+ paddw m0, m8
+ paddw m4, m9
+ packuswb m0, m4
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*2], m0, 1
+ punpcklbw m8, m10, m13
+ punpckhbw m10, m13
+ paddw m1, m8
+ paddw m5, m10
+ packuswb m1, m5
+ mova [dstq+strideq*1], ym1
+ vextracti32x8 [dstq+r3 ], m1, 1
+ punpcklbw m8, m11, m13
+ punpckhbw m11, m13
+ paddw m2, m8
+ paddw m6, m11
+ packuswb m2, m6
+ mova [r4+strideq*0], ym2
+ vextracti32x8 [r4+strideq*2], m2, 1
+ punpcklbw m8, m12, m13
+ punpckhbw m12, m13
+ paddw m3, m8
+ paddw m7, m12
+ packuswb m3, m7
+ mova [r4+strideq*1], ym3
+ vextracti32x8 [r4+r3 ], m3, 1
+ RET
+
+%macro IDCT_16x32_END 3 ; src[1-2], row
+ mova xm8, [dstq+strideq*0]
+ vinserti32x4 ym8, [dstq+strideq*1], 1
+ mova xm9, [dstq+r3 ]
+ vinserti32x4 ym9, [dstq+strideq*2], 1
+ pmulhrsw m%1, m10
+ pmulhrsw m%2, m10
+ vpermb m8, m11, m8
+ vpermb m9, m11, m9
+ mova [cq+64*(%3*2+0)], m13
+ mova [cq+64*(%3*2+1)], m13
+ paddw m8, m%1
+ paddw m9, m%2
+ packuswb m8, m9
+ vpermd m8, m12, m8
+ mova [dstq+strideq*0], xm8
+ vextracti32x4 [dstq+strideq*1], ym8, 1
+ vextracti32x4 [dstq+strideq*2], m8, 2
+ vextracti32x4 [dstq+r3 ], m8, 3
+%if %1 != 20
+ lea dstq, [dstq+strideq*4]
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m15, [o(pw_2896x8)]
+ cmp eobd, 151
+ jb .fast
+ pmulhrsw m5, m15, [cq+64*10]
+ pmulhrsw m3, m15, [cq+64* 6]
+ pmulhrsw m1, m15, [cq+64* 2]
+ pmulhrsw m7, m15, [cq+64*14]
+ pmulhrsw m2, m15, [cq+64* 4]
+ pmulhrsw m6, m15, [cq+64*12]
+ pmulhrsw m0, m15, [cq+64* 0]
+ pmulhrsw m4, m15, [cq+64* 8]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ pmulhrsw m14, m15, [cq+64* 1]
+ pmulhrsw m21, m15, [cq+64*15]
+ pmulhrsw m18, m15, [cq+64* 9]
+ pmulhrsw m17, m15, [cq+64* 7]
+ pmulhrsw m16, m15, [cq+64* 5]
+ pmulhrsw m19, m15, [cq+64*11]
+ pmulhrsw m20, m15, [cq+64*13]
+ pmulhrsw m15, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova m8, [o(idct_16x32p)]
+ vpbroadcastd m9, [o(pw_16384)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m14, m15, m16, m17, m18, m19, m20, m21
+ punpckldq m8, m0, m1
+ punpckhdq m0, m1
+ punpckldq m1, m2, m3
+ punpckhdq m2, m3
+ REPX {pmulhrsw x, m9}, m8, m0, m1, m2
+ punpckldq m3, m4, m5
+ punpckhdq m4, m5
+ punpckldq m5, m6, m7
+ punpckhdq m6, m7
+ REPX {pmulhrsw x, m9}, m3, m4, m5, m6
+ punpckldq m7, m14, m15
+ punpckhdq m14, m15
+ punpckldq m15, m16, m17
+ punpckhdq m16, m17
+ REPX {pmulhrsw x, m9}, m7, m14, m15, m16
+ punpckldq m17, m18, m19
+ punpckhdq m18, m19
+ punpckldq m19, m20, m21
+ punpckhdq m20, m21
+ REPX {pmulhrsw x, m9}, m17, m18, m19, m20
+ punpcklqdq m21, m8, m1
+ punpckhqdq m8, m1
+ punpcklqdq m1, m0, m2
+ punpckhqdq m0, m2
+ punpcklqdq m2, m3, m5
+ punpckhqdq m3, m5
+ punpcklqdq m5, m4, m6
+ punpckhqdq m4, m6
+ punpcklqdq m6, m7, m15
+ punpckhqdq m7, m15
+ punpcklqdq m15, m14, m16
+ punpckhqdq m14, m16
+ punpcklqdq m16, m17, m19
+ punpckhqdq m17, m19
+ punpcklqdq m19, m18, m20
+ punpckhqdq m18, m20
+ vinserti32x8 m20, m21, ym2, 1
+ vshufi32x4 m21, m2, q3232
+ vinserti32x8 m2, m8, ym3, 1
+ vshufi32x4 m8, m3, q3232
+ vinserti32x8 m3, m1, ym5, 1
+ vshufi32x4 m1, m5, q3232
+ vinserti32x8 m5, m0, ym4, 1
+ vshufi32x4 m0, m4, q3232
+ vinserti32x8 m4, m6, ym16, 1
+ vshufi32x4 m6, m16, q3232
+ vinserti32x8 m16, m7, ym17, 1
+ vshufi32x4 m7, m17, q3232
+ vinserti32x8 m17, m15, ym19, 1
+ vshufi32x4 m15, m19, q3232
+ vinserti32x8 m19, m14, ym18, 1
+ vshufi32x4 m14, m18, q3232
+ vshufi32x4 m18, m21, m6, q3131 ; 27 5
+ vshufi32x4 m21, m6, q2020 ; 31 1
+ vshufi32x4 m6, m8, m7, q2020 ; 24 8
+ vshufi32x4 m8, m7, q3131 ; 30 2
+ vshufi32x4 m7, m1, m15, q2020 ; 28 4
+ vshufi32x4 m1, m15, q3131 ; 6 26
+ vshufi32x4 m15, m0, m14, q2020 ; 7 25
+ vshufi32x4 m0, m14, q3131 ; 14 18
+ vshufi32x4 m14, m20, m4, q2020 ; 3 29
+ vshufi32x4 m20, m4, q3131 ; 23 9
+ vshufi32x4 m9, m3, m17, q2020 ; 16 0
+ vshufi32x4 m3, m17, q3131 ; 12 20
+ vshufi32x4 m17, m5, m19, q2020 ; 15 17
+ vshufi32x4 m5, m19, q3131 ; 22 10
+ vshufi32x4 m19, m2, m16, q2020 ; 19 13
+ vshufi32x4 m16, m2, m16, q3131 ; 11 21
+ call m(idct_16x16_internal_8bpc).main3
+ call .main_oddhalf
+ jmp .pass2
+.fast: ; right half is zero
+ mova ym8, [cq+64*15]
+ vinserti32x8 m8, [cq+64* 1], 1
+ mova m2, [o(int16_perm)]
+ mova ym9, [cq+64* 8]
+ vinserti32x8 m9, [cq+64* 0], 1
+ mova ym0, [cq+64* 7]
+ vinserti32x8 m0, [cq+64* 9], 1
+ mova ym7, [cq+64*14]
+ vinserti32x8 m7, [cq+64* 2], 1
+ mova ym1, [cq+64* 3]
+ vinserti32x8 m1, [cq+64*13], 1
+ mova ym3, [cq+64* 6]
+ vinserti32x8 m3, [cq+64*10], 1
+ mova ym5, [cq+64*11]
+ vinserti32x8 m5, [cq+64* 5], 1
+ mova ym6, [cq+64*12]
+ vinserti32x8 m6, [cq+64* 4], 1
+ REPX {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6
+ REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
+ call m(idct_16x16_internal_8bpc).main2
+ vbroadcasti32x4 m8, [o(int_shuf3)]
+ vbroadcasti32x4 m9, [o(int_shuf4)]
+ vpbroadcastd m11, [o(pw_16384)]
+ pshufb m0, m8
+ pshufb m1, m9
+ pshufb m2, m8
+ pshufb m3, m9
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ pshufb m4, m8
+ pshufb m5, m9
+ pshufb m6, m8
+ pshufb m7, m9
+ REPX {pmulhrsw x, m11}, m4, m5, m6, m7
+ punpckhdq m17, m0, m1
+ punpckldq m0, m1
+ punpckhdq m16, m2, m3
+ punpckldq m2, m3
+ punpckhdq m18, m4, m5
+ punpckldq m4, m5
+ punpckhdq m5, m6, m7
+ punpckldq m6, m7
+ vinserti32x8 m1, m0, ym2, 1
+ vshufi32x4 m3, m0, m2, q3232
+ vinserti32x8 m2, m4, ym6, 1
+ vshufi32x4 m4, m6, q3232
+ vinserti32x8 m15, m17, ym16, 1
+ vshufi32x4 m17, m16, q3232
+ vinserti32x8 m16, m18, ym5, 1
+ vshufi32x4 m18, m5, q3232
+ vshufi32x4 m0, m1, m2, q2020 ; 0 2
+ vshufi32x4 m1, m2, q3131 ; 4 6
+ vshufi32x4 m2, m3, m4, q2020 ; 8 10
+ vshufi32x4 m3, m4, q3131 ; 12 14
+ vshufi32x4 m14, m15, m16, q2020 ; 1 3
+ vshufi32x4 m15, m16, q3131 ; 5 7
+ vshufi32x4 m16, m17, m18, q2020 ; 9 11
+ vshufi32x4 m17, m18, q3131 ; 13 15
+ pxor m6, m6
+ punpckhwd m8, m0, m0
+ punpcklwd m9, m6, m0
+ punpckhwd m0, m3, m3
+ punpckhwd m5, m2, m2
+ punpcklwd m7, m1, m1
+ punpckhwd m1, m1
+ punpcklwd m3, m3
+ punpcklwd m6, m2
+ call m(idct_16x16_internal_8bpc).main_fast5
+ punpcklwd m21, m14, m14
+ punpckhwd m14, m14
+ punpcklwd m18, m15, m15
+ punpckhwd m15, m15
+ punpcklwd m20, m16, m16
+ punpckhwd m16, m16
+ punpcklwd m19, m17, m17
+ punpckhwd m17, m17
+ call .main_oddhalf_fast
+.pass2:
+ vpbroadcastd m10, [o(pw_2048)]
+ mova m11, [o(end_16x32p)]
+ lea r3, [strideq*3]
+ pxor m13, m13
+ psrld m12, m11, 8
+ IDCT_16x32_END 0, 1, 0
+ IDCT_16x32_END 2, 3, 1
+ IDCT_16x32_END 4, 5, 2
+ IDCT_16x32_END 6, 7, 3
+ IDCT_16x32_END 14, 15, 4
+ IDCT_16x32_END 16, 17, 5
+ IDCT_16x32_END 18, 19, 6
+ IDCT_16x32_END 20, 21, 7
+ RET
+ALIGN function_align
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly
+ALIGN function_align
+cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
+ vpbroadcastd m8, [o(pw_201_4091x8)]
+ vpbroadcastd m20, [o(pw_m1380_3857x8)]
+ vpbroadcastd m9, [o(pw_995_3973x8)]
+ vpbroadcastd m16, [o(pw_m601_4052x8)]
+ pmulhrsw m21, m8 ; t16a, t31a
+ pmulhrsw m20, m15 ; t19a, t28a
+ pmulhrsw m18, m9 ; t20a, t27a
+ pmulhrsw m14, m16 ; t23a, t24a
+ mova m8, m21
+ mova m17, m20
+ mova m15, m18
+ mova m16, m14
+ jmp .main3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; bottom half is zero
+ vpbroadcastd m8, [o(pw_201_4091x8)]
+ vpbroadcastd m9, [o(pw_m2751_3035x8)]
+ vpbroadcastd m11, [o(pw_1751_3703x8)]
+ vpbroadcastd m12, [o(pw_m1380_3857x8)]
+ pmulhrsw m21, m8 ; t16a, t31a
+ vpbroadcastd m8, [o(pw_995_3973x8)]
+ pmulhrsw m17, m9 ; t17a, t30a
+ vpbroadcastd m9, [o(pw_m2106_3513x8)]
+ pmulhrsw m20, m11 ; t18a, t29a
+ vpbroadcastd m11, [o(pw_2440_3290x8)]
+ pmulhrsw m15, m12 ; t19a, t28a
+ vpbroadcastd m12, [o(pw_m601_4052x8)]
+ pmulhrsw m18, m8 ; t20a, t27a
+ pmulhrsw m16, m9 ; t21a, t26a
+ pmulhrsw m19, m11 ; t22a, t25a
+ pmulhrsw m14, m12 ; t23a, t24a
+ jmp .main2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ ITX_MUL2X_PACK 21, 8, 9, 10, 201, 4091, 5 ; t16a, t31a
+ ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
+ ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
+ ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
+ ITX_MUL2X_PACK 18, 8, 9, 10, 995, 3973, 5 ; t20a, t27a
+ ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
+ ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
+ ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a
+.main2:
+ psubsw m8, m21, m17 ; t17 t30
+ paddsw m21, m17 ; t16 t31
+ psubsw m17, m15, m20 ; t18 t29
+ paddsw m20, m15 ; t19 t28
+ psubsw m15, m18, m16 ; t21 t26
+ paddsw m18, m16 ; t20 t27
+ psubsw m16, m14, m19 ; t22 t25
+ paddsw m14, m19 ; t23 t24
+.main3:
+ ITX_MUL2X_PACK 8, 9, 19, 10, 799, 4017, 5 ; t17a t30a
+ ITX_MUL2X_PACK 17, 9, 19, 10, m4017, 799, 5 ; t18a t29a
+ ITX_MUL2X_PACK 15, 9, 19, 10, 3406, 2276, 5 ; t21a t26a
+ ITX_MUL2X_PACK 16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a
+ vpbroadcastd m11, [o(pw_m3784_1567)]
+ psubsw m19, m21, m20 ; t19a t28a
+ paddsw m21, m20 ; t16a t31a
+ psubsw m20, m14, m18 ; t20a t27a
+ paddsw m14, m18 ; t23a t24a
+ psubsw m18, m8, m17 ; t18 t29
+ paddsw m8, m17 ; t17 t30
+ psubsw m17, m16, m15 ; t21 t26
+ paddsw m15, m16 ; t22 t25
+ ITX_MUL2X_PACK 18, 9, 16, 10, 1567_3784, 11, 20 ; t18a t29a
+ ITX_MUL2X_PACK 19, 9, 16, 10, 1567_3784, 11, 20 ; t19 t28
+ ITX_MUL2X_PACK 20, 9, 16, 10, 11, m1567_m3784, 36 ; t20 t27
+ ITX_MUL2X_PACK 17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a
+ vbroadcasti32x4 m9, [o(deint_shuf)]
+ psubsw m16, m21, m14 ; t23 t24
+ paddsw m14, m21 ; t16 t31
+ psubsw m21, m8, m15 ; t22a t25a
+ paddsw m15, m8 ; t17a t30a
+ psubsw m8, m18, m17 ; t21 t26
+ paddsw m18, m17 ; t18 t29
+ paddsw m17, m19, m20 ; t19a t28a
+ psubsw m19, m20 ; t20a t27a
+ vpbroadcastd m11, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ REPX {pshufb x, m9}, m14, m15, m18, m17
+ mova m9, m10
+ vpdpwssd m9, m16, m11
+ mova m20, m10
+ vpdpwssd m20, m21, m11
+ psrad m9, 12
+ psrad m20, 12
+ packssdw m9, m20 ; t23a t22
+ mova m20, m10
+ vpdpwssd m20, m16, m12
+ mova m16, m10
+ vpdpwssd m16, m21, m12
+ psrad m20, 12
+ psrad m16, 12
+ packssdw m16, m20, m16 ; t24a t25
+ ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a
+ ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27
+ packssdw m11, m20 ; t27 t26a
+ packssdw m8, m21 ; t20 t21a
+ punpcklqdq m20, m14, m15 ; t16 t17a
+ punpckhqdq m14, m15 ; t31 t30a
+ punpckhqdq m15, m17, m18 ; t28a t29
+ punpcklqdq m17, m18 ; t19a t18
+ psubsw m21, m0, m14 ; out31 out30
+ paddsw m0, m14 ; out0 out1
+ psubsw m14, m7, m20 ; out16 out17
+ paddsw m7, m20 ; out15 out14
+ psubsw m20, m1, m15 ; out28 out29
+ paddsw m1, m15 ; out3 out2
+ psubsw m15, m6, m17 ; out19 out18
+ paddsw m6, m17 ; out12 out13
+ psubsw m17, m4, m9 ; out23 out22
+ paddsw m4, m9 ; out8 out9
+ psubsw m18, m3, m16 ; out24 out25
+ paddsw m3, m16 ; out7 out6
+ psubsw m16, m5, m8 ; out20 out21
+ paddsw m5, m8 ; out11 out10
+ psubsw m19, m2, m11 ; out27 out26
+ paddsw m2, m11 ; out4 out5
+ ret
+
+cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ mova m21, [o(permB)]
+ vpermq m1, m21, [cq+64* 0] ; 0 1
+ vpermq m14, m21, [cq+64* 1] ; 2 3
+ vpermq m20, m21, [cq+64* 2] ; 4 5
+ vpermq m15, m21, [cq+64* 3] ; 6 7
+ vpbroadcastd m8, [o(pw_2896x8)]
+ vpermq m2, m21, [cq+64* 4] ; 8 9
+ vpermq m16, m21, [cq+64* 5] ; 10 11
+ vpermq m3, m21, [cq+64* 6] ; 12 13
+ vpermq m17, m21, [cq+64* 7] ; 14 15
+ REPX {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17
+ pxor m12, m12
+ REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7
+ cmp eobd, 151
+ jb .fast
+ vpermq m9, m21, [cq+64* 8] ; 16 17
+ vpermq m19, m21, [cq+64* 9] ; 18 19
+ vpermq m4, m21, [cq+64*10] ; 20 21
+ vpermq m5, m21, [cq+64*11] ; 22 23
+ vpermq m6, m21, [cq+64*12] ; 24 25
+ vpermq m18, m21, [cq+64*13] ; 26 27
+ vpermq m7, m21, [cq+64*14] ; 28 29
+ vpermq m21, m21, [cq+64*15] ; 30 31
+ REPX {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21
+ REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15
+ punpcklwd m8, m21, m14 ; 30 2
+ punpckhwd m21, m1 ; 31 1
+ punpcklwd m0, m17, m19 ; 14 18
+ punpckhwd m17, m9 ; 15 17
+ punpcklwd m9, m1 ; 16 0
+ punpckhwd m14, m7 ; 3 29
+ punpcklwd m1, m15, m18 ; 6 26
+ punpckhwd m15, m6 ; 7 25
+ punpcklwd m6, m2 ; 24 8
+ punpckhwd m19, m3 ; 19 13
+ punpcklwd m3, m4 ; 12 20
+ punpckhwd m18, m20 ; 27 5
+ punpcklwd m7, m20 ; 28 4
+ punpckhwd m20, m5, m2 ; 23 9
+ punpcklwd m5, m16 ; 22 10
+ punpckhwd m16, m4 ; 11 21
+ call m(idct_16x16_internal_8bpc).main2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ jmp .pass2
+.fast: ; bottom half zero
+ punpcklwd m8, m14, m14 ; 2
+ punpcklwd m0, m17, m17 ; 14
+ punpcklwd m5, m16, m16 ; 10
+ punpcklwd m9, m12, m1 ; __ 0
+ punpckhwd m21, m1, m1 ; 1
+ punpcklwd m1, m15, m15 ; 6
+ punpcklwd m7, m20, m20 ; 4
+ punpckhwd m19, m3, m3 ; 13
+ punpcklwd m3, m3 ; 12
+ punpcklwd m6, m12, m2 ; __ 8
+ punpckhwd m18, m20, m20 ; 5
+ punpckhwd m20, m2, m2 ; 9
+ call m(idct_16x16_internal_8bpc).main_fast
+ punpckhwd m15, m15 ; 7
+ punpckhwd m14, m14 ; 3
+ punpckhwd m16, m16 ; 11
+ punpckhwd m17, m17 ; 15
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+.pass2:
+ vpbroadcastd m9, [o(pw_16384)]
+ call .transpose_round
+ vshufi32x4 m16, m14, m2, q3131 ; 5
+ vshufi32x4 m14, m2, q2020 ; 1
+ vshufi32x4 m2, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m1, m18, q3131 ; 6
+ vshufi32x4 m1, m18, q2020 ; 2
+ vshufi32x4 m18, m20, m6, q2020 ; 9
+ vshufi32x4 m20, m6, q3131 ; 13
+ vshufi32x4 m6, m21, m4, q3131 ; 12
+ vshufi32x4 m4, m21, m4, q2020 ; 8
+ vshufi32x4 m21, m19, m7, q3131 ; 15
+ vshufi32x4 m19, m7, q2020 ; 11
+ vshufi32x4 m7, m5, m15, q3131 ; 14
+ vshufi32x4 m5, m15, q2020 ; 10
+ vshufi32x4 m15, m17, m9, q2020 ; 3
+ vshufi32x4 m17, m9, q3131 ; 7
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
+ call .main_oddhalf
+ vpbroadcastd m12, [o(pw_2048)]
+ movshdup m13, [o(permD)]
+ lea r2, [strideq*3]
+ pmovzxbw m8, [dstq+strideq*0]
+ pmovzxbw m9, [dstq+strideq*1]
+ pmovzxbw m10, [dstq+strideq*2]
+ pmovzxbw m11, [dstq+r2 ]
+ REPX {pmulhrsw x, m12}, m0, m1, m2, m3
+ lea r3, [dstq+strideq*4]
+ paddw m0, m8
+ paddw m1, m9
+ paddw m2, m10
+ paddw m3, m11
+ pmovzxbw m8, [r3+strideq*0]
+ pmovzxbw m9, [r3+strideq*1]
+ pmovzxbw m10, [r3+strideq*2]
+ pmovzxbw m11, [r3+r2 ]
+ REPX {pmulhrsw x, m12}, m4, m5, m6, m7
+ lea r4, [dstq+strideq*8]
+ packuswb m0, m1
+ paddw m4, m8
+ paddw m5, m9
+ packuswb m2, m3
+ paddw m6, m10
+ paddw m7, m11
+ pmovzxbw m8, [r4+strideq*0]
+ pmovzxbw m9, [r4+strideq*1]
+ pmovzxbw m10, [r4+strideq*2]
+ pmovzxbw m11, [r4+r2 ]
+ REPX {pmulhrsw x, m12}, m14, m15, m16, m17
+ lea r5, [r3+strideq*8]
+ packuswb m4, m5
+ paddw m14, m8
+ paddw m15, m9
+ packuswb m6, m7
+ paddw m16, m10
+ paddw m17, m11
+ pmovzxbw m8, [r5+strideq*0]
+ pmovzxbw m9, [r5+strideq*1]
+ pmovzxbw m10, [r5+strideq*2]
+ pmovzxbw m11, [r5+r2 ]
+ REPX {pmulhrsw x, m12}, m18, m19, m20, m21
+ packuswb m14, m15
+ paddw m18, m8
+ paddw m19, m9
+ packuswb m16, m17
+ paddw m20, m10
+ paddw m21, m11
+ packuswb m18, m19
+ packuswb m20, m21
+ REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym2
+ vextracti32x8 [dstq+r2 ], m2, 1
+ mova [r3+strideq*0], ym4
+ vextracti32x8 [r3+strideq*1], m4, 1
+ mova [r3+strideq*2], ym6
+ vextracti32x8 [r3+r2 ], m6, 1
+ mova [r4+strideq*0], ym14
+ vextracti32x8 [r4+strideq*1], m14, 1
+ mova [r4+strideq*2], ym16
+ vextracti32x8 [r4+r2 ], m16, 1
+ mova [r5+strideq*0], ym18
+ vextracti32x8 [r5+strideq*1], m18, 1
+ mova [r5+strideq*2], ym20
+ vextracti32x8 [r5+r2 ], m20, 1
+ RET
+ALIGN function_align
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 16
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
+ vpbroadcastd m9, [o(pw_2896x8)]
+ vpbroadcastd m2, [o(pw_4017x8)]
+ vpbroadcastd m3, [o(pw_799x8)]
+ vpbroadcastd m18, [o(pw_4076x8)]
+ vpbroadcastd m19, [o(pw_401x8)]
+ vpbroadcastd m20, [o(pw_m1189x8)]
+ vpbroadcastd m16, [o(pw_3920x8)]
+ pmulhrsw m9, m0 ; t0
+ pmulhrsw m2, m1 ; t7a
+ pmulhrsw m1, m3 ; t4a
+ pmulhrsw m18, m14 ; t15a
+ pmulhrsw m14, m19 ; t8a
+ pmulhrsw m20, m15 ; t11a
+ pmulhrsw m15, m16 ; t12a
+ psubsw m7, m9, m2 ; idct8 out7
+ paddsw m0, m9, m2 ; idct8 out0
+ psubsw m4, m9, m1 ; idct8 out4
+ paddsw m3, m9, m1 ; idct8 out3
+ ITX_MULSUB_2W 2, 1, 5, 6, 10, 2896, 2896 ; t5, t6
+ mova m21, m18
+ mova m19, m14
+ mova m16, m15
+ mova m8, m20
+ psubsw m6, m9, m1 ; idct8 out6
+ paddsw m1, m9 ; idct8 out1
+ psubsw m5, m9, m2 ; idct8 out5
+ paddsw m2, m9 ; idct8 out2
+ jmp .main3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; bottom half is zero
+ vpbroadcastd m5, [o(pw_m2276x8)]
+ vpbroadcastd m11, [o(pw_3406x8)]
+ vpbroadcastd m7, [o(pw_4017x8)]
+ vpbroadcastd m12, [o(pw_799x8)]
+ vpbroadcastd m6, [o(pw_3784x8)]
+ vpbroadcastd m10, [o(pw_1567x8)]
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m5, m3 ; t5a
+ pmulhrsw m3, m11 ; t6a
+ pmulhrsw m7, m1 ; t7a
+ pmulhrsw m1, m12 ; t4a
+ pmulhrsw m6, m2 ; t3
+ pmulhrsw m2, m10 ; t2
+ pmulhrsw m4, m0 ; t0
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ vpbroadcastd m10, [o(pd_2048)]
+ mova m0, m4 ; t1
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main3
+ vpbroadcastd m21, [o(pw_4076x8)]
+ vpbroadcastd m8, [o(pw_401x8)]
+ vpbroadcastd m18, [o(pw_m2598x8)]
+ vpbroadcastd m9, [o(pw_3166x8)]
+ vpbroadcastd m19, [o(pw_3612x8)]
+ vpbroadcastd m11, [o(pw_1931x8)]
+ vpbroadcastd m20, [o(pw_m1189x8)]
+ vpbroadcastd m12, [o(pw_3920x8)]
+ pmulhrsw m21, m14 ; t15a
+ pmulhrsw m14, m8 ; t8a
+ pmulhrsw m18, m17 ; t9a
+ pmulhrsw m17, m9 ; t14a
+ pmulhrsw m19, m16 ; t13a
+ pmulhrsw m16, m11 ; t10a
+ pmulhrsw m20, m15 ; t11a
+ pmulhrsw m15, m12 ; t12a
+ jmp .main2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ ITX_MULSUB_2W 14, 21, 8, 9, 10, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2W 18, 17, 8, 9, 10, 3166, 2598 ; t9a, t14a
+ ITX_MULSUB_2W 16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2W 20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a
+.main2:
+ paddsw m8, m20, m16 ; t11
+ psubsw m20, m16 ; t10
+ paddsw m16, m15, m19 ; t12
+ psubsw m15, m19 ; t13
+ psubsw m19, m14, m18 ; t9
+ paddsw m14, m18 ; t8
+ psubsw m18, m21, m17 ; t14
+ paddsw m21, m17 ; t15
+.main3:
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a
+ vpbroadcastd m11, [o(pw_m1567_m3784)]
+ ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ psubsw m17, m14, m8 ; t11a
+ paddsw m8, m14 ; t8a
+ paddsw m14, m18, m15 ; t9
+ psubsw m18, m15 ; t10
+ psubsw m15, m19, m20 ; t13
+ paddsw m19, m20 ; t14
+ paddsw m20, m21, m16 ; t15a
+ psubsw m16, m21, m16 ; t12a
+ ITX_MULSUB_2W 15, 18, 9, 21, 10, 11, 12 ; t10a, t13a
+ ITX_MULSUB_2W 16, 17, 9, 21, 10, 11, 12 ; t11, t12
+ psubsw m21, m0, m20 ; out15
+ paddsw m0, m20 ; out0
+ psubsw m20, m1, m19 ; out14
+ paddsw m1, m19 ; out1
+ psubsw m19, m2, m18 ; out13
+ paddsw m2, m18 ; out2
+ psubsw m18, m3, m17 ; out12
+ paddsw m3, m17 ; out3
+ psubsw m17, m4, m16 ; out11
+ paddsw m4, m16 ; out4
+ psubsw m16, m5, m15 ; out10
+ paddsw m5, m15 ; out5
+ psubsw m15, m6, m14 ; out9
+ paddsw m6, m14 ; out6
+ psubsw m14, m7, m8 ; out8
+ paddsw m7, m8 ; out7
+ ret
+.transpose_round:
+ punpcklwd m8, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m1, m3
+ punpckhwd m1, m3
+ punpcklwd m3, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m5, m7
+ punpckhwd m5, m7
+ punpcklwd m7, m14, m16
+ punpckhwd m14, m16
+ punpcklwd m16, m15, m17
+ punpckhwd m15, m17
+ punpcklwd m17, m19, m21
+ punpckhwd m19, m21
+ punpckhwd m21, m18, m20
+ punpcklwd m18, m20
+ punpcklwd m20, m8, m1
+ punpckhwd m8, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ REPX {pmulhrsw x, m9}, m20, m8, m1, m0
+ punpcklwd m6, m7, m15
+ punpckhwd m7, m15
+ punpcklwd m15, m14, m16
+ punpckhwd m14, m16
+ REPX {pmulhrsw x, m9}, m2, m3, m5, m4
+ punpckhwd m16, m18, m19
+ punpcklwd m18, m19
+ punpcklwd m19, m21, m17
+ punpckhwd m21, m17
+ REPX {pmulhrsw x, m9}, m6, m7, m15, m14
+ punpcklwd m17, m8, m0 ; a2 a6 aa ae
+ punpckhwd m8, m0 ; a3 a7 ab af
+ punpcklwd m0, m20, m1 ; a0 a4 a8 ac
+ punpckhwd m20, m1 ; a1 a5 a9 ad
+ REPX {pmulhrsw x, m9}, m16, m18, m19, m21
+ punpcklwd m1, m2, m5 ; b0 b4 b8 bc
+ punpckhwd m2, m5 ; b1 b5 b9 bd
+ punpcklwd m5, m3, m4 ; b2 b6 ba be
+ punpckhwd m3, m4 ; b3 b7 bb bf
+ punpcklwd m4, m6, m15 ; c0 c4 c8 cc
+ punpckhwd m6, m15 ; c1 c5 c9 cd
+ punpcklwd m15, m7, m14 ; c2 c6 ca ce
+ punpckhwd m7, m14 ; c3 c7 cb cf
+ punpcklwd m14, m18, m19 ; d0 d4 d8 dc
+ punpckhwd m18, m19 ; d1 d5 d9 dd
+ punpcklwd m9, m16, m21 ; d2 d6 da de
+ punpckhwd m16, m21 ; d3 d7 db df
+ vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc
+ vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4
+ vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6
+ vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be
+ vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7
+ vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf
+ vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4
+ vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc
+ vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5
+ vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd
+ vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5
+ vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd
+ vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6
+ vshufi32x4 m15, m9, q3232 ; ca ce da de
+ vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7
+ vshufi32x4 m7, m16, q3232 ; cb cf db df
+ ret
+
+%macro IDTX_16x32 4 ; src/dst[1-4]
+ pmulhrsw m%1, m15, [cq+64*%1]
+ pmulhrsw m%2, m15, [cq+64*%2]
+ pmulhrsw m%3, m15, [cq+64*%3]
+ pmulhrsw m%4, m15, [cq+64*%4]
+ pmulhrsw m18, m16, m%1
+ pmulhrsw m19, m16, m%2
+ pmulhrsw m20, m16, m%3
+ pmulhrsw m21, m16, m%4
+ REPX {pmulhrsw x, m17}, m18, m19, m20, m21
+ paddsw m%1, m18
+ paddsw m%2, m19
+ paddsw m%3, m20
+ paddsw m%4, m21
+%endmacro
+
+%macro IDTX_16x32_STORE 2 ; src[1-2]
+ mova xm17, [dstq+r3*0]
+ vinserti128 ym17, [dstq+r3*4], 1
+ vinserti32x4 m17, [dstq+r3*8], 2
+ vinserti32x4 m17, [dstq+r4*8], 3
+ mova [cq+64*(%1*2+0)], m18
+ mova [cq+64*(%1*2+1)], m18
+ punpcklbw m16, m17, m18
+ punpckhbw m17, m18
+ paddw m16, m%1
+ paddw m17, m%2
+ packuswb m16, m17
+ mova [dstq+r3*0], xm16
+ vextracti128 [dstq+r3*4], ym16, 1
+ vextracti32x4 [dstq+r3*8], m16, 2
+ vextracti32x4 [dstq+r4*8], m16, 3
+%if %1 != 7
+ add dstq, strideq
+%endif
+%endmacro
+
+cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c
+ vpbroadcastd m15, [pw_2896x8]
+ vpbroadcastd m16, [pw_1697x16]
+ vpbroadcastd m17, [pw_16384]
+ IDTX_16x32 0, 1, 2, 3
+ IDTX_16x32 4, 5, 6, 7
+ IDTX_16x32 8, 9, 10, 11
+ IDTX_16x32 12, 13, 14, 15
+ vpbroadcastd m16, [pw_8192]
+ call .transpose_2x8x8_round
+ lea r3, [strideq*2]
+ lea r4, [strideq*3]
+ pxor m18, m18
+ IDTX_16x32_STORE 0, 8
+ IDTX_16x32_STORE 1, 9
+ IDTX_16x32_STORE 2, 10
+ IDTX_16x32_STORE 3, 11
+ IDTX_16x32_STORE 4, 12
+ IDTX_16x32_STORE 5, 13
+ IDTX_16x32_STORE 6, 14
+ IDTX_16x32_STORE 7, 15
+ RET
+ALIGN function_align
+.transpose_2x8x8_round:
+ punpckhwd m17, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m17, m1
+ punpckhdq m17, m1
+ REPX {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m17
+ punpcklqdq m6, m17
+ punpckhwd m17, m12, m13
+ punpcklwd m12, m13
+ punpckhwd m13, m8, m9
+ punpcklwd m8, m9
+ punpckhwd m9, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m10, m11
+ punpcklwd m10, m11
+ punpckhdq m11, m8, m10
+ punpckldq m8, m10
+ punpckldq m10, m12, m14
+ punpckhdq m12, m14
+ punpckhdq m14, m13, m15
+ punpckldq m13, m15
+ punpckldq m15, m17, m9
+ punpckhdq m17, m9
+ REPX {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17
+ punpckhqdq m9, m8, m10
+ punpcklqdq m8, m10
+ punpcklqdq m10, m11, m12
+ punpckhqdq m11, m12
+ punpcklqdq m12, m13, m15
+ punpckhqdq m13, m15
+ punpckhqdq m15, m14, m17
+ punpcklqdq m14, m17
+ ret
+
+%macro IDTX_32x16 4 ; dst[1-4]
+ pmulhrsw m%2, m12, [cq+32*(%1+ 0)]
+ pmulhrsw m18, m12, [cq+32*(%1+16)]
+ pmulhrsw m%4, m12, [cq+32*(%3+ 0)]
+ pmulhrsw m19, m12, [cq+32*(%3+16)]
+ REPX {paddsw x, x}, m%2, m18, m%4, m19
+ mova m%1, m14
+ vpermi2q m%1, m%2, m18
+ vpermt2q m%2, m16, m18
+%if %3 != 14
+ mova m%3, m14
+%endif
+ vpermi2q m%3, m%4, m19
+ vpermt2q m%4, m16, m19
+ pmulhrsw m18, m17, m%1
+ pmulhrsw m19, m17, m%2
+ pmulhrsw m20, m17, m%3
+ pmulhrsw m21, m17, m%4
+ REPX {paddsw x, x}, m%1, m%2, m%3, m%4
+ paddsw m%1, m18
+ paddsw m%2, m19
+ paddsw m%3, m20
+ paddsw m%4, m21
+%endmacro
+
+%macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32
+ mova ym19, [dstq+strideq*0]
+ vinserti32x8 m19, [dstq+strideq*8], 1
+%if %3 == 0
+ mova [cq+64*(%1*2+0)], m20
+ mova [cq+64*(%1*2+1)], m20
+%endif
+ punpcklbw m18, m19, m20
+ punpckhbw m19, m20
+ paddw m18, m%1
+ paddw m19, m%2
+ packuswb m18, m19
+ mova [dstq+strideq*0], ym18
+ vextracti32x8 [dstq+strideq*8], m18, 1
+%if %3 || %1 != 7
+ add dstq, strideq
+%endif
+%endmacro
+
+cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c
+ vpbroadcastd m12, [pw_2896x8]
+ movu m14, [permB+7]
+ vpbroadcastd m17, [pw_1697x16]
+ psrlq m16, m14, 4
+ IDTX_32x16 0, 1, 2, 3
+ IDTX_32x16 4, 5, 6, 7
+ IDTX_32x16 8, 9, 10, 11
+ IDTX_32x16 12, 13, 14, 15
+ vpbroadcastd m16, [pw_2048]
+ call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
+ pxor m20, m20
+ IDTX_32x16_STORE 0, 8
+ IDTX_32x16_STORE 1, 9
+ IDTX_32x16_STORE 2, 10
+ IDTX_32x16_STORE 3, 11
+ IDTX_32x16_STORE 4, 12
+ IDTX_32x16_STORE 5, 13
+ IDTX_32x16_STORE 6, 14
+ IDTX_32x16_STORE 7, 15
+ RET
+
+%macro IDCT_32x32_END 4 ; src, mem, stride[1-2]
+ pmovzxbw m10, [dstq+%3]
+ pmovzxbw m11, [r3 +%4]
+%if %2 < 8
+ paddsw m8, m%2, m%1
+ psubsw m9, m%2, m%1
+%else
+ mova m9, [cq+64*(%2*2-16)]
+ paddsw m8, m9, m%1
+ psubsw m9, m%1
+%endif
+ pmulhrsw m8, m12
+ pmulhrsw m9, m12
+%if %2 >= 8
+%if %2 == 8
+ pxor m0, m0
+%endif
+ mova [cq+64*(%2*2-16)], m0
+ mova [cq+64*(%2*2-15)], m0
+%endif
+ paddw m8, m10
+ paddw m9, m11
+ packuswb m8, m9
+ vpermq m8, m13, m8
+ mova [dstq+%3], ym8
+ vextracti32x8 [r3 +%4], m8, 1
+%if %2 == 3 || %2 == 7 || %2 == 11
+ add dstq, r5
+ sub r3, r5
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ WIN64_SPILL_XMM 30
+ cmp eobd, 136
+ jb .fast
+ mova m5, [cq+64*20]
+ mova m3, [cq+64*12]
+ mova m1, [cq+64* 4]
+ mova m7, [cq+64*28]
+ mova m2, [cq+64* 8]
+ mova m6, [cq+64*24]
+ mova m0, [cq+64* 0]
+ mova m4, [cq+64*16]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ mova m14, [cq+64* 2]
+ mova m21, [cq+64*30]
+ mova m18, [cq+64*18]
+ mova m17, [cq+64*14]
+ mova m16, [cq+64*10]
+ mova m19, [cq+64*22]
+ mova m20, [cq+64*26]
+ mova m15, [cq+64* 6]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ mova m22, [cq+64* 1]
+ mova m21, [cq+64*31]
+ mova m14, [cq+64*17]
+ mova m29, [cq+64*15]
+ mova m26, [cq+64* 9]
+ mova m17, [cq+64*23]
+ mova m18, [cq+64*25]
+ mova m25, [cq+64* 7]
+ mova m24, [cq+64* 5]
+ mova m19, [cq+64*27]
+ mova m16, [cq+64*21]
+ mova m27, [cq+64*11]
+ mova m28, [cq+64*13]
+ mova m15, [cq+64*19]
+ mova m20, [cq+64*29]
+ mova m23, [cq+64* 3]
+ call .main_oddhalf
+ vpbroadcastd m10, [o(pw_8192)]
+ psubsw m13, m0, m29 ; 31
+ paddsw m0, m29 ; 0
+ psubsw m29, m1, m28 ; 30
+ paddsw m1, m28 ; 1
+ psubsw m28, m2, m27 ; 29
+ paddsw m2, m27 ; 2
+ psubsw m27, m3, m26 ; 28
+ paddsw m3, m26 ; 3
+ psubsw m26, m4, m25 ; 27
+ paddsw m4, m25 ; 4
+ psubsw m25, m5, m24 ; 26
+ paddsw m5, m24 ; 5
+ psubsw m24, m6, m23 ; 25
+ paddsw m6, m23 ; 6
+ psubsw m23, m7, m22 ; 24
+ paddsw m7, m22 ; 7
+ pxor m9, m9
+ punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3
+ REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
+ punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
+ punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3
+ punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
+ punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3
+ REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
+ punpckhwd m3, m23, m24
+ punpcklwd m23, m24
+ punpckhwd m24, m25, m26
+ punpcklwd m25, m26
+ REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
+ punpckhwd m26, m27, m28
+ punpcklwd m27, m28
+ punpckhwd m28, m29, m13
+ punpcklwd m29, m13
+ REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
+ punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7
+ punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
+ punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5
+ REPX {pmulhrsw x, m10}, m0, m4, m8, m22
+ punpckhdq m13, m23, m25
+ punpckldq m23, m25
+ punpckhdq m25, m27, m29
+ punpckldq m27, m29
+ REPX {pmulhrsw x, m10}, m13, m23, m25, m27
+ punpckhdq m9, m3, m24
+ punpckldq m3, m24
+ punpckhdq m24, m26, m28
+ punpckldq m26, m28
+ punpcklqdq m5, m23, m27 ; d00 d08 d16 d24
+ punpckhqdq m23, m27 ; d01 d09 d17 d25
+ punpckhqdq m27, m13, m25 ; d03 d11 d19 d27
+ punpcklqdq m13, m25 ; d02 d10 d18 d26
+ punpckhqdq m25, m3, m26 ; d05 d13 d21 d29
+ punpcklqdq m3, m26 ; d04 d12 d20 d28
+ punpckhqdq m26, m9, m24 ; d07 d15 d23 d31
+ punpcklqdq m9, m24 ; d06 d14 d22 d30
+ REPX {pmulhrsw x, m10}, m25, m3, m26
+ mova [cq+64* 9], m23
+ mova [cq+64*11], m27
+ mova [cq+64*13], m25
+ mova [cq+64*15], m26
+ punpckhqdq m24, m8, m22 ; a05 a13 a21 a29
+ punpcklqdq m8, m22 ; a04 a12 a20 a28
+ punpckhqdq m22, m0, m4 ; a01 a09 a17 a25
+ punpcklqdq m0, m4 ; a00 a08 a16 a24
+ punpckhqdq m23, m7, m2 ; a03 a11 a19 a27
+ punpcklqdq m7, m2 ; a02 a10 a18 a26
+ punpckhqdq m25, m6, m1 ; a07 a15 a23 a31
+ punpcklqdq m6, m1 ; a06 a14 a22 a30
+ mova m2, [cq+64* 0]
+ mova m11, [cq+64* 2]
+ mova m12, [cq+64* 4]
+ mova m29, [cq+64* 6]
+ mova m27, [cq+64* 8]
+ mova m26, [cq+64*10]
+ mova m4, [cq+64*12]
+ mova m28, [cq+64*14]
+ psubsw m1, m2, m21 ; 23
+ paddsw m2, m21 ; 8
+ psubsw m21, m11, m20 ; 22
+ paddsw m11, m20 ; 9
+ psubsw m20, m12, m19 ; 21
+ paddsw m12, m19 ; 10
+ psubsw m19, m29, m18 ; 20
+ paddsw m29, m18 ; 11
+ psubsw m18, m27, m17 ; 19
+ paddsw m27, m17 ; 12
+ psubsw m17, m26, m16 ; 18
+ paddsw m26, m16 ; 13
+ paddsw m16, m4, m15 ; 14
+ psubsw m4, m15 ; 17
+ pmulhrsw m15, m6, m10
+ psubsw m6, m28, m14 ; 16
+ paddsw m28, m14 ; 15
+ pmulhrsw m14, m7, m10
+ punpcklwd m7, m6, m4
+ punpckhwd m6, m4
+ punpckhwd m4, m17, m18
+ punpcklwd m17, m18
+ punpckhwd m18, m19, m20
+ punpcklwd m19, m20
+ punpckhwd m20, m21, m1
+ punpcklwd m21, m1
+ punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7
+ punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3
+ punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
+ punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3
+ punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
+ punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3
+ punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
+ punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3
+ pmulhrsw m23, m10
+ pmulhrsw m25, m10
+ punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1
+ REPX {pmulhrsw x, m10}, m28, m2, m12, m27
+ punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7
+ punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5
+ punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
+ punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5
+ REPX {pmulhrsw x, m10}, m16, m1, m11, m29
+ punpckhdq m26, m19, m21
+ punpckldq m19, m21
+ punpckhdq m21, m6, m4
+ punpckldq m6, m4
+ REPX {pmulhrsw x, m10}, m26, m19, m21, m6
+ punpckhdq m4, m18, m20
+ punpckldq m18, m20
+ punpckhdq m20, m7, m17
+ punpckldq m7, m17
+ REPX {pmulhrsw x, m10}, m4, m18, m20, m7
+ punpcklqdq m17, m28, m12 ; b02 b10 b18 b26
+ punpckhqdq m28, m12 ; b03 b11 b19 b27
+ punpckhqdq m12, m2, m27 ; b01 b09 b17 b25
+ punpcklqdq m2, m27 ; b00 b08 b16 b24
+ punpckhqdq m27, m1, m29 ; b05 b13 b21 b29
+ punpcklqdq m1, m29 ; b04 b12 b20 b28
+ punpckhqdq m29, m16, m11 ; b07 b15 b23 b31
+ punpcklqdq m16, m11 ; b06 b14 b22 b30
+ mova [cq+64* 1], m12
+ mova [cq+64* 3], m28
+ mova [cq+64* 5], m27
+ mova [cq+64* 7], m29
+ punpckhqdq m27, m20, m26 ; c03 c11 c19 c27
+ punpcklqdq m20, m26 ; c02 c10 c18 c26
+ punpckhqdq m26, m7, m19 ; c01 c09 c17 c25
+ punpcklqdq m7, m19 ; c00 c08 c16 c24
+ punpckhqdq m28, m6, m18 ; c05 c13 c21 c29
+ punpcklqdq m6, m18 ; c04 c12 c20 c28
+ punpckhqdq m29, m21, m4 ; c07 c15 c23 c31
+ punpcklqdq m21, m4 ; c06 c14 c22 c30
+ pmulhrsw m19, m9, m10
+ vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24
+ vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08
+ vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24
+ vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08
+ vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28
+ vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12
+ vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28
+ vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12
+ vshufi32x4 m3, m1, m6, q3131 ; 12
+ vshufi32x4 m1, m6, q2020 ; 4
+ vshufi32x4 m6, m4, m2, q3131 ; 24
+ vshufi32x4 m4, m2, q2020 ; 16
+ vshufi32x4 m2, m0, m7, q3131 ; 8
+ vshufi32x4 m0, m7, q2020 ; 0
+ vshufi32x4 m7, m5, m8, q3131 ; 28
+ vshufi32x4 m5, m8, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26
+ vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10
+ vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26
+ vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10
+ vshufi32x4 m13, m21, m19, q3232 ; c22 c30 d22 d30
+ vinserti32x8 m21, ym19, 1 ; c06 c14 d06 d14
+ vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30
+ vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14
+ vshufi32x4 m16, m14, m20, q3131 ; 10
+ vshufi32x4 m14, m20, q2020 ; 2
+ vshufi32x4 m20, m18, m17, q3131 ; 26
+ vshufi32x4 m18, m17, q2020 ; 18
+ vshufi32x4 m17, m15, m21, q3131 ; 14
+ vshufi32x4 m15, m21, q2020 ; 6
+ vshufi32x4 m21, m19, m13, q3131 ; 30
+ vshufi32x4 m19, m13, q2020 ; 22
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ mova m15, [cq+64* 1]
+ mova m16, [cq+64* 3]
+ mova m17, [cq+64* 5]
+ mova m19, [cq+64* 7]
+ mova m20, [cq+64* 9]
+ mova m21, [cq+64*11]
+ mova m13, [cq+64*13]
+ mova m18, [cq+64*15]
+ vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25
+ vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09
+ vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27
+ vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11
+ vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29
+ vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13
+ vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31
+ vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15
+ vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09
+ vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25
+ vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11
+ vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27
+ vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13
+ vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29
+ vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15
+ vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31
+ vshufi32x4 m18, m14, m26, q3131 ; 25
+ vshufi32x4 m14, m26, q2020 ; 17
+ vshufi32x4 m19, m15, m27, q3131 ; 27
+ vshufi32x4 m15, m27, q2020 ; 19
+ vshufi32x4 m20, m16, m28, q3131 ; 29
+ vshufi32x4 m16, m28, q2020 ; 21
+ vshufi32x4 m21, m17, m29, q3131 ; 31
+ vshufi32x4 m17, m29, q2020 ; 23
+ vshufi32x4 m26, m22, m8, q3131 ; 9
+ vshufi32x4 m22, m8, q2020 ; 1
+ vshufi32x4 m27, m23, m9, q3131 ; 11
+ vshufi32x4 m23, m9, q2020 ; 3
+ vshufi32x4 m28, m24, m11, q3131 ; 13
+ vshufi32x4 m24, m11, q2020 ; 5
+ vshufi32x4 m29, m25, m12, q3131 ; 15
+ vshufi32x4 m25, m12, q2020 ; 7
+ call .main_oddhalf
+ jmp .end
+.fast: ; bottom/right halves are zero
+ mova m14, [o(dup16_perm)]
+ pmovzxwd m9, [cq+64* 0]
+ pmovzxwd m6, [cq+64* 8]
+ vpermb m8, m14, [cq+64* 2]
+ vpermb ym0, ym14, [cq+64*14]
+ vpermb ym5, ym14, [cq+64*10]
+ vpermb m1, m14, [cq+64* 6]
+ vpermb m7, m14, [cq+64* 4]
+ vpermb ym3, ym14, [cq+64*12]
+ pslld m9, 16
+ pslld m6, 16
+ call m(idct_16x16_internal_8bpc).main_fast
+ vpermb m21, m14, [cq+64* 1]
+ vpermb ym17, ym14, [cq+64*15]
+ vpermb ym20, ym14, [cq+64* 9]
+ vpermb m15, m14, [cq+64* 7]
+ vpermb m18, m14, [cq+64* 5]
+ vpermb ym16, ym14, [cq+64*11]
+ vpermb ym19, ym14, [cq+64*13]
+ vpermb m14, m14, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m9, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
+ vshufi32x4 m22, m14, m2, q2020 ; 1
+ vshufi32x4 m24, m14, m2, q3131 ; 5
+ vshufi32x4 m23, m17, m9, q2020 ; 3
+ vshufi32x4 m25, m17, m9, q3131 ; 7
+ vshufi32x4 m16, m5, m15, q2020 ; 10
+ vshufi32x4 m17, m5, m15, q3131 ; 14
+ vshufi32x4 m14, m1, m18, q2020 ; 2
+ vshufi32x4 m15, m1, m18, q3131 ; 6
+ vshufi32x4 m1, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m21, m4, q3131 ; 12
+ vshufi32x4 m2, m21, m4, q2020 ; 8
+ vshufi32x4 m26, m20, m6, q2020 ; 9
+ vshufi32x4 m28, m20, m6, q3131 ; 13
+ vshufi32x4 m27, m19, m7, q2020 ; 11
+ vshufi32x4 m29, m19, m7, q3131 ; 15
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ call .main_oddhalf_fast
+.end:
+ lea r4, [strideq*3]
+ vpbroadcastd m12, [o(pw_2048)]
+ movshdup m13, [o(permD)]
+ lea r3, [dstq+r4*8]
+ lea r5, [strideq+r4] ; stride*4
+ add r3, r5 ; dst+stride*28
+ IDCT_32x32_END 29, 0, strideq*0, r4
+ IDCT_32x32_END 28, 1, strideq*1, strideq*2
+ IDCT_32x32_END 27, 2, strideq*2, strideq*1
+ IDCT_32x32_END 26, 3, r4 , strideq*0
+ IDCT_32x32_END 25, 4, strideq*0, r4
+ IDCT_32x32_END 24, 5, strideq*1, strideq*2
+ IDCT_32x32_END 23, 6, strideq*2, strideq*1
+ IDCT_32x32_END 22, 7, r4 , strideq*0
+ IDCT_32x32_END 21, 8, strideq*0, r4
+ IDCT_32x32_END 20, 9, strideq*1, strideq*2
+ IDCT_32x32_END 19, 10, strideq*2, strideq*1
+ IDCT_32x32_END 18, 11, r4 , strideq*0
+ IDCT_32x32_END 17, 12, strideq*0, r4
+ IDCT_32x32_END 16, 13, strideq*1, strideq*2
+ IDCT_32x32_END 15, 14, strideq*2, strideq*1
+ IDCT_32x32_END 14, 15, r4 , strideq*0
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2
+ALIGN function_align
+cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero
+ vpbroadcastd m21, [o(pw_4091x8)]
+ vpbroadcastd m8, [o(pw_201x8)]
+ vpbroadcastd m18, [o(pw_m1380x8)]
+ vpbroadcastd m9, [o(pw_3857x8)]
+ vpbroadcastd m19, [o(pw_3973x8)]
+ vpbroadcastd m11, [o(pw_995x8)]
+ vpbroadcastd m28, [o(pw_m601x8)]
+ vpbroadcastd m12, [o(pw_4052x8)]
+ pmulhrsw m21, m22 ; t31a
+ pmulhrsw m22, m8 ; t16a
+ pmulhrsw m18, m25 ; t19a
+ pmulhrsw m25, m9 ; t28a
+ pmulhrsw m19, m24 ; t27a
+ pmulhrsw m24, m11 ; t20a
+ pmulhrsw m28, m23 ; t23a
+ pmulhrsw m23, m12 ; t24a
+ mova m15, m21
+ mova m8, m22
+ mova m14, m18
+ mova m27, m25
+ mova m29, m19
+ mova m26, m24
+ mova m16, m28
+ mova m20, m23
+ jmp .main3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; bottom half is zero
+ vpbroadcastd m21, [o(pw_4091x8)]
+ vpbroadcastd m8, [o(pw_201x8)]
+ vpbroadcastd m14, [o(pw_m2751x8)]
+ vpbroadcastd m9, [o(pw_3035x8)]
+ vpbroadcastd m17, [o(pw_3703x8)]
+ vpbroadcastd m11, [o(pw_1751x8)]
+ vpbroadcastd m18, [o(pw_m1380x8)]
+ vpbroadcastd m12, [o(pw_3857x8)]
+ pmulhrsw m21, m22 ; t31a
+ vpbroadcastd m19, [o(pw_3973x8)]
+ pmulhrsw m22, m8 ; t16a
+ vpbroadcastd m8, [o(pw_995x8)]
+ pmulhrsw m14, m29 ; t30a
+ vpbroadcastd m16, [o(pw_m2106x8)]
+ pmulhrsw m29, m9 ; t17a
+ vpbroadcastd m9, [o(pw_3513x8)]
+ pmulhrsw m17, m26 ; t29a
+ vpbroadcastd m15, [o(pw_3290x8)]
+ pmulhrsw m26, m11 ; t18a
+ vpbroadcastd m11, [o(pw_2440x8)]
+ pmulhrsw m18, m25 ; t19a
+ vpbroadcastd m20, [o(pw_m601x8)]
+ pmulhrsw m25, m12 ; t28a
+ vpbroadcastd m12, [o(pw_4052x8)]
+ pmulhrsw m19, m24 ; t27a
+ pmulhrsw m24, m8 ; t20a
+ pmulhrsw m16, m27 ; t21a
+ pmulhrsw m27, m9 ; t26a
+ pmulhrsw m15, m28 ; t25a
+ pmulhrsw m28, m11 ; t22a
+ pmulhrsw m20, m23 ; t23a
+ pmulhrsw m23, m12 ; t24a
+ jmp .main2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ ITX_MULSUB_2W 22, 21, 8, 9, 10, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2W 14, 29, 8, 9, 10, 3035, 2751 ; t17a, t30a
+ ITX_MULSUB_2W 26, 17, 8, 9, 10, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2W 18, 25, 8, 9, 10, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2W 24, 19, 8, 9, 10, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2W 16, 27, 8, 9, 10, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2W 28, 15, 8, 9, 10, 2440, 3290 ; t22a, t25a
+ ITX_MULSUB_2W 20, 23, 8, 9, 10, 4052, 601 ; t23a, t24a
+.main2:
+ psubsw m8, m22, m14 ; t17
+ paddsw m22, m14 ; t16
+ paddsw m14, m18, m26 ; t19
+ psubsw m18, m26 ; t18
+ psubsw m26, m24, m16 ; t21
+ paddsw m24, m16 ; t20
+ psubsw m16, m20, m28 ; t22
+ paddsw m28, m20 ; t23
+ psubsw m20, m23, m15 ; t25
+ paddsw m23, m15 ; t24
+ psubsw m15, m21, m29 ; t30
+ paddsw m21, m29 ; t31
+ psubsw m29, m19, m27 ; t26
+ paddsw m19, m27 ; t27
+ paddsw m27, m25, m17 ; t28
+ psubsw m25, m17 ; t29
+.main3:
+ ITX_MULSUB_2W 15, 8, 9, 17, 10, 799, 4017 ; t17a, t30a
+ ITX_MULSUB_2W 25, 18, 9, 17, 10, m4017, 799 ; t18a, t29a
+ ITX_MULSUB_2W 29, 26, 9, 17, 10, 3406, 2276 ; t21a, t26a
+ ITX_MULSUB_2W 20, 16, 9, 17, 10, m2276, 3406 ; t22a, t25a
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ psubsw m17, m21, m27 ; t28a
+ paddsw m21, m27 ; t31a
+ psubsw m27, m15, m25 ; t18
+ paddsw m15, m25 ; t17
+ psubsw m25, m20, m29 ; t21
+ paddsw m20, m29 ; t22
+ psubsw m29, m8, m18 ; t29
+ paddsw m8, m18 ; t30
+ psubsw m18, m22, m14 ; t19a
+ paddsw m22, m14 ; t16a
+ psubsw m14, m28, m24 ; t20a
+ paddsw m24, m28 ; t23a
+ paddsw m28, m16, m26 ; t25
+ psubsw m16, m26 ; t26
+ psubsw m26, m23, m19 ; t27a
+ paddsw m23, m19 ; t24a
+ ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a
+ ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28
+ vpbroadcastd m11, [o(pw_m1567_m3784)]
+ ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a
+ ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ psubsw m19, m27, m25 ; t26
+ paddsw m27, m25 ; t29
+ psubsw m25, m17, m26 ; t20a
+ paddsw m17, m26 ; t19a
+ paddsw m26, m18, m14 ; t28a
+ psubsw m18, m14 ; t27a
+ paddsw m14, m22, m24 ; t16
+ psubsw m22, m24 ; t23
+ psubsw m24, m29, m16 ; t21
+ paddsw m16, m29 ; t18
+ paddsw m29, m21, m23 ; t31
+ psubsw m21, m23 ; t24
+ psubsw m23, m15, m20 ; t22a
+ paddsw m15, m20 ; t17a
+ psubsw m20, m8, m28 ; t25a
+ paddsw m28, m8 ; t30a
+ ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27
+ ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a
+ ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a
+ ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25
+ ret
+
+%macro IDTX_32x32 2 ; dst[1-2]
+ vmovdqa32 ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which
+ vmovdqa32 ym17, [cq+64*(%1+16)] ; reduces code size due to
+ vmovdqa32 ym%2, [cq+64*(%2+ 0)] ; compressed displacements
+ vmovdqa32 ym18, [cq+64*(%2+16)]
+ vpermt2q m%1, m21, m17
+ vpermt2q m%2, m21, m18
+%endmacro
+
+cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c
+ movu m21, [permB+7]
+ vpbroadcastd m16, [pw_8192]
+ pxor m20, m20
+.loop:
+ IDTX_32x32 0, 1
+ IDTX_32x32 2, 3
+ IDTX_32x32 4, 5
+ IDTX_32x32 6, 7
+ IDTX_32x32 8, 9
+ IDTX_32x32 10, 11
+ IDTX_32x32 12, 13
+ IDTX_32x32 14, 15
+ call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
+ IDTX_32x16_STORE 0, 8, 1
+ IDTX_32x16_STORE 1, 9, 1
+ IDTX_32x16_STORE 2, 10, 1
+ IDTX_32x16_STORE 3, 11, 1
+ IDTX_32x16_STORE 4, 12, 1
+ IDTX_32x16_STORE 5, 13, 1
+ IDTX_32x16_STORE 6, 14, 1
+ IDTX_32x16_STORE 7, 15, 1
+ lea dstq, [dstq+strideq*8]
+ btc cq, 5
+ jnc .loop
+ mov r0d, 8
+.zero_loop:
+ mova [cq+64*0], m20
+ mova [cq+64*1], m20
+ mova [cq+64*2], m20
+ mova [cq+64*3], m20
+ add cq, 64*4
+ dec r0d
+ jg .zero_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ WIN64_SPILL_XMM 30
+ cmp eobd, 151
+ jb .fast
+ mova m5, [cq+64*10]
+ mova m3, [cq+64* 6]
+ mova m1, [cq+64* 2]
+ mova m7, [cq+64*14]
+ mova m2, [cq+64* 4]
+ mova m6, [cq+64*12]
+ mova m0, [cq+64* 0]
+ mova m4, [cq+64* 8]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ mova m14, [cq+64* 1]
+ mova m21, [cq+64*15]
+ mova m18, [cq+64* 9]
+ mova m17, [cq+64* 7]
+ mova m16, [cq+64* 5]
+ mova m19, [cq+64*11]
+ mova m20, [cq+64*13]
+ mova m15, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ vpbroadcastd m9, [o(pw_8192)]
+%macro TRANSPOSE_8x4_ROUND 4
+ punpckhwd m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpcklwd m%3, m%4 ; c0 d0 c1 d1 c2 d2 c3 d3
+ punpckhwd m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m%1, m%2 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhdq m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m%1, m%3 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckldq m%3, m%4, m8 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m%4, m8 ; a6 b6 c6 d6 a7 b7 c7 d7
+ REPX {pmulhrsw x, m9}, m%2, m%1, m%3, m%4
+%endmacro
+ TRANSPOSE_8x4_ROUND 0, 1, 2, 3
+ TRANSPOSE_8x4_ROUND 4, 5, 6, 7
+ TRANSPOSE_8x4_ROUND 14, 15, 16, 17
+ TRANSPOSE_8x4_ROUND 18, 19, 20, 21
+ vinserti32x8 m26, m0, ym4, 1 ; a0 a4 b0 b4
+ vshufi32x4 m0, m4, q3232 ; a8 a12 b8 b12
+ vinserti32x8 m27, m1, ym5, 1 ; a1 a5 b1 b5
+ vshufi32x4 m1, m5, q3232 ; a9 a13 b9 b13
+ vinserti32x8 m28, m2, ym6, 1 ; a2 a6 b2 b6
+ vshufi32x4 m2, m6, q3232 ; a10 a14 b10 b14
+ vinserti32x8 m29, m3, ym7, 1 ; a3 a7 b3 b7
+ vshufi32x4 m8, m3, m7, q3232 ; a11 a15 b11 b15
+ vinserti32x8 m4, m14, ym18, 1 ; c0 c4 d0 d4
+ vshufi32x4 m14, m18, q3232 ; c8 c12 d8 d12
+ vinserti32x8 m5, m15, ym19, 1 ; c1 c5 d1 d5
+ vshufi32x4 m15, m19, q3232 ; c9 c13 d9 d13
+ vinserti32x8 m6, m16, ym20, 1 ; c2 c6 d2 d6
+ vshufi32x4 m16, m20, q3232 ; c10 c14 d10 d14
+ vinserti32x8 m7, m17, ym21, 1 ; c3 c7 d3 d7
+ vshufi32x4 m17, m21, q3232 ; c11 c15 d11 d15
+ vshufi32x4 m22, m26, m4, q2020 ; 0 1
+ vshufi32x4 m26, m4, q3131 ; 8 9
+ vshufi32x4 m23, m27, m5, q2020 ; 2 3
+ vshufi32x4 m27, m5, q3131 ; 10 11
+ vshufi32x4 m24, m28, m6, q2020 ; 4 5
+ vshufi32x4 m28, m6, q3131 ; 12 13
+ vshufi32x4 m25, m29, m7, q2020 ; 6 7
+ vshufi32x4 m29, m7, q3131 ; 14 15
+ vshufi32x4 m4, m0, m14, q2020 ; 16 17
+ vshufi32x4 m3, m0, m14, q3131 ; 24 25
+ vshufi32x4 m20, m1, m15, q2020 ; 18 19
+ vshufi32x4 m19, m1, m15, q3131 ; 26 27
+ vshufi32x4 m5, m2, m16, q2020 ; 20 21
+ vshufi32x4 m0, m2, m16, q3131 ; 28 29
+ vshufi32x4 m16, m8, m17, q2020 ; 22 23
+ vshufi32x4 m17, m8, m17, q3131 ; 30 31
+ pxor m6, m6
+ mova [cq+64* 0], m4
+ mova [cq+64* 2], m5
+ mova [cq+64* 4], m3
+ mova [cq+64* 6], m0
+ punpcklwd m8, m24, m24 ; 4
+ punpcklwd m0, m0 ; 28
+ punpcklwd m5, m5 ; 20
+ punpcklwd m1, m28, m28 ; 12
+ punpcklwd m7, m26, m26 ; 8
+ punpcklwd m3, m3 ; 24
+ punpcklwd m9, m6, m22 ; __ 0
+ punpcklwd m6, m4 ; __ 16
+ call m(idct_16x16_internal_8bpc).main_fast3
+ mova [cq+64* 1], m20
+ mova [cq+64* 3], m16
+ mova [cq+64* 5], m19
+ mova [cq+64* 7], m17
+ punpcklwd m21, m23, m23 ; 2
+ punpcklwd m17, m17 ; 30
+ punpcklwd m20, m20 ; 18
+ punpcklwd m15, m29, m29 ; 14
+ punpcklwd m18, m27, m27 ; 10
+ punpcklwd m16, m16 ; 22
+ punpcklwd m19, m19 ; 26
+ punpcklwd m14, m25, m25 ; 6
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ mova m21, [cq+64* 7]
+ mova m14, [cq+64* 0]
+ mova m17, [cq+64* 3]
+ mova m18, [cq+64* 4]
+ mova m19, [cq+64* 5]
+ mova m16, [cq+64* 2]
+ mova m15, [cq+64* 1]
+ mova m20, [cq+64* 6]
+ REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
+ m24, m19, m16, m27, m28, m15, m20, m23
+ call .main_oddhalf
+ jmp .end
+.fast: ; right half is zero
+ mova ym8, [cq+64*15]
+ vinserti32x8 m8, [cq+64* 1], 1
+ mova m2, [o(int16_perm)]
+ mova ym9, [cq+64* 8]
+ vinserti32x8 m9, [cq+64* 0], 1
+ mova ym0, [cq+64* 7]
+ vinserti32x8 m0, [cq+64* 9], 1
+ mova ym7, [cq+64*14]
+ vinserti32x8 m7, [cq+64* 2], 1
+ mova ym1, [cq+64* 3]
+ vinserti32x8 m1, [cq+64*13], 1
+ mova ym3, [cq+64* 6]
+ vinserti32x8 m3, [cq+64*10], 1
+ mova ym5, [cq+64*11]
+ vinserti32x8 m5, [cq+64* 5], 1
+ mova ym6, [cq+64*12]
+ vinserti32x8 m6, [cq+64* 4], 1
+ REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
+ call m(idct_16x16_internal_8bpc).main2
+ vbroadcasti32x4 m8, [o(int_shuf3)]
+ vbroadcasti32x4 m9, [o(int_shuf4)]
+ vpbroadcastd m11, [o(pw_8192)]
+ pshufb m0, m8
+ pshufb m1, m9
+ pshufb m2, m8
+ pshufb m3, m9
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ pshufb m4, m8
+ pshufb m5, m9
+ pshufb m6, m8
+ pshufb m7, m9
+ REPX {pmulhrsw x, m11}, m4, m5, m6, m7
+ punpckhdq m28, m0, m1
+ punpckldq m0, m1
+ punpckhdq m27, m2, m3
+ punpckldq m2, m3
+ punpckhdq m22, m4, m5
+ punpckldq m4, m5
+ punpckhdq m23, m6, m7
+ punpckldq m6, m7
+ vinserti32x8 m14, m0, ym2, 1
+ vshufi32x4 m15, m0, m2, q3232
+ vinserti32x8 m2, m4, ym6, 1
+ vshufi32x4 m4, m6, q3232
+ vshufi32x4 m21, m14, m2, q2020 ; 0 2
+ vshufi32x4 m14, m2, q3131 ; 4 6
+ vshufi32x4 m18, m15, m4, q2020 ; 8 10
+ vshufi32x4 m15, m4, q3131 ; 12 14
+ pxor m9, m9
+ punpcklwd m8, m14, m14 ; 4
+ punpcklwd m1, m15, m15 ; 12
+ punpcklwd m7, m18, m18 ; 8
+ punpcklwd m9, m21 ; __ 0
+ call m(idct_16x16_internal_8bpc).main_fast4
+ punpckhwd m21, m21 ; 2
+ punpckhwd m15, m15 ; 14
+ punpckhwd m18, m18 ; 10
+ punpckhwd m14, m14 ; 6
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ vinserti32x8 m24, m28, ym27, 1
+ vshufi32x4 m28, m27, q3232
+ vinserti32x8 m27, m22, ym23, 1
+ vshufi32x4 m22, m23, q3232
+ vshufi32x4 m23, m24, m27, q2020 ; 1 3
+ vshufi32x4 m24, m27, q3131 ; 5 7
+ vshufi32x4 m27, m28, m22, q2020 ; 9 11
+ vshufi32x4 m28, m22, q3131 ; 13 15
+ punpcklwd m22, m23, m23 ; 1
+ punpckhwd m29, m28, m28 ; 15
+ punpcklwd m26, m27, m27 ; 9
+ punpckhwd m25, m24, m24 ; 7
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ punpcklwd m24, m24 ; 5
+ punpckhwd m27, m27 ; 11
+ punpcklwd m28, m28 ; 13
+ punpckhwd m23, m23 ; 3
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ call .main_oddhalf_fast
+.end:
+ imul r6, strideq, 60
+ mova m10, [o(end_16x32p)]
+ vpbroadcastd m11, [o(pw_2048)]
+ lea r3, [strideq*3]
+ pxor m12, m12
+ add r6, dstq ; dst+stride*60
+ psrldq m13, m10, 1
+ lea r4, [strideq+r3] ; stride*4
+%macro IDCT_16x64_END 3 ; idct32, idct64, tmp
+%if %1 & 1
+ %define %%s0 r3
+ %define %%s1 strideq*2
+ %define %%s2 strideq*1
+ %define %%s3 strideq*0
+%else
+ %define %%s0 strideq*0
+ %define %%s1 strideq*1
+ %define %%s2 strideq*2
+ %define %%s3 r3
+%if %1
+ add dstq, r4
+ sub r6, r4
+%endif
+%endif
+%if %1 < 8
+ pmulhrsw m8, m11, m%1
+ pmulhrsw m9, m11, m%2
+%else
+ mova m9, [cq+64*%1]
+ paddsw m8, m9, m%2 ; out 0+n, 1+n
+ psubsw m9, m%2 ; out 63-n, 62-n
+ pmulhrsw m8, m11
+ pmulhrsw m9, m11
+%endif
+ mova xm29, [dstq+%%s0]
+ vinserti128 ym29, [dstq+%%s1], 1
+ mova xm%3, [r6 +%%s3]
+ vinserti128 ym%3, [r6 +%%s2], 1
+ vpermb m29, m10, m29
+ vpermb m%3, m10, m%3
+ mova [cq+64*%1], m12
+ paddw m29, m8
+ paddw m%3, m9
+ packuswb m29, m%3
+ vpermd m29, m13, m29
+ mova [dstq+%%s0], xm29
+ vextracti128 [dstq+%%s1], ym29, 1
+ vextracti32x4 [r6 +%%s2], m29, 2
+ vextracti32x4 [r6 +%%s3], m29, 3
+%endmacro
+ IDCT_16x64_END 0, 29, 0
+ IDCT_16x64_END 1, 28, 28
+ IDCT_16x64_END 2, 27, 28
+ IDCT_16x64_END 3, 26, 28
+ IDCT_16x64_END 4, 25, 28
+ IDCT_16x64_END 5, 24, 28
+ IDCT_16x64_END 6, 23, 28
+ IDCT_16x64_END 7, 22, 28
+ IDCT_16x64_END 8, 21, 28
+ IDCT_16x64_END 9, 20, 28
+ IDCT_16x64_END 10, 19, 28
+ IDCT_16x64_END 11, 18, 28
+ IDCT_16x64_END 12, 17, 28
+ IDCT_16x64_END 13, 16, 28
+ IDCT_16x64_END 14, 15, 28
+ IDCT_16x64_END 15, 14, 28
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 64
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
+ALIGN function_align
+cglobal_label .main_oddhalf_fast ; bottom three-quarters are zero
+ vpbroadcastd m8, [o(pw_101_4095x8)]
+ vpbroadcastd m21, [o(pw_m1474_3822x8)]
+ vpbroadcastd m14, [o(pw_897_3996x8)]
+ vpbroadcastd m17, [o(pw_m700_4036x8)]
+ vpbroadcastd m18, [o(pw_501_4065x8)]
+ vpbroadcastd m19, [o(pw_m1092_3948x8)]
+ vpbroadcastd m16, [o(pw_1285_3889x8)]
+ vpbroadcastd m15, [o(pw_m301_4085x8)]
+ pmulhrsw m8, m22 ; t32a t63a
+ pmulhrsw m21, m29 ; t35a t60a
+ pmulhrsw m14, m26 ; t36a t59a
+ pmulhrsw m17, m25 ; t39a t56
+ pmulhrsw m18, m24 ; t40a t55a
+ pmulhrsw m19, m27 ; t43a t52a
+ pmulhrsw m16, m28 ; t44a t51a
+ pmulhrsw m15, m23 ; t47a t48a
+ mova m22, m8
+ mova m29, m21
+ mova m26, m14
+ mova m25, m17
+ mova m24, m18
+ mova m27, m19
+ mova m28, m16
+ mova m20, m15
+ jmp .main_oddhalf2
+ALIGN function_align
+cglobal_label .main_oddhalf
+ vpbroadcastd m8, [o(pw_101_4095x8)]
+ vpbroadcastd m9, [o(pw_m2824_2967x8)]
+ vpbroadcastd m11, [o(pw_1660_3745x8)]
+ vpbroadcastd m12, [o(pw_m1474_3822x8)]
+ pmulhrsw m22, m8 ; t32a t63a
+ vpbroadcastd m8, [o(pw_897_3996x8)]
+ pmulhrsw m21, m9 ; t33a t62a
+ vpbroadcastd m9, [o(pw_m2191_3461x8)]
+ pmulhrsw m14, m11 ; t34a t61a
+ vpbroadcastd m11, [o(pw_2359_3349x8)]
+ pmulhrsw m29, m12 ; t35a t60a
+ vpbroadcastd m12, [o(pw_m700_4036x8)]
+ pmulhrsw m26, m8 ; t36a t59a
+ vpbroadcastd m8, [o(pw_501_4065x8)]
+ pmulhrsw m17, m9 ; t37a t58a
+ vpbroadcastd m9, [o(pw_m2520_3229x8)]
+ pmulhrsw m18, m11 ; t38a t57a
+ vpbroadcastd m11, [o(pw_2019_3564x8)]
+ pmulhrsw m25, m12 ; t39a t56a
+ vpbroadcastd m12, [o(pw_m1092_3948x8)]
+ pmulhrsw m24, m8 ; t40a t55a
+ vpbroadcastd m8, [o(pw_1285_3889x8)]
+ pmulhrsw m19, m9 ; t41a t54a
+ vpbroadcastd m9, [o(pw_m1842_3659x8)]
+ pmulhrsw m16, m11 ; t42a t53a
+ vpbroadcastd m11, [o(pw_2675_3102x8)]
+ pmulhrsw m27, m12 ; t43a t52a
+ vpbroadcastd m12, [o(pw_m301_4085x8)]
+ pmulhrsw m28, m8 ; t44a t51a
+ pmulhrsw m15, m9 ; t45a t50a
+ pmulhrsw m20, m11 ; t46a t49a
+ pmulhrsw m23, m12 ; t47a t48a
+ psubsw m8, m22, m21 ; t33 t62
+ paddsw m22, m21 ; t32 t63
+ psubsw m21, m29, m14 ; t34 t61
+ paddsw m29, m14 ; t35 t60
+ psubsw m14, m26, m17 ; t37 t58
+ paddsw m26, m17 ; t36 t59
+ psubsw m17, m25, m18 ; t38 t57
+ paddsw m25, m18 ; t39 t56
+ psubsw m18, m24, m19 ; t41 t54
+ paddsw m24, m19 ; t40 t55
+ psubsw m19, m27, m16 ; t42 t53
+ paddsw m27, m16 ; t43 t52
+ psubsw m16, m28, m15 ; t45 t50
+ paddsw m28, m15 ; t44 t51
+ psubsw m15, m23, m20 ; t46 t49
+ paddsw m20, m23 ; t47 t48
+.main_oddhalf2:
+ ITX_MUL2X_PACK 8, 9, 23, 10, 401, 4076, 5 ; t33a t62a
+ ITX_MUL2X_PACK 21, 9, 23, 10, m4076, 401, 5 ; t34a t61a
+ ITX_MUL2X_PACK 14, 9, 23, 10, 3166, 2598, 5 ; t37a t58a
+ ITX_MUL2X_PACK 17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a
+ ITX_MUL2X_PACK 18, 9, 23, 10, 1931, 3612, 5 ; t41a t54a
+ ITX_MUL2X_PACK 19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a
+ ITX_MUL2X_PACK 16, 9, 23, 10, 3920, 1189, 5 ; t45a t50a
+ ITX_MUL2X_PACK 15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a
+ vpbroadcastd m11, [o(pw_m4017_799)]
+ psubsw m23, m25, m26 ; t36a t59a
+ paddsw m25, m26 ; t39a t56a
+ psubsw m26, m24, m27 ; t43a t52a
+ paddsw m27, m24 ; t40a t55a
+ psubsw m24, m20, m28 ; t44a t51a
+ paddsw m20, m28 ; t47a t48a
+ psubsw m28, m8, m21 ; t34 t61
+ paddsw m8, m21 ; t33 t62
+ psubsw m21, m17, m14 ; t37 t58
+ paddsw m17, m14 ; t38 t57
+ psubsw m14, m18, m19 ; t42 t53
+ paddsw m18, m19 ; t41 t54
+ psubsw m19, m15, m16 ; t45 t50
+ paddsw m15, m16 ; t46 t49
+ psubsw m16, m22, m29 ; t35a t60a
+ paddsw m22, m29 ; t32a t63a
+ ITX_MUL2X_PACK 16, 9, 29, 10, 799_4017, 11, 20 ; t35 t60
+ ITX_MUL2X_PACK 28, 9, 29, 10, 799_4017, 11, 20 ; t34a t61a
+ ITX_MUL2X_PACK 23, 9, 29, 10, 11, m799_m4017, 36 ; t36 t59
+ ITX_MUL2X_PACK 21, 9, 29, 10, 11, m799_m4017, 36 ; t37a t58a
+ vpbroadcastd m11, [o(pw_m2276_3406)]
+ ITX_MUL2X_PACK 26, 9, 29, 10, 3406_2276, 11, 20 ; t43 t52
+ ITX_MUL2X_PACK 14, 9, 29, 10, 3406_2276, 11, 20 ; t42a t53a
+ ITX_MUL2X_PACK 24, 9, 29, 10, 11, m3406_m2276, 36 ; t44 t51
+ ITX_MUL2X_PACK 19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ psubsw m29, m22, m25 ; t39 t56
+ paddsw m22, m25 ; t32 t63
+ psubsw m25, m20, m27 ; t40 t55
+ paddsw m20, m27 ; t47 t48
+ psubsw m27, m8, m17 ; t38a t57a
+ paddsw m8, m17 ; t33a t62a
+ psubsw m17, m15, m18 ; t41a t54a
+ paddsw m15, m18 ; t46a t49a
+ paddsw m18, m16, m23 ; t35a t60a
+ psubsw m16, m23 ; t36a t59a
+ psubsw m23, m24, m26 ; t43a t52a
+ paddsw m24, m26 ; t44a t51a
+ paddsw m26, m28, m21 ; t34 t61
+ psubsw m28, m21 ; t37 t58
+ psubsw m21, m19, m14 ; t42 t53
+ paddsw m19, m14 ; t45 t50
+ ITX_MUL2X_PACK 29, 9, 14, 10, 11, 12, 4 ; t39a t56a
+ ITX_MUL2X_PACK 27, 9, 14, 10, 11, 12, 4 ; t38 t57
+ ITX_MUL2X_PACK 16, 9, 14, 10, 11, 12, 4 ; t36 t59
+ ITX_MUL2X_PACK 28, 9, 14, 10, 11, 12, 4 ; t37a t58a
+ vpbroadcastd m11, [o(pw_m1567_m3784)]
+ ITX_MUL2X_PACK 25, 9, 14, 10, 12, 11, 4 ; t40a t55a
+ ITX_MUL2X_PACK 17, 9, 14, 10, 12, 11, 4 ; t41 t54
+ ITX_MUL2X_PACK 23, 9, 14, 10, 12, 11, 4 ; t43 t52
+ ITX_MUL2X_PACK 21, 9, 14, 10, 12, 11, 4 ; t42a t53a
+ vbroadcasti32x4 m13, [o(deint_shuf)]
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ paddsw m14, m22, m20 ; t32a t63a
+ psubsw m22, m20 ; t47a t48a
+ psubsw m20, m8, m15 ; t46 t49
+ paddsw m8, m15 ; t33 t62
+ paddsw m15, m18, m24 ; t35 t60
+ psubsw m18, m24 ; t44 t51
+ psubsw m24, m26, m19 ; t45a t50a
+ paddsw m26, m19 ; t34a t61a
+ REPX {pshufb x, m13}, m14, m8, m15, m26
+ psubsw m19, m29, m25 ; t40 t55
+ paddsw m25, m29 ; t39 t56
+ psubsw m29, m27, m17 ; t41a t54a
+ paddsw m27, m17 ; t38a t57a
+ psubsw m17, m16, m23 ; t43a t52a
+ paddsw m16, m23 ; t36a t59a
+ psubsw m9, m28, m21 ; t42 t53
+ paddsw m28, m21 ; t37 t58
+ REPX {pshufb x, m13}, m25, m27, m16, m28
+ ITX_MUL2X_PACK 22, 13, 21, 10, 11, 12, 8 ; t47 t48
+ ITX_MUL2X_PACK 20, 23, 22, 10, 11, 12, 8 ; t46a t49a
+ packssdw m21, m22 ; t47 t46a
+ packssdw m13, m23 ; t48 t49a
+ ITX_MUL2X_PACK 18, 22, 20, 10, 11, 12, 8 ; t44a t51a
+ ITX_MUL2X_PACK 24, 23, 18, 10, 11, 12, 8 ; t45 t50
+ packssdw m20, m18 ; t44a t45
+ packssdw m22, m23 ; t51a t50
+ ITX_MUL2X_PACK 19, 24, 18, 10, 11, 12, 8 ; t40a t55a
+ ITX_MUL2X_PACK 29, 23, 19, 10, 11, 12, 8 ; t41 t54
+ packssdw m18, m19 ; t40a t41
+ packssdw m24, m23 ; t55a t54
+ ITX_MUL2X_PACK 17, 23, 19, 10, 11, 12, 8 ; t43 t52
+ ITX_MUL2X_PACK 9, 29, 17, 10, 11, 12, 8 ; t42a t53a
+ packssdw m19, m17 ; t43 t42a
+ packssdw m23, m29 ; t52 t53a
+ punpcklqdq m17, m25, m27 ; t39 t38a
+ punpckhqdq m25, m27 ; t56 t57a
+ punpckhqdq m27, m15, m26 ; t60 t61a
+ punpcklqdq m15, m26 ; t35 t34a
+ punpckhqdq m26, m16, m28 ; t59a t58
+ punpcklqdq m16, m28 ; t36a t37
+ punpckhqdq m28, m14, m8 ; t63a t62
+ punpcklqdq m14, m8 ; t32a t33
+ psubsw m29, m0, m28 ; out63 out62
+ paddsw m0, m28 ; out0 out1
+ psubsw m28, m1, m27 ; out60 out61
+ paddsw m1, m27 ; out3 out2
+ psubsw m27, m2, m26 ; out59 out58
+ paddsw m2, m26 ; out4 out5
+ psubsw m26, m3, m25 ; out56 out57
+ paddsw m3, m25 ; out7 out6
+ psubsw m25, m4, m24 ; out55 out54
+ paddsw m4, m24 ; out8 out9
+ psubsw m24, m5, m23 ; out52 out53
+ paddsw m5, m23 ; out11 out10
+ psubsw m23, m6, m22 ; out51 out50
+ paddsw m6, m22 ; out12 out13
+ psubsw m22, m7, m13 ; out48 out49
+ paddsw m7, m13 ; out15 out14
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 16
+.dconly:
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+.dconly2:
+ imul r6d, 181
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m2, m2
+ vpbroadcastw m3, r6d
+.dconly_loop:
+ mova m1, [dstq]
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.normal:
+ WIN64_SPILL_XMM 31
+ mova m19, [o(dup16_perm)]
+ mova m24, [cq+64* 2]
+ mova m28, [cq+64* 6]
+ mova m26, [cq+64* 4]
+ mova m22, [cq+64* 0]
+ mova m23, [cq+64* 1]
+ mova m29, [cq+64* 7]
+ mova m27, [cq+64* 5]
+ mova m25, [cq+64* 3]
+ vpermb m8, m19, m24 ; 4
+ vpermb m1, m19, m28 ; 12
+ vpermb m7, m19, m26 ; 8
+ vpermb m9, m19, m22 ; __ 0
+ vpermb m21, m19, m23 ; 2
+ vpermb m15, m19, m29 ; 14
+ vpermb m18, m19, m27 ; 10
+ vpermb m14, m19, m25 ; 6
+ pslld m9, 16
+ vpord m30, m19, [o(pb_32)] {1to16}
+ REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23
+ cmp eobd, 151
+ jb .fast
+ vpermb m0, m19, [cq+64*14] ; 28
+ vpermb m5, m19, [cq+64*10] ; 20
+ vpermb m3, m19, [cq+64*12] ; 24
+ vpermb m6, m19, [cq+64* 8] ; __ 16
+ pslld m6, 16
+ call m(idct_16x16_internal_8bpc).main_fast
+ vpermb m17, m19, [cq+64*15] ; 30
+ vpermb m20, m19, [cq+64* 9] ; 18
+ vpermb m16, m19, [cq+64*11] ; 22
+ vpermb m19, m19, [cq+64*13] ; 26
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ vpermb m21, m30, [cq+64*15]
+ vpermb m14, m30, [cq+64* 8]
+ vpermb m17, m30, [cq+64*11]
+ vpermb m18, m30, [cq+64*12]
+ vpermb m19, m30, [cq+64*13]
+ vpermb m16, m30, [cq+64*10]
+ vpermb m15, m30, [cq+64* 9]
+ vpermb m20, m30, [cq+64*14]
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
+ jmp .end
+.fast: ; bottom half is zero
+ call m(idct_16x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+.end:
+ mova [cq+64* 8], m4
+ mova [cq+64* 9], m5
+ mova [cq+64*10], m6
+ mova [cq+64*11], m7
+ mova [cq+64*12], m26
+ mova [cq+64*13], m27
+ mova [cq+64*14], m28
+ mova [cq+64*15], m29
+ vpbroadcastd m13, [o(pw_8192)]
+ call .pass1_end
+ call .pass2
+ mova [cq+64* 0], m0
+ mova [cq+64* 1], m1
+ mova [cq+64* 2], m2
+ mova [cq+64* 3], m3
+ mova [cq+64* 4], m4
+ mova [cq+64* 5], m5
+ mova [cq+64* 6], m6
+ mova [cq+64* 7], m7
+ pmulhrsw m0, m13, [cq+64* 8]
+ pmulhrsw m1, m13, [cq+64* 9]
+ pmulhrsw m2, m13, [cq+64*10]
+ pmulhrsw m3, m13, [cq+64*11]
+ vpbroadcastd m30, [o(pw_2048)]
+ pmulhrsw m4, m13, m22
+ pmulhrsw m5, m13, m23
+ pmulhrsw m6, m13, m24
+ pmulhrsw m7, m13, m25
+ pmulhrsw m22, m30, m14
+ pmulhrsw m14, m13, m26
+ pmulhrsw m23, m30, m15
+ pmulhrsw m15, m13, m27
+ pmulhrsw m24, m30, m16
+ pmulhrsw m16, m13, m28
+ pmulhrsw m25, m30, m17
+ pmulhrsw m17, m13, m29
+ pmulhrsw m26, m30, m18
+ pmulhrsw m18, m13, [cq+64*12]
+ pmulhrsw m27, m30, m19
+ pmulhrsw m19, m13, [cq+64*13]
+ pmulhrsw m28, m30, m20
+ pmulhrsw m20, m13, [cq+64*14]
+ pmulhrsw m29, m30, m21
+ pmulhrsw m21, m13, [cq+64*15]
+ call .transpose_round
+ call .pass2
+ pxor m10, m10
+ lea r3, [strideq*3]
+%macro IDCT_64x16_END 4
+ mova m9, [dstq+%4]
+%if %1 < 8
+ pmulhrsw m%3, m30, [cq+64*%1]
+%endif
+ pmulhrsw m%2, m30
+ mova [cq+64*%1], m10
+ punpcklbw m8, m9, m10
+ punpckhbw m9, m10
+ paddw m8, m%3
+ paddw m9, m%2
+ packuswb m8, m9
+ mova [dstq+%4], m8
+%if %1 == 3 || %1 == 7 || %1 == 11
+ lea dstq, [dstq+strideq*4]
+%endif
+%endmacro
+ IDCT_64x16_END 0, 0, 11, strideq*0
+ IDCT_64x16_END 1, 1, 11, strideq*1
+ IDCT_64x16_END 2, 2, 11, strideq*2
+ IDCT_64x16_END 3, 3, 11, r3
+ IDCT_64x16_END 4, 4, 11, strideq*0
+ IDCT_64x16_END 5, 5, 11, strideq*1
+ IDCT_64x16_END 6, 6, 11, strideq*2
+ IDCT_64x16_END 7, 7, 11, r3
+ IDCT_64x16_END 8, 14, 22, strideq*0
+ IDCT_64x16_END 9, 15, 23, strideq*1
+ IDCT_64x16_END 10, 16, 24, strideq*2
+ IDCT_64x16_END 11, 17, 25, r3
+ IDCT_64x16_END 12, 18, 26, strideq*0
+ IDCT_64x16_END 13, 19, 27, strideq*1
+ IDCT_64x16_END 14, 20, 28, strideq*2
+ IDCT_64x16_END 15, 21, 29, r3
+ RET
+ALIGN function_align
+.pass1_end:
+ mova m4, [cq+64* 0]
+ mova m5, [cq+64* 1]
+ mova m6, [cq+64* 2]
+ mova m7, [cq+64* 3]
+ mova m8, [cq+64* 4]
+ mova m9, [cq+64* 5]
+ mova m11, [cq+64* 6]
+ mova m12, [cq+64* 7]
+ psubsw m29, m4, m21 ; out47 out46
+ paddsw m4, m21 ; out16 out17
+ psubsw m28, m5, m20 ; out44 out45
+ paddsw m5, m20 ; out19 out18
+ REPX {pmulhrsw x, m13}, m0, m1, m2, m3
+ psubsw m27, m6, m19 ; out43 out42
+ paddsw m6, m19 ; out20 out21
+ psubsw m26, m7, m18 ; out40 out41
+ paddsw m7, m18 ; out23 out22
+ pmulhrsw m18, m13, m22
+ pmulhrsw m19, m13, m23
+ pmulhrsw m20, m13, m24
+ pmulhrsw m21, m13, m25
+ paddsw m25, m12, m14 ; out31 out30
+ psubsw m14, m12, m14 ; out32 out33
+ paddsw m24, m11, m15 ; out28 out29
+ psubsw m15, m11, m15 ; out35 out34
+ REPX {pmulhrsw x, m13}, m4, m5, m6, m7
+ paddsw m23, m9, m16 ; out27 out26
+ psubsw m16, m9, m16 ; out36 out37
+ paddsw m22, m8, m17 ; out24 out25
+ psubsw m17, m8, m17 ; out39 out38
+ REPX {pmulhrsw x, m13}, m14, m15, m16, m17
+.transpose_round:
+%macro TRANSPOSE_8x4_PACKED 4
+ punpckhwd m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3
+ punpcklwd m%1, m%3 ; a0 e0 a1 e1 a2 e2 a3 e3
+ punpcklwd m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3
+ punpckhwd m%2, m%4 ; c0 g0 c1 g1 c2 g2 c3 g3
+ punpckhwd m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3
+ punpcklwd m%1, m%2 ; a0 c0 e0 g0 a1 c1 e1 g1
+ punpckhwd m%2, m8, m%3 ; b2 d2 f2 h2 b3 d3 f3 h3
+ punpcklwd m8, m%3 ; b0 d0 f0 h0 b1 d1 f1 h1
+ punpcklwd m%3, m%4, m%2 ; 2
+ punpckhwd m%4, m%2 ; 3
+ punpckhwd m%2, m%1, m8 ; 1
+ punpcklwd m%1, m8 ; 0
+%endmacro
+ TRANSPOSE_8x4_PACKED 0, 1, 2, 3
+ TRANSPOSE_8x4_PACKED 18, 19, 20, 21
+ TRANSPOSE_8x4_PACKED 4, 5, 6, 7
+ TRANSPOSE_8x4_PACKED 14, 15, 16, 17
+ vshufi32x4 m8, m0, m4, q3232 ; a02 a03 b02 b03
+ vinserti32x8 m0, ym4, 1 ; a00 a01 b00 b01
+ vshufi32x4 m4, m1, m5, q3232 ; a12 a13 b12 b13
+ vinserti32x8 m9, m1, ym5, 1 ; a10 a11 b10 b11
+ vshufi32x4 m5, m2, m6, q3232 ; a22 a23 b22 b23
+ vinserti32x8 m1, m2, ym6, 1 ; a20 a21 b20 b21
+ vshufi32x4 m6, m3, m7, q3232 ; a32 a33 b32 b33
+ vinserti32x8 m11, m3, ym7, 1 ; a30 a31 b30 b31
+ vshufi32x4 m2, m14, m18, q3232 ; c02 c03 d02 d03
+ vinserti32x8 m3, m14, ym18, 1 ; c00 c01 d00 d01
+ vshufi32x4 m18, m15, m19, q3232 ; c12 c13 d12 d13
+ vinserti32x8 m15, ym19, 1 ; c10 c11 d10 d11
+ vshufi32x4 m19, m16, m20, q3232 ; c22 c23 d22 d23
+ vinserti32x8 m16, ym20, 1 ; c20 c21 d20 d21
+ vshufi32x4 m20, m17, m21, q3232 ; c32 c33 d32 d33
+ vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31
+ ret
+.pass2:
+ vshufi32x4 m7, m5, m19, q3131 ; 14
+ vshufi32x4 m5, m19, q2020 ; 10
+ vshufi32x4 m21, m6, m20, q3131 ; 15
+ vshufi32x4 m19, m6, m20, q2020 ; 11
+ vshufi32x4 m20, m4, m18, q3131 ; 13
+ vshufi32x4 m18, m4, m18, q2020 ; 9
+ vshufi32x4 m6, m8, m2, q3131 ; 12
+ vshufi32x4 m4, m8, m2, q2020 ; 8
+ vshufi32x4 m2, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m1, m16, q3131 ; 6
+ vshufi32x4 m1, m16, q2020 ; 2
+ vshufi32x4 m16, m9, m15, q3131 ; 5
+ vshufi32x4 m14, m9, m15, q2020 ; 1
+ vshufi32x4 m15, m11, m17, q2020 ; 3
+ vshufi32x4 m17, m11, m17, q3131 ; 7
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
+ jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+
+cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 9, 30, 64*32, dst, stride, c, eob
+ vpbroadcastd m23, [o(pw_2896x8)]
+%undef cmp
+ cmp eobd, 136
+ jb .fast
+ pmulhrsw m5, m23, [cq+64*20]
+ pmulhrsw m3, m23, [cq+64*12]
+ pmulhrsw m1, m23, [cq+64* 4]
+ pmulhrsw m7, m23, [cq+64*28]
+ pmulhrsw m2, m23, [cq+64* 8]
+ pmulhrsw m6, m23, [cq+64*24]
+ pmulhrsw m0, m23, [cq+64* 0]
+ pmulhrsw m4, m23, [cq+64*16]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ pmulhrsw m14, m23, [cq+64* 2]
+ pmulhrsw m21, m23, [cq+64*30]
+ pmulhrsw m18, m23, [cq+64*18]
+ pmulhrsw m17, m23, [cq+64*14]
+ pmulhrsw m16, m23, [cq+64*10]
+ pmulhrsw m19, m23, [cq+64*22]
+ pmulhrsw m20, m23, [cq+64*26]
+ pmulhrsw m15, m23, [cq+64* 6]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ pmulhrsw m22, m23, [cq+64* 1]
+ pmulhrsw m21, m23, [cq+64*31]
+ pmulhrsw m14, m23, [cq+64*17]
+ pmulhrsw m29, m23, [cq+64*15]
+ pmulhrsw m26, m23, [cq+64* 9]
+ pmulhrsw m17, m23, [cq+64*23]
+ pmulhrsw m18, m23, [cq+64*25]
+ pmulhrsw m25, m23, [cq+64* 7]
+ pmulhrsw m24, m23, [cq+64* 5]
+ pmulhrsw m19, m23, [cq+64*27]
+ pmulhrsw m16, m23, [cq+64*21]
+ pmulhrsw m27, m23, [cq+64*11]
+ pmulhrsw m28, m23, [cq+64*13]
+ pmulhrsw m15, m23, [cq+64*19]
+ pmulhrsw m20, m23, [cq+64*29]
+ pmulhrsw m23, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+ vpbroadcastd m12, [o(pw_16384)]
+ psubsw m13, m0, m29 ; 31
+ paddsw m0, m29 ; 0
+ psubsw m29, m1, m28 ; 30
+ paddsw m1, m28 ; 1
+ psubsw m28, m2, m27 ; 29
+ paddsw m2, m27 ; 2
+ psubsw m27, m3, m26 ; 28
+ paddsw m3, m26 ; 3
+ psubsw m26, m4, m25 ; 27
+ paddsw m4, m25 ; 4
+ psubsw m25, m5, m24 ; 26
+ paddsw m5, m24 ; 5
+ psubsw m24, m6, m23 ; 25
+ paddsw m6, m23 ; 6
+ psubsw m23, m7, m22 ; 24
+ paddsw m7, m22 ; 7
+ pxor m9, m9
+ punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3
+ REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
+ punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
+ punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3
+ punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
+ punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3
+ REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
+ punpckhwd m3, m23, m24
+ punpcklwd m23, m24
+ punpckhwd m24, m25, m26
+ punpcklwd m25, m26
+ REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
+ punpckhwd m26, m27, m28
+ punpcklwd m27, m28
+ punpckhwd m28, m29, m13
+ punpcklwd m29, m13
+ REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
+ punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1
+ REPX {pmulhrsw x, m12}, m7, m0, m2, m4
+ punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7
+ punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
+ punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5
+ REPX {pmulhrsw x, m12}, m6, m8, m1, m22
+ punpckhdq m13, m23, m25
+ punpckldq m23, m25
+ punpckhdq m25, m27, m29
+ punpckldq m27, m29
+ REPX {pmulhrsw x, m12}, m13, m23, m25, m27
+ punpckhdq m9, m3, m24
+ punpckldq m3, m24
+ punpckhdq m24, m26, m28
+ punpckldq m26, m28
+ REPX {pmulhrsw x, m12}, m9, m3, m24, m26
+ punpckhqdq m5, m23, m27 ; d01 d09 d17 d25
+ punpcklqdq m23, m27 ; d00 d08 d16 d24
+ punpcklqdq m27, m13, m25 ; d02 d10 d18 d26
+ punpckhqdq m13, m25 ; d03 d11 d19 d27
+ punpcklqdq m25, m3, m26 ; d04 d12 d20 d28
+ punpckhqdq m3, m26 ; d05 d13 d21 d29
+ punpcklqdq m26, m9, m24 ; d06 d14 d22 d30
+ punpckhqdq m9, m24 ; d07 d15 d23 d31
+ mova [cq+64* 3], m23
+ mova [cq+64*13], m27
+ mova [cq+64* 7], m25
+ mova [cq+64*15], m26
+ punpckhqdq m24, m8, m22 ; a05 a13 a21 a29
+ punpcklqdq m8, m22 ; a04 a12 a20 a28
+ punpckhqdq m22, m0, m4 ; a01 a09 a17 a25
+ punpcklqdq m0, m4 ; a00 a08 a16 a24
+ punpckhqdq m23, m7, m2 ; a03 a11 a19 a27
+ punpcklqdq m7, m2 ; a02 a10 a18 a26
+ punpckhqdq m25, m6, m1 ; a07 a15 a23 a31
+ punpcklqdq m6, m1 ; a06 a14 a22 a30
+ mova [cq+64* 1], m0
+ mova [cq+64* 9], m7
+ mova [cq+64* 5], m8
+ mova [cq+64*11], m6
+ mova m2, [cq+64* 0]
+ mova m11, [cq+64* 2]
+ mova m8, [cq+64* 4]
+ mova m29, [cq+64* 6]
+ mova m27, [cq+64* 8]
+ mova m26, [cq+64*10]
+ mova m4, [cq+64*12]
+ mova m28, [cq+64*14]
+ psubsw m1, m2, m21 ; 23
+ paddsw m2, m21 ; 8
+ psubsw m21, m11, m20 ; 22
+ paddsw m11, m20 ; 9
+ psubsw m20, m8, m19 ; 21
+ paddsw m8, m19 ; 10
+ psubsw m19, m29, m18 ; 20
+ paddsw m29, m18 ; 11
+ psubsw m18, m27, m17 ; 19
+ paddsw m27, m17 ; 12
+ psubsw m17, m26, m16 ; 18
+ paddsw m26, m16 ; 13
+ psubsw m16, m4, m15 ; 17
+ paddsw m4, m15 ; 14
+ psubsw m15, m28, m14 ; 16
+ paddsw m28, m14 ; 15
+ punpcklwd m14, m15, m16
+ punpckhwd m15, m16
+ punpckhwd m16, m17, m18
+ punpcklwd m17, m18
+ punpckhwd m18, m19, m20
+ punpcklwd m19, m20
+ punpckhwd m20, m21, m1
+ punpcklwd m21, m1
+ punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7
+ punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3
+ punpckhwd m11, m8, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
+ punpcklwd m8, m29 ; k0 l0 k1 l1 k2 l2 k3 l3
+ punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
+ punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3
+ punpckhwd m26, m4, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
+ punpcklwd m4, m28 ; o0 p0 o1 p1 o2 p2 o3 p3
+ punpckhdq m28, m2, m8 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpckldq m2, m8 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m8, m27, m4 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpckldq m27, m4 ; m0 n0 o0 p0 m1 n1 o1 p1
+ REPX {pmulhrsw x, m12}, m28, m2, m8, m27
+ punpckhdq m4, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7
+ punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5
+ punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
+ punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5
+ REPX {pmulhrsw x, m12}, m4, m1, m11, m29
+ punpckhdq m26, m19, m21
+ punpckldq m19, m21
+ punpckhdq m21, m15, m16
+ punpckldq m15, m16
+ REPX {pmulhrsw x, m12}, m26, m19, m21, m15
+ punpckhdq m16, m18, m20
+ punpckldq m18, m20
+ punpckhdq m20, m14, m17
+ punpckldq m14, m17
+ REPX {pmulhrsw x, m12}, m16, m18, m20, m14
+ punpckhqdq m17, m28, m8 ; b03 b11 b19 b27
+ punpcklqdq m28, m8 ; b02 b10 b18 b26
+ punpckhqdq m8, m2, m27 ; b01 b09 b17 b25
+ punpcklqdq m2, m27 ; b00 b08 b16 b24
+ punpcklqdq m27, m1, m29 ; b04 b12 b20 b28
+ punpckhqdq m1, m29 ; b05 b13 b21 b29
+ punpcklqdq m29, m4, m11 ; b06 b14 b22 b30
+ punpckhqdq m4, m11 ; b07 b15 b23 b31
+ mova [cq+64* 0], m2
+ mova [cq+64* 8], m28
+ mova [cq+64* 4], m27
+ mova [cq+64*10], m29
+ punpckhqdq m27, m20, m26 ; c03 c11 c19 c27
+ punpcklqdq m20, m26 ; c02 c10 c18 c26
+ punpckhqdq m26, m14, m19 ; c01 c09 c17 c25
+ punpcklqdq m14, m19 ; c00 c08 c16 c24
+ punpckhqdq m28, m15, m18 ; c05 c13 c21 c29
+ punpcklqdq m15, m18 ; c04 c12 c20 c28
+ punpckhqdq m29, m21, m16 ; c07 c15 c23 c31
+ punpcklqdq m21, m16 ; c06 c14 c22 c30
+ mova [cq+64* 2], m14
+ mova [cq+64*12], m20
+ mova [cq+64* 6], m15
+ mova [cq+64*14], m21
+ vshufi32x4 m14, m22, m8, q3232 ; a17 a25 b17 b25
+ vinserti32x8 m22, ym8, 1 ; a01 a09 b01 b09
+ vshufi32x4 m15, m23, m17, q3232 ; a19 a27 b19 b27
+ vinserti32x8 m23, ym17, 1 ; a03 a11 b03 b11
+ vshufi32x4 m16, m24, m1, q3232 ; a21 a29 b21 b29
+ vinserti32x8 m24, ym1, 1 ; a05 a13 b05 b13
+ vshufi32x4 m17, m25, m4, q3232 ; a23 a31 b23 b31
+ vinserti32x8 m25, ym4, 1 ; a07 a15 b07 b15
+ vinserti32x8 m19, m26, ym5, 1 ; c01 c09 d01 d09
+ vshufi32x4 m26, m5, q3232 ; c17 c25 d17 d25
+ vinserti32x8 m20, m27, ym13, 1 ; c03 c11 d03 d11
+ vshufi32x4 m27, m13, q3232 ; c19 c27 d19 d27
+ vinserti32x8 m21, m28, ym3, 1 ; c05 c13 d05 d13
+ vshufi32x4 m28, m3, q3232 ; c21 c29 d21 d29
+ vinserti32x8 m18, m29, ym9, 1 ; c07 c15 d07 d15
+ vshufi32x4 m29, m9, q3232 ; c23 c31 d23 d31
+ mov r4, rsp
+ vshufi32x4 m0, m22, m19, q2020 ; 1
+ vshufi32x4 m1, m17, m29, q3131 ; 31
+ vshufi32x4 m2, m14, m26, q2020 ; 17
+ vshufi32x4 m3, m25, m18, q3131 ; 15
+ call .main_part1
+ vshufi32x4 m0, m25, m18, q2020 ; 7
+ vshufi32x4 m1, m14, m26, q3131 ; 25
+ vshufi32x4 m2, m17, m29, q2020 ; 23
+ vshufi32x4 m3, m22, m19, q3131 ; 9
+ call .main_part1
+ vshufi32x4 m0, m24, m21, q2020 ; 5
+ vshufi32x4 m1, m15, m27, q3131 ; 27
+ vshufi32x4 m2, m16, m28, q2020 ; 21
+ vshufi32x4 m3, m23, m20, q3131 ; 11
+ call .main_part1
+ vshufi32x4 m0, m23, m20, q2020 ; 3
+ vshufi32x4 m1, m16, m28, q3131 ; 29
+ vshufi32x4 m2, m15, m27, q2020 ; 19
+ vshufi32x4 m3, m24, m21, q3131 ; 13
+ call .main_part1
+ call .main_part2
+ mova m0, [cq+64* 1] ; a0
+ mova m15, [cq+64* 0] ; b0
+ mova m3, [cq+64* 2] ; c0
+ mova m16, [cq+64* 3] ; d0
+ mova m14, [cq+64* 5] ; a4
+ mova m8, [cq+64* 4] ; b4
+ mova m17, [cq+64* 6] ; c4
+ mova m1, [cq+64* 7] ; d4
+ vshufi32x4 m2, m0, m15, q3232 ; a16 a24 b16 b24
+ vinserti32x8 m0, ym15, 1 ; a00 a08 b00 b08
+ vshufi32x4 m15, m3, m16, q3232 ; c16 c24 d16 d24
+ vinserti32x8 m3, ym16, 1 ; c00 c08 d00 d08
+ vshufi32x4 m16, m14, m8, q3232 ; a20 a28 b20 b28
+ vinserti32x8 m14, ym8, 1 ; a04 a12 b04 b12
+ vshufi32x4 m8, m17, m1, q3232 ; c20 c28 d20 d28
+ vinserti32x8 m17, ym1, 1 ; c04 c12 d04 d12
+ vshufi32x4 m1, m0, m3, q3131 ; 8
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m2, m15, q3131 ; 24
+ vshufi32x4 m2, m15, q2020 ; 16
+ vshufi32x4 m15, m14, m17, q3131 ; 12
+ vshufi32x4 m14, m17, q2020 ; 4
+ vshufi32x4 m17, m16, m8, q3131 ; 28
+ vshufi32x4 m16, m8, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova m8, [cq+64* 8]
+ mova m9, [cq+64*12]
+ mova m11, [cq+64*10]
+ mova m12, [cq+64*14]
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ mova m22, [cq+64* 9]
+ mova m27, [cq+64*13]
+ mova m23, [cq+64*11]
+ mova m24, [cq+64*15]
+ vshufi32x4 m26, m22, m8, q3232 ; a18 a26 b18 b26
+ vinserti32x8 m22, ym8, 1 ; a02 a10 b02 b10
+ vshufi32x4 m8, m9, m27, q3232 ; c18 c26 d18 d26
+ vinserti32x8 m9, ym27, 1 ; c02 c10 d02 d10
+ vshufi32x4 m27, m23, m11, q3232 ; a22 a30 b22 b30
+ vinserti32x8 m23, ym11, 1 ; a06 a14 b06 b14
+ vshufi32x4 m11, m12, m24, q3232 ; c22 c30 d22 d30
+ vinserti32x8 m12, ym24, 1 ; c06 c14 d06 d14
+ vshufi32x4 m28, m26, m8, q3131 ; 26
+ vshufi32x4 m26, m8, q2020 ; 18
+ vshufi32x4 m24, m22, m9, q3131 ; 10
+ vshufi32x4 m22, m9, q2020 ; 2
+ vshufi32x4 m29, m27, m11, q3131 ; 30
+ vshufi32x4 m27, m11, q2020 ; 22
+ vshufi32x4 m25, m23, m12, q3131 ; 14
+ vshufi32x4 m23, m12, q2020 ; 6
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ jmp .end
+.fast: ; bottom/right halves are zero
+ pmulhrsw ym9, ym23, [cq+64* 0]
+ pmulhrsw ym6, ym23, [cq+64* 8]
+ mova m14, [o(dup16_perm)]
+ pmulhrsw ym8, ym23, [cq+64* 2]
+ pmulhrsw xm0, xm23, [cq+64*14]
+ pmulhrsw xm5, xm23, [cq+64*10]
+ pmulhrsw ym1, ym23, [cq+64* 6]
+ pmulhrsw ym7, ym23, [cq+64* 4]
+ pmulhrsw xm3, xm23, [cq+64*12]
+ pmovzxwd m9, ym9
+ pmovzxwd m6, ym6
+ vpermb m8, m14, m8
+ punpcklwd xm0, xm0
+ vpermb ym5, ym14, ym5
+ vpermb m1, m14, m1
+ vpermb m7, m14, m7
+ punpcklwd xm3, xm3
+ pslld m9, 16
+ pslld m6, 16
+ call m(idct_16x16_internal_8bpc).main_fast
+ vpmulhrsw ym21, ym23, [cq+64* 1]
+ {evex}vpmulhrsw xm17, xm23, [cq+64*15] ; force EVEX encoding, which
+ {evex}vpmulhrsw xm20, xm23, [cq+64* 9] ; reduces code size due to
+ {evex}vpmulhrsw ym15, ym23, [cq+64* 7] ; compressed displacements
+ {evex}vpmulhrsw ym18, ym23, [cq+64* 5]
+ {evex}vpmulhrsw xm16, xm23, [cq+64*11]
+ {evex}vpmulhrsw xm19, xm23, [cq+64*13]
+ {evex}vpmulhrsw ym23, [cq+64* 3]
+ vpermb m21, m14, m21
+ punpcklwd xm17, xm17
+ vpermb ym20, ym14, ym20
+ vpermb m15, m14, m15
+ vpermb m18, m14, m18
+ vpermb ym16, ym14, ym16
+ punpcklwd xm19, xm19
+ vpermb m14, m14, m23
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m9, [o(pw_16384)]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
+ vshufi32x4 m16, m0, m3, q2020 ; 0
+ vshufi32x4 m26, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m14, m2, q2020 ; 1
+ vshufi32x4 m14, m2, q3131 ; 5
+ vshufi32x4 m3, m19, m7, q3131 ; 15
+ vshufi32x4 m19, m7, q2020 ; 11
+ vshufi32x4 m27, m17, m9, q2020 ; 3
+ vshufi32x4 m17, m9, q3131 ; 7
+ vshufi32x4 m28, m20, m6, q2020 ; 9
+ vshufi32x4 m20, m6, q3131 ; 13
+ vshufi32x4 m22, m1, m18, q2020 ; 2
+ vshufi32x4 m23, m1, m18, q3131 ; 6
+ vshufi32x4 m24, m5, m15, q2020 ; 10
+ vshufi32x4 m25, m5, m15, q3131 ; 14
+ vshufi32x4 m15, m21, m4, q3131 ; 12
+ vshufi32x4 m21, m21, m4, q2020 ; 8
+ mov r4, rsp
+ call .main_part1_fast
+ mova m0, m17
+ mova m3, m28
+ call .main_part1_fast
+ mova m0, m14
+ mova m3, m19
+ call .main_part1_fast
+ mova m0, m27
+ mova m3, m20
+ call .main_part1_fast
+ call .main_part2
+ mova m0, m16
+ mova m1, m21
+ mova m14, m26
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
+ mova [cq+64*14], m21
+ mova [cq+64* 0], m14
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64* 4], m16
+ mova [cq+64* 2], m15
+ mova [cq+64*12], m20
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
+.end:
+ lea r4, [strideq*3]
+ vpbroadcastd m12, [o(pw_2048)]
+ movshdup m13, [o(permD)]
+ lea r5, [r4+strideq] ; stride*4
+ lea r3, [dstq+r4*8]
+ lea r6, [strideq+r5*8] ; stride*33
+ lea r8, [r4+r5*8] ; stride*35
+ add r3, r5 ; dst+stride*28
+ lea r7, [r6+strideq] ; stride*34
+%macro IDCT_32x64_END 6 ; src, mem, stride[1-4]
+%if %2 < 8
+ paddsw m10, m%2, m%1
+ psubsw m11, m%2, m%1
+%else
+ mova m11, [cq+64*(%2*2-16)]
+ paddsw m10, m11, m%1
+ psubsw m11, m%1
+%endif
+ mova m9, [rsp+64*(31-%2)]
+ mova m%1, [rsp+64*%2]
+ paddsw m8, m10, m9
+ psubsw m10, m9
+ paddsw m9, m11, m%1
+ pmovzxbw m0, [dstq+%3]
+ psubsw m11, m%1
+ pmovzxbw m%1, [r3 +%4]
+ REPX {pmulhrsw x, m12}, m8, m10, m9, m11
+ paddw m8, m0
+ pmovzxbw m0, [r3 +%5]
+ paddw m10, m%1
+ pmovzxbw m%1, [dstq+%6]
+ paddw m9, m0
+ paddw m11, m%1
+%if %2 >= 8
+%if %2 == 8
+ pxor m1, m1
+%endif
+ mova [cq+64*(%2*2-16)], m1
+ mova [cq+64*(%2*2-15)], m1
+%endif
+ packuswb m8, m10
+ packuswb m9, m11
+ vpermq m8, m13, m8
+ vpermq m9, m13, m9
+ mova [dstq+%3], ym8
+ vextracti32x8 [r3 +%4], m8, 1
+ mova [r3 +%5], ym9
+ vextracti32x8 [dstq+%6], m9, 1
+%if %2 == 3 || %2 == 7 || %2 == 11
+ add dstq, r5
+ sub r3, r5
+%endif
+%endmacro
+ IDCT_32x64_END 29, 0, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 28, 1, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 27, 2, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 26, 3, r4 , r5*8, strideq*0, r8
+ IDCT_32x64_END 25, 4, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 24, 5, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 23, 6, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 22, 7, r4 , r5*8, strideq*0, r8
+ IDCT_32x64_END 21, 8, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 20, 9, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 19, 10, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 18, 11, r4 , r5*8, strideq*0, r8
+ IDCT_32x64_END 17, 12, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 16, 13, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 15, 14, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 14, 15, r4 , r5*8, strideq*0, r8
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 64
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
+ALIGN function_align ; bottom three-quarters are zero
+.main_part1_fast:
+ vpbroadcastd m1, [o(idct64_mul+4*0)]
+ vpbroadcastd m8, [o(idct64_mul+4*1)]
+ vpbroadcastd m2, [o(idct64_mul+4*6)]
+ vpbroadcastd m9, [o(idct64_mul+4*7)]
+ pmulhrsw m1, m0 ; t63a
+ pmulhrsw m0, m8 ; t32a
+ pmulhrsw m2, m3 ; t60a
+ pmulhrsw m3, m9 ; t35a
+ mova m8, m0
+ mova m7, m1
+ mova m6, m3
+ mova m5, m2
+ jmp .main_part1b
+.main_part1:
+ ; idct64 steps 1-5:
+ ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+ vpbroadcastd m7, [o(idct64_mul+4*0)]
+ vpbroadcastd m8, [o(idct64_mul+4*1)]
+ vpbroadcastd m6, [o(idct64_mul+4*2)]
+ vpbroadcastd m9, [o(idct64_mul+4*3)]
+ pmulhrsw m7, m0 ; t63a
+ vpbroadcastd m5, [o(idct64_mul+4*4)]
+ pmulhrsw m0, m8 ; t32a
+ vpbroadcastd m8, [o(idct64_mul+4*5)]
+ pmulhrsw m6, m1 ; t62a
+ vpbroadcastd m4, [o(idct64_mul+4*6)]
+ pmulhrsw m1, m9 ; t33a
+ vpbroadcastd m9, [o(idct64_mul+4*7)]
+ pmulhrsw m5, m2 ; t61a
+ pmulhrsw m2, m8 ; t34a
+ pmulhrsw m4, m3 ; t60a
+ pmulhrsw m3, m9 ; t35a
+ psubsw m8, m0, m1 ; t33
+ paddsw m0, m1 ; t32
+ psubsw m1, m7, m6 ; t62
+ paddsw m7, m6 ; t63
+ psubsw m6, m3, m2 ; t34
+ paddsw m3, m2 ; t35
+ psubsw m2, m4, m5 ; t61
+ paddsw m5, m4 ; t60
+.main_part1b:
+ vpbroadcastd m11, [o(idct64_mul+4*8)]
+ vpbroadcastd m12, [o(idct64_mul+4*9)]
+ ITX_MULSUB_2W 1, 8, 4, 9, 10, 11, 12 ; t33a, t62a
+ vpbroadcastd m11, [o(idct64_mul+4*10)]
+ ITX_MULSUB_2W 2, 6, 4, 9, 10, 12, 11 ; t34a, t61a
+ vpbroadcastd m11, [o(idct64_mul+4*11)]
+ vpbroadcastd m12, [o(idct64_mul+4*12)]
+ psubsw m4, m0, m3 ; t35a
+ paddsw m0, m3 ; t32a
+ psubsw m3, m7, m5 ; t60a
+ paddsw m7, m5 ; t63a
+ psubsw m5, m1, m2 ; t34
+ paddsw m1, m2 ; t33
+ psubsw m2, m8, m6 ; t61
+ paddsw m6, m8 ; t62
+ add r5, 4*13
+ ITX_MULSUB_2W 3, 4, 8, 9, 10, 11, 12 ; t35, t60
+ ITX_MULSUB_2W 2, 5, 8, 9, 10, 11, 12 ; t34a, t61a
+ mova [r4+64*0], m0
+ mova [r4+64*7], m7
+ mova [r4+64*1], m1
+ mova [r4+64*6], m6
+ mova [r4+64*3], m3
+ mova [r4+64*4], m4
+ mova [r4+64*2], m2
+ mova [r4+64*5], m5
+ add r4, 64*8
+ ret
+.main_part2:
+ vpbroadcastd m11, [o(pw_1567_3784 -16*13)]
+ vpbroadcastd m12, [o(pw_m3784_1567 -16*13)]
+ lea r6, [r4+64*7]
+ vpbroadcastd m17, [o(pw_m1567_m3784-16*13)]
+ vpbroadcastd m18, [o(pw_2896_2896 -16*13)]
+ vpbroadcastd m19, [o(pw_m2896_2896 -16*13)]
+ sub r5, 16*13
+.main_part2_loop:
+ mova m0, [r4-64*32] ; t32a
+ mova m1, [r6-64*24] ; t39a
+ mova m2, [r6-64*32] ; t63a
+ mova m3, [r4-64*24] ; t56a
+ mova m4, [r4-64*16] ; t40a
+ mova m5, [r6-64* 8] ; t47a
+ mova m6, [r6-64*16] ; t55a
+ mova m7, [r4-64* 8] ; t48a
+ psubsw m8, m0, m1 ; t39
+ paddsw m0, m1 ; t32
+ psubsw m1, m2, m3 ; t56
+ paddsw m2, m3 ; t63
+ psubsw m3, m5, m4 ; t40
+ paddsw m5, m4 ; t47
+ psubsw m4, m7, m6 ; t55
+ paddsw m7, m6 ; t48
+ ITX_MULSUB_2W 1, 8, 6, 9, 10, 11, 12 ; t39a, t56a
+ ITX_MULSUB_2W 4, 3, 6, 9, 10, 12, 17 ; t40a, t55a
+ psubsw m6, m2, m7 ; t48a
+ paddsw m2, m7 ; t63a
+ psubsw m7, m0, m5 ; t47a
+ paddsw m0, m5 ; t32a
+ psubsw m5, m8, m3 ; t55
+ paddsw m8, m3 ; t56
+ psubsw m3, m1, m4 ; t40
+ paddsw m1, m4 ; t39
+ ITX_MULSUB_2W 6, 7, 4, 9, 10, 18, 19 ; t47, t48
+ ITX_MULSUB_2W 5, 3, 4, 9, 10, 18, 19 ; t40a, t55a
+ mova [r6-64* 8], m2
+ mova [r4-64*32], m0
+ mova [r4-64* 8], m8
+ mova [r6-64*32], m1
+ mova [r6-64*24], m6
+ mova [r4-64*16], m7
+ mova [r4-64*24], m5
+ mova [r6-64*16], m3
+ add r4, 64
+ sub r6, 64
+ cmp r4, r6
+ jb .main_part2_loop
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 30, 64*32, dst, stride, c, eob
+ vpbroadcastd m23, [o(pw_2896x8)]
+%undef cmp
+ cmp eobd, 136
+ jb .fast
+ pmulhrsw m0, m23, [cq+64* 1]
+ pmulhrsw m1, m23, [cq+64*31]
+ pmulhrsw m2, m23, [cq+64*17]
+ pmulhrsw m3, m23, [cq+64*15]
+ vpbroadcastd m10, [o(pd_2048)]
+ mov r4, rsp
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ pmulhrsw m0, m23, [cq+64* 7]
+ pmulhrsw m1, m23, [cq+64*25]
+ pmulhrsw m2, m23, [cq+64*23]
+ pmulhrsw m3, m23, [cq+64* 9]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ pmulhrsw m0, m23, [cq+64* 5]
+ pmulhrsw m1, m23, [cq+64*27]
+ pmulhrsw m2, m23, [cq+64*21]
+ pmulhrsw m3, m23, [cq+64*11]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ pmulhrsw m0, m23, [cq+64* 3]
+ pmulhrsw m1, m23, [cq+64*29]
+ pmulhrsw m2, m23, [cq+64*19]
+ pmulhrsw m3, m23, [cq+64*13]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ pmulhrsw m3, m23, [cq+64*24]
+ pmulhrsw m1, m23, [cq+64* 8]
+ pmulhrsw m2, m23, [cq+64*16]
+ pmulhrsw m0, m23, [cq+64* 0]
+ pmulhrsw m14, m23, [cq+64* 4]
+ pmulhrsw m17, m23, [cq+64*28]
+ pmulhrsw m16, m23, [cq+64*20]
+ pmulhrsw m15, m23, [cq+64*12]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ pmulhrsw m22, m23, [cq+64* 2]
+ pmulhrsw m29, m23, [cq+64*30]
+ pmulhrsw m26, m23, [cq+64*18]
+ pmulhrsw m25, m23, [cq+64*14]
+ pmulhrsw m24, m23, [cq+64*10]
+ pmulhrsw m27, m23, [cq+64*22]
+ pmulhrsw m28, m23, [cq+64*26]
+ pmulhrsw m23, [cq+64* 6]
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_16384)]
+ call .pass1_end_part1
+ mova [cq+64*16], m1
+ mova [cq+64*17], m3
+ mova [cq+64*18], m5
+ mova [cq+64*19], m7
+ mova [cq+64*24], m23
+ mova [cq+64*25], m25
+ mova [cq+64*26], m27
+ mova [cq+64*27], m29
+ pmulhrsw m23, m13, m0 ; a0
+ pmulhrsw m25, m13, m2 ; a2
+ pmulhrsw m27, m13, m4 ; a4
+ pmulhrsw m29, m13, m6 ; a6
+ REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6
+ call .pass1_end_part2
+ mova [cq+64*20], m15
+ mova [cq+64*21], m17
+ mova [cq+64*22], m19
+ mova [cq+64*23], m21
+ mova [cq+64*28], m1
+ mova [cq+64*29], m3
+ mova [cq+64*30], m5
+ mova [cq+64*31], m7
+ REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6
+ REPX {pmulhrsw x, m13}, m0, m2, m4, m6 ; g0 g2 g4 g6
+ vinserti32x8 m3, m23, ym14, 1 ; a00 a01 c00 c01
+ vshufi32x4 m23, m14, q3232 ; a02 a03 c02 c03
+ vinserti32x8 m15, m22, ym0, 1 ; e00 e01 g00 g01
+ vshufi32x4 m22, m0, q3232 ; e02 e03 g02 g03
+ vinserti32x8 m1, m27, ym18, 1 ; a40 a41 c40 c41
+ vshufi32x4 m27, m18, q3232 ; a42 a43 c42 c43
+ vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41
+ vshufi32x4 m26, m4, q3232 ; e42 e43 g42 g43
+ vinserti32x8 m14, m25, ym16, 1 ; a20 a21 c20 c21
+ vshufi32x4 m25, m16, q3232 ; a22 a23 c22 c23
+ vinserti32x8 m17, m24, ym2, 1 ; e20 e21 g20 g21
+ vshufi32x4 m24, m2, q3232 ; e22 e23 g22 g23
+ vinserti32x8 m19, m29, ym20, 1 ; a60 a61 c60 c61
+ vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63
+ vinserti32x8 m20, m28, ym6, 1 ; e60 e61 g60 g61
+ vshufi32x4 m28, m6, q3232 ; e62 e63 g62 g63
+ vshufi32x4 m2, m3, m15, q3131 ; 8
+ vshufi32x4 m0, m3, m15, q2020 ; 0
+ vshufi32x4 m6, m23, m22, q3131 ; 24
+ vshufi32x4 m4, m23, m22, q2020 ; 16
+ vshufi32x4 m3, m1, m18, q3131 ; 12
+ vshufi32x4 m1, m18, q2020 ; 4
+ vshufi32x4 m7, m27, m26, q3131 ; 28
+ vshufi32x4 m5, m27, m26, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ vshufi32x4 m16, m14, m17, q3131 ; 10
+ vshufi32x4 m14, m17, q2020 ; 2
+ vshufi32x4 m17, m19, m20, q3131 ; 14
+ vshufi32x4 m15, m19, m20, q2020 ; 6
+ vshufi32x4 m20, m25, m24, q3131 ; 26
+ vshufi32x4 m18, m25, m24, q2020 ; 18
+ vshufi32x4 m21, m29, m28, q3131 ; 30
+ vshufi32x4 m19, m29, m28, q2020 ; 22
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ pmulhrsw m22, m13, [cq+64*16] ; a1
+ pmulhrsw m23, m13, [cq+64*20] ; c1
+ pmulhrsw m24, m13, [cq+64*24] ; e1
+ pmulhrsw m25, m13, [cq+64*28] ; g1
+ pmulhrsw m26, m13, [cq+64*17] ; a3
+ pmulhrsw m27, m13, [cq+64*21] ; c3
+ pmulhrsw m28, m13, [cq+64*25] ; e3
+ pmulhrsw m29, m13, [cq+64*29] ; g3
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ pmulhrsw m14, m13, [cq+64*18] ; a5
+ pmulhrsw m15, m13, [cq+64*22] ; c5
+ pmulhrsw m16, m13, [cq+64*26] ; e5
+ pmulhrsw m17, m13, [cq+64*30] ; g5
+ pmulhrsw m18, m13, [cq+64*19] ; a7
+ pmulhrsw m19, m13, [cq+64*23] ; c7
+ pmulhrsw m20, m13, [cq+64*27] ; e7
+ pmulhrsw m21, m13, [cq+64*31] ; g7
+ vinserti32x8 m8, m22, ym23, 1 ; a10 a11 c10 c11
+ vshufi32x4 m22, m23, q3232 ; a12 a13 c12 c13
+ vinserti32x8 m9, m24, ym25, 1 ; e10 e11 g10 g11
+ vshufi32x4 m24, m25, q3232 ; e12 e13 g12 g13
+ vinserti32x8 m23, m26, ym27, 1 ; a30 a31 c30 c31
+ vshufi32x4 m26, m27, q3232 ; a32 a33 c32 c33
+ vinserti32x8 m11, m28, ym29, 1 ; e30 e31 g30 g31
+ vshufi32x4 m28, m29, q3232 ; e32 e33 g32 g33
+ mova [cq+64* 0], m0
+ mova [cq+64* 1], m1
+ mova [cq+64* 2], m2
+ mova [cq+64* 3], m3
+ mova [cq+64* 4], m4
+ mova [cq+64* 5], m5
+ mova [cq+64* 6], m6
+ mova [cq+64* 7], m7
+ vinserti32x8 m12, m14, ym15, 1 ; a50 a51 c50 c51
+ vshufi32x4 m14, m15, q3232 ; a52 a53 c52 c53
+ vinserti32x8 m13, m16, ym17, 1 ; e50 e51 g50 g51
+ vshufi32x4 m16, m17, q3232 ; e52 e53 g52 g53
+ vinserti32x8 m25, m18, ym19, 1 ; a70 a71 c70 c71
+ vshufi32x4 m18, m19, q3232 ; a72 a73 c72 c73
+ vinserti32x8 m17, m20, ym21, 1 ; e70 e71 g70 g71
+ vshufi32x4 m20, m21, q3232 ; e72 e73 g72 g73
+ vshufi32x4 m27, m23, m11, q3131 ; 11 m27
+ vshufi32x4 m23, m11, q2020 ; 3 m23
+ vshufi32x4 m19, m26, m28, q3131 ; 27 m19
+ vshufi32x4 m15, m26, m28, q2020 ; 19 m15
+ vshufi32x4 m29, m25, m17, q3131 ; 15 m29
+ vshufi32x4 m25, m17, q2020 ; 7 m25
+ vshufi32x4 m21, m18, m20, q3131 ; 31 m21
+ vshufi32x4 m17, m18, m20, q2020 ; 23 m17
+ vshufi32x4 m20, m14, m16, q3131 ; 29 m20
+ vshufi32x4 m16, m14, m16, q2020 ; 21 m16
+ vshufi32x4 m18, m22, m24, q3131 ; 25 m18
+ vshufi32x4 m14, m22, m24, q2020 ; 17 m14
+ vshufi32x4 m26, m8, m9, q3131 ; 9 m26
+ vshufi32x4 m22, m8, m9, q2020 ; 1 m22
+ vshufi32x4 m28, m12, m13, q3131 ; 13 m28
+ vshufi32x4 m24, m12, m13, q2020 ; 5 m24
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+ vpbroadcastd m13, [o(pw_16384)]
+ pmulhrsw m0, m13, [r4-64*21]
+ pmulhrsw m1, m13, [r4-64*22]
+ pmulhrsw m2, m13, [r4-64*23]
+ pmulhrsw m3, m13, [r4-64*24]
+ pmulhrsw m4, m13, [r4-64*25]
+ pmulhrsw m5, m13, [r4-64*26]
+ pmulhrsw m6, m13, [r4-64*27]
+ pmulhrsw m7, m13, [r4-64*28]
+ mova [cq+64*16], m14
+ mova [cq+64*17], m15
+ mova [cq+64*18], m16
+ mova [cq+64*19], m17
+ mova [cq+64*20], m18
+ mova [cq+64*21], m19
+ mova [cq+64*22], m20
+ mova [cq+64*23], m21
+ pmulhrsw m14, m13, [r4-64*12]
+ pmulhrsw m15, m13, [r4-64*11]
+ pmulhrsw m16, m13, [r4-64*10]
+ pmulhrsw m17, m13, [r4-64* 9]
+ pmulhrsw m18, m13, [r4-64* 8]
+ pmulhrsw m19, m13, [r4-64* 7]
+ pmulhrsw m20, m13, [r4-64* 6]
+ pmulhrsw m21, m13, [r4-64* 5]
+ mova [cq+64*24], m22
+ mova [cq+64*25], m23
+ mova [cq+64*26], m24
+ mova [cq+64*27], m25
+ mova [cq+64*28], m26
+ mova [cq+64*29], m27
+ mova [cq+64*30], m28
+ mova [cq+64*31], m29
+ call .transpose_2x8x8_lo
+ mova [r4-64*12], m1
+ mova [r4-64*11], m3
+ mova [r4-64*10], m5
+ mova [r4-64* 9], m7
+ mova [r4-64* 8], m15
+ mova [r4-64* 7], m17
+ mova [r4-64* 6], m19
+ mova [r4-64* 5], m21
+ vinserti32x8 m22, m0, ym14, 1 ; f00 f01 h00 h01
+ vshufi32x4 m23, m0, m14, q3232 ; f02 f03 h02 h03
+ vinserti32x8 m24, m2, ym16, 1 ; f20 f21 h20 h21
+ vshufi32x4 m25, m2, m16, q3232 ; f22 f23 h22 h23
+ vinserti32x8 m26, m4, ym18, 1 ; f40 f41 h40 h41
+ vshufi32x4 m27, m4, m18, q3232 ; f42 f43 h42 h43
+ vinserti32x8 m28, m6, ym20, 1 ; f60 f61 h60 h61
+ vshufi32x4 m29, m6, m20, q3232 ; f62 f63 h62 h63
+ pmulhrsw m0, m13, [r4-64*20]
+ pmulhrsw m1, m13, [r4-64*19]
+ pmulhrsw m2, m13, [r4-64*18]
+ pmulhrsw m3, m13, [r4-64*17]
+ pmulhrsw m4, m13, [r4-64*16]
+ pmulhrsw m5, m13, [r4-64*15]
+ pmulhrsw m6, m13, [r4-64*14]
+ pmulhrsw m7, m13, [r4-64*13]
+ pmulhrsw m14, m13, [r4-64*29]
+ pmulhrsw m15, m13, [r4-64*30]
+ pmulhrsw m16, m13, [r4-64*31]
+ pmulhrsw m17, m13, [r4-64*32]
+ pmulhrsw m18, m13, [r4-64*33]
+ pmulhrsw m19, m13, [r4-64*34]
+ pmulhrsw m20, m13, [r4-64*35]
+ pmulhrsw m21, m13, [r4-64*36]
+ call .transpose_2x8x8_lo
+ mova [r4-64*20], m1
+ mova [r4-64*19], m3
+ mova [r4-64*18], m5
+ mova [r4-64*17], m7
+ mova [r4-64*16], m15
+ mova [r4-64*15], m17
+ mova [r4-64*14], m19
+ mova [r4-64*13], m21
+ vinserti32x8 m1, m4, ym18, 1 ; b40 b41 d40 d41
+ vshufi32x4 m5, m4, m18, q3232 ; b42 b43 d42 d43
+ vshufi32x4 m4, m0, m14, q3232 ; b02 b03 d02 d03
+ vinserti32x8 m0, ym14, 1 ; b00 b01 d00 d01
+ vinserti32x8 m14, m2, ym16, 1 ; b20 b21 d20 d21
+ vshufi32x4 m18, m2, m16, q3232 ; b22 b23 d22 d23
+ vinserti32x8 m15, m6, ym20, 1 ; b60 b61 d60 d61
+ vshufi32x4 m19, m6, m20, q3232 ; b62 b63 d62 d63
+ vshufi32x4 m2, m0, m22, q3131 ; 8
+ vshufi32x4 m0, m22, q2020 ; 0
+ vshufi32x4 m3, m1, m26, q3131 ; 12
+ vshufi32x4 m1, m26, q2020 ; 4
+ vshufi32x4 m6, m4, m23, q3131 ; 24
+ vshufi32x4 m4, m23, q2020 ; 16
+ vshufi32x4 m7, m5, m27, q3131 ; 28
+ vshufi32x4 m5, m27, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ vshufi32x4 m16, m14, m24, q3131 ; 10
+ vshufi32x4 m14, m24, q2020 ; 2
+ vshufi32x4 m17, m15, m28, q3131 ; 14
+ vshufi32x4 m15, m28, q2020 ; 6
+ vshufi32x4 m20, m18, m25, q3131 ; 26
+ vshufi32x4 m18, m25, q2020 ; 18
+ vshufi32x4 m21, m19, m29, q3131 ; 30
+ vshufi32x4 m19, m29, q2020 ; 22
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova m22, [r4-64*20]
+ mova m26, [r4-64*16]
+ mova m23, [r4-64*19]
+ mova m27, [r4-64*15]
+ mova m24, [r4-64*18]
+ mova m28, [r4-64*14]
+ mova m25, [r4-64*17]
+ mova m29, [r4-64*13]
+ mova [r4-64*20], m14
+ mova [r4-64*19], m15
+ mova [r4-64*18], m16
+ mova [r4-64*17], m17
+ mova [r4-64*16], m18
+ mova [r4-64*15], m19
+ mova [r4-64*14], m20
+ mova [r4-64*13], m21
+ mova m19, [r4-64*12]
+ mova m11, [r4-64* 8]
+ mova m20, [r4-64*11]
+ mova m12, [r4-64* 7]
+ mova m21, [r4-64*10]
+ mova m8, [r4-64* 6]
+ mova m9, [r4-64* 9]
+ mova m18, [r4-64* 5]
+ vshufi32x4 m14, m22, m26, q3232 ; b12 b13 d12 d13
+ vinserti32x8 m22, ym26, 1 ; b10 b11 d10 d11
+ vshufi32x4 m15, m23, m27, q3232 ; b32 b33 d32 d33
+ vinserti32x8 m23, ym27, 1 ; b30 b31 d30 d31
+ vshufi32x4 m16, m24, m28, q3232 ; b52 b53 d52 d53
+ vinserti32x8 m24, ym28, 1 ; b50 b51 d50 d51
+ vshufi32x4 m17, m25, m29, q3232 ; b72 b73 d72 d73
+ vinserti32x8 m25, ym29, 1 ; b70 b71 d70 d71
+ vinserti32x8 m27, m19, ym11, 1 ; f10 f11 h10 h11
+ vshufi32x4 m19, m11, q3232 ; f12 f13 h12 h13
+ vinserti32x8 m28, m20, ym12, 1 ; f30 f31 h30 h31
+ vshufi32x4 m20, m12, q3232 ; f32 f33 h32 h33
+ vinserti32x8 m29, m21, ym8, 1 ; f50 f51 h50 h51
+ vshufi32x4 m21, m8, q3232 ; f52 f53 h52 h53
+ vinserti32x8 m8, m9, ym18, 1 ; f70 f71 h70 h71
+ vshufi32x4 m9, m18, q3232 ; f72 f73 h72 h73
+ vshufi32x4 m26, m22, m27, q3131 ; 9
+ vshufi32x4 m22, m27, q2020 ; 1
+ vshufi32x4 m27, m23, m28, q3131 ; 11
+ vshufi32x4 m23, m28, q2020 ; 3
+ vshufi32x4 m28, m24, m29, q3131 ; 13
+ vshufi32x4 m24, m29, q2020 ; 5
+ vshufi32x4 m29, m25, m8, q3131 ; 15
+ vshufi32x4 m25, m8, q2020 ; 7
+ vshufi32x4 m18, m14, m19, q3131 ; 25
+ vshufi32x4 m14, m19, q2020 ; 17
+ vshufi32x4 m19, m15, m20, q3131 ; 27
+ vshufi32x4 m15, m20, q2020 ; 19
+ vshufi32x4 m20, m16, m21, q3131 ; 29
+ vshufi32x4 m16, m21, q2020 ; 21
+ vshufi32x4 m21, m17, m9, q3131 ; 31
+ vshufi32x4 m17, m9, q2020 ; 23
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+ jmp .end
+.fast: ; bottom/right halves are zero
+ {evex}vpmulhrsw ym8, ym23, [cq+64* 4]
+ {evex}vpmulhrsw xm1, xm23, [cq+64*12]
+ mova m28, [o(dup16_perm)]
+ {evex}vpmulhrsw ym7, ym23, [cq+64* 8]
+ vpmulhrsw ym22, ym23, [cq+64* 0]
+ vpermb m8, m28, m8
+ vpermb ym1, ym28, ym1
+ vpermb m7, m28, m7
+ pmovzxwd m9, ym22
+ pslld m9, 16
+ call m(idct_16x16_internal_8bpc).main_fast2
+ {evex}vpmulhrsw ym21, ym23, [cq+64* 2]
+ {evex}vpmulhrsw xm15, xm23, [cq+64*14]
+ {evex}vpmulhrsw xm18, xm23, [cq+64*10]
+ {evex}vpmulhrsw ym14, ym23, [cq+64* 6]
+ vpermb m21, m28, m21
+ punpcklwd xm15, xm15
+ vpermb ym18, ym28, ym18
+ vpermb m14, m28, m14
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ vpmulhrsw ym22, ym23, [cq+64* 1]
+ {evex}vpmulhrsw xm29, xm23, [cq+64*15]
+ {evex}vpmulhrsw xm26, xm23, [cq+64* 9]
+ {evex}vpmulhrsw ym25, ym23, [cq+64* 7]
+ {evex}vpmulhrsw ym24, ym23, [cq+64* 5]
+ {evex}vpmulhrsw xm27, xm23, [cq+64*11]
+ {evex}vpmulhrsw xm8, xm23, [cq+64*13]
+ {evex}vpmulhrsw ym23, [cq+64* 3]
+ vpermb m22, m28, m22
+ punpcklwd xm29, xm29
+ vpermb ym26, ym28, ym26
+ vpermb m25, m28, m25
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ REPX {vpermb x, m28, x}, m24, m27, m23
+ punpcklwd xm28, xm8, xm8
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+ mov r4, rsp
+ vpbroadcastd m13, [o(pw_16384)]
+ mova [r4+64*16], m4
+ mova [r4+64*17], m5
+ mova [r4+64*18], m6
+ mova [r4+64*19], m7
+ mova [r4+64*28], m26
+ mova [r4+64*29], m27
+ mova [r4+64*30], m28
+ mova [r4+64*31], m29
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
+ mova [r4+64*20], m22
+ mova [r4+64*21], m23
+ mova [r4+64*22], m24
+ mova [r4+64*23], m25
+ mova [r4+64*24], m26
+ mova [r4+64*25], m27
+ mova [r4+64*26], m28
+ mova [r4+64*27], m29
+ call .pass2_fast
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ mova [cq+64* 0], m0
+ mova [cq+64* 1], m1
+ mova [cq+64* 2], m2
+ mova [cq+64* 3], m3
+ mova [cq+64* 4], m4
+ mova [cq+64* 5], m5
+ mova [cq+64* 6], m6
+ mova [cq+64* 7], m7
+ pmulhrsw m0, m13, [r4+64*16]
+ pmulhrsw m1, m13, [r4+64*17]
+ pmulhrsw m2, m13, [r4+64*18]
+ pmulhrsw m3, m13, [r4+64*19]
+ pmulhrsw m4, m13, [r4+64*20]
+ pmulhrsw m5, m13, [r4+64*21]
+ pmulhrsw m6, m13, [r4+64*22]
+ pmulhrsw m7, m13, [r4+64*23]
+ mova [cq+64*16], m14
+ mova [cq+64*17], m15
+ mova [cq+64*18], m16
+ mova [cq+64*19], m17
+ mova [cq+64*20], m18
+ mova [cq+64*21], m19
+ mova [cq+64*22], m20
+ mova [cq+64*23], m21
+ pmulhrsw m14, m13, [r4+64*24]
+ pmulhrsw m15, m13, [r4+64*25]
+ pmulhrsw m16, m13, [r4+64*26]
+ pmulhrsw m17, m13, [r4+64*27]
+ pmulhrsw m18, m13, [r4+64*28]
+ pmulhrsw m19, m13, [r4+64*29]
+ pmulhrsw m20, m13, [r4+64*30]
+ pmulhrsw m21, m13, [r4+64*31]
+ mova [cq+64*24], m22
+ mova [cq+64*25], m23
+ mova [cq+64*26], m24
+ mova [cq+64*27], m25
+ mova [cq+64*28], m26
+ mova [cq+64*29], m27
+ mova [cq+64*30], m28
+ mova [cq+64*31], m29
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
+ call .pass2_fast
+ mova [r4+64*16], m14
+ mova [r4+64*17], m15
+ mova [r4+64*18], m16
+ mova [r4+64*19], m17
+ mova [r4+64*20], m18
+ mova [r4+64*21], m19
+ mova [r4+64*22], m20
+ mova [r4+64*23], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+.end:
+ vpbroadcastd m13, [o(pw_2048)]
+ lea r5, [strideq*3]
+ pxor m12, m12
+ lea r3, [dstq+r5*8]
+ lea r6, [strideq+r5] ; stride*4
+ add r3, r6 ; dst+stride*28
+%macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi
+ mova m11, [cq+64*( %3)] ; 0
+ mova m9, [cq+64*(31-%3)] ; 31
+%if %3 >= 8
+ mova m%1, [rsp+64*(%1+16)]
+%endif
+ mova m10, [dstq+%4]
+ paddsw m8, m11, m9
+ psubsw m11, m9
+ paddsw m9, m%1, m%2
+ psubsw m%1, m%2
+ punpcklbw m%2, m10, m12
+ punpckhbw m10, m12
+ pmulhrsw m8, m13
+ pmulhrsw m9, m13
+ paddw m8, m%2
+ paddw m9, m10
+ mova m10, [r3+%5]
+ pmulhrsw m11, m13
+ pmulhrsw m%1, m13
+ mova [cq+64*( %3)], m12
+ mova [cq+64*(31-%3)], m12
+ punpcklbw m%2, m10, m12
+ punpckhbw m10, m12
+ packuswb m8, m9
+ paddw m11, m%2
+ paddw m%1, m10
+ packuswb m11, m%1
+ mova [dstq+%4], m8
+ mova [r3 +%5], m11
+%if %3 == 3 || %3 == 7 || %3 == 11
+ add dstq, r6
+ sub r3, r6
+%endif
+%endmacro
+ IDCT_64x32_END 0, 29, 0, strideq*0, r5
+ IDCT_64x32_END 1, 28, 1, strideq*1, strideq*2
+ IDCT_64x32_END 2, 27, 2, strideq*2, strideq*1
+ IDCT_64x32_END 3, 26, 3, r5 , strideq*0
+ IDCT_64x32_END 4, 25, 4, strideq*0, r5
+ IDCT_64x32_END 5, 24, 5, strideq*1, strideq*2
+ IDCT_64x32_END 6, 23, 6, strideq*2, strideq*1
+ IDCT_64x32_END 7, 22, 7, r5 , strideq*0
+ IDCT_64x32_END 0, 21, 8, strideq*0, r5
+ IDCT_64x32_END 1, 20, 9, strideq*1, strideq*2
+ IDCT_64x32_END 2, 19, 10, strideq*2, strideq*1
+ IDCT_64x32_END 3, 18, 11, r5 , strideq*0
+ IDCT_64x32_END 4, 17, 12, strideq*0, r5
+ IDCT_64x32_END 5, 16, 13, strideq*1, strideq*2
+ IDCT_64x32_END 6, 15, 14, strideq*2, strideq*1
+ IDCT_64x32_END 7, 14, 15, r5 , strideq*0
+ RET
+ALIGN function_align
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 32
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2
+ALIGN function_align
+.pass1_end_part1:
+%macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64
+%if %1 != %3
+ mova m%1, [cq+64*%1]
+%endif
+ mova m9, [r4+64*(%3-36)] ; idct64 32+n
+ mova m11, [r4+64*(-5-%3)] ; idct64 63-n
+ psubsw m8, m%1, m%2 ; idct32 31-n
+ paddsw m%1, m%2 ; idct32 0+n
+%if %1 == %3
+ psubsw m%2, m8, m9 ; out 32+n e
+ paddsw m8, m9 ; out 31-n d
+ psubsw m9, m%1, m11 ; out 63-n h
+ paddsw m%1, m11 ; out 0+n a
+%else
+ paddsw m%2, m8, m9 ; out 23-n c
+ psubsw m8, m9 ; out 40+n f
+ paddsw m9, m%1, m11 ; out 8+n b
+ psubsw m%1, m11 ; out 55-n g
+%endif
+ mova [r4+64*(%3-36)], m8
+ mova [r4+64*(-5-%3)], m9
+%endmacro
+ IDCT_64x32_PASS1_END 0, 29, 0
+ IDCT_64x32_PASS1_END 1, 28, 1
+ IDCT_64x32_PASS1_END 2, 27, 2
+ IDCT_64x32_PASS1_END 3, 26, 3
+ IDCT_64x32_PASS1_END 4, 25, 4
+ IDCT_64x32_PASS1_END 5, 24, 5
+ IDCT_64x32_PASS1_END 6, 23, 6
+ IDCT_64x32_PASS1_END 7, 22, 7
+.transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted)
+ punpcklwd m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3
+ punpckhwd m25, m24 ; e4 f4 e5 f5 e6 f6 e7 f7
+ punpcklwd m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3
+ punpckhwd m23, m22 ; g4 h4 g5 h5 g6 h6 g7 h7
+ punpcklwd m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m29, m28 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3
+ punpckhwd m27, m26 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpckldq m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m29, m27 ; a6 b6 c6 d6 a7 b7 c7 d7
+ punpckldq m27, m8, m24 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m8, m24 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckhdq m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m22, m28 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckldq m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5
+ punpckhdq m25, m23 ; e6 f6 g6 h6 e7 f7 g7 h7
+ punpckhqdq m23, m22, m27 ; 1 23
+ punpcklqdq m22, m27 ; 0 22
+ punpckhqdq m27, m26, m28 ; 5 27
+ punpcklqdq m26, m28 ; 4 26
+ punpcklqdq m28, m29, m25 ; 6 28
+ punpckhqdq m29, m25 ; 7 29
+ punpckhqdq m25, m24, m8 ; 3 25
+ punpcklqdq m24, m8 ; 2 24
+.transpose_8x8:
+ punpckhwd m8, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m8, m1
+ punpckhdq m8, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m8
+ punpcklqdq m6, m8
+ ret
+.pass1_end_part2:
+ IDCT_64x32_PASS1_END 0, 21, 8
+ IDCT_64x32_PASS1_END 1, 20, 9
+ IDCT_64x32_PASS1_END 2, 19, 10
+ IDCT_64x32_PASS1_END 3, 18, 11
+ IDCT_64x32_PASS1_END 4, 17, 12
+ IDCT_64x32_PASS1_END 5, 16, 13
+ IDCT_64x32_PASS1_END 6, 15, 14
+ IDCT_64x32_PASS1_END 7, 14, 15
+.transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21
+ punpcklwd m8, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m7, m6
+ punpckhwd m7, m6
+ punpcklwd m6, m5, m4
+ punpckhwd m5, m4
+ punpckldq m4, m7, m5
+ punpckhdq m7, m5
+ punpckldq m5, m8, m2
+ punpckhdq m8, m2
+ punpckhdq m2, m0, m6
+ punpckldq m0, m6
+ punpckldq m6, m3, m1
+ punpckhdq m3, m1
+ punpckhqdq m1, m0, m5
+ punpcklqdq m0, m5
+ punpckhqdq m5, m4, m6
+ punpcklqdq m4, m6
+ punpcklqdq m6, m7, m3
+ punpckhqdq m7, m3
+ punpckhqdq m3, m2, m8
+ punpcklqdq m2, m8
+ punpckhwd m8, m18, m19
+ punpcklwd m18, m19
+ punpckhwd m19, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m20, m21
+ punpcklwd m20, m21
+ punpckhwd m21, m16, m17
+ punpcklwd m16, m17
+ punpckhdq m17, m14, m16
+ punpckldq m14, m16
+ punpckldq m16, m18, m20
+ punpckhdq m18, m20
+ punpckhdq m20, m19, m21
+ punpckldq m19, m21
+ punpckldq m21, m8, m15
+ punpckhdq m8, m15
+ punpckhqdq m15, m14, m16
+ punpcklqdq m14, m16
+ punpcklqdq m16, m17, m18
+ punpckhqdq m17, m18
+ punpcklqdq m18, m19, m21
+ punpckhqdq m19, m21
+ punpckhqdq m21, m20, m8
+ punpcklqdq m20, m8
+ ret
+.pass2_fast:
+ vshufi32x4 m24, m9, m15, q3131 ; 5
+ vshufi32x4 m22, m9, m15, q2020 ; 1
+ vshufi32x4 m15, m1, m16, q3131 ; 6
+ vshufi32x4 m14, m1, m16, q2020 ; 2
+ vshufi32x4 m1, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m8, m2, q3131 ; 12
+ vshufi32x4 m2, m8, m2, q2020 ; 8
+ vshufi32x4 m25, m11, m17, q3131 ; 7
+ vshufi32x4 m23, m11, m17, q2020 ; 3
+ vshufi32x4 m17, m5, m19, q3131 ; 14
+ vshufi32x4 m16, m5, m19, q2020 ; 10
+ vshufi32x4 m29, m6, m20, q3131 ; 15
+ vshufi32x4 m27, m6, m20, q2020 ; 11
+ vshufi32x4 m28, m4, m18, q3131 ; 13
+ vshufi32x4 m26, m4, m18, q2020 ; 9
+ jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+
+cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 30, 64*96, dst, stride, c, eob
+%undef cmp
+ cmp eobd, 136
+ jb .fast
+ mova m0, [cq+64* 1]
+ mova m1, [cq+64*31]
+ mova m2, [cq+64*17]
+ mova m3, [cq+64*15]
+ vpbroadcastd m10, [o(pd_2048)]
+ mov r4, rsp
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, [cq+64* 7]
+ mova m1, [cq+64*25]
+ mova m2, [cq+64*23]
+ mova m3, [cq+64* 9]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, [cq+64* 5]
+ mova m1, [cq+64*27]
+ mova m2, [cq+64*21]
+ mova m3, [cq+64*11]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, [cq+64* 3]
+ mova m1, [cq+64*29]
+ mova m2, [cq+64*19]
+ mova m3, [cq+64*13]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 8]
+ mova m2, [cq+64*16]
+ mova m3, [cq+64*24]
+ mova m14, [cq+64* 4]
+ mova m15, [cq+64*12]
+ mova m16, [cq+64*20]
+ mova m17, [cq+64*28]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova m22, [cq+64* 2]
+ mova m29, [cq+64*30]
+ mova m26, [cq+64*18]
+ mova m25, [cq+64*14]
+ mova m24, [cq+64*10]
+ mova m27, [cq+64*22]
+ mova m28, [cq+64*26]
+ mova m23, [cq+64* 6]
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1
+ mova [r4+64*36], m1
+ mova [r4+64*37], m3
+ mova [r4+64*38], m5
+ mova [r4+64*39], m7
+ mova [r4+64*44], m23
+ mova [r4+64*45], m25
+ mova [r4+64*46], m27
+ mova [r4+64*47], m29
+ pmulhrsw m23, m13, m0 ; a0
+ pmulhrsw m25, m13, m2 ; a2
+ pmulhrsw m27, m13, m4 ; a4
+ pmulhrsw m29, m13, m6 ; a6
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2
+ lea r6, [r4-64*4]
+ add r4, 64*28
+ call .pass2_end
+ mov r4, rsp
+ mova m0, [r4+64*23]
+ mova m1, [r4+64*22]
+ mova m2, [r4+64*21]
+ mova m3, [r4+64*20]
+ mova m4, [r4+64*19]
+ mova m5, [r4+64*18]
+ mova m6, [r4+64*17]
+ mova m7, [r4+64*16]
+ mova m22, [r4+64*15]
+ mova m23, [r4+64*14]
+ mova m24, [r4+64*13]
+ mova m25, [r4+64*12]
+ mova m26, [r4+64*11]
+ mova m27, [r4+64*10]
+ mova m28, [r4+64* 9]
+ mova m29, [r4+64* 8]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi
+ vpbroadcastd m13, [o(pw_8192)]
+ mova [r4+64* 8], m1
+ mova [r4+64* 9], m3
+ mova [r4+64*10], m5
+ mova [r4+64*11], m7
+ mova [r4+64*16], m23
+ mova [r4+64*17], m25
+ mova [r4+64*18], m27
+ mova [r4+64*19], m29
+ pmulhrsw m23, m13, m0 ; b0
+ pmulhrsw m25, m13, m2 ; b2
+ pmulhrsw m27, m13, m4 ; b4
+ pmulhrsw m29, m13, m6 ; b6
+ mova m0, [r4+64*31]
+ mova m1, [r4+64*30]
+ mova m2, [r4+64*29]
+ mova m3, [r4+64*28]
+ mova m4, [r4+64*27]
+ mova m5, [r4+64*26]
+ mova m6, [r4+64*25]
+ mova m7, [r4+64*24]
+ mova m14, [r4+64* 7]
+ mova m15, [r4+64* 6]
+ mova m16, [r4+64* 5]
+ mova m17, [r4+64* 4]
+ mova m18, [r4+64* 3]
+ mova m19, [r4+64* 2]
+ mova m20, [r4+64* 1]
+ mova m21, [r4+64* 0]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo
+ mov r6, cq
+ call .pass2_end
+ jmp .end
+.fast: ; bottom/right halves are zero
+ mova m28, [o(dup16_perm)]
+ pmovzxwd m9, [cq+64* 0]
+ vpermb m8, m28, [cq+64* 4]
+ vpermb ym1, ym28, [cq+64*12]
+ vpermb m7, m28, [cq+64* 8]
+ pslld m9, 16
+ call m(idct_16x16_internal_8bpc).main_fast2
+ vpermb m21, m28, [cq+64* 2]
+ vpermb ym15, ym28, [cq+64*14]
+ vpermb ym18, ym28, [cq+64*10]
+ vpermb m14, m28, [cq+64* 6]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ vpermb m22, m28, [cq+64* 1]
+ vpermb ym29, ym28, [cq+64*15]
+ vpermb ym26, ym28, [cq+64* 9]
+ vpermb m25, m28, [cq+64* 7]
+ vpermb m24, m28, [cq+64* 5]
+ vpermb ym27, ym28, [cq+64*11]
+ vpermb m23, m28, [cq+64* 3]
+ vpermb ym28, ym28, [cq+64*13]
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_8192)]
+ mova [cq+64*16], m4
+ mova [cq+64*17], m5
+ mova [cq+64*18], m6
+ mova [cq+64*19], m7
+ mova [cq+64*28], m26
+ mova [cq+64*29], m27
+ mova [cq+64*30], m28
+ mova [cq+64*31], m29
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
+ mova [cq+64*20], m22
+ mova [cq+64*21], m23
+ mova [cq+64*22], m24
+ mova [cq+64*23], m25
+ mova [cq+64*24], m26
+ mova [cq+64*25], m27
+ mova [cq+64*26], m28
+ mova [cq+64*27], m29
+ lea r4, [rsp+64*64]
+ lea r3, [rsp+64*32]
+ call .pass2_fast
+ pmulhrsw m0, m13, [cq+64*16]
+ pmulhrsw m1, m13, [cq+64*17]
+ pmulhrsw m2, m13, [cq+64*18]
+ pmulhrsw m3, m13, [cq+64*19]
+ pmulhrsw m4, m13, [cq+64*20]
+ pmulhrsw m5, m13, [cq+64*21]
+ pmulhrsw m6, m13, [cq+64*22]
+ pmulhrsw m7, m13, [cq+64*23]
+ pmulhrsw m14, m13, [cq+64*24]
+ pmulhrsw m15, m13, [cq+64*25]
+ pmulhrsw m16, m13, [cq+64*26]
+ pmulhrsw m17, m13, [cq+64*27]
+ pmulhrsw m18, m13, [cq+64*28]
+ pmulhrsw m19, m13, [cq+64*29]
+ pmulhrsw m20, m13, [cq+64*30]
+ pmulhrsw m21, m13, [cq+64*31]
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
+ mov r4, rsp
+ mov r3, cq
+ call .pass2_fast
+.end:
+ vpbroadcastd m17, [o(pw_2048)]
+ lea r5, [strideq*8]
+ mov r3, dstq
+ pxor m16, m16
+ sub r4, 64*5 ; rsp+64*31
+ mov r6, rsp
+.end_loop:
+ mova m2, [r6+64*32] ; idct16 0+n lo
+ mova m7, [r6+64*48] ; idct32 31-n lo
+ mova m6, [cq+64* 0] ; idct16 0+n hi
+ mova m0, [cq+64*16] ; idct32 31-n hi
+ mova m4, [r4+64*64] ; idct64 63-n lo
+ mova m1, [r4+64* 0] ; idct64 63-n hi
+ mova m5, [r6+64*64] ; idct64 32+n lo
+ mova m8, [r6+64* 0] ; idct64 32+n hi
+ sub r3, strideq
+ paddsw m3, m2, m7 ; idct32 0+n lo
+ mova m12, [dstq+r5*0]
+ psubsw m2, m7 ; idct32 31-n lo
+ mova m15, [r3 +r5*8]
+ paddsw m7, m6, m0 ; idct32 0+n hi
+ mova m13, [r3 +r5*4]
+ psubsw m6, m0 ; idct32 31-n hi
+ mova m14, [dstq+r5*4]
+ paddsw m0, m3, m4 ; out 0+n lo
+ add r6, 64
+ psubsw m3, m4 ; out 63-n lo
+ sub r4, 64
+ paddsw m4, m7, m1 ; out 0+n hi
+ mova [cq+64* 0], m16
+ psubsw m7, m1 ; out 63-n hi
+ mova [cq+64*16], m16
+ paddsw m1, m2, m5 ; out 31-n lo
+ add cq, 64
+ psubsw m2, m5 ; out 32+n lo
+ paddsw m5, m6, m8 ; out 31-n hi
+ psubsw m6, m8 ; out 32+n hi
+ pmulhrsw m0, m17
+ punpcklbw m8, m12, m16
+ pmulhrsw m4, m17
+ punpckhbw m12, m16
+ pmulhrsw m3, m17
+ punpcklbw m11, m15, m16
+ pmulhrsw m7, m17
+ punpckhbw m15, m16
+ pmulhrsw m1, m17
+ punpcklbw m9, m13, m16
+ pmulhrsw m5, m17
+ punpckhbw m13, m16
+ pmulhrsw m2, m17
+ punpcklbw m10, m14, m16
+ pmulhrsw m6, m17
+ punpckhbw m14, m16
+ paddw m0, m8
+ paddw m4, m12
+ packuswb m0, m4
+ paddw m3, m11
+ paddw m7, m15
+ packuswb m3, m7
+ paddw m1, m9
+ paddw m5, m13
+ packuswb m1, m5
+ paddw m2, m10
+ paddw m6, m14
+ packuswb m2, m6
+ mova [dstq+r5*0], m0
+ mova [r3 +r5*8], m3
+ mova [r3 +r5*4], m1
+ mova [dstq+r5*4], m2
+ add dstq, strideq
+ cmp r6, r4
+ jb .end_loop
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ or r3d, 64
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
+ALIGN function_align
+.pass2_end:
+ REPX {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6
+ mova [r4+64*20], m1
+ mova [r4+64*21], m3
+ mova [r4+64*22], m5
+ mova [r4+64*23], m7
+ vinserti32x8 m1, m23, ym14, 1 ; a00 a01 c00 c01
+ vshufi32x4 m3, m23, m14, q3232 ; a02 a03 c02 c03
+ vinserti32x8 m5, m22, ym0, 1 ; e00 e01 g00 g01
+ vshufi32x4 m14, m22, m0, q3232 ; e02 e03 g02 g03
+ mova [r4+64*12], m15
+ mova [r4+64*13], m17
+ mova [r4+64*14], m19
+ mova [r4+64*15], m21
+ vinserti32x8 m15, m27, ym18, 1 ; a40 a41 c40 c41
+ vshufi32x4 m17, m27, m18, q3232 ; a42 a43 c42 c43
+ vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41
+ vshufi32x4 m19, m26, m4, q3232 ; e42 e43 g42 g43
+ vinserti32x8 m22, m25, ym16, 1 ; a20 a21 c20 c21
+ vshufi32x4 m26, m25, m16, q3232 ; a22 a23 c22 c23
+ vinserti32x8 m25, m24, ym2, 1 ; e20 e21 g20 g21
+ vshufi32x4 m27, m24, m2, q3232 ; e22 e23 g22 g23
+ vinserti32x8 m23, m29, ym20, 1 ; a60 a61 c60 c61
+ vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63
+ vshufi32x4 m13, m28, m6, q3232 ; e62 e63 g62 g63
+ vinserti32x8 m28, ym6, 1 ; e60 e61 g60 g61
+ vshufi32x4 m0, m1, m5, q2020 ; 0
+ vshufi32x4 m1, m5, q3131 ; 8
+ vshufi32x4 m2, m3, m14, q2020 ; 16
+ vshufi32x4 m3, m14, q3131 ; 24
+ vshufi32x4 m14, m15, m18, q2020 ; 4
+ vshufi32x4 m15, m18, q3131 ; 12
+ vshufi32x4 m16, m17, m19, q2020 ; 20
+ vshufi32x4 m17, m19, q3131 ; 28
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ vshufi32x4 m24, m22, m25, q3131 ; 10
+ vshufi32x4 m22, m25, q2020 ; 2
+ vshufi32x4 m25, m23, m28, q3131 ; 14
+ vshufi32x4 m23, m28, q2020 ; 6
+ vshufi32x4 m28, m26, m27, q3131 ; 26
+ vshufi32x4 m26, m27, q2020 ; 18
+ vshufi32x4 m27, m29, m13, q2020 ; 22
+ vshufi32x4 m29, m13, q3131 ; 30
+ mova [r6+64* 0], m0
+ mova [r6+64* 1], m1
+ mova [r6+64* 2], m2
+ mova [r6+64* 3], m3
+ mova [r6+64* 4], m4
+ mova [r6+64* 5], m5
+ mova [r6+64* 6], m6
+ mova [r6+64* 7], m7
+ mova [r6+64* 8], m14
+ mova [r6+64* 9], m15
+ mova [r6+64*10], m16
+ mova [r6+64*11], m17
+ mova [r6+64*12], m18
+ mova [r6+64*13], m19
+ mova [r6+64*14], m20
+ mova [r6+64*15], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_8192)]
+ mova [r6+64*16], m29
+ mova [r6+64*17], m28
+ mova [r6+64*18], m27
+ mova [r6+64*19], m26
+ mova [r6+64*20], m25
+ mova [r6+64*21], m24
+ mova [r6+64*22], m23
+ mova [r6+64*23], m22
+ mova [r6+64*24], m21
+ mova [r6+64*25], m20
+ mova [r6+64*26], m19
+ mova [r6+64*27], m18
+ mova [r6+64*28], m17
+ mova [r6+64*29], m16
+ mova [r6+64*30], m15
+ mova [r6+64*31], m14
+ pmulhrsw m15, m13, [r4+64* 8] ; 1 9 17 25
+ pmulhrsw m16, m13, [r4+64*12]
+ pmulhrsw m17, m13, [r4+64*16]
+ pmulhrsw m18, m13, [r4+64*20]
+ pmulhrsw m19, m13, [r4+64*11] ; 7 15 23 31
+ pmulhrsw m20, m13, [r4+64*15]
+ pmulhrsw m21, m13, [r4+64*19]
+ pmulhrsw m22, m13, [r4+64*23]
+ vinserti32x8 m14, m15, ym16, 1 ; a1 a9 c1 c9
+ vshufi32x4 m15, m16, q3232 ; a17 a25 c17 c25
+ vinserti32x8 m16, m17, ym18, 1 ; e1 e9 g1 g9
+ vshufi32x4 m17, m18, q3232 ; e17 e25 g17 g25
+ pmulhrsw m23, m13, [r4+64*10] ; 5 13 21 29
+ pmulhrsw m24, m13, [r4+64*14]
+ pmulhrsw m25, m13, [r4+64*18]
+ pmulhrsw m26, m13, [r4+64*22]
+ vinserti32x8 m18, m19, ym20, 1 ; a7 a15 c7 c15
+ vshufi32x4 m19, m20, q3232 ; a23 a31 c23 c31
+ vinserti32x8 m20, m21, ym22, 1 ; e7 e15 g7 g15
+ vshufi32x4 m21, m22, q3232 ; e23 e31 g23 g31
+ pmulhrsw m27, m13, [r4+64* 9] ; 3 11 19 27
+ pmulhrsw m28, m13, [r4+64*13]
+ pmulhrsw m29, m13, [r4+64*17]
+ pmulhrsw m13, [r4+64*21]
+ vshufi32x4 m0, m14, m16, q2020 ; 1
+ vshufi32x4 m1, m19, m21, q3131 ; 31
+ vshufi32x4 m2, m15, m17, q2020 ; 17
+ vshufi32x4 m3, m18, m20, q3131 ; 15
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ vshufi32x4 m0, m18, m20, q2020 ; 7
+ vshufi32x4 m1, m15, m17, q3131 ; 25
+ vshufi32x4 m2, m19, m21, q2020 ; 23
+ vshufi32x4 m3, m14, m16, q3131 ; 9
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ vinserti32x8 m22, m23, ym24, 1 ; a5 a13 c5 c13
+ vshufi32x4 m23, m24, q3232 ; a21 a29 c21 c29
+ vinserti32x8 m24, m25, ym26, 1 ; e5 e13 g5 g13
+ vshufi32x4 m25, m26, q3232 ; e21 e29 g21 g29
+ vinserti32x8 m26, m27, ym28, 1 ; a3 a11 c3 c11
+ vshufi32x4 m27, m28, q3232 ; a19 a27 c19 c27
+ vinserti32x8 m28, m29, ym13, 1 ; e3 e11 g3 g11
+ vshufi32x4 m29, m13, q3232 ; e19 e17 g19 g27
+ vshufi32x4 m0, m22, m24, q2020 ; 5
+ vshufi32x4 m1, m27, m29, q3131 ; 27
+ vshufi32x4 m2, m23, m25, q2020 ; 21
+ vshufi32x4 m3, m26, m28, q3131 ; 11
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ vshufi32x4 m0, m26, m28, q2020 ; 3
+ vshufi32x4 m1, m23, m25, q3131 ; 29
+ vshufi32x4 m2, m27, m29, q2020 ; 19
+ vshufi32x4 m3, m22, m24, q3131 ; 13
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ALIGN function_align
+.pass2_fast:
+ vshufi32x4 m23, m1, m16, q3131 ; 6
+ vshufi32x4 m22, m1, m16, q2020 ; 2
+ vshufi32x4 m14, m0, m3, q3131 ; 4
+ vshufi32x4 m26, m0, m3, q2020 ; 0
+ vshufi32x4 m28, m9, m15, q3131 ; 5
+ vshufi32x4 m0, m9, m15, q2020 ; 1
+ vshufi32x4 m16, m11, m17, q3131 ; 7
+ vshufi32x4 m29, m11, m17, q2020 ; 3
+ vshufi32x4 m15, m8, m2, q3131 ; 12
+ vshufi32x4 m27, m8, m2, q2020 ; 8
+ vshufi32x4 m25, m5, m19, q3131 ; 14
+ vshufi32x4 m24, m5, m19, q2020 ; 10
+ vshufi32x4 m3, m6, m20, q3131 ; 15
+ vshufi32x4 m19, m6, m20, q2020 ; 11
+ vshufi32x4 m17, m4, m18, q3131 ; 13
+ vshufi32x4 m18, m4, m18, q2020 ; 9
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m16
+ mova m3, m18
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m28
+ mova m3, m19
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m29
+ mova m3, m17
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ mova m0, m26
+ mova m1, m27
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
+ mova [r3+64* 0], m0
+ mova [r3+64* 1], m1
+ mova [r3+64* 2], m2
+ mova [r3+64* 3], m3
+ mova [r3+64* 4], m4
+ mova [r3+64* 5], m5
+ mova [r3+64* 6], m6
+ mova [r3+64* 7], m7
+ mova [r3+64* 8], m14
+ mova [r3+64* 9], m15
+ mova [r3+64*10], m16
+ mova [r3+64*11], m17
+ mova [r3+64*12], m18
+ mova [r3+64*13], m19
+ mova [r3+64*14], m20
+ mova [r3+64*15], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
+ mova [r3+64*16], m29
+ mova [r3+64*17], m28
+ mova [r3+64*18], m27
+ mova [r3+64*19], m26
+ mova [r3+64*20], m25
+ mova [r3+64*21], m24
+ mova [r3+64*22], m23
+ mova [r3+64*23], m22
+ mova [r3+64*24], m21
+ mova [r3+64*25], m20
+ mova [r3+64*26], m19
+ mova [r3+64*27], m18
+ mova [r3+64*28], m17
+ mova [r3+64*29], m16
+ mova [r3+64*30], m15
+ mova [r3+64*31], m14
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/itx_sse.asm b/third_party/dav1d/src/x86/itx_sse.asm
new file mode 100644
index 0000000000..ec7e3a52f4
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx_sse.asm
@@ -0,0 +1,6533 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+
+SECTION_RODATA 16
+
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+
+deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7
+
+%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1
+pw_%1_m%2: times 4 dw %1, -%2
+%if %3 != 2
+pw_%2_%1: times 4 dw %2, %1
+%endif
+%if %3
+pw_m%1_m%2: times 4 dw -%1, -%2
+%endif
+%endmacro
+
+;adst4
+pw_1321_3803: times 4 dw 1321, 3803
+pw_2482_m1321: times 4 dw 2482, -1321
+pw_3344_2482: times 4 dw 3344, 2482
+pw_3344_m3803: times 4 dw 3344, -3803
+pw_3344_m3344: times 4 dw 3344, -3344
+pw_0_3344 times 4 dw 0, 3344
+pw_m6688_m3803: times 4 dw -6688, -3803
+
+COEF_PAIR 2896, 2896
+COEF_PAIR 1567, 3784
+COEF_PAIR 799, 4017
+COEF_PAIR 3406, 2276
+COEF_PAIR 401, 4076
+COEF_PAIR 1931, 3612
+COEF_PAIR 3166, 2598
+COEF_PAIR 3920, 1189
+COEF_PAIR 3784, 1567, 1
+COEF_PAIR 995, 3973
+COEF_PAIR 1751, 3703
+COEF_PAIR 3513, 2106
+COEF_PAIR 3857, 1380
+COEF_PAIR 4017, 799, 1
+COEF_PAIR 201, 4091
+COEF_PAIR 2440, 3290
+COEF_PAIR 3035, 2751
+COEF_PAIR 4052, 601
+COEF_PAIR 2276, 3406, 1
+COEF_PAIR 4076, 401, 2
+COEF_PAIR 2598, 3166, 2
+COEF_PAIR 3612, 1931, 2
+COEF_PAIR 1189, 3920, 2
+
+pd_2048: times 4 dd 2048
+pw_2048: times 8 dw 2048
+pw_m2048: times 8 dw -2048
+pw_4096: times 8 dw 4096
+pw_16384: times 8 dw 16384
+pw_m16384: times 8 dw -16384
+pw_1697x16: times 8 dw 1697*16
+pw_1697x8: times 8 dw 1697*8
+pw_2896x8: times 8 dw 2896*8
+pw_3344x8: times 8 dw 3344*8
+pw_8192: times 8 dw 8192
+pw_m8192: times 8 dw -8192
+pw_5: times 8 dw 5
+pw_201x8: times 8 dw 201*8
+pw_4091x8: times 8 dw 4091*8
+pw_m2751x8: times 8 dw -2751*8
+pw_3035x8: times 8 dw 3035*8
+pw_1751x8: times 8 dw 1751*8
+pw_3703x8: times 8 dw 3703*8
+pw_m1380x8: times 8 dw -1380*8
+pw_3857x8: times 8 dw 3857*8
+pw_995x8: times 8 dw 995*8
+pw_3973x8: times 8 dw 3973*8
+pw_m2106x8: times 8 dw -2106*8
+pw_3513x8: times 8 dw 3513*8
+pw_2440x8: times 8 dw 2440*8
+pw_3290x8: times 8 dw 3290*8
+pw_m601x8: times 8 dw -601*8
+pw_4052x8: times 8 dw 4052*8
+
+pw_4095x8: times 8 dw 4095*8
+pw_101x8: times 8 dw 101*8
+pw_2967x8: times 8 dw 2967*8
+pw_m2824x8: times 8 dw -2824*8
+pw_3745x8: times 8 dw 3745*8
+pw_1660x8: times 8 dw 1660*8
+pw_3822x8: times 8 dw 3822*8
+pw_m1474x8: times 8 dw -1474*8
+pw_3996x8: times 8 dw 3996*8
+pw_897x8: times 8 dw 897*8
+pw_3461x8: times 8 dw 3461*8
+pw_m2191x8: times 8 dw -2191*8
+pw_3349x8: times 8 dw 3349*8
+pw_2359x8: times 8 dw 2359*8
+pw_4036x8: times 8 dw 4036*8
+pw_m700x8: times 8 dw -700*8
+pw_4065x8: times 8 dw 4065*8
+pw_501x8: times 8 dw 501*8
+pw_3229x8: times 8 dw 3229*8
+pw_m2520x8: times 8 dw -2520*8
+pw_3564x8: times 8 dw 3564*8
+pw_2019x8: times 8 dw 2019*8
+pw_3948x8: times 8 dw 3948*8
+pw_m1092x8: times 8 dw -1092*8
+pw_3889x8: times 8 dw 3889*8
+pw_1285x8: times 8 dw 1285*8
+pw_3659x8: times 8 dw 3659*8
+pw_m1842x8: times 8 dw -1842*8
+pw_3102x8: times 8 dw 3102*8
+pw_2675x8: times 8 dw 2675*8
+pw_4085x8: times 8 dw 4085*8
+pw_m301x8: times 8 dw -301*8
+
+SECTION .text
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%if ARCH_X86_64
+%define o(x) x
+%else
+%define o(x) r5-$$+x ; PIC
+%endif
+
+%macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4]
+ lea r2, [dstq+strideq*2]
+%assign %%i 1
+%rotate 5
+%rep 4
+ %if %1 & 2
+ CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1)
+ %else
+ CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+ %endif
+ %assign %%i %%i + 1
+ %rotate 1
+%endrep
+
+ movd m%3, [%%row_adr1] ;dst0
+ movd m%5, [%%row_adr2] ;dst1
+ punpckldq m%3, m%5 ;high: dst1 :low: dst0
+ movd m%4, [%%row_adr3] ;dst2
+ movd m%5, [%%row_adr4] ;dst3
+ punpckldq m%4, m%5 ;high: dst3 :low: dst2
+
+ pxor m%5, m%5
+ punpcklbw m%3, m%5 ;extend byte to word
+ punpcklbw m%4, m%5 ;extend byte to word
+
+ paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0
+ paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2
+
+ packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
+
+ movd [%%row_adr1], m%3 ;store dst0 + out0
+ pshuflw m%4, m%3, q1032
+ movd [%%row_adr2], m%4 ;store dst1 + out1
+ punpckhqdq m%3, m%3
+ movd [%%row_adr3], m%3 ;store dst2 + out2
+ psrlq m%3, 32
+ movd [%%row_adr4], m%3 ;store dst3 + out3
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+ mova m2, [o(pw_%5)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+%endif
+
+ WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4
+ ret
+%endmacro
+
+; flags: 1 = swap, 2: coef_regs, 4: no_pack
+%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
+%if %6 & 2
+ pmaddwd m%2, m%4, m%1
+ pmaddwd m%1, m%5
+%elif %6 & 1
+ pmaddwd m%2, m%1, [o(pw_%5_%4)]
+ pmaddwd m%1, [o(pw_%4_m%5)]
+%else
+ pmaddwd m%2, m%1, [o(pw_%4_m%5)]
+ pmaddwd m%1, [o(pw_%5_%4)]
+%endif
+ paddd m%2, m%3
+ paddd m%1, m%3
+ psrad m%2, 12
+ psrad m%1, 12
+%if %6 & 4 == 0
+ packssdw m%1, m%2
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8
+ mova m3, [o(pd_2048)]
+ punpckhwd m2, m0, m1 ;unpacked in1 in3
+ punpcklwd m0, m1 ;unpacked in0 in2
+ ITX_MUL2X_PACK 2, 1, 3, 1567, 3784
+ ITX_MUL2X_PACK 0, 1, 3, 2896, 2896
+ psubsw m1, m0, m2 ;high: out2 ;low: out3
+ paddsw m0, m2 ;high: out1 ;low: out0
+%endmacro
+
+%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack
+cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2
+ %define %%p1 m(i%1_%3_internal_8bpc)
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+%if has_epilogue
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jz %%end
+%endif
+ lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
+ call %%p1
+ RET
+%%end:
+%else
+ lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x4, 6
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklqdq m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd ;0
+ pmulhrsw m0, m1
+ mova m1, m0
+ TAIL_CALL m(iadst_4x4_internal_8bpc).end2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16.
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
+
+cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0] ;high: in1 ;low: in0
+ mova m1, [coeffq+16*1] ;high: in3 ;low in2
+
+ IDCT4_1D_PACKED
+
+ mova m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m1, q0220
+ pshufb m0, m2 ;high: in1 ;low: in0
+ pshufb m1, m3, m2 ;high: in3 ;low :in2
+ jmp tx2q
+
+.pass2:
+ IDCT4_1D_PACKED
+
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2 ;memset(coeff, 0, sizeof(*coeff) * sh * sw);
+
+ ITX4_END 0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ call .main
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2 ;high: in3 ;low :in2
+ punpcklwd m0, m2 ;high: in1 ;low: in0
+ jmp tx2q
+
+.pass2:
+ call .main
+
+.end:
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2
+
+.end2:
+ ITX4_END 0, 1, 2, 3
+
+ALIGN function_align
+cglobal_label .main
+ punpcklwd m2, m0, m1 ;unpacked in0 in2
+ punpckhwd m0, m1 ;unpacked in1 in3
+ mova m3, m0
+ pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2
+ pmaddwd m0, [o(pw_0_3344)] ;3344 * in3
+ paddd m1, m0 ;t2
+ pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
+ paddd m4, m0 ;t0 + t3
+ pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+ mova m0, [o(pd_2048)]
+ paddd m1, m0 ;t2 + 2048
+ paddd m2, m0
+ paddd m0, m4 ;t0 + t3 + 2048
+ paddd m5, m2 ;t1 + t3 + 2048
+ paddd m2, m4
+ paddd m2, m3 ;t0 + t1 - t3 + 2048
+ REPX {psrad x, 12}, m1, m0, m5, m2
+ packssdw m0, m5 ;high: out1 ;low: out0
+ packssdw m1, m2 ;high: out3 ;low: out3
+ ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ call m(iadst_4x4_internal_8bpc).main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2 ;high: in3 ;low :in2
+ punpckhwd m1, m2 ;high: in1 ;low: in0
+ jmp tx2q
+
+.pass2:
+ call m(iadst_4x4_internal_8bpc).main
+
+.end:
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2
+
+.end2:
+ ITX4_END 3, 2, 1, 0
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ mova m3, [o(pw_1697x8)]
+ pmulhrsw m2, m0, m3
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2 ;high: in3 ;low :in2
+ punpcklwd m0, m2 ;high: in1 ;low: in0
+ jmp tx2q
+
+.pass2:
+ mova m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x4_internal_8bpc).end
+
+%macro IWHT4_1D_PACKED 0
+ punpckhqdq m3, m0, m1 ;low: in1 high: in3
+ punpcklqdq m0, m1 ;low: in0 high: in2
+ psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3
+ paddw m0, m3 ;low: in0 + in1 high: in2 + in3
+ punpckhqdq m2, m2 ;t2 t2
+ punpcklqdq m0, m0 ;t0 t0
+ psubw m1, m0, m2
+ psraw m1, 1 ;t4 t4
+ psubw m1, m3 ;low: t1/out2 high: t3/out1
+ psubw m0, m1 ;high: out0
+ paddw m2, m1 ;low: out3
+%endmacro
+
+INIT_XMM sse2
+cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2
+ psraw m0, 2
+ psraw m1, 2
+ IWHT4_1D_PACKED
+ punpckhwd m0, m1
+ punpcklwd m3, m1, m2
+ punpckhdq m1, m0, m3
+ punpckldq m0, m3
+ IWHT4_1D_PACKED
+ shufpd m0, m2, 0x01
+ ITX4_END 0, 3, 2, 1, 0
+
+%macro IDCT8_1D_PACKED 0
+ mova m6, [o(pd_2048)]
+ punpckhwd m4, m0, m3 ;unpacked in1 in7
+ punpcklwd m0, m2 ;unpacked in0 in4
+ punpckhwd m2, m1 ;unpacked in5 in3
+ punpcklwd m1, m3 ;unpacked in2 in6
+ ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a
+ ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a
+ ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2
+ psubsw m3, m4, m2 ;low: t6a high: t5a
+ paddsw m4, m2 ;low: t7 high: t4
+ pshufb m3, [o(deint_shuf1)]
+ ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1
+ ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5
+ psubsw m2, m0, m1 ;low: tmp3 high: tmp2
+ paddsw m0, m1 ;low: tmp0 high: tmp1
+ punpcklqdq m1, m4, m3 ;low: t7 high: t6
+ punpckhqdq m4, m3 ;low: t4 high: t5
+ psubsw m3, m0, m1 ;low: out7 high: out6
+ paddsw m0, m1 ;low: out0 high: out1
+ paddsw m1, m2, m4 ;low: out3 high: out2
+ psubsw m2, m4 ;low: out4 high: out5
+%endmacro
+
+;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1
+ punpckhwd m%4, m%1, m%2
+ punpcklwd m%1, m%2
+%if %7 < 8
+ pmaddwd m%2, m%7, m%1
+ pmaddwd m%3, m%7, m%4
+%else
+ mova m%2, [o(pw_%7_%6)]
+%if %8
+ pmaddwd m%3, m%1, m%2
+ pmaddwd m%2, m%4
+%else
+ pmaddwd m%3, m%4, m%2
+ pmaddwd m%2, m%1
+%endif
+%endif
+ paddd m%3, m%5
+ paddd m%2, m%5
+ psrad m%3, 12
+ psrad m%2, 12
+%if %8
+ packssdw m%3, m%2
+%else
+ packssdw m%2, m%3 ;dst2
+%endif
+%if %7 < 8
+ pmaddwd m%4, m%6
+ pmaddwd m%1, m%6
+%elif %8
+ mova m%2, [o(pw_%6_m%7)]
+ pmaddwd m%4, m%2
+ pmaddwd m%1, m%2
+%else
+ mova m%3, [o(pw_%6_m%7)]
+ pmaddwd m%4, m%3
+ pmaddwd m%1, m%3
+%endif
+ paddd m%4, m%5
+ paddd m%1, m%5
+ psrad m%4, 12
+ psrad m%1, 12
+ packssdw m%1, m%4 ;dst1
+%endmacro
+
+%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3
+ ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0
+ psubsw m%3, m%1, m%2 ;out2
+ paddsw m%2, m%1 ;out1
+ paddsw m%1, m%5, m%4 ;out0
+ psubsw m%4, m%5 ;out3
+%endmacro
+
+%macro WRITE_4X8 4 ;row[1-4]
+ WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4
+%endmacro
+
+%macro INV_4X8 0
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhdq m1, m0, m2 ;low: in2 high: in3
+ punpckldq m0, m2 ;low: in0 high: in1
+ punpckldq m2, m3, m4 ;low: in4 high: in5
+ punpckhdq m3, m4 ;low: in6 high: in7
+%endmacro
+
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x8, 8
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklqdq m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ pmulhrsw m0, [o(pw_2048)]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ TAIL_CALL m(iadst_4x8_internal_8bpc).end3
+%endif
+%endmacro
+
+INIT_XMM ssse3
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
+
+cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ call m(idct_8x4_internal_8bpc).main
+ jmp m(iadst_4x8_internal_8bpc).pass1_end
+
+.pass2:
+ call .main
+ shufps m1, m1, q1032
+ shufps m3, m3, q1032
+ mova m4, [o(pw_2048)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ call m(iadst_8x4_internal_8bpc).main
+
+.pass1_end:
+ INV_4X8
+ jmp tx2q
+
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call .main
+ mova m4, [o(pw_2048)]
+ pxor m5, m5
+ psubw m5, m4
+
+.end:
+ punpcklqdq m4, m5
+
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ pxor m5, m5
+ mova [coeffq+16*0], m5
+ mova [coeffq+16*1], m5
+ mova [coeffq+16*2], m5
+ mova [coeffq+16*3], m5
+
+.end3:
+ WRITE_4X8 0, 1, 2, 3
+ RET
+
+ALIGN function_align
+cglobal_label .main
+ mova m6, [o(pd_2048)]
+ punpckhwd m4, m3, m0 ;unpacked in7 in0
+ punpckhwd m5, m2, m1 ;unpacked in5 in2
+ punpcklwd m1, m2 ;unpacked in3 in4
+ punpcklwd m0, m3 ;unpacked in1 in6
+ ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a
+ ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a
+ ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a
+ ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a
+
+ psubsw m3, m4, m1 ;low: t4 high: t5
+ paddsw m4, m1 ;low: t0 high: t1
+ psubsw m2, m5, m0 ;low: t6 high: t7
+ paddsw m5, m0 ;low: t2 high: t3
+
+ shufps m1, m3, m2, q1032
+ punpckhwd m2, m1
+ punpcklwd m3, m1
+ ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a
+ ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a
+
+ psubsw m1, m4, m5 ;low: t2 high: t3
+ paddsw m4, m5 ;low: out0 high: -out7
+ psubsw m5, m3, m2 ;low: t7 high: t6
+ paddsw m3, m2 ;low: out6 high: -out1
+ shufps m0, m4, m3, q3210 ;low: out0 high: -out1
+ shufps m3, m4, q3210 ;low: out6 high: -out7
+
+ mova m2, [o(pw_2896_m2896)]
+ mova m7, [o(pw_2896_2896)]
+ shufps m4, m1, m5, q1032 ;low: t3 high: t7
+ shufps m1, m5, q3210 ;low: t2 high: t6
+ punpcklwd m5, m1, m4
+ punpckhwd m1, m4
+ pmaddwd m4, m2, m1 ;-out5
+ pmaddwd m2, m5 ; out4
+ pmaddwd m1, m7 ; out2
+ pmaddwd m5, m7 ;-out3
+ REPX {paddd x, m6}, m4, m2, m1, m5
+ REPX {psrad x, 12}, m4, m2, m1, m5
+ packssdw m1, m5 ;low: out2 high: -out3
+ packssdw m2, m4 ;low: out4 high: -out5
+ ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ call m(iadst_8x4_internal_8bpc).main
+
+ punpcklwd m4, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m5, m1, m0
+ punpckhwd m1, m0
+ punpckldq m2, m3, m1 ;low: in4 high: in5
+ punpckhdq m3, m1 ;low: in6 high: in7
+ punpckldq m0, m4, m5 ;low: in0 high: in1
+ punpckhdq m1, m4, m5 ;low: in2 high: in3
+ jmp tx2q
+
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call m(iadst_4x8_internal_8bpc).main
+
+ mova m4, m0
+ mova m5, m1
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ pshufd m2, m5, q1032
+ pshufd m3, m4, q1032
+ mova m5, [o(pw_2048)]
+ pxor m4, m4
+ psubw m4, m5
+ jmp m(iadst_4x8_internal_8bpc).end
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(iadst_4x8_internal_8bpc).pass1_end
+
+.pass2:
+ mova m4, [o(pw_4096)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+
+%macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3]
+ movq m%3, [dstq ]
+ movq m%4, [dstq+strideq]
+ pxor m%5, m%5
+ punpcklbw m%3, m%5 ;extend byte to word
+ punpcklbw m%4, m%5 ;extend byte to word
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ movq [dstq ], m%3
+ punpckhqdq m%3, m%3
+ movq [dstq+strideq], m%3
+%endmacro
+
+%macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3]
+ WRITE_8X2 %1, %2, %5, %6, %7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X2 %3, %4, %5, %6, %7
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x4, 8
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklqdq m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ mova m2, [o(pw_2048)]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ TAIL_CALL m(iadst_8x4_internal_8bpc).end2
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
+
+cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+ call m(idct_4x8_internal_8bpc).main
+
+ mova m4, [o(deint_shuf1)]
+ mova m5, [o(deint_shuf2)]
+ pshufb m0, m4
+ pshufb m1, m5
+ pshufb m2, m4
+ pshufb m3, m5
+ punpckhdq m4, m0, m1
+ punpckldq m0, m1
+ punpckhdq m5, m2, m3
+ punpckldq m2, m3
+ punpckhqdq m1, m0, m2 ;in1
+ punpcklqdq m0, m2 ;in0
+ punpckhqdq m3, m4, m5 ;in3
+ punpcklqdq m2 ,m4, m5 ;in2
+ jmp tx2q
+
+.pass2:
+ call .main
+ jmp m(iadst_8x4_internal_8bpc).end
+
+ALIGN function_align
+cglobal_label .main
+ mova m6, [o(pd_2048)]
+ IDCT4_1D 0, 1, 2, 3, 4, 5, 6
+ ret
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call m(iadst_4x8_internal_8bpc).main
+
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ pxor m5, m5
+ psubsw m3, m5, m1
+ psubsw m5, m4
+ punpckhdq m4, m5, m3
+ punpckldq m5, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckhwd m1, m0, m5 ;in1
+ punpcklwd m0, m5 ;in0
+ punpcklwd m2, m3, m4 ;in2
+ punpckhwd m3, m4 ;in3
+ jmp tx2q
+
+.pass2:
+ call .main
+
+.end:
+ mova m4, [o(pw_2048)]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+
+.end2:
+ pxor m6, m6
+ mova [coeffq+16*0], m6
+ mova [coeffq+16*1], m6
+ mova [coeffq+16*2], m6
+ mova [coeffq+16*3], m6
+.end3:
+ WRITE_8X4 0, 1, 2, 3, 4, 5, 6
+ RET
+
+ALIGN function_align
+cglobal_label .main
+ punpckhwd m6, m0, m2 ;unpacked in0 in2
+ punpcklwd m0, m2 ;unpacked in0 in2
+ punpckhwd m7, m1, m3 ;unpacked in1 in3
+ punpcklwd m1, m3 ;unpacked in1 in3
+
+ mova m2, [o(pw_3344_m3344)]
+ mova m4, [o(pw_0_3344)]
+ pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2
+ pmaddwd m5, m4, m7 ;3344 * in3
+ pmaddwd m2, m0
+ pmaddwd m4, m1
+ paddd m3, m5
+ paddd m2, m4
+ mova m4, [o(pd_2048)]
+ paddd m3, m4 ;t2 + 2048
+ paddd m2, m4
+ psrad m3, 12
+ psrad m2, 12
+ packssdw m2, m3 ;out2
+
+ pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
+ paddd m3, m4 ;t0 + t3
+
+ pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+ mova m4, [o(pd_2048)]
+ paddd m0, m4
+ paddd m4, m3 ;t0 + t3 + 2048
+ paddd m5, m0 ;t1 + t3 + 2048
+ paddd m3, m0
+ paddd m3, m1 ;t0 + t1 - t3 + 2048
+
+ psrad m4, 12 ;out0
+ psrad m5, 12 ;out1
+ psrad m3, 12 ;out3
+ packssdw m0, m4, m5 ;low: out0 high: out1
+
+ pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
+ paddd m1, m4 ;t0 + t3
+ pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+
+ mova m4, [o(pd_2048)]
+ paddd m6, m4
+ paddd m4, m1 ;t0 + t3 + 2048
+ paddd m5, m6 ;t1 + t3 + 2048
+ paddd m1, m6
+ paddd m1, m7 ;t0 + t1 - t3 + 2048
+
+ psrad m4, 12 ;out0
+ psrad m5, 12 ;out1
+ psrad m1, 12 ;out3
+ packssdw m3, m1 ;out3
+ packssdw m4, m5 ;low: out0 high: out1
+
+ punpckhqdq m1, m0, m4 ;out1
+ punpcklqdq m0, m4 ;out0
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call m(iadst_4x8_internal_8bpc).main
+
+ punpckhwd m5, m3, m2
+ punpcklwd m3, m2
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+
+ pxor m0, m0
+ psubsw m4, m0, m2
+ psubsw m0, m5
+ punpckhdq m2, m0, m4
+ punpckldq m0, m4
+ punpckhdq m4, m3, m1
+ punpckldq m3, m1
+ punpckhwd m1, m0, m3 ;in1
+ punpcklwd m0, m3 ;in0
+ punpckhwd m3, m2, m4 ;in3
+ punpcklwd m2, m4 ;in2
+ jmp tx2q
+
+.pass2:
+ call m(iadst_8x4_internal_8bpc).main
+ mova m4, m0
+ mova m5, m1
+ mova m0, m3
+ mova m1, m2
+ mova m2, m5
+ mova m3, m4
+ jmp m(iadst_8x4_internal_8bpc).end
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+ paddsw m0, m0
+ paddsw m1, m1
+ paddsw m2, m2
+ paddsw m3, m3
+
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m5, m4, m1
+ punpckldq m4, m1
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckhwd m1, m0, m4 ;in1
+ punpcklwd m0, m4 ;in0
+ punpcklwd m2, m3, m5 ;in2
+ punpckhwd m3, m5 ;in3
+ jmp tx2q
+
+.pass2:
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(iadst_8x4_internal_8bpc).end
+
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x8, 8, 16*4
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklwd m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mova m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m2
+ psrlw m2, 3
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+.end:
+ mov r3d, 2
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)]
+.loop:
+ WRITE_8X4 0, 0, 0, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*2]
+ dec r3d
+ jg .loop
+ jmp tx2q
+.end3:
+ RET
+%endif
+%endmacro
+
+%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
+%if %3
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [%1+%2*0]
+ pmulhrsw m1, m7, [%1+%2*1]
+ pmulhrsw m2, m7, [%1+%2*2]
+ pmulhrsw m3, m7, [%1+%2*3]
+ pmulhrsw m4, m7, [%1+%2*4]
+ pmulhrsw m5, m7, [%1+%2*5]
+ pmulhrsw m6, m7, [%1+%2*6]
+ pmulhrsw m7, [%1+%2*7]
+%else
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+ mova m4, [%1+%2*4]
+ mova m5, [%1+%2*5]
+ mova m6, [%1+%2*6]
+ mova m7, [%1+%2*7]
+%endif
+%endmacro
+
+%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a
+ ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a
+ psubsw m%2, m%4, m%5 ;t6a
+ paddsw m%4, m%5 ;t7
+ psubsw m%5, m%1, m%3 ;t5a
+ paddsw m%1, m%3 ;t4
+ ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
+
+cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+
+.pass1:
+ call .main
+
+.pass1_end:
+ mova m7, [o(pw_16384)]
+
+.pass1_end1:
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+
+.pass1_end2:
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [rsp+gprsize+16*0]
+
+cglobal_label .pass1_end3
+ punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53
+ punpckhwd m1, m5 ;14 54 15 55 16 56 17 57
+ punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47
+ punpcklwd m0, m4 ;00 40 01 41 02 42 03 43
+ punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77
+ punpcklwd m3, m7 ;30 70 31 71 32 72 33 73
+ punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77
+ punpcklwd m1, m4 ;14 34 54 74 15 35 55 75
+ punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73
+ punpcklwd m6, m3 ;10 30 50 70 11 31 51 71
+ mova [rsp+gprsize+16*2], m6
+ mova m6, [rsp+gprsize+16*1]
+ punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67
+ punpcklwd m2, m6 ;20 60 21 61 22 62 23 63
+ punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67
+ punpcklwd m5, m3 ;04 24 44 64 05 25 45 65
+ punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63
+ punpcklwd m0, m2 ;00 20 40 60 01 21 41 61
+
+ punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77
+ punpcklwd m6, m7 ;06 16 26 36 46 56 66 76
+ mova [rsp+gprsize+16*0], m2
+ punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72
+ punpckhwd m3, m4 ;03 13 23 33 43 53 63 73
+ punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74
+ punpckhwd m5, m1 ;05 15 25 35 45 55 65 75
+ mova m7, [rsp+gprsize+16*2]
+ punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71
+ punpcklwd m0, m7 ;00 10 20 30 40 50 60 70
+ mova m7, [rsp+gprsize+16*0]
+ jmp tx2q
+
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.pass2_main:
+ call .main
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+
+.end2:
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [rsp+gprsize+16*0]
+ mova [rsp+gprsize+16*2], m5
+ mova [rsp+gprsize+16*0], m7
+
+.end3:
+ WRITE_8X4 0, 1, 2, 3, 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7
+ jmp tx2q
+
+.end4:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ ret
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*0], m7
+ mova [rsp+gprsize*2+16*1], m3
+ mova [rsp+gprsize*2+16*2], m1
+ mova m7, [o(pd_2048)]
+ IDCT4_1D 0, 2, 4, 6, 1, 3, 7
+ mova m3, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m2
+ mova m2, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*1], m4
+ mova m4, [rsp+gprsize*2+16*0]
+ mova [rsp+gprsize*2+16*0], m6
+ IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7
+ mova m6, [rsp+gprsize*2+16*0]
+ psubsw m7, m0, m4 ;out7
+ paddsw m0, m4 ;out0
+ mova [rsp+gprsize*2+16*0], m7
+ mova m1, [rsp+gprsize*2+16*2]
+ psubsw m4, m6, m3 ;out4
+ paddsw m3, m6 ;out3
+ mova m7, [rsp+gprsize*2+16*1]
+ psubsw m6, m1, m5 ;out6
+ paddsw m1, m5 ;out1
+ psubsw m5, m7, m2 ;out5
+ paddsw m2, m7 ;out2
+ ret
+
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+
+.pass1:
+ call .main
+ call .main_pass1_end
+
+.pass1_end:
+ mova m7, [o(pw_16384)]
+
+.pass1_end1:
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+ pxor m6, m6
+ psubw m6, m7
+ mova m7, m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end2
+
+ALIGN function_align
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.pass2_main:
+ call .main
+ call .main_pass2_end
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+ pxor m6, m6
+ psubw m6, m7
+ mova m7, m6
+ jmp m(idct_8x8_internal_8bpc).end2
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*0], m7
+ mova [rsp+gprsize*2+16*1], m3
+ mova [rsp+gprsize*2+16*2], m4
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a
+ ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a
+ paddsw m3, m2, m6 ;t2
+ psubsw m2, m6 ;t6
+ paddsw m4, m5, m1 ;t3
+ psubsw m5, m1 ;t7
+ ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a
+
+ mova m6, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m5
+ mova m1, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*1], m2
+ mova m5, [rsp+gprsize*2+16*0]
+ mova [rsp+gprsize*2+16*0], m3
+ ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a
+ ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a
+ psubsw m2, m0, m6 ;t4
+ paddsw m0, m6 ;t0
+ paddsw m3, m5, m1 ;t1
+ psubsw m5, m1 ;t5
+ ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a
+
+ mova m7, [rsp+gprsize*2+16*0]
+ paddsw m1, m3, m4 ;-out7
+ psubsw m3, m4 ;t3
+ mova [rsp+gprsize*2+16*0], m1
+ psubsw m4, m0, m7 ;t2
+ paddsw m0, m7 ;out0
+ mova m6, [rsp+gprsize*2+16*2]
+ mova m7, [rsp+gprsize*2+16*1]
+ paddsw m1, m5, m6 ;-out1
+ psubsw m5, m6 ;t6
+ paddsw m6, m2, m7 ;out6
+ psubsw m2, m7 ;t7
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova [rsp+gprsize*2+16*1], m1
+ mova [rsp+gprsize*2+16*2], m6
+ punpckhwd m1, m4, m3
+ punpcklwd m4, m3
+ punpckhwd m7, m5, m2
+ punpcklwd m5, m2
+ mova m2, [o(pw_2896_2896)]
+ mova m6, [o(pd_2048)]
+ pmaddwd m3, m2, m7
+ pmaddwd m2, m5
+ paddd m3, m6
+ paddd m2, m6
+ psrad m3, 12
+ psrad m2, 12
+ packssdw m2, m3 ;out2
+ mova m3, [o(pw_2896_m2896)]
+ pmaddwd m7, m3
+ pmaddwd m5, m3
+ paddd m7, m6
+ paddd m5, m6
+ psrad m7, 12
+ psrad m5, 12
+ packssdw m5, m7 ;-out5
+ mova m3, [o(pw_2896_2896)]
+ pmaddwd m7, m3, m1
+ pmaddwd m3, m4
+ paddd m7, m6
+ paddd m3, m6
+ psrad m7, 12
+ psrad m3, 12
+ packssdw m3, m7 ;-out3
+ mova m7, [o(pw_2896_m2896)]
+ pmaddwd m1, m7
+ pmaddwd m4, m7
+ paddd m1, m6
+ paddd m4, m6
+ psrad m1, 12
+ psrad m4, 12
+ packssdw m4, m1 ;-out5
+ mova m1, [rsp+gprsize*2+16*1]
+ mova m6, [rsp+gprsize*2+16*2]
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ paddsw m7, m4, m3 ;t2 + t3
+ psubsw m4, m3 ;t2 - t3
+ paddsw m3, m5, m2 ;t6 + t7
+ psubsw m5, m2 ;t6 - t7
+ mova m2, [o(pw_2896x8)]
+ pmulhrsw m4, m2 ;out4
+ pmulhrsw m5, m2 ;-out5
+ pmulhrsw m7, m2 ;-out3
+ pmulhrsw m2, m3 ;out2
+ mova m3, m7
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+
+.pass1:
+ call m(iadst_8x8_internal_8bpc).main
+ call m(iadst_8x8_internal_8bpc).main_pass1_end
+
+.pass1_end:
+ mova m7, [o(pw_m16384)]
+
+.pass1_end1:
+ pmulhrsw m1, m7
+ mova [rsp+gprsize+16*1], m1
+ mova m1, m6
+ mova m6, m2
+ pmulhrsw m2, m5, m7
+ mova m5, m6
+ mova m6, m4
+ pmulhrsw m4, m3, m7
+ mova m3, m6
+ mova m6, m0
+ mova m0, m7
+ pxor m7, m7
+ psubw m7, m0
+ pmulhrsw m0, [rsp+gprsize+16*0]
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+ALIGN function_align
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.pass2_main:
+ call m(iadst_8x8_internal_8bpc).main
+ call m(iadst_8x8_internal_8bpc).main_pass2_end
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*2], m2
+ mova m2, m0
+ pxor m0, m0
+ psubw m0, m7
+ mova m7, m2
+ pmulhrsw m1, m0
+ pmulhrsw m2, m5, m0
+ mova [rsp+gprsize+16*1], m1
+ mova m5, m4
+ mova m1, m6
+ pmulhrsw m4, m3, m0
+ pmulhrsw m0, [rsp+gprsize+16*0]
+ mova m3, m5
+ mova [rsp+gprsize+16*0], m7
+ jmp m(idct_8x8_internal_8bpc).end3
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+ALIGN function_align
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.end:
+ pmulhrsw m7, [o(pw_4096)]
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_4096)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ mova [rsp+gprsize+16*2], m5
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).end3
+
+
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x16, 8
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklwd m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd
+ pmulhrsw m0, [o(pw_16384)]
+ pmulhrsw m0, m1
+ pmulhrsw m0, [o(pw_2048)]
+.end:
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ RET
+%endif
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
+
+cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(idct_4x8_internal_8bpc).pass1)]
+
+.pass1:
+ mova m0, [coeffq+16*1]
+ mova m1, [coeffq+16*3]
+ mova m2, [coeffq+16*5]
+ mova m3, [coeffq+16*7]
+ push tx2q
+ lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)]
+ jmp r3
+
+.pass1_2:
+ mova [coeffq+16*1], m0
+ mova [coeffq+16*3], m1
+ mova [coeffq+16*5], m2
+ mova [coeffq+16*7], m3
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*2]
+ mova m2, [coeffq+16*4]
+ mova m3, [coeffq+16*6]
+ lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)]
+ jmp r3
+
+.pass1_end:
+ pop tx2q
+
+ mova m4, [coeffq+16*1]
+ mova m5, [coeffq+16*3]
+ mova m6, [coeffq+16*5]
+ mova m7, [o(pw_16384)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+
+ pmulhrsw m7, [coeffq+16*7]
+ mova [coeffq+16*7], m7
+ jmp tx2q
+
+.pass2:
+ call m(idct_16x4_internal_8bpc).main
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [coeffq+16*7]
+ mova [coeffq+16*4], m4
+
+.end1:
+ mova [coeffq+16*5], m5
+ mova [coeffq+16*6], m6
+ mov r3, coeffq
+ WRITE_4X8 0, 1, 3, 2
+
+ mova m0, [r3+16*4]
+ mova m1, [r3+16*5]
+ mova m2, [r3+16*6]
+ mova m3, m7
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X8 0, 1, 3, 2
+
+.end2:
+ pxor m7, m7
+ REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iadst_4x8_internal_8bpc).pass1)]
+ jmp m(idct_4x16_internal_8bpc).pass1
+
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ call m(iadst_16x4_internal_8bpc).main_pass2_end
+
+ punpcklqdq m6, m5, m4 ;low: -out5 high: -out7
+ punpckhqdq m4, m5 ;low: out8 high: out10
+ punpcklqdq m5, m7, m2 ;low: out4 high: out6
+ punpckhqdq m2, m7 ;low: -out9 high: -out11
+ mova [coeffq+16*4], m2
+ mova [coeffq+16*5], m6
+ mova m2, [coeffq+16*6]
+ mova m6, [coeffq+16*7]
+ punpckhqdq m1, m6, m0 ;low: -out13 high: -out15
+ punpcklqdq m0, m6 ;low: out0 high: out2
+ punpckhqdq m6, m3, m2 ;low: out12 high: out14
+ punpcklqdq m2, m3 ;low: -out1 high: -out3
+
+ mova m7, [o(pw_2048)]
+
+.end1:
+ REPX {pmulhrsw x, m7}, m0, m5, m4, m6
+ pxor m3, m3
+ psubw m3, m7
+ mova m7, [coeffq+16*4]
+ REPX {pmulhrsw x, m3}, m2, m7, m1
+ pmulhrsw m3, [coeffq+16*5]
+ mova [coeffq+16*7], m5
+
+ punpckhqdq m5, m4, m7 ;low: out10 high: out11
+ punpcklqdq m4, m7 ;low: out8 high: out9
+ punpckhqdq m7, m6, m1 ;low: out14 high: out15
+ punpcklqdq m6, m1 ;low: out12 high: out13
+ punpckhqdq m1, m0, m2 ;low: out2 high: out3
+ punpcklqdq m0, m2 ;low: out0 high: out1
+ mova [coeffq+16*4], m4
+ mova m4, [coeffq+16*7]
+ punpcklqdq m2, m4, m3 ;low: out4 high: out5
+ punpckhqdq m4, m3 ;low: out6 high: out7
+ mova m3, m4
+
+.end2:
+ mova [coeffq+16*5], m5
+ mova [coeffq+16*6], m6
+ mov r3, coeffq
+ WRITE_4X8 0, 1, 2, 3
+
+ mova m0, [r3+16*4]
+ mova m1, [r3+16*5]
+ mova m2, [r3+16*6]
+ mova m3, m7
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X8 0, 1, 2, 3
+
+.end3:
+ pxor m7, m7
+ REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ ret
+
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)]
+ jmp m(idct_4x16_internal_8bpc).pass1
+
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ call m(iadst_16x4_internal_8bpc).main_pass2_end
+
+ punpckhqdq m6, m5, m4 ;low: out5 high: out7
+ punpcklqdq m4, m5 ;low: -out8 high: -out10
+ punpckhqdq m5, m7, m2 ;low: -out4 high: -out6
+ punpcklqdq m2, m7 ;low: out9 high: out11
+ mova [coeffq+16*4], m2
+ mova [coeffq+16*5], m6
+ mova m2, [coeffq+16*6]
+ mova m6, [coeffq+16*7]
+ punpcklqdq m1, m6, m0 ;low: out13 high: out15
+ punpckhqdq m0, m6 ;low: -out0 high: -out2
+ punpcklqdq m6, m3, m2 ;low: -out12 high: -out14
+ punpckhqdq m2, m3 ;low: out1 high: out3
+
+ mova m7, [o(pw_m2048)]
+ jmp m(iadst_4x16_internal_8bpc).end1
+
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
+ pmulhrsw m%2, m%3, m%1
+%if %0 == 4 ; if downshifting by 1
+ pmulhrsw m%2, m%4
+%else
+ paddsw m%1, m%1
+%endif
+ paddsw m%1, m%2
+%endmacro
+
+cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*1]
+ mova m6, [o(pw_1697x8)]
+ mova m1, [coeffq+16*3]
+ mova m2, [coeffq+16*5]
+ mova m3, [coeffq+16*7]
+ pcmpeqw m7, m7
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_2)]
+.pass1:
+ pmulhrsw m4, m6, m0
+ pmulhrsw m5, m6, m1
+ pavgw m4, m0
+ pcmpeqw m0, m7
+ pavgw m5, m1
+ pcmpeqw m1, m7
+ pandn m0, m4
+ pmulhrsw m4, m6, m2
+ pandn m1, m5
+ pmulhrsw m5, m6, m3
+ pavgw m4, m2
+ pcmpeqw m2, m7
+ pavgw m5, m3
+ pcmpeqw m3, m7
+ pandn m2, m4
+ pandn m3, m5
+ jmp m(iadst_4x8_internal_8bpc).pass1_end
+.pass1_2:
+ mova [coeffq+16*1], m0
+ mova [coeffq+16*3], m1
+ mova [coeffq+16*5], m2
+ mova [coeffq+16*7], m3
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*2]
+ mova m2, [coeffq+16*4]
+ mova m3, [coeffq+16*6]
+ lea tx2q, [o(.pass1_end)]
+ jmp .pass1
+.pass1_end:
+ mova m4, [coeffq+16*1]
+ mova m5, [coeffq+16*3]
+ mova m6, [coeffq+16*5]
+ jmp r3
+.pass2:
+ mova m7, [o(pw_1697x16)]
+ mova [coeffq+16*6], m6
+ REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
+ mova m6, [coeffq+16*7]
+ IDTX16 6, 7, 7
+ mova [coeffq+16*7], m6
+ mova m6, [coeffq+16*6]
+ pmulhrsw m7, m6, [o(pw_1697x16)]
+ paddsw m6, m6
+ paddsw m6, m7
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [coeffq+16*7]
+ mova [coeffq+16*4], m4
+ jmp m(iadst_4x16_internal_8bpc).end2
+
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x4, 8
+%ifidn %1_%2, dct_dct
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ mov r2d, 2
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)]
+.dconly:
+ pmulhrsw m0, m2
+ movd m2, [o(pw_2048)] ;intentionally rip-relative
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ pxor m5, m5
+.dconly_loop:
+ mova m1, [dstq]
+ mova m3, [dstq+strideq]
+ punpckhbw m2, m1, m5
+ punpcklbw m1, m5
+ punpckhbw m4, m3, m5
+ punpcklbw m3, m5
+ paddw m2, m0
+ paddw m1, m0
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m1, m2
+ packuswb m3, m4
+ mova [dstq], m1
+ mova [dstq+strideq], m3
+ lea dstq, [dstq+strideq*2]
+ dec r2d
+ jg .dconly_loop
+ jmp tx2q
+.end:
+ RET
+%endif
+%endmacro
+
+%macro LOAD_7ROWS 2 ;src, stride
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+ mova m4, [%1+%2*4]
+ mova m5, [%1+%2*5]
+ mova m6, [%1+%2*6]
+%endmacro
+
+%macro SAVE_7ROWS 2 ;src, stride
+ mova [%1+%2*0], m0
+ mova [%1+%2*1], m1
+ mova [%1+%2*2], m2
+ mova [%1+%2*3], m3
+ mova [%1+%2*4], m4
+ mova [%1+%2*5], m5
+ mova [%1+%2*6], m6
+%endmacro
+
+%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3]
+ punpckhwd m%5, m%4, m%1 ;packed in13 in3
+ punpcklwd m%1, m%4 ;packed in1 in15
+ punpcklwd m%4, m%3, m%2 ;packed in9 in7
+ punpckhwd m%2, m%3 ;packed in5 in11
+ mova m%7, [o(pd_2048)]
+ ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a
+ ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a
+ ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a
+ ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a
+ psubsw m%6, m%1, m%4 ;low: t9 high: t14
+ paddsw m%1, m%4 ;low: t8 high: t15
+ psubsw m%4, m%5, m%2 ;low: t10 high: t13
+ paddsw m%5, m%2 ;low: t11 high: t12
+ mova m%2, [o(deint_shuf2)]
+ pshufb m%6, m%2
+ pshufb m%4, m%2
+ ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a
+ ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a
+ psubsw m%3, m%1, m%5 ;low: t11a high: t12a
+ paddsw m%1, m%5 ;low: t8a high: t15a
+ psubsw m%5, m%6, m%4 ;low: t10 high: t13
+ paddsw m%6, m%4 ;low: t9 high: t14
+ pshufb m%3, m%2
+ pshufb m%5, m%2
+ ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11
+ ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a
+ packssdw m%2, m%4 ;low: t11 high: t10a
+ packssdw m%3, m%5 ;low: t12 high: t13a
+ punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14
+ punpcklqdq m%1, m%6 ;low: t8a high: t9
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
+
+cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_7ROWS coeffq, 16
+ call .main
+
+.pass1_end:
+ punpckhwd m7, m0, m2 ;packed out1, out5
+ punpcklwd m0, m2 ;packed out0, out4
+ punpcklwd m2, m1, m3 ;packed out3, out7
+ punpckhwd m1, m3 ;packed out2, out6
+ mova [coeffq+16*6], m7
+ mova m7, [coeffq+16*7]
+ punpckhwd m3, m4, m6 ;packed out9, out13
+ punpcklwd m4, m6 ;packed out8, out12
+ punpcklwd m6, m5, m7 ;packed out11, out15
+ punpckhwd m5, m7 ;packed out10, out14
+
+.pass1_end2:
+ mova m7, [o(pw_16384)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [coeffq+16*6]
+ mova [coeffq+16*6], m7
+
+.pass1_end3:
+ punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high
+ punpcklwd m3, m6 ;packed 9, 10, 13, 15 low
+ punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high
+ punpcklwd m4, m5 ;packed 8, 10, 12, 14 low
+ punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1)
+ punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0)
+ punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3)
+ punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2)
+ mova [coeffq+16*7], m3
+ mova m3, [coeffq+16*6]
+ punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high
+ punpcklwd m3, m2 ;packed 1, 3, 5, 7 low
+ punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high
+ punpcklwd m0, m1 ;packed 0, 2, 4, 6 low
+ punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1)
+ punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0)
+ punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3)
+ punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2)
+ jmp tx2q
+
+.pass2:
+ lea tx2q, [o(m(idct_8x4_internal_8bpc).pass2)]
+
+.pass2_end:
+ mova [coeffq+16*4], m4
+ mova [coeffq+16*5], m5
+ mova [coeffq+16*6], m6
+ lea r3, [dstq+8]
+ call tx2q
+
+ add coeffq, 16*4
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+ mov dstq, r3
+ jmp tx2q
+
+ALIGN function_align
+cglobal_label .main
+ punpckhqdq m7, m0, m1 ;low:in1 high:in3
+ punpcklqdq m0, m1
+ punpcklqdq m1, m2, m3
+ punpckhqdq m3, m2 ;low:in7 high:in5
+ mova [coeffq+16*4], m7
+ mova [coeffq+16*5], m3
+ mova m7, [coeffq+16*7]
+ punpcklqdq m2, m4, m5
+ punpckhqdq m4, m5 ;low:in9 high:in11
+ punpcklqdq m3, m6, m7
+ punpckhqdq m7, m6 ;low:in15 high:in13
+ mova [coeffq+16*6], m4
+ IDCT8_1D_PACKED
+ mova m6, [coeffq+16*4]
+ mova m4, [coeffq+16*5]
+ mova m5, [coeffq+16*6]
+ mova [coeffq+16*4], m1
+ mova [coeffq+16*5], m2
+ mova [coeffq+16*6], m3
+
+ IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3
+
+ mova m1, [coeffq+16*4]
+ psubsw m3, m0, m7 ;low:out15 high:out14
+ paddsw m0, m7 ;low:out0 high:out1
+ psubsw m7, m1, m5 ;low:out12 high:out13
+ paddsw m1, m5 ;low:out3 high:out2
+ mova [coeffq+16*7], m3
+ mova m2, [coeffq+16*5]
+ mova m3, [coeffq+16*6]
+ psubsw m5, m2, m4 ;low:out11 high:out10
+ paddsw m2, m4 ;low:out4 high:out5
+ psubsw m4, m3, m6 ;low:out8 high:out9
+ paddsw m3, m6 ;low:out7 high:out6
+ mova m6, m7
+ ret
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_7ROWS coeffq, 16
+ call .main
+ call .main_pass1_end
+
+ punpckhwd m6, m7, m0 ;packed -out11, -out15
+ punpcklwd m0, m7 ;packed out0, out4
+ punpcklwd m7, m3, m4 ;packed -out3, -out7
+ punpckhwd m4, m3 ;packed out8, out12
+ mova m1, [coeffq+16*6]
+ punpcklwd m3, m1, m5 ;packed -out1, -out5
+ punpckhwd m5, m1 ;packed out10, out14
+ mova m1, [coeffq+16*7]
+ mova [coeffq+16*6], m3
+ mova [coeffq+16*7], m7
+ punpckhwd m3, m2, m1 ;packed -out9, -out13
+ punpcklwd m1, m2 ;packed out2, out6
+
+ mova m7, [o(pw_16384)]
+
+.pass1_end:
+ REPX {pmulhrsw x, m7}, m0, m1, m4, m5
+ pxor m2, m2
+ psubw m2, m7
+ mova m7, [coeffq+16*6]
+ REPX {pmulhrsw x, m2}, m7, m3, m6
+ pmulhrsw m2, [coeffq+16*7]
+ mova [coeffq+16*6], m7
+ jmp m(idct_16x4_internal_8bpc).pass1_end3
+
+.pass2:
+ lea tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)]
+ jmp m(idct_16x4_internal_8bpc).pass2_end
+
+ALIGN function_align
+cglobal_label .main
+ mova [coeffq+16*6], m0
+ pshufd m0, m1, q1032
+ pshufd m2, m2, q1032
+ punpckhwd m1, m6, m0 ;packed in13, in2
+ punpcklwd m0, m6 ;packed in3, in12
+ punpckhwd m7, m5, m2 ;packed in11, in4
+ punpcklwd m2, m5 ;packed in5, in10
+ mova m6, [o(pd_2048)]
+ ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3
+ ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5
+ ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11
+ ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13
+ psubsw m5, m1, m2 ;low:t10a high:t11a
+ paddsw m1, m2 ;low:t2a high:t3a
+ psubsw m2, m7, m0 ;low:t12a high:t13a
+ paddsw m7, m0 ;low:t4a high:t5a
+ punpcklqdq m0, m5
+ punpckhwd m0, m5 ;packed t10a, t11a
+ punpcklqdq m5, m2
+ punpckhwd m2, m5 ;packed t13a, t12a
+ ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11
+ ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13
+ mova [coeffq+16*4], m1
+ mova [coeffq+16*5], m7
+ mova m1, [coeffq+16*6]
+ mova m7, [coeffq+16*7]
+ pshufd m1, m1, q1032
+ pshufd m3, m3, q1032
+ punpckhwd m5, m7, m1 ;packed in15, in0
+ punpcklwd m1, m7 ;packed in1, in14
+ punpckhwd m7, m4, m3 ;packed in9, in6
+ punpcklwd m3, m4 ;packed in7, in8
+ ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1
+ ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7
+ ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9
+ ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15
+ psubsw m4, m5, m3 ;low:t8a high:t9a
+ paddsw m5, m3 ;low:t0a high:t1a
+ psubsw m3, m7, m1 ;low:t14a high:t15a
+ paddsw m7, m1 ;low:t6a high:t7a
+ punpcklqdq m1, m4
+ punpckhwd m1, m4 ;packed t8a, t9a
+ punpcklqdq m4, m3
+ punpckhwd m3, m4 ;packed t15a, t14a
+ ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9
+ ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15
+ paddsw m4, m1, m2 ;low:t12a high:t13a
+ psubsw m1, m2 ;low:t8a high:t9a
+ psubsw m2, m0, m3 ;low:t14a high:t15a
+ paddsw m0, m3 ;low:t10a high:t11a
+ punpcklqdq m3, m1
+ punpckhwd m3, m1 ;packed t12a, t13a
+ punpcklqdq m1, m2
+ punpckhwd m2, m1 ;packed t15a, t14a
+ ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13
+ ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15
+ psubsw m1, m3, m2 ;low:t14a high:t15a
+ paddsw m3, m2 ;low:out2 high:-out13
+ psubsw m2, m4, m0 ;low:t10 high:t11
+ paddsw m0, m4 ;low:-out1 high:out14
+ mova [coeffq+16*6], m0
+ mova [coeffq+16*7], m3
+ mova m0, [coeffq+16*4]
+ mova m3, [coeffq+16*5]
+ psubsw m4, m5, m3 ;low:t4 high:t5
+ paddsw m5, m3 ;low:t0 high:t1
+ psubsw m3, m0, m7 ;low:t6 high:t7
+ paddsw m0, m7 ;low:t2 high:t3
+ punpcklqdq m7, m4
+ punpckhwd m7, m4 ;packed t4, t5
+ punpcklqdq m4, m3
+ punpckhwd m3, m4 ;packed t7, t6
+ ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a
+ ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a
+ psubsw m4, m5, m0 ;low:t2a high:t3a
+ paddsw m0, m5 ;low:out0 high:-out15
+ psubsw m5, m7, m3 ;low:t6 high:t7
+ paddsw m3, m7 ;low:-out3 high:out12
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova m7, [o(deint_shuf1)]
+ mova [coeffq+16*4], m0
+ mova [coeffq+16*5], m3
+ mova m0, [o(pw_2896_m2896)]
+ mova m3, [o(pw_2896_2896)]
+ pshufb m1, m7 ;t14a t15a
+ pshufb m2, m7 ;t10 t11
+ pshufb m4, m7 ;t2a t3a
+ pshufb m5, m7 ;t6 t7
+ pmaddwd m7, m0, m2
+ pmaddwd m2, m3
+ paddd m7, m6
+ paddd m2, m6
+ psrad m7, 12
+ psrad m2, 12
+ packssdw m2, m7 ;low:out6 high:-out9
+ pmaddwd m7, m0, m4
+ pmaddwd m4, m3
+ paddd m7, m6
+ paddd m4, m6
+ psrad m7, 12
+ psrad m4, 12
+ packssdw m4, m7 ;low:-out7 high:out8
+ pmaddwd m7, m3, m5
+ pmaddwd m5, m0
+ paddd m7, m6
+ paddd m5, m6
+ psrad m7, 12
+ psrad m5, 12
+ packssdw m7, m5 ;low:out4 high:-out11
+ pmaddwd m5, m3, m1
+ pmaddwd m1, m0
+ paddd m5, m6
+ paddd m1, m6
+ psrad m5, 12
+ psrad m1, 12
+ packssdw m5, m1 ;low:-out5 high:out10
+ mova m0, [coeffq+16*4]
+ mova m3, [coeffq+16*5]
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ mova m7, [o(pw_2896x8)]
+ punpckhqdq m6, m2, m1 ;low:t11 high:t15a
+ punpcklqdq m2, m1 ;low:t10 high:t14a
+ psubsw m1, m2, m6
+ paddsw m2, m6
+ punpckhqdq m6, m4, m5 ;low:t3a high:t7
+ punpcklqdq m4, m5 ;low:t2a high:t6
+ psubsw m5, m4, m6
+ paddsw m4, m6
+ pmulhrsw m1, m7 ;low:-out9 high:out10
+ pmulhrsw m2, m7 ;low:out6 high:-out5
+ pmulhrsw m5, m7 ;low:out8 high:-out11
+ pmulhrsw m4, m7 ;low:-out7 high:out4
+ punpckhqdq m7, m4, m5 ;low:out4 high:-out11
+ punpcklqdq m4, m5 ;low:-out7 high:out8
+ punpckhqdq m5, m2, m1 ;low:-out5 high:out10
+ punpcklqdq m2, m1 ;low:out6 high:-out9
+ ret
+
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_7ROWS coeffq, 16
+ call m(iadst_16x4_internal_8bpc).main
+ call m(iadst_16x4_internal_8bpc).main_pass1_end
+
+ punpcklwd m6, m7, m0 ;packed out11, out15
+ punpckhwd m0, m7 ;packed -out0, -out4
+ punpckhwd m7, m3, m4 ;packed out3, out7
+ punpcklwd m4, m3 ;packed -out8, -out12
+ mova m1, [coeffq+16*6]
+ punpckhwd m3, m1, m5 ;packed out1, out5
+ punpcklwd m5, m1 ;packed -out10, -out14
+ mova m1, [coeffq+16*7]
+ mova [coeffq+16*6], m3
+ mova [coeffq+16*7], m7
+ punpcklwd m3, m2, m1 ;packed out9, out13
+ punpckhwd m1, m2 ;packed -out2, -out6
+
+ mova m7, [o(pw_m16384)]
+ jmp m(iadst_16x4_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)]
+ jmp m(idct_16x4_internal_8bpc).pass2_end
+
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m1, [coeffq+16*6]
+ mova m0, [coeffq+16*5]
+ mova m2, [coeffq+16*7]
+ mova m6, [o(pw_1697x16)]
+ mova m7, [o(pw_16384)]
+ pmulhrsw m4, m6, m1
+ pmulhrsw m3, m6, m0
+ pmulhrsw m5, m6, m2
+ pmulhrsw m4, m7
+ pmulhrsw m3, m7
+ pmulhrsw m5, m7
+ paddsw m1, m4
+ paddsw m0, m3
+ paddsw m5, m2
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+ mova m4, [coeffq+16*4]
+ mova [coeffq+16*6], m1
+ mova [coeffq+16*5], m0
+ mova [coeffq+16*7], m5
+ pmulhrsw m0, m6, m2
+ pmulhrsw m1, m6, m3
+ pmulhrsw m5, m6, m4
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ pmulhrsw m5, m7
+ paddsw m2, m0
+ paddsw m3, m1
+ paddsw m4, m5
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ pmulhrsw m5, m6, m0
+ pmulhrsw m6, m1
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ paddsw m0, m5
+ paddsw m1, m6
+ mova m6, [coeffq+16*6]
+ mova m5, [coeffq+16*5]
+ punpckhwd m7, m0, m2 ;packed out1, out5
+ punpcklwd m0, m2 ;packed out0, out4
+ punpckhwd m2, m1, m3 ;packed out3, out7
+ punpcklwd m1, m3 ;packed out2, out6
+ mova [coeffq+16*6], m7
+ mova m7, [coeffq+16*7]
+ punpckhwd m3, m4, m6 ;packed out9, out13
+ punpcklwd m4, m6 ;packed out8, out12
+ punpckhwd m6, m5, m7 ;packed out11, out15
+ punpcklwd m5, m7 ;packed out10, out14
+ jmp m(idct_16x4_internal_8bpc).pass1_end3
+
+.pass2:
+ lea tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)]
+ jmp m(idct_16x4_internal_8bpc).pass2_end
+
+
+%macro SAVE_8ROWS 2 ;src, stride
+ mova [%1+%2*0], m0
+ mova [%1+%2*1], m1
+ mova [%1+%2*2], m2
+ mova [%1+%2*3], m3
+ mova [%1+%2*4], m4
+ mova [%1+%2*5], m5
+ mova [%1+%2*6], m6
+ mova [%1+%2*7], m7
+%endmacro
+
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x16, 8, 16*16
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklwd m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mova m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ psrlw m2, 3 ; pw_2048
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ mov r3d, 4
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
+.end:
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
+
+cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(idct_8x8_internal_8bpc).pass1)]
+
+.pass1:
+ LOAD_8ROWS coeffq+16*1, 32, 1
+ mov [rsp+gprsize+16*11], tx2q
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)]
+ jmp r3
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 32, 1
+ mov tx2q, [rsp+gprsize+16*11]
+ jmp r3
+
+.pass2:
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end)]
+
+.pass2_pre:
+ mova [coeffq+16*2 ], m1
+ mova [coeffq+16*6 ], m3
+ mova [coeffq+16*10], m5
+ mova [coeffq+16*14], m7
+ mova m1, m2
+ mova m2, m4
+ mova m3, m6
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*5 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*13]
+
+.pass2_main:
+ call m(idct_8x8_internal_8bpc).main
+
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ mova m0, [coeffq+16*2 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*10]
+ mova m3, [coeffq+16*14]
+ mova m4, [coeffq+16*3 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*11]
+ mova m7, [coeffq+16*15]
+ call m(idct_16x8_internal_8bpc).main
+
+ mov r3, dstq
+ lea dstq, [dstq+strideq*8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iadst_8x8_internal_8bpc).pass1)]
+ jmp m(idct_8x16_internal_8bpc).pass1
+
+.pass2:
+ lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
+
+.pass2_pre:
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ mova m3, m5
+
+.pass2_main:
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*3 ]
+ mova m6, [coeffq+16*13]
+ mova m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*3], m4
+ mova [rsp+gprsize+16*4], m5
+ mova [rsp+gprsize+16*9], m6
+ mova [rsp+gprsize+32*5], m7
+ mova m4, [coeffq+16*5 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*11]
+
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+
+ mov r3, dstq
+ lea dstq, [dstq+strideq*8]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iadst_8x8_internal_8bpc).end
+
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)]
+ jmp m(idct_8x16_internal_8bpc).pass1
+
+.pass2:
+ lea tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)]
+ lea r3, [dstq+strideq*8]
+
+.pass2_pre:
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ mova m3, m5
+
+.pass2_main:
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*3 ]
+ mova m6, [coeffq+16*13]
+ mova m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*3], m4
+ mova [rsp+gprsize+16*4], m5
+ mova [rsp+gprsize+16*9], m6
+ mova [rsp+gprsize+32*5], m7
+ mova m4, [coeffq+16*5 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*11]
+
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*1, 32, 1
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 32, 1
+ mov tx2q, r3
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass2:
+ lea tx2q, [o(.end1)]
+
+.end:
+ mova [rsp+gprsize+16*0], m7
+ mova [rsp+gprsize+16*1], m6
+ mova m7, [o(pw_1697x16)]
+ REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
+ mova m6, [rsp+gprsize+16*1]
+ mova [rsp+gprsize+16*2], m5
+ IDTX16 6, 5, 7
+ mova m5, [rsp+gprsize+16*0]
+ IDTX16 5, 7, 7
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [rsp+gprsize+16*2]
+ mova [rsp+gprsize+16*0], m5
+ mova [rsp+gprsize+16*1], m6
+ mova [rsp+gprsize+16*2], m7
+ jmp m(idct_8x8_internal_8bpc).end3
+
+.end1:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp .end
+
+
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x8, 8, 16*16
+%ifidn %1_%2, dct_dct
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r2d, 4
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+.end:
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
+
+cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*0, 32, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+16*1, 32, 1
+ call .main
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+16*2], m6
+ mova [rsp+gprsize*2+32*5], m5
+
+ mova m6, [o(pd_2048)]
+ ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a
+ ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a
+ psubsw m2, m0, m4 ;t9
+ paddsw m0, m4 ;t8
+ psubsw m4, m7, m3 ;t14
+ paddsw m7, m3 ;t15
+ ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a
+ mova m3, [rsp+gprsize*2+16*1]
+ mova m5, [rsp+gprsize*2+32*5]
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+32*5], m4
+ mova m2, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m7
+ ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a
+ ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a
+ psubsw m4, m2, m3 ;t10
+ paddsw m2, m3 ;t11
+ psubsw m3, m1, m5 ;t13
+ paddsw m1, m5 ;t12
+ ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a
+ mova m7, [rsp+gprsize*2+32*5]
+ psubsw m6, m0, m2 ;t11a
+ paddsw m0, m2 ;t8a
+ paddsw m2, m7, m3 ;t9
+ psubsw m7, m3 ;t10
+ mova m5, [rsp+gprsize*2+16*0]
+ psubsw m3, m5, m0 ;out8
+ paddsw m0, m5 ;out7
+ mova [rsp+gprsize*2+32*5], m0
+ mova m5, [rsp+gprsize*2+16*9]
+ psubsw m0, m5, m2 ;out9
+ paddsw m2, m5 ;out6
+ mova [rsp+gprsize*2+16*0], m0
+ mova [rsp+gprsize*2+16*9], m2
+ mova m0, [rsp+gprsize*2+16*1]
+ mova m2, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*1], m3
+ psubsw m5, m0, m4 ;t13
+ paddsw m0, m4 ;t14
+ mova m3, [o(pd_2048)]
+ psubsw m4, m2, m1 ;t12a
+ paddsw m1, m2 ;t15a
+ mova [rsp+gprsize*2+16*2], m1
+ ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a
+ ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12
+ mova m3, [rsp+gprsize*2+16*8]
+ psubsw m2, m3, m5 ;out10
+ paddsw m3, m5 ;out5
+ mova m5, [rsp+gprsize*2+16*7]
+ mova [rsp+gprsize*2+16*8], m3
+ psubsw m3, m5, m4 ;out11
+ paddsw m5, m4 ;out4
+ mova m4, [rsp+gprsize*2+16*6]
+ mova [rsp+gprsize*2+16*7], m5
+ paddsw m5, m4, m6 ;out3
+ psubsw m4, m6 ;out12
+ mova m6, [rsp+gprsize*2+16*5]
+ mova [rsp+gprsize*2+16*6], m5
+ psubsw m5, m6, m7 ;out13
+ paddsw m6, m7 ;out2
+ mova m7, [rsp+gprsize*2+16*4]
+ mova [rsp+gprsize*2+16*5], m6
+ psubsw m6, m7, m0 ;out14
+ paddsw m7, m0 ;out1
+ mova m1, [rsp+gprsize*2+16*2]
+ mova m0, [rsp+gprsize*2+16*3]
+ mova [rsp+gprsize*2+16*4], m7
+ psubsw m7, m0, m1 ;out15
+ paddsw m0, m1 ;out0
+ mova [rsp+gprsize*2+16*3], m0
+ mova m1, [rsp+gprsize*2+16*0]
+ mova m0, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*0], m7
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [coeffq+16*0 ]
+ pmulhrsw m1, m7, [coeffq+16*1 ]
+ pmulhrsw m2, m7, [coeffq+16*14]
+ pmulhrsw m3, m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ pmulhrsw m0, m7, [coeffq+16*6 ]
+ pmulhrsw m1, m7, [coeffq+16*7 ]
+ pmulhrsw m2, m7, [coeffq+16*8 ]
+ pmulhrsw m3, m7, [coeffq+16*9 ]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ pmulhrsw m0, m7, [coeffq+16*2 ]
+ pmulhrsw m1, m7, [coeffq+16*3 ]
+ pmulhrsw m2, m7, [coeffq+16*4 ]
+ pmulhrsw m3, m7, [coeffq+16*5 ]
+ pmulhrsw m4, m7, [coeffq+16*10]
+ pmulhrsw m5, m7, [coeffq+16*11]
+ pmulhrsw m6, m7, [coeffq+16*12]
+ pmulhrsw m7, [coeffq+16*13]
+
+ call .main
+ call .main_pass1_end
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(iadst_8x8_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iadst_8x8_internal_8bpc).pass2_main
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iadst_8x8_internal_8bpc).pass2_main
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*0], m1
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+16*2], m6
+
+ mova m6, [o(pd_2048)]
+ ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2
+ ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10
+ psubsw m1, m0, m4 ;t10a
+ paddsw m0, m4 ;t2a
+ psubsw m4, m7, m3 ;t11a
+ paddsw m3, m7 ;t3a
+ ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10
+ mova m2, [rsp+gprsize*2+16*0] ;in3
+ mova m7, [rsp+gprsize*2+16*1] ;in4
+ mova [rsp+gprsize*2+16*0], m1 ;t11
+ mova [rsp+gprsize*2+16*1], m4 ;t10
+ mova m1, [rsp+gprsize*2+16*2] ;in12
+ mova [rsp+gprsize*2+16*2], m0 ;t2a
+ ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4
+ ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12
+ psubsw m0, m7, m1 ;t12a
+ paddsw m1, m7 ;t4a
+ psubsw m4, m5, m2 ;t13a
+ paddsw m5, m2 ;t5a
+ ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13
+ mova m2, [rsp+gprsize*2+16*8] ;in1
+ mova m7, [rsp+gprsize*2+16*9] ;in14
+ mova [rsp+gprsize*2+16*8], m4 ;t12
+ mova [rsp+gprsize*2+16*9], m0 ;t13
+ mova m4, [rsp+gprsize*2+16*4] ;in9
+ mova m0, [rsp+gprsize*2+16*5] ;in6
+ mova [rsp+gprsize*2+16*4], m1 ;t4a
+ mova [rsp+gprsize*2+16*5], m5 ;t5a
+ ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14
+ ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6
+ psubsw m1, m0, m7 ;t14a
+ paddsw m0, m7 ;t6a
+ psubsw m5, m4, m2 ;t15a
+ paddsw m4, m2 ;t7a
+ ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15
+ mova m2, [rsp+gprsize*2+16*2] ;t2a
+ mova [rsp+gprsize*2+16*2], m5 ;t14
+ psubsw m7, m2, m0 ;t6
+ paddsw m2, m0 ;t2
+ psubsw m0, m3, m4 ;t7
+ paddsw m3, m4 ;t3
+ ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a
+ mova m4, [rsp+gprsize*2+16*7] ;in0
+ mova m5, [rsp+gprsize*2+32*5] ;in15
+ mova [rsp+gprsize*2+16*7], m3 ;t3
+ mova [rsp+gprsize*2+32*5], m1 ;t15
+ mova m1, [rsp+gprsize*2+16*6] ;in7
+ mova m3, [rsp+gprsize*2+16*3] ;in8
+ mova [rsp+gprsize*2+16*6], m7 ;t7a
+ mova [rsp+gprsize*2+16*3], m0 ;t6a
+ ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0
+ ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8
+ psubsw m0, m4, m3 ;t8a
+ paddsw m4, m3 ;t0a
+ psubsw m3, m5, m1 ;t9a
+ paddsw m5, m1 ;t1a
+ ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8
+ mova m1, [rsp+gprsize*2+16*4] ;t4a
+ mova m7, [rsp+gprsize*2+16*5] ;t5a
+ mova [rsp+gprsize*2+16*4], m3 ;t8
+ mova [rsp+gprsize*2+16*5], m0 ;t9
+ psubsw m0, m4, m1 ;t4
+ paddsw m4, m1 ;t0
+ psubsw m3, m5, m7 ;t5
+ paddsw m5, m7 ;t1
+ ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a
+ mova m7, [rsp+gprsize*2+16*3] ;t6a
+ psubsw m1, m4, m2 ;t2a
+ paddsw m4, m2 ;out0
+ mova [rsp+gprsize*2+16*3], m4 ;out0
+ mova m4, [rsp+gprsize*2+16*6] ;t7a
+ psubsw m2, m3, m7 ;t6
+ paddsw m3, m7 ;-out3
+ mova [rsp+gprsize*2+16*6], m3 ;-out3
+ psubsw m3, m0, m4 ;t7
+ paddsw m0, m4 ;out12
+ mova [rsp+gprsize*2+16*12], m3
+ mova m3, [rsp+gprsize*2+16*7] ;t3
+ mova [rsp+gprsize*2+16* 7], m2 ;out4
+ psubsw m2, m5, m3 ;t3a
+ paddsw m5, m3 ;-out15
+ mova [rsp+gprsize*2+16*11], m2
+ mova m2, [rsp+gprsize*2+32*5] ;t15
+ mova [rsp+gprsize*2+16*10], m1 ;-out7
+ mova m1, [rsp+gprsize*2+16*0] ;t11
+ mova [rsp+gprsize*2+16*0 ], m5 ;-out15
+ mova m3, [rsp+gprsize*2+16*1] ;t10
+ mova [rsp+gprsize*2+16*1 ], m4 ;-out11
+ mova m4, [rsp+gprsize*2+16*2] ;t14
+ mova [rsp+gprsize*2+16*2 ], m0 ;out12
+ psubsw m0, m3, m4 ;t14a
+ paddsw m3, m4 ;t10a
+ psubsw m5, m1, m2 ;t15a
+ paddsw m1, m2 ;t11a
+ ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15
+ mova m2, [rsp+gprsize*2+16*4] ;t8
+ mova m4, [rsp+gprsize*2+16*5] ;t9
+ mova [rsp+gprsize*2+16*4], m3 ;t10a
+ mova [rsp+gprsize*2+16*5], m1 ;t11a
+ mova m3, [rsp+gprsize*2+16*8] ;t12
+ mova m1, [rsp+gprsize*2+16*9] ;t13
+ mova [rsp+gprsize*2+16*8], m5 ;t14
+ mova [rsp+gprsize*2+16*9], m0 ;t15
+ psubsw m5, m2, m3 ;t12a
+ paddsw m2, m3 ;t8a
+ psubsw m0, m4, m1 ;t13a
+ paddsw m4, m1 ;t9a
+ ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12
+ mova m6, [rsp+gprsize*2+16*4] ;t10a
+ mova m1, [rsp+gprsize*2+16*5] ;t11a
+ psubsw m3, m2, m6 ;t10
+ paddsw m2, m6 ;-out1
+ paddsw m6, m4, m1 ;out14
+ psubsw m4, m1 ;t11
+ mova [rsp+gprsize*2+16*14], m4
+ mova [rsp+gprsize*2+16* 4], m2 ;-out1
+ mova m4, [rsp+gprsize*2+16*8] ;t14
+ mova m2, [rsp+gprsize*2+16*9] ;t15
+ mova [rsp+gprsize*2+16* 9], m3 ;out6
+ psubsw m3, m0, m4 ;t14a
+ paddsw m0, m4 ;out2
+ psubsw m4, m5, m2 ;t15a
+ paddsw m5, m2 ;-out13
+ mova [rsp+gprsize*2+16* 5], m0 ;out2
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova m0, [rsp+gprsize*2+16*14]
+ mova [rsp+gprsize*2+16*14], m5
+ mova [rsp+gprsize*2+16*15], m6
+ mova m5, [o(pw_2896_2896)]
+ mova m6, [o(pw_2896_m2896)]
+ mova m7, [o(pd_2048)]
+ punpcklwd m2, m3, m4
+ punpckhwd m3, m4
+ pmaddwd m4, m5, m2
+ pmaddwd m2, m6
+ pmaddwd m1, m5, m3
+ pmaddwd m3, m6
+ REPX {paddd x, m7}, m4, m2, m1, m3
+ REPX {psrad x, 12}, m4, m1, m2, m3
+ packssdw m4, m1 ;-out5
+ packssdw m2, m3 ;out10
+ mova [rsp+gprsize*2+16* 8], m4
+ mova m3, [rsp+gprsize*2+16* 9]
+ punpcklwd m1, m3, m0
+ punpckhwd m3, m0
+ pmaddwd m0, m5, m1
+ pmaddwd m1, m6
+ pmaddwd m4, m5, m3
+ pmaddwd m3, m6
+ REPX {paddd x, m7}, m0, m1, m4, m3
+ REPX {psrad x, 12}, m0, m4, m1, m3
+ packssdw m0, m4 ;out6
+ packssdw m1, m3 ;-out9
+ mova [rsp+gprsize*2+16* 9], m0
+ mova m0, [rsp+gprsize*2+16* 7]
+ mova m4, [rsp+gprsize*2+16*12]
+ punpcklwd m3, m0, m4
+ punpckhwd m0, m4
+ pmaddwd m4, m5, m3
+ pmaddwd m3, m6
+ pmaddwd m5, m0
+ pmaddwd m0, m6
+ REPX {paddd x, m7}, m4, m3, m5, m0
+ REPX {psrad x, 12}, m4, m5, m3, m0
+ packssdw m4, m5 ;out4
+ packssdw m3, m0 ;-out11
+ mova [rsp+gprsize*2+16* 7], m4
+ mova m4, [rsp+gprsize*2+16*10]
+ mova m5, [rsp+gprsize*2+16*11]
+ punpcklwd m0, m4, m5
+ punpckhwd m4, m5
+ pmaddwd m5, m0, [o(pw_2896_2896)]
+ pmaddwd m0, m6
+ pmaddwd m6, m4
+ pmaddwd m4, [o(pw_2896_2896)]
+ REPX {paddd x, m7}, m5, m0, m6, m4
+ REPX {psrad x, 12}, m0, m6, m5, m4
+ packssdw m0, m6 ;out8
+ packssdw m5, m4 ;-out7
+ mova [rsp+gprsize*2+16*10], m5
+ mova m4, [rsp+gprsize*2+16* 2] ;out12
+ mova m5, [rsp+gprsize*2+16*14] ;-out13
+ mova m6, [rsp+gprsize*2+16*15] ;out14
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ mova m7, [o(pw_2896x8)]
+ mova m1, [rsp+gprsize*2+16* 9]
+ mova m2, [rsp+gprsize*2+16*14]
+ paddsw m0, m1, m2
+ psubsw m1, m2
+ pmulhrsw m0, m7 ;out6
+ pmulhrsw m1, m7 ;-out9
+ mova [rsp+gprsize*2+16* 9], m0
+ psubsw m2, m3, m4
+ paddsw m3, m4
+ pmulhrsw m2, m7 ;out10
+ pmulhrsw m3, m7 ;-out5
+ mova [rsp+gprsize*2+16* 8], m3
+ mova m3, [rsp+gprsize*2+16* 7]
+ mova m4, [rsp+gprsize*2+16*12]
+ paddsw m0, m3, m4
+ psubsw m3, m4
+ pmulhrsw m0, m7 ;out4
+ pmulhrsw m3, m7 ;-out11
+ mova [rsp+gprsize*2+16* 7], m0
+ mova m0, [rsp+gprsize*2+16*10]
+ paddsw m4, m0, [rsp+gprsize*2+16*11]
+ psubsw m0, [rsp+gprsize*2+16*11]
+ pmulhrsw m4, m7 ;-out7
+ pmulhrsw m0, m7 ;out8
+ mova [rsp+gprsize*2+16*10], m4
+ mova m4, [rsp+gprsize*2+16*2 ] ;out12
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [coeffq+16*0 ]
+ pmulhrsw m1, m7, [coeffq+16*1 ]
+ pmulhrsw m2, m7, [coeffq+16*14]
+ pmulhrsw m3, m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ pmulhrsw m0, m7, [coeffq+16*6 ]
+ pmulhrsw m1, m7, [coeffq+16*7 ]
+ pmulhrsw m2, m7, [coeffq+16*8 ]
+ pmulhrsw m3, m7, [coeffq+16*9 ]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ pmulhrsw m0, m7, [coeffq+16*2 ]
+ pmulhrsw m1, m7, [coeffq+16*3 ]
+ pmulhrsw m2, m7, [coeffq+16*4 ]
+ pmulhrsw m3, m7, [coeffq+16*5 ]
+ pmulhrsw m4, m7, [coeffq+16*10]
+ pmulhrsw m5, m7, [coeffq+16*11]
+ pmulhrsw m6, m7, [coeffq+16*12]
+ pmulhrsw m7, [coeffq+16*13]
+
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS coeffq+16*0, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 32
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iflipadst_8x8_internal_8bpc).pass2_main
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iflipadst_8x8_internal_8bpc).pass2_main
+
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ add coeffq, 16*16
+ mova m4, [coeffq-16*7]
+ mova m5, [coeffq-16*5]
+ mova m6, [coeffq-16*3]
+ mova m7, [coeffq-16*1]
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+
+.pass1:
+ mova m0, [o(pw_2896x8)]
+ mova m2, [o(pw_1697x16)]
+ mova m3, [o(pw_16384)]
+ sub coeffq, 8*16
+ REPX {pmulhrsw x, m0}, m4, m5, m6, m7
+ pmulhrsw m1, m2, m4
+ pmulhrsw m1, m3
+ paddsw m1, m4 ; 1
+ pmulhrsw m4, m2, m5
+ pmulhrsw m4, m3
+ paddsw m4, m5 ; 3
+ pmulhrsw m5, m2, m6
+ pmulhrsw m5, m3
+ paddsw m5, m6 ; 5
+ pmulhrsw m6, m2, m7
+ pmulhrsw m6, m3
+ paddsw m7, m6 ; 7
+ pmulhrsw m6, m0, [coeffq+16*6]
+ mova [rsp+gprsize+16*0], m4
+ pmulhrsw m4, m2, m6
+ pmulhrsw m4, m3
+ paddsw m6, m4 ; 6
+ pmulhrsw m4, m0, [coeffq+16*4]
+ mova [rsp+gprsize+16*1], m6
+ pmulhrsw m6, m2, m4
+ pmulhrsw m6, m3
+ paddsw m4, m6 ; 4
+ pmulhrsw m6, m0, [coeffq+16*2]
+ pmulhrsw m0, [coeffq+16*0]
+ pmulhrsw m2, m6
+ pmulhrsw m2, m3
+ paddsw m2, m6 ; 2
+ pmulhrsw m6, m0, [o(pw_1697x16)]
+ pmulhrsw m6, m3
+ mova m3, [rsp+gprsize+16*0]
+ paddsw m0, m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass1_end:
+ mova [coeffq+16*1], m4
+ mova [coeffq+16*3], m5
+ mova [coeffq+16*5], m6
+ mova [coeffq+16*7], m7
+ mova m4, [coeffq-16*7]
+ mova m5, [coeffq-16*5]
+ mova m6, [coeffq-16*3]
+ mova m7, [coeffq-16*1]
+ mova [coeffq-16*7], m0
+ mova [coeffq-16*5], m1
+ mova [coeffq-16*3], m2
+ mova [coeffq-16*1], m3
+ mov tx2q, r3
+ jmp .pass1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iidentity_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iidentity_8x8_internal_8bpc).end
+
+
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x16, 8, 16*16
+%ifidn %1_%2, dct_dct
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r2d, 8
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+.end:
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
+
+cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*1, 64
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*3, 64
+ call m(idct_16x8_internal_8bpc).main
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*17, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 64
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*2, 64
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end2)]
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ jmp m(idct_8x16_internal_8bpc).pass2_pre
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ mov dstq, r3
+ lea r3, [dstq+8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+ mov dstq, r3
+
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*4 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*12]
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*5 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*13]
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end)]
+ jmp m(idct_8x16_internal_8bpc).pass2_main
+
+
+%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0
+ mova m0, [coeffq+16*1 ]
+ mova m1, [coeffq+16*3 ]
+ mova m2, [coeffq+16*29]
+ mova m3, [coeffq+16*31]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ mova m0, [coeffq+16*13]
+ mova m1, [coeffq+16*15]
+ mova m2, [coeffq+16*17]
+ mova m3, [coeffq+16*19]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ mova m0, [coeffq+16*5 ]
+ mova m1, [coeffq+16*7 ]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*11]
+ mova m4, [coeffq+16*21]
+ mova m5, [coeffq+16*23]
+ mova m6, [coeffq+16*25]
+ mova m7, [coeffq+16*27]
+%endmacro
+
+%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*2 ]
+ mova m2, [coeffq+16*28]
+ mova m3, [coeffq+16*30]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ mova m0, [coeffq+16*12]
+ mova m1, [coeffq+16*14]
+ mova m2, [coeffq+16*16]
+ mova m3, [coeffq+16*18]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*10]
+ mova m4, [coeffq+16*20]
+ mova m5, [coeffq+16*22]
+ mova m6, [coeffq+16*24]
+ mova m7, [coeffq+16*26]
+%endmacro
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ ITX_16X16_ADST_LOAD_ODD_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*17, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*1, 32
+ ITX_16X16_ADST_LOAD_EVEN_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ lea tx2q, [o(.pass1_end2)]
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ jmp m(iadst_8x16_internal_8bpc).pass2_pre
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ mov dstq, r3
+ lea r3, [dstq+8]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+ mov dstq, r3
+
+ mova m4, [coeffq+16*0 ]
+ mova m5, [coeffq+16*2 ]
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*10]
+ mova m6, [coeffq+16*12]
+ mova m7, [coeffq+16*14]
+ mova [rsp+gprsize+16*7], m4
+ mova [rsp+gprsize+16*8], m5
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+ lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
+ jmp m(iadst_8x16_internal_8bpc).pass2_main
+
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ ITX_16X16_ADST_LOAD_ODD_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*17, 32
+ ITX_16X16_ADST_LOAD_EVEN_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS coeffq+16*0, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end2)]
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS coeffq+16* 0, 32
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iflipadst_8x16_internal_8bpc).pass2_pre
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+
+ mova m4, [coeffq+16*0 ]
+ mova m5, [coeffq+16*2 ]
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*10]
+ mova m6, [coeffq+16*12]
+ mova m7, [coeffq+16*14]
+ mova [rsp+gprsize+16*7], m4
+ mova [rsp+gprsize+16*8], m5
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+
+ lea tx2q, [o(.end2)]
+ mov dstq, r3
+ jmp m(iflipadst_8x16_internal_8bpc).pass2_main
+
+.end2:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+
+%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
+ pmulhrsw m%2, m%3, m%1
+ psraw m%2, 1
+ pavgw m%1, m%2
+%endmacro
+
+INV_TXFM_16X16_FN identity, dct
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ add coeffq, 16*17
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+
+.pass1:
+ mova m6, [o(pw_1697x16)]
+ mova m7, [coeffq+32*6]
+ mova m0, [coeffq+32*0]
+ mova m1, [coeffq+32*1]
+ mova m2, [coeffq+32*2]
+ mova m3, [coeffq+32*3]
+ mova m4, [coeffq+32*4]
+ REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4
+ mova m5, [coeffq+32*5]
+ mova [rsp+gprsize+16*1], m7
+ IDTX16B 5, 7, 6
+ mova m7, [coeffq+32*7]
+ IDTX16B 7, 6, 6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass1_end:
+ SAVE_8ROWS coeffq, 32
+ sub coeffq, 16
+ lea tx2q, [o(.pass1_end1)]
+ jmp .pass1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq, 32
+ sub coeffq, 15*16
+ lea tx2q, [o(.pass1_end2)]
+ jmp .pass1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq, 32
+ sub coeffq, 16
+ mov tx2q, r3
+ jmp .pass1
+
+.pass2:
+ lea r3, [dstq+8]
+ lea tx2q, [o(.end1)]
+
+.end:
+ mova [rsp+gprsize+16*0], m7
+ mova [rsp+gprsize+16*1], m4
+ mova m7, [o(pw_1697x16)]
+ REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3
+ mova m4, [o(pw_2048)]
+ pmulhrsw m5, m4
+ pmulhrsw m6, m4
+ mova [rsp+gprsize+16*2], m5
+ mova m5, [rsp+gprsize+16*1]
+ mova [rsp+gprsize+16*1], m6
+ IDTX16 5, 6, 7
+ mova m6, [rsp+gprsize+16*0]
+ IDTX16 6, 7, 7
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6
+ pmulhrsw m4, m5
+ mova [rsp+gprsize+16*0], m6
+ jmp m(idct_8x8_internal_8bpc).end3
+
+.end1:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(.end2)]
+ lea dstq, [dstq+strideq*2]
+ jmp .end
+
+.end2:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+ LOAD_8ROWS coeffq, 32
+ lea tx2q, [o(.end3)]
+ mov dstq, r3
+ jmp .end
+
+.end3:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp .end
+
+
+cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_8x32_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m2
+ psrlw m2, 2 ;pw_2048
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ mov r3d, 8
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
+
+.end:
+ RET
+
+
+
+cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ cmp eobd, 106
+ jle .fast
+
+ LOAD_8ROWS coeffq+16*3, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1:
+ mova [rsp+gprsize+16*9 ], m0 ;in24
+ mova [rsp+gprsize+16*10], m4 ;in28
+ mova [rsp+gprsize+16*17], m2 ;in26
+ mova [rsp+gprsize+16*18], m6 ;in30
+ mova [rsp+gprsize+16*31], m1 ;in25
+ mova [rsp+gprsize+16*30], m3 ;in27
+ mova [rsp+gprsize+16*27], m5 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+ LOAD_8ROWS coeffq+16*2, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_1:
+ mova [rsp+gprsize+16*7 ], m0 ;in16
+ mova [rsp+gprsize+16*8 ], m4 ;in20
+ mova [rsp+gprsize+16*15], m2 ;in18
+ mova [rsp+gprsize+16*16], m6 ;in22
+ mova [rsp+gprsize+16*33], m1 ;in17
+ mova [rsp+gprsize+16*28], m3 ;in19
+ mova [rsp+gprsize+16*29], m5 ;in21
+ mova [rsp+gprsize+16*32], m7 ;in23
+
+.fast:
+ LOAD_8ROWS coeffq+16*1, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ mova [rsp+gprsize+16*5 ], m0 ;in8
+ mova [rsp+gprsize+16*6 ], m4 ;in12
+ mova [rsp+gprsize+16*13], m2 ;in10
+ mova [rsp+gprsize+16*14], m6 ;in14
+ mova [rsp+gprsize+16*21], m1 ;in9
+ mova [rsp+gprsize+16*24], m3 ;in11
+ mova [rsp+gprsize+16*25], m5 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+ LOAD_8ROWS coeffq+16*0, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ mova [rsp+gprsize+16*11], m2 ;in2
+ mova [rsp+gprsize+16*12], m6 ;in6
+ mova [rsp+gprsize+16*19], m1 ;in1
+ mova [rsp+gprsize+16*26], m3 ;in3
+ mova [rsp+gprsize+16*23], m5 ;in5
+ mova [rsp+gprsize+16*22], m7 ;in7
+ mova m1, m4 ;in4
+ mova m2, [rsp+gprsize+16*5 ] ;in8
+ mova m3, [rsp+gprsize+16*6 ] ;in12
+
+ cmp eobd, 106
+ jg .full
+
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3 , 16
+ mova m0, [rsp+gprsize+16*11]
+ mova m1, [rsp+gprsize+16*12]
+ mova m2, [rsp+gprsize+16*13]
+ mova m3, [rsp+gprsize+16*14]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call .main_fast
+ jmp .pass2
+
+.full:
+ mova m4, [rsp+gprsize+16*7 ] ;in16
+ mova m5, [rsp+gprsize+16*8 ] ;in20
+ mova m6, [rsp+gprsize+16*9 ] ;in24
+ mova m7, [rsp+gprsize+16*10] ;in28
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3 , 16
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+ call .main
+
+.pass2:
+ lea r3, [o(.end6)]
+
+.end:
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.end2)]
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+
+ jmp tx2q
+
+.end2:
+ lea tx2q, [o(.end3)]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end3:
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea dstq, [dstq+strideq*2]
+ lea tx2q, [o(.end4)]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end4:
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea dstq, [dstq+strideq*2]
+ lea tx2q, [o(.end5)]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end5:
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea dstq, [dstq+strideq*2]
+ mov tx2q, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end6:
+ ret
+
+ALIGN function_align
+cglobal_label .main_veryfast
+ mova m0, [rsp+gprsize*2+16*19] ;in1
+ pmulhrsw m3, m0, [o(pw_4091x8)] ;t30,t31
+ pmulhrsw m0, [o(pw_201x8)] ;t16,t17
+ mova m7, [o(pd_2048)]
+ mova [rsp+gprsize*2+16*19], m0 ;t16
+ mova [rsp+gprsize*2+16*34], m3 ;t31
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 799, 4017 ;t17a, t30a
+ mova [rsp+gprsize*2+16*20], m3 ;t17a
+ mova [rsp+gprsize*2+16*33], m0 ;t30a
+ mova m1, [rsp+gprsize*2+16*22] ;in7
+ pmulhrsw m2, m1, [o(pw_3857x8)] ;t28,t29
+ pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19
+ mova [rsp+gprsize*2+16*22], m1 ;t19
+ mova [rsp+gprsize*2+16*31], m2 ;t28
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m2 ;t18a
+ mova [rsp+gprsize*2+16*32], m1 ;t29a
+ mova m0, [rsp+gprsize*2+16*23] ;in5
+ pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27
+ pmulhrsw m0, [o(pw_995x8)] ;t20, t21
+ mova [rsp+gprsize*2+16*23], m0 ;t20
+ mova [rsp+gprsize*2+16*30], m3 ;t27
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 3406, 2276 ;t21a, t26a
+ mova [rsp+gprsize*2+16*24], m3 ;t21a
+ mova [rsp+gprsize*2+16*29], m0 ;t26a
+ mova m2, [rsp+gprsize*2+16*26] ;in3
+ pxor m0, m0
+ mova m3, m0
+ pmulhrsw m1, m2, [o(pw_4052x8)]
+ pmulhrsw m2, [o(pw_m601x8)]
+ jmp .main2
+
+ALIGN function_align
+cglobal_label .main_fast ;bottom half is zero
+ mova m0, [rsp+gprsize*2+16*19] ;in1
+ mova m1, [rsp+gprsize*2+16*20] ;in15
+ pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a
+ pmulhrsw m0, [o(pw_201x8)] ;t16a
+ pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a
+ pmulhrsw m1, [o(pw_m2751x8)] ;t17a
+ mova m7, [o(pd_2048)]
+ psubsw m4, m0, m1 ;t17
+ paddsw m0, m1 ;t16
+ psubsw m5, m3, m2 ;t30
+ paddsw m3, m2 ;t31
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a
+ mova [rsp+gprsize*2+16*19], m0 ;t16
+ mova [rsp+gprsize*2+16*20], m5 ;t17a
+ mova [rsp+gprsize*2+16*33], m4 ;t30a
+ mova [rsp+gprsize*2+16*34], m3 ;t31
+ mova m0, [rsp+gprsize*2+16*21] ;in9
+ mova m1, [rsp+gprsize*2+16*22] ;in7
+ pmulhrsw m3, m0, [o(pw_3703x8)]
+ pmulhrsw m0, [o(pw_1751x8)]
+ pmulhrsw m2, m1, [o(pw_3857x8)]
+ pmulhrsw m1, [o(pw_m1380x8)]
+ psubsw m4, m1, m0 ;t18
+ paddsw m0, m1 ;t19
+ psubsw m5, m2, m3 ;t29
+ paddsw m3, m2 ;t28
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m5 ;t18a
+ mova [rsp+gprsize*2+16*22], m0 ;t19
+ mova [rsp+gprsize*2+16*31], m3 ;t28
+ mova [rsp+gprsize*2+16*32], m4 ;t29a
+ mova m0, [rsp+gprsize*2+16*23] ;in5
+ mova m1, [rsp+gprsize*2+16*24] ;in11
+ pmulhrsw m3, m0, [o(pw_3973x8)]
+ pmulhrsw m0, [o(pw_995x8)]
+ pmulhrsw m2, m1, [o(pw_3513x8)]
+ pmulhrsw m1, [o(pw_m2106x8)]
+ psubsw m4, m0, m1 ;t21
+ paddsw m0, m1 ;t20
+ psubsw m5, m3, m2 ;t26
+ paddsw m3, m2 ;t27
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a
+ mova [rsp+gprsize*2+16*23], m0 ;t20
+ mova [rsp+gprsize*2+16*24], m5 ;t21a
+ mova [rsp+gprsize*2+16*29], m4 ;t26a
+ mova [rsp+gprsize*2+16*30], m3 ;t27
+ mova m0, [rsp+gprsize*2+16*25] ;in13
+ mova m2, [rsp+gprsize*2+16*26] ;in3
+ pmulhrsw m3, m0, [o(pw_3290x8)]
+ pmulhrsw m0, [o(pw_2440x8)]
+ pmulhrsw m1, m2, [o(pw_4052x8)]
+ pmulhrsw m2, [o(pw_m601x8)]
+ jmp .main2
+
+ALIGN function_align
+cglobal_label .main
+ mova m7, [o(pd_2048)]
+ mova m0, [rsp+gprsize*2+16*19] ;in1
+ mova m1, [rsp+gprsize*2+16*20] ;in15
+ mova m2, [rsp+gprsize*2+16*33] ;in17
+ mova m3, [rsp+gprsize*2+16*34] ;in31
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 201, 4091 ;t16a, t31a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 3035, 2751 ;t17a, t30a
+ psubsw m4, m0, m2 ;t17
+ paddsw m0, m2 ;t16
+ psubsw m5, m3, m1 ;t30
+ paddsw m3, m1 ;t31
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a
+ mova [rsp+gprsize*2+16*19], m0 ;t16
+ mova [rsp+gprsize*2+16*20], m5 ;t17a
+ mova [rsp+gprsize*2+16*33], m4 ;t30a
+ mova [rsp+gprsize*2+16*34], m3 ;t31
+ mova m0, [rsp+gprsize*2+16*21] ;in9
+ mova m1, [rsp+gprsize*2+16*22] ;in7
+ mova m2, [rsp+gprsize*2+16*31] ;in25
+ mova m3, [rsp+gprsize*2+16*32] ;in23
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 1751, 3703 ;t18a, t29a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 3857, 1380 ;t19a, t28a
+ psubsw m4, m2, m0 ;t18
+ paddsw m0, m2 ;t19
+ psubsw m5, m1, m3 ;t29
+ paddsw m3, m1 ;t28
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m5 ;t18a
+ mova [rsp+gprsize*2+16*22], m0 ;t19
+ mova [rsp+gprsize*2+16*31], m3 ;t28
+ mova [rsp+gprsize*2+16*32], m4 ;t29a
+ mova m0, [rsp+gprsize*2+16*23] ;in5
+ mova m1, [rsp+gprsize*2+16*24] ;in11
+ mova m2, [rsp+gprsize*2+16*29] ;in21
+ mova m3, [rsp+gprsize*2+16*30] ;in27
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 995, 3973 ;t20a, t27a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 3513, 2106 ;t21a, t26a
+ psubsw m4, m0, m2 ;t21
+ paddsw m0, m2 ;t20
+ psubsw m5, m3, m1 ;t26
+ paddsw m3, m1 ;t27
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a
+ mova [rsp+gprsize*2+16*23], m0 ;t20
+ mova [rsp+gprsize*2+16*24], m5 ;t21a
+ mova [rsp+gprsize*2+16*29], m4 ;t26a
+ mova [rsp+gprsize*2+16*30], m3 ;t27
+ mova m0, [rsp+gprsize*2+16*25] ;in13
+ mova m1, [rsp+gprsize*2+16*26] ;in3
+ mova m2, [rsp+gprsize*2+16*27] ;in29
+ mova m3, [rsp+gprsize*2+16*28] ;in19
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 2440, 3290 ;t22a, t25a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 4052, 601 ;t23a, t24a
+
+.main2:
+ psubsw m4, m2, m0 ;t22
+ paddsw m0, m2 ;t23
+ psubsw m5, m1, m3 ;t25
+ paddsw m3, m1 ;t24
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a
+ mova m2, [rsp+gprsize*2+16*24] ;t21a
+ psubsw m1, m5, m2 ;t21
+ paddsw m5, m2 ;t22
+ mova [rsp+gprsize*2+16*25], m5 ;t22
+ mova m2, [rsp+gprsize*2+16*29] ;t26a
+ psubsw m5, m4, m2 ;t26
+ paddsw m4, m2 ;t25
+ mova [rsp+gprsize*2+16*28], m4 ;t25
+ ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a
+ mova [rsp+gprsize*2+16*24], m5 ;t21a
+ mova [rsp+gprsize*2+16*29], m1 ;t26a
+
+ mova m1, [rsp+gprsize*2+16*23] ;t20
+ mova m5, [rsp+gprsize*2+16*30] ;t27
+ psubsw m2, m0, m1 ;t20a
+ paddsw m0, m1 ;t23a
+ psubsw m6, m3, m5 ;t27a
+ paddsw m3, m5 ;t24a
+ ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27
+ mova [rsp+gprsize*2+16*26], m0 ;t23a
+ mova [rsp+gprsize*2+16*27], m3 ;t24a
+ mova [rsp+gprsize*2+16*30], m2 ;t27
+
+ mova m0, [rsp+gprsize*2+16*20] ;t17a
+ mova m1, [rsp+gprsize*2+16*21] ;t18a
+ mova m2, [rsp+gprsize*2+16*32] ;t29a
+ mova m3, [rsp+gprsize*2+16*33] ;t30a
+ psubsw m4, m0, m1 ;t18
+ paddsw m0, m1 ;t17
+ psubsw m5, m3, m2 ;t29
+ paddsw m3, m2 ;t30
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t18a, t29a
+ mova [rsp+gprsize*2+16*20], m0 ;t17
+ mova [rsp+gprsize*2+16*21], m5 ;t18a
+ mova [rsp+gprsize*2+16*32], m4 ;t29a
+ mova [rsp+gprsize*2+16*33], m3 ;t30
+ mova m0, [rsp+gprsize*2+16*19] ;t16
+ mova m1, [rsp+gprsize*2+16*22] ;t19
+ mova m2, [rsp+gprsize*2+16*31] ;t28
+ mova m3, [rsp+gprsize*2+16*34] ;t31
+ psubsw m4, m0, m1 ;t19a
+ paddsw m0, m1 ;t16a
+ psubsw m5, m3, m2 ;t28a
+ paddsw m3, m2 ;t31a
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28
+ mova m2, [rsp+gprsize*2+16*15] ;tmp12
+ psubsw m1, m5, m6 ;t20a
+ paddsw m5, m6 ;t19a
+ psubsw m6, m2, m5 ;out19
+ paddsw m2, m5 ;out12
+ mova m5, [rsp+gprsize*2+16*30] ;t27
+ mova [rsp+gprsize*2+16*22], m6 ;out19
+ mova [rsp+gprsize*2+16*15], m2 ;out12
+ psubsw m6, m4, m5 ;t27a
+ paddsw m4, m5 ;t28a
+ ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27
+ mova m2, [rsp+gprsize*2+16*6 ] ;tmp3
+ psubsw m5, m2, m4 ;out28
+ paddsw m2, m4 ;out3
+ mova m4, [rsp+gprsize*2+16*14] ;tmp11
+ mova [rsp+gprsize*2+16*31], m5 ;out28
+ mova [rsp+gprsize*2+16*6 ], m2 ;out3
+ psubsw m5, m4, m6 ;out20
+ paddsw m4, m6 ;out11
+ mova m2, [rsp+gprsize*2+16*7 ] ;tmp4
+ mova [rsp+gprsize*2+16*23], m5 ;out20
+ mova [rsp+gprsize*2+16*14], m4 ;out11
+ psubsw m5, m2, m1 ;out27
+ paddsw m2, m1 ;out4
+ mova m1, [rsp+gprsize*2+16*26] ;t23a
+ mova m4, [rsp+gprsize*2+16*27] ;t24a
+ mova [rsp+gprsize*2+16*30], m5 ;out27
+ mova [rsp+gprsize*2+16*7 ], m2 ;out4
+ psubsw m5, m0, m1 ;t23
+ paddsw m0, m1 ;t16
+ psubsw m2, m3, m4 ;t24
+ paddsw m3, m4 ;t31
+ ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a
+ mova m6, [rsp+gprsize*2+16*18] ;tmp15
+ psubsw m4, m6, m0 ;out16
+ paddsw m6, m0 ;out15
+ mova m0, [rsp+gprsize*2+16*3 ] ;tmp0
+ mova m1, [rsp+gprsize*2+16*11] ;tmp8
+ mova [rsp+gprsize*2+16*18], m6 ;out15
+ mova [rsp+gprsize*2+16*19], m4 ;out16
+ psubsw m6, m0, m3 ;out31
+ paddsw m0, m3 ;out0
+ psubsw m4, m1, m2 ;out23
+ paddsw m1, m2 ;out8
+ mova m3, [rsp+gprsize*2+16*10] ;tmp7
+ mova [rsp+gprsize*2+16*34], m6 ;out31
+ mova [rsp+gprsize*2+16*11], m1 ;out8
+ mova [rsp+gprsize*2+16*26], m4 ;out23
+ paddsw m6, m3, m5 ;out7
+ psubsw m3, m5 ;out24
+ mova m1, [rsp+gprsize*2+16*20] ;t17
+ mova m5, [rsp+gprsize*2+16*25] ;t22
+ mova m2, [rsp+gprsize*2+16*17] ;tmp14
+ mova [rsp+gprsize*2+16*27], m3 ;out24
+ psubsw m4, m1, m5 ;t22a
+ paddsw m1, m5 ;t17a
+ psubsw m3, m2, m1 ;out17
+ paddsw m2, m1 ;out14
+ mova m5, [rsp+gprsize*2+16*28] ;t25
+ mova m1, [rsp+gprsize*2+16*33] ;t30
+ mova [rsp+gprsize*2+16*17], m2 ;out14
+ mova [rsp+gprsize*2+16*20], m3 ;out17
+ psubsw m2, m1, m5 ;t25a
+ paddsw m1, m5 ;t30a
+ ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25
+ mova m5, [rsp+gprsize*2+16*4 ] ;tmp1
+ psubsw m3, m5, m1 ;out30
+ paddsw m5, m1 ;out1
+ mova m1, [rsp+gprsize*2+16*12] ;tmp9
+ mova [rsp+gprsize*2+16*33], m3 ;out30
+ mova [rsp+gprsize*2+16*4 ], m5 ;out1
+ psubsw m3, m1, m2 ;out22
+ paddsw m1, m2 ;out9
+ mova m5, [rsp+gprsize*2+16*9 ] ;tmp6
+ mova [rsp+gprsize*2+16*25], m3 ;out22
+ mova [rsp+gprsize*2+16*12], m1 ;out9
+ psubsw m3, m5, m4 ;out25
+ paddsw m5, m4 ;out6
+ mova m4, [rsp+gprsize*2+16*21] ;t18a
+ mova m1, [rsp+gprsize*2+16*24] ;t21a
+ mova m2, [rsp+gprsize*2+16*16] ;tmp13
+ mova [rsp+gprsize*2+16*28], m3 ;out25
+ mova [rsp+gprsize*2+16*9 ], m5 ;out6
+ paddsw m3, m4, m1 ;t18
+ psubsw m4, m1 ;t21
+ psubsw m5, m2, m3 ;out18
+ paddsw m2, m3 ;out13
+ mova m1, [rsp+gprsize*2+16*29] ;t26a
+ mova m3, [rsp+gprsize*2+16*32] ;t29a
+ mova [rsp+gprsize*2+16*21], m5 ;out18
+ mova [rsp+gprsize*2+16*16], m2 ;out13
+ psubsw m5, m3, m1 ;t26
+ paddsw m3, m1 ;t29
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a
+ mova m2, [rsp+gprsize*2+16*5 ] ;tmp2
+ psubsw m1, m2, m3 ;out29
+ paddsw m2, m3 ;out2
+ mova m3, [rsp+gprsize*2+16*13] ;tmp10
+ mova [rsp+gprsize*2+16*32], m1 ;out29
+ psubsw m7, m3, m5 ;out21
+ paddsw m3, m5 ;out10
+ mova m5, [rsp+gprsize*2+16*8 ] ;tmp5
+ mova [rsp+gprsize*2+16*24], m7 ;out21
+ mova [rsp+gprsize*2+16*13], m3 ;out10
+ psubsw m1, m5, m4 ;out26
+ paddsw m5, m4 ;out5
+ mova m7, m6 ;out7
+ mova m3, [rsp+gprsize*2+16*6 ] ;out3
+ mova m4, [rsp+gprsize*2+16*7 ] ;out4
+ mova [rsp+gprsize*2+16*29], m1 ;out26
+ mova m6, [rsp+gprsize*2+16*9 ] ;out6
+ mova m1, [rsp+gprsize*2+16*4 ] ;out1
+ ret
+
+
+cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_32x8_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 8
+ lea tx2q, [o(.end)]
+
+.body:
+ pmulhrsw m0, m2
+ movd m2, [o(pw_2048)] ;intentionally rip-relative
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ pxor m5, m5
+
+.loop:
+ mova m1, [dstq+16*0]
+ mova m3, [dstq+16*1]
+ punpckhbw m2, m1, m5
+ punpcklbw m1, m5
+ punpckhbw m4, m3, m5
+ punpcklbw m3, m5
+ paddw m2, m0
+ paddw m1, m0
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m1, m2
+ packuswb m3, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m3
+ add dstq, strideq
+ dec r3d
+ jg .loop
+ jmp tx2q
+
+.end:
+ RET
+
+
+cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*0, 64
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+16*2, 64
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+16*1, 32
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ cmp eobd, 106
+ jg .full
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp .pass2
+
+.full:
+ LOAD_8ROWS coeffq+16*17, 32
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+ call m(idct_8x32_internal_8bpc).main
+
+.pass2:
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.end)]
+ jmp m(idct_8x32_internal_8bpc).end1
+
+.end:
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end1:
+ lea r3, [dstq+8]
+ lea tx2q, [o(.end2)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end2:
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0 ], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end3:
+ mov dstq, r3
+ add r3, 8
+ lea tx2q, [o(.end4)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end4:
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0 ], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end5:
+ mov dstq, r3
+ add r3, 8
+ lea tx2q, [o(.end6)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end6:
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0 ], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end7:
+ mov dstq, r3
+ lea tx2q, [o(.end8)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end8:
+ ret
+
+
+cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r5d, 4
+ mov tx2d, 2
+ cmp eobd, 107
+ cmovns tx2d, r5d
+ mov r3d, tx2d
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
+.loop:
+ LOAD_8ROWS coeffq+16*0, 64
+ paddsw m6, [o(pw_5)]
+ mova [rsp+16*1], m6
+ mova m6, [o(pw_5)]
+ REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
+ mova [rsp+16*2], m5
+ mova [rsp+16*1], m6
+ mova [rsp+16*0], m7
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+ pxor m7, m7
+ REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ add coeffq, 16
+ dec r3d
+ jg .loop
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r5d, 4
+ mov tx2d, 2
+ cmp eobd, 107
+ cmovns tx2d, r5d
+ mov r3d, tx2d
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+.loop:
+ LOAD_8ROWS coeffq+16*0, 16
+ pmulhrsw m6, [o(pw_4096)]
+ mova [rsp+16*1], m6
+ mova m6, [o(pw_4096)]
+ REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+ lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+
+ mov [rsp+16*3], dstq
+ mova [rsp+16*2], m5
+ mova [rsp+16*1], m6
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+ call m(idct_8x8_internal_8bpc).end3
+
+ add coeffq, 16*8
+ mov dstq, [rsp+16*3]
+ lea dstq, [dstq+8]
+ dec r3d
+ jg .loop
+ jnc .loop
+ RET
+
+
+cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_16x32_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r2d, 16
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+
+
+cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*1, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*5, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*33, 64 ;in8~in15
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ mova [coeffq+16*1 ], m0 ;in8
+ mova [coeffq+16*5 ], m4 ;in12
+ mova [rsp+gprsize+16*13], m2 ;in10
+ mova [rsp+gprsize+16*14], m6 ;in14
+ mova [rsp+gprsize+16*21], m1 ;in9
+ mova [rsp+gprsize+16*24], m3 ;in11
+ mova [rsp+gprsize+16*25], m5 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+ LOAD_8ROWS coeffq+16*0, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*4, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*32, 64 ;in0~in7
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ mova [rsp+gprsize+16*11], m2 ;in2
+ mova [rsp+gprsize+16*12], m6 ;in6
+ mova [rsp+gprsize+16*19], m1 ;in1
+ mova [rsp+gprsize+16*26], m3 ;in3
+ mova [rsp+gprsize+16*23], m5 ;in5
+ mova [rsp+gprsize+16*22], m7 ;in7
+
+ cmp eobd, 150
+ jg .full
+
+ mova m1, m4 ;in4
+ mova m2, [coeffq+16*1 ] ;in8
+ mova m3, [coeffq+16*5 ] ;in12
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ mova m0, [rsp+gprsize+16*11] ;in2
+ mova m1, [rsp+gprsize+16*12] ;in6
+ mova m2, [rsp+gprsize+16*13] ;in10
+ mova m3, [rsp+gprsize+16*14] ;in14
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp .pass2
+
+.full:
+ mova [coeffq+16*0 ], m0 ;in0
+ mova [coeffq+16*4 ], m4 ;in4
+
+ LOAD_8ROWS coeffq+16*2, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*6, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+16*34, 64 ;in16~in23
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end5:
+ mova [coeffq+16*2 ], m0 ;in16
+ mova [coeffq+16*6 ], m4 ;in20
+ mova [rsp+gprsize+16*15], m2 ;in18
+ mova [rsp+gprsize+16*16], m6 ;in22
+ mova [rsp+gprsize+16*33], m1 ;in17
+ mova [rsp+gprsize+16*28], m3 ;in19
+ mova [rsp+gprsize+16*29], m5 ;in21
+ mova [rsp+gprsize+16*32], m7 ;in23
+
+ LOAD_8ROWS coeffq+16*3, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*7, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end6:
+ SAVE_8ROWS coeffq+16*35, 64 ;in24~in31
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end7:
+ mova [rsp+gprsize+16*17], m2 ;in26
+ mova [rsp+gprsize+16*18], m6 ;in30
+ mova [rsp+gprsize+16*31], m1 ;in25
+ mova [rsp+gprsize+16*30], m3 ;in27
+ mova [rsp+gprsize+16*27], m5 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ mova m6, m0 ;in24
+ mova m7, m4 ;in28
+ mova m0, [coeffq+16*0 ] ;in0
+ mova m1, [coeffq+16*4 ] ;in4
+ mova m2, [coeffq+16*1 ] ;in8
+ mova m3, [coeffq+16*5 ] ;in12
+ mova m4, [coeffq+16*2 ] ;in16
+ mova m5, [coeffq+16*6 ] ;in20
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3 , 16
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main
+
+.pass2:
+ mov [rsp+gprsize*1+16*35], eobd
+ lea r3, [dstq+8]
+ mov [rsp+gprsize*2+16*35], r3
+ lea r3, [o(.end)]
+ jmp m(idct_8x32_internal_8bpc).end
+
+.end:
+ mov dstq, [rsp+gprsize*2+16*35]
+ mov eobd, [rsp+gprsize*1+16*35]
+ add coeffq, 16*32
+
+ mova m0, [coeffq+16*4 ] ;in1
+ mova m1, [coeffq+16*12] ;in3
+ mova m2, [coeffq+16*20] ;in5
+ mova m3, [coeffq+16*28] ;in7
+ mova m4, [coeffq+16*5 ] ;in9
+ mova m5, [coeffq+16*13] ;in11
+ mova m6, [coeffq+16*21] ;in13
+ mova m7, [coeffq+16*29] ;in15
+
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mova m0, [coeffq+16*0 ] ;in0
+ mova m1, [coeffq+16*16] ;in4
+ mova m2, [coeffq+16*1 ] ;in8
+ mova m3, [coeffq+16*17] ;in12
+
+ cmp eobd, 150
+ jg .full1
+
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ] ;in2
+ mova m1, [coeffq+16*24] ;in6
+ mova m2, [coeffq+16*9 ] ;in10
+ mova m3, [coeffq+16*25] ;in14
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp m(idct_8x32_internal_8bpc).pass2
+
+.full1:
+ mova m4, [coeffq+16*2 ] ;in16
+ mova m5, [coeffq+16*18] ;in20
+ mova m6, [coeffq+16*3 ] ;in24
+ mova m7, [coeffq+16*19] ;in26
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ] ;in2
+ mova m1, [coeffq+16*24] ;in6
+ mova m2, [coeffq+16*9 ] ;in10
+ mova m3, [coeffq+16*25] ;in14
+ mova m4, [coeffq+16*10] ;in18
+ mova m5, [coeffq+16*26] ;in22
+ mova m6, [coeffq+16*11] ;in26
+ mova m7, [coeffq+16*27] ;in30
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*6 ] ;in17
+ mova m1, [coeffq+16*14] ;in19
+ mova m2, [coeffq+16*22] ;in21
+ mova m3, [coeffq+16*30] ;in23
+ mova m4, [coeffq+16*7 ] ;in25
+ mova m5, [coeffq+16*15] ;in27
+ mova m6, [coeffq+16*23] ;in29
+ mova m7, [coeffq+16*31] ;in31
+
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp m(idct_8x32_internal_8bpc).pass2
+
+
+cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_32x16_internal_8bpc)
+ call m(idct_8x16_internal_8bpc).pass2
+
+ add coeffq, 16*16
+ lea dstq, [r3+8]
+ LOAD_8ROWS rsp+16*11, 16
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end
+ call m(idct_8x16_internal_8bpc).pass2
+
+ add coeffq, 16*16
+ lea dstq, [r3+8]
+ LOAD_8ROWS rsp+16*19, 16
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end
+ call m(idct_8x16_internal_8bpc).pass2
+
+ add coeffq, 16*16
+ lea dstq, [r3+8]
+ LOAD_8ROWS rsp+16*27, 16
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end
+ call m(idct_8x16_internal_8bpc).pass2
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r3d, 16
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
+
+
+cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ add coeffq, 16
+ lea r3, [o(.pass1_end1)]
+.pass1:
+ LOAD_8ROWS coeffq+16*0, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+16*4, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+16*2, 64, 1
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ LOAD_8ROWS coeffq+16*34, 64, 1
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+ call m(idct_8x32_internal_8bpc).main
+
+.pass1_end:
+ mova [rsp+gprsize+16*0 ], m7
+ mov tx2q, r3
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*0, 32
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+16*32, 32
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+16*48, 32
+
+ sub coeffq, 16
+ lea r3, [o(.end)]
+ jmp .pass1
+
+.end:
+ ret
+
+
+cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r4d, eobd
+ cmp eobd, 43 ;if (eob > 43)
+ sbb r3d, r3d ; iteration_count++
+ cmp r4d, 150 ;if (eob > 150)
+ sbb r3d, 0 ; iteration_count++
+ cmp r4d, 278 ;if (eob > 278)
+ sbb r3d, -4 ; iteration_count++
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ lea r4, [dstq+8]
+ mov [rsp+16*3], r4
+ mov [rsp+gprsize+16*3], r3d
+ mov [rsp+gprsize*2+16*3], coeffq
+
+.loop:
+ LOAD_8ROWS coeffq, 64, 1
+ mova [rsp+16*1], m6
+ pxor m6, m6
+ REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ mova [rsp+16*0], m2
+ mova [rsp+16*1], m3
+ mova [rsp+16*2], m4
+ mova m3, [o(pw_1697x16)]
+ mova m4, [o(pw_16384)]
+ REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1
+ mova m2, [o(pw_8192)]
+ REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1
+ mova m2, [rsp+16*0]
+ mova [rsp+16*0], m7
+ IDTX16 2, 7, 3, 4
+ mova m7, [rsp+16*2]
+ mova [rsp+16*2], m5
+ IDTX16 7, 5, 3, 4
+ mova m5, [rsp+16*1]
+ mova [rsp+16*1], m6
+ pmulhrsw m3, m5
+ pmulhrsw m3, m4
+ psrlw m4, 1 ; pw_8192
+ paddsw m3, m5
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ pmulhrsw m4, m7
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+ add coeffq, 16
+ dec r3d
+ jg .loop
+ mov coeffq, [rsp+gprsize*2+16*3]
+ add coeffq, 64*8
+ mov r3d, [rsp+gprsize+16*3]
+ xor dstq, dstq
+ mov [rsp+gprsize+16*3], dstq
+ mov dstq, [rsp+16*3]
+ test r3d, r3d
+ jnz .loop
+ RET
+
+
+cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r4d, 12 ;0100b
+ mov r5d, 136 ;1000 1000b
+ cmp eobd, 44 ;if (eob > 43)
+ cmovns r4d, r5d ; iteration_count+2
+ cmp eobd, 151 ;if (eob > 150)
+ mov r3d, 34952 ;1000 1000 1000 1000b
+ cmovs r3d, r4d ; iteration_count += 4
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ lea r4, [dstq+8]
+ mov [rsp+16*3], r4
+
+.loop:
+ LOAD_8ROWS coeffq, 32, 1
+ REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ mova [rsp+16*1], m6
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ mova [rsp+16*1], m5
+ mova [rsp+16*2], m6
+ mova m6, [o(pw_1697x16)]
+ REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4
+ pmulhrsw m7, [o(pw_2048)]
+ mova m5, [rsp+16*1]
+ mova [rsp+16*0], m7
+ IDTX16 5, 7, 6
+ mova m7, [rsp+16*2]
+ IDTX16 7, 6, 6
+ mova m6, [o(pw_2048)]
+ REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+ mova [rsp+16*2], m5
+ mova [rsp+16*1], m7
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+ pxor m7, m7
+ REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+
+.loop_end:
+ add coeffq, 16
+ shr r3d, 2
+ jz .ret
+ test r3d, 2
+ jnz .loop
+ mov r4d, r3d
+ and r4d, 1
+ lea coeffq, [coeffq+r4*8+32*7]
+ mov dstq, [rsp+16*3]
+ lea r4, [dstq+8]
+ mov [rsp+16*3], r4
+ jmp .loop
+
+.ret:
+ RET
+
+
+cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_32x32_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 32
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
+
+
+cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 136
+ mov [rsp+gprsize*1+16*35], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*35], coeffq
+
+.pass1_loop:
+ LOAD_8ROWS coeffq+64*1, 64*2
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mov tx2d, [rsp+gprsize*1+16*35]
+ test tx2d, tx2d
+ jl .fast
+
+.full:
+ LOAD_8ROWS coeffq+64*0, 64*4
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+64*2, 64*4
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*17, 64*2
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp .pass1_end
+
+.fast:
+ mova m0, [coeffq+256*0]
+ mova m1, [coeffq+256*1]
+ mova m2, [coeffq+256*2]
+ mova m3, [coeffq+256*3]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ mova m0, [coeffq+128*1]
+ mova m1, [coeffq+128*3]
+ mova m2, [coeffq+128*5]
+ mova m3, [coeffq+128*7]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+
+.pass1_end:
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+64*24, 64
+
+ add coeffq, 16
+ dec r3d
+ jg .pass1_loop
+
+
+.pass2:
+ mov coeffq, [rsp+gprsize*2+16*35]
+ mov r3d, 4
+ lea tx2q, [o(.pass2_end)]
+
+.pass2_loop:
+ mov [rsp+gprsize*3+16*35], r3d
+ lea r3, [dstq+8]
+ mov [rsp+gprsize*2+16*35], r3
+
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*12]
+ mova m2, [coeffq+16*20]
+ mova m3, [coeffq+16*28]
+ mova m4, [coeffq+16*5 ]
+ mova m5, [coeffq+16*13]
+ mova m6, [coeffq+16*21]
+ mova m7, [coeffq+16*29]
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mov eobd, [rsp+gprsize*1+16*35]
+ test eobd, eobd
+ jl .fast1
+
+.full1:
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*16]
+ mova m2, [coeffq+16*1 ]
+ mova m3, [coeffq+16*17]
+ mova m4, [coeffq+16*2 ]
+ mova m5, [coeffq+16*18]
+ mova m6, [coeffq+16*3 ]
+ mova m7, [coeffq+16*19]
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ mova m4, [coeffq+16*10]
+ mova m5, [coeffq+16*26]
+ mova m6, [coeffq+16*11]
+ mova m7, [coeffq+16*27]
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*6 ]
+ mova m1, [coeffq+16*14]
+ mova m2, [coeffq+16*22]
+ mova m3, [coeffq+16*30]
+ mova m4, [coeffq+16*7 ]
+ mova m5, [coeffq+16*15]
+ mova m6, [coeffq+16*23]
+ mova m7, [coeffq+16*31]
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp tx2q
+
+.fast1:
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*16]
+ mova m2, [coeffq+16*1 ]
+ mova m3, [coeffq+16*17]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp tx2q
+
+.pass2_end:
+ lea r3, [o(.pass2_end1)]
+ jmp m(idct_8x32_internal_8bpc).end
+
+.pass2_end1:
+ lea tx2q, [o(.pass2_end)]
+ add coeffq, 16*32
+ mov dstq, [rsp+gprsize*2+16*35]
+ mov r3d, [rsp+gprsize*3+16*35]
+ dec r3d
+ jg .pass2_loop
+
+ ret
+
+
+cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ cmp eobd, 136
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*0+16*3], r4
+ mov [rsp+gprsize*1+16*3], r3d
+ mov [rsp+gprsize*2+16*3], r3d
+ mov [rsp+gprsize*3+16*3], coeffq
+
+.loop:
+ LOAD_8ROWS coeffq, 64
+ mova [rsp+16*1], m6
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ pmulhrsw m7, [o(pw_8192)]
+ mova [rsp+16*0], m7
+ mova m7, [o(pw_8192)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ mova [rsp+16*1], m6
+ mova [rsp+16*2], m5
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+
+ pxor m7, m7
+ REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+
+ add coeffq, 16
+ dec r3d
+ jg .loop
+
+ mov r4d, [rsp+gprsize*2+16*3]
+ dec r4d
+ jle .ret
+
+ mov dstq, [rsp+gprsize*0+16*3]
+ mov coeffq, [rsp+gprsize*3+16*3]
+ mov [rsp+gprsize*2+16*3], r4
+ lea r3, [dstq+8]
+ add coeffq, 64*8
+ mov [rsp+gprsize*0+16*3], r3
+ mov r3d, [rsp+gprsize*1+16*3]
+ mov [rsp+gprsize*3+16*3], coeffq
+ jmp .loop
+
+.ret:
+ RET
+
+
+cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_16x64_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r2d, 32
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+
+
+cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 151
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*67], coeffq
+
+.pass1_loop:
+ LOAD_8ROWS coeffq+64*0, 64*2
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+64*1, 64*2
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*0, 64
+
+ add coeffq, 16
+ dec r3d
+ jg .pass1_loop
+
+ mov coeffq, [rsp+gprsize*2+16*67]
+ mov r3d, 2
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.end1)]
+
+.pass2_loop:
+ mov [rsp+gprsize*3+16*67], r3d
+ mov eobd, [rsp+gprsize*1+16*67]
+
+ mova m0, [coeffq+16*4 ] ;in1
+ mova m1, [coeffq+16*12] ;in3
+ mova m2, [coeffq+16*20] ;in5
+ mova m3, [coeffq+16*28] ;in7
+ mova m4, [coeffq+16*5 ] ;in9
+ mova m5, [coeffq+16*13] ;in11
+ mova m6, [coeffq+16*21] ;in13
+ mova m7, [coeffq+16*29] ;in15
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ pxor m4, m4
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+
+ test eobd, eobd
+ jl .fast
+
+.full:
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ mova m0, [coeffq+16*16]
+ mova m1, [coeffq+16*17]
+ mova m2, [coeffq+16*18]
+ mova m3, [coeffq+16*19]
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ mova m4, [coeffq+16*10]
+ mova m5, [coeffq+16*26]
+ mova m6, [coeffq+16*11]
+ mova m7, [coeffq+16*27]
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*6 ] ;in17
+ mova m1, [coeffq+16*14] ;in19
+ mova m2, [coeffq+16*22] ;in21
+ mova m3, [coeffq+16*30] ;in23
+ mova m4, [coeffq+16*7 ] ;in25
+ mova m5, [coeffq+16*15] ;in27
+ mova m6, [coeffq+16*23] ;in29
+ mova m7, [coeffq+16*31] ;in31
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call .main
+ jmp .end
+
+.fast:
+ REPX {mova x, m4}, m2, m3, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ mova m0, [coeffq+16*16]
+ mova m1, [coeffq+16*17]
+
+ REPX {mova x, m4}, m2, m3, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+
+ call m(idct_8x32_internal_8bpc).main_veryfast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ call .main_fast
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov r3, r4
+ jmp m(idct_8x32_internal_8bpc).end2
+
+.end1:
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ lea dstq, [dstq+strideq*2]
+ lea r3, [rsp+16*32+gprsize]
+ call .write
+ mov dstq, [rsp+gprsize*2+16*67]
+ mov r3d, [rsp+gprsize*3+16*67]
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.end1)]
+
+ dec r3d
+ jg .pass2_loop
+ ret
+.write:
+ mova [r3+16*0], m7
+ mov r4, -16*32
+ pxor m7, m7
+ sub coeffq, r4
+.zero_loop:
+ mova [coeffq+r4+16*0], m7
+ mova [coeffq+r4+16*1], m7
+ add r4, 16*2
+ jl .zero_loop
+ call .write_main2
+ LOAD_8ROWS r3+16*11, 16
+ call .write_main
+ LOAD_8ROWS r3+16*19, 16
+ call .write_main
+ LOAD_8ROWS r3+16*27, 16
+.write_main:
+ mova [r3+16*0], m7
+.write_main2:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [r3+16*0]
+ mova [r3+16*2], m5
+ mova [r3+16*1], m6
+ mova [r3+16*0], m7
+ WRITE_8X4 0, 1, 2, 3, 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X4 4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ ret
+
+
+ALIGN function_align
+cglobal_label .main_fast
+ mova m0, [rsp+gprsize*2+16*35] ;in1
+ pmulhrsw m3, m0, [o(pw_4095x8)] ;t62,t63
+ pmulhrsw m0, [o(pw_101x8)] ;t32,t33
+ mova m7, [o(pd_2048)]
+ mova [rsp+gprsize*2+16*35], m0 ;t32
+ mova [rsp+gprsize*2+16*66], m3 ;t63
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 401, 4076 ;t33a, t62a
+ mova [rsp+gprsize*2+16*36], m3 ;t33a
+ mova [rsp+gprsize*2+16*65], m0 ;t62a
+
+ mova m1, [rsp+gprsize*2+16*37] ;in15
+ pmulhrsw m2, m1, [o(pw_3822x8)] ;t60,t61
+ pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35
+ mova [rsp+gprsize*2+16*38], m1 ;t35
+ mova [rsp+gprsize*2+16*63], m2 ;t60
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a
+ mova [rsp+gprsize*2+16*37], m2 ;t34a
+ mova [rsp+gprsize*2+16*64], m1 ;t61a
+
+ mova m0, [rsp+gprsize*2+16*39] ;in9
+ pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59
+ pmulhrsw m0, [o(pw_897x8)] ;t36,t37
+ mova [rsp+gprsize*2+16*39], m0 ;t36
+ mova [rsp+gprsize*2+16*62], m3 ;t59
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 3166, 2598 ;t37a, t58a
+ mova [rsp+gprsize*2+16*40], m3 ;t37a
+ mova [rsp+gprsize*2+16*61], m0 ;t58a
+
+ mova m1, [rsp+gprsize*2+16*41] ;in7
+ pmulhrsw m2, m1, [o(pw_4036x8)] ;t56,t57
+ pmulhrsw m1, [o(pw_m700x8)] ;t38,t39
+ mova [rsp+gprsize*2+16*42], m1 ;t39
+ mova [rsp+gprsize*2+16*59], m2 ;t56
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a
+ mova [rsp+gprsize*2+16*41], m2 ;t38a
+ mova [rsp+gprsize*2+16*60], m1 ;t57a
+
+ mova m0, [rsp+gprsize*2+16*43] ;in5
+ pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55
+ pmulhrsw m0, [o(pw_501x8)] ;t40,t41
+ mova [rsp+gprsize*2+16*43], m0 ;t40
+ mova [rsp+gprsize*2+16*58], m3 ;t55
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 1931, 3612 ;t41a, t54a
+ mova [rsp+gprsize*2+16*44], m3 ;t41a
+ mova [rsp+gprsize*2+16*57], m0 ;t54a
+
+ mova m1, [rsp+gprsize*2+16*45] ;in11
+ pmulhrsw m2, m1, [o(pw_3948x8)] ;t52,t53
+ pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43
+ mova [rsp+gprsize*2+16*46], m1 ;t43
+ mova [rsp+gprsize*2+16*55], m2 ;t52
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a
+ mova [rsp+gprsize*2+16*45], m2 ;t42a
+ mova [rsp+gprsize*2+16*56], m1 ;t53a
+
+ mova m0, [rsp+gprsize*2+16*47] ;in13
+ pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51
+ pmulhrsw m0, [o(pw_1285x8)] ;t44,t45
+ mova m6, m0
+ mova [rsp+gprsize*2+16*54], m3 ;t51
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 3920, 1189 ;t45a, t50a
+ mova [rsp+gprsize*2+16*48], m3 ;t45a
+ mova [rsp+gprsize*2+16*53], m0 ;t50a
+
+ mova m0, [rsp+gprsize*2+16*49] ;in3
+ pmulhrsw m3, m0, [o(pw_4085x8)] ;t48,t49
+ pmulhrsw m0, [o(pw_m301x8)] ;t46,t47
+ mova m4, m3
+ mova m5, m0
+
+ jmp .main2
+
+ALIGN function_align
+cglobal_label .main
+ mova m0, [rsp+gprsize*2+16*35] ;in1
+ mova m1, [rsp+gprsize*2+16*65] ;in31
+ pmulhrsw m3, m0, [o(pw_4095x8)] ;t63a
+ pmulhrsw m0, [o(pw_101x8)] ;t32a
+ pmulhrsw m2, m1, [o(pw_2967x8)] ;t62a
+ pmulhrsw m1, [o(pw_m2824x8)] ;t33a
+ mova m7, [o(pd_2048)]
+ psubsw m4, m0, m1 ;t33
+ paddsw m0, m1 ;t32
+ psubsw m5, m3, m2 ;t62
+ paddsw m3, m2 ;t63
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 401, 4076 ;t33a, t62a
+ mova [rsp+gprsize*2+16*35], m0 ;t32
+ mova [rsp+gprsize*2+16*36], m5 ;t33a
+ mova [rsp+gprsize*2+16*65], m4 ;t62a
+ mova [rsp+gprsize*2+16*66], m3 ;t63
+
+ mova m0, [rsp+gprsize*2+16*63] ;in17
+ mova m1, [rsp+gprsize*2+16*37] ;in15
+ pmulhrsw m3, m0, [o(pw_3745x8)] ;t61a
+ pmulhrsw m0, [o(pw_1660x8)] ;t34a
+ pmulhrsw m2, m1, [o(pw_3822x8)] ;t60a
+ pmulhrsw m1, [o(pw_m1474x8)] ;t35a
+ psubsw m4, m1, m0 ;t34
+ paddsw m0, m1 ;t35
+ psubsw m5, m2, m3 ;t61
+ paddsw m3, m2 ;t60
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a
+ mova [rsp+gprsize*2+16*37], m5 ;t34a
+ mova [rsp+gprsize*2+16*38], m0 ;t35
+ mova [rsp+gprsize*2+16*63], m3 ;t60
+ mova [rsp+gprsize*2+16*64], m4 ;t61a
+
+ mova m0, [rsp+gprsize*2+16*39] ;in9
+ mova m1, [rsp+gprsize*2+16*61] ;in23
+ pmulhrsw m3, m0, [o(pw_3996x8)] ;t59a
+ pmulhrsw m0, [o(pw_897x8)] ;t36a
+ pmulhrsw m2, m1, [o(pw_3461x8)] ;t58a
+ pmulhrsw m1, [o(pw_m2191x8)] ;t37a
+ psubsw m4, m0, m1 ;t37
+ paddsw m0, m1 ;t36
+ psubsw m5, m3, m2 ;t58
+ paddsw m3, m2 ;t59
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3166, 2598 ;t37a, t58a
+ mova [rsp+gprsize*2+16*39], m0 ;t36
+ mova [rsp+gprsize*2+16*40], m5 ;t37a
+ mova [rsp+gprsize*2+16*61], m4 ;t58a
+ mova [rsp+gprsize*2+16*62], m3 ;t59
+
+ mova m0, [rsp+gprsize*2+16*59] ;in25
+ mova m1, [rsp+gprsize*2+16*41] ;in7
+ pmulhrsw m3, m0, [o(pw_3349x8)] ;t57a
+ pmulhrsw m0, [o(pw_2359x8)] ;t38a
+ pmulhrsw m2, m1, [o(pw_4036x8)] ;t56a
+ pmulhrsw m1, [o(pw_m700x8)] ;t39a
+ psubsw m4, m1, m0 ;t38
+ paddsw m0, m1 ;t39
+ psubsw m5, m2, m3 ;t57
+ paddsw m3, m2 ;t56
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a
+ mova [rsp+gprsize*2+16*41], m5 ;t38a
+ mova [rsp+gprsize*2+16*42], m0 ;t39
+ mova [rsp+gprsize*2+16*59], m3 ;t56
+ mova [rsp+gprsize*2+16*60], m4 ;t57a
+
+ mova m0, [rsp+gprsize*2+16*43] ;in5
+ mova m1, [rsp+gprsize*2+16*57] ;in27
+ pmulhrsw m3, m0, [o(pw_4065x8)] ;t55a
+ pmulhrsw m0, [o(pw_501x8)] ;t40a
+ pmulhrsw m2, m1, [o(pw_3229x8)] ;t54a
+ pmulhrsw m1, [o(pw_m2520x8)] ;t41a
+ psubsw m4, m0, m1 ;t41
+ paddsw m0, m1 ;t40
+ psubsw m5, m3, m2 ;t54
+ paddsw m3, m2 ;t55
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 1931, 3612 ;t41a, t54a
+ mova [rsp+gprsize*2+16*43], m0 ;t40
+ mova [rsp+gprsize*2+16*44], m5 ;t41a
+ mova [rsp+gprsize*2+16*57], m4 ;t54a
+ mova [rsp+gprsize*2+16*58], m3 ;t55
+
+ mova m0, [rsp+gprsize*2+16*55] ;in21
+ mova m1, [rsp+gprsize*2+16*45] ;in11
+ pmulhrsw m3, m0, [o(pw_3564x8)] ;t53a
+ pmulhrsw m0, [o(pw_2019x8)] ;t42a
+ pmulhrsw m2, m1, [o(pw_3948x8)] ;t52a
+ pmulhrsw m1, [o(pw_m1092x8)] ;t43a
+ psubsw m4, m1, m0 ;t42
+ paddsw m0, m1 ;t43
+ psubsw m5, m2, m3 ;t53
+ paddsw m3, m2 ;t52
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a
+ mova [rsp+gprsize*2+16*45], m5 ;t42a
+ mova [rsp+gprsize*2+16*46], m0 ;t43
+ mova [rsp+gprsize*2+16*55], m3 ;t52
+ mova [rsp+gprsize*2+16*56], m4 ;t53a
+
+ mova m0, [rsp+gprsize*2+16*47] ;in13
+ mova m1, [rsp+gprsize*2+16*53] ;in19
+ pmulhrsw m3, m0, [o(pw_3889x8)] ;t51a
+ pmulhrsw m0, [o(pw_1285x8)] ;t44a
+ pmulhrsw m2, m1, [o(pw_3659x8)] ;t50a
+ pmulhrsw m1, [o(pw_m1842x8)] ;t45a
+ psubsw m4, m0, m1 ;t45
+ paddsw m0, m1 ;t44
+ psubsw m5, m3, m2 ;t50
+ paddsw m3, m2 ;t51
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3920, 1189 ;t45a, t50a
+ mova m6, m0
+ mova [rsp+gprsize*2+16*48], m5 ;t45a
+ mova [rsp+gprsize*2+16*53], m4 ;t50a
+ mova [rsp+gprsize*2+16*54], m3 ;t51
+
+ mova m0, [rsp+gprsize*2+16*51] ;in29
+ mova m1, [rsp+gprsize*2+16*49] ;in3
+ pmulhrsw m3, m0, [o(pw_3102x8)] ;t49a
+ pmulhrsw m0, [o(pw_2675x8)] ;t46a
+ pmulhrsw m2, m1, [o(pw_4085x8)] ;t48a
+ pmulhrsw m1, [o(pw_m301x8)] ;t47a
+ psubsw m5, m1, m0 ;t46
+ paddsw m0, m1 ;t47
+ psubsw m4, m2, m3 ;t49
+ paddsw m3, m2 ;t48
+
+ALIGN function_align
+.main2:
+ ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a
+ mova m1, [rsp+gprsize*2+16*54] ;t51
+ psubsw m2, m0, m6 ;t44a
+ paddsw m0, m6 ;t47a
+ psubsw m6, m3, m1 ;t51a
+ paddsw m3, m1 ;t48a
+ mova [rsp+gprsize*2+16*50], m0 ;t47a
+ mova [rsp+gprsize*2+16*51], m3 ;t48a
+ ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51
+ mova [rsp+gprsize*2+16*47], m6 ;t44
+ mova [rsp+gprsize*2+16*54], m2 ;t51
+
+ mova m0, [rsp+gprsize*2+16*48] ;t45a
+ mova m3, [rsp+gprsize*2+16*53] ;t50a
+ psubsw m2, m4, m0 ;t45
+ paddsw m4, m0 ;t46
+ psubsw m6, m5, m3 ;t50
+ paddsw m5, m3 ;t49
+ ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a
+ mova [rsp+gprsize*2+16*48], m6 ;t45a
+ mova [rsp+gprsize*2+16*49], m4 ;t46
+ mova [rsp+gprsize*2+16*52], m5 ;t49
+ mova [rsp+gprsize*2+16*53], m2 ;t50a
+
+ mova m0, [rsp+gprsize*2+16*43] ;t40
+ mova m2, [rsp+gprsize*2+16*46] ;t43
+ mova m3, [rsp+gprsize*2+16*55] ;t52
+ mova m1, [rsp+gprsize*2+16*58] ;t55
+ psubsw m4, m0, m2 ;t43a
+ paddsw m0, m2 ;t40a
+ psubsw m5, m1, m3 ;t52a
+ paddsw m1, m3 ;t55a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t43, t52
+ mova [rsp+gprsize*2+16*43], m0 ;t40a
+ mova [rsp+gprsize*2+16*46], m5 ;t43
+ mova [rsp+gprsize*2+16*55], m4 ;t52
+ mova [rsp+gprsize*2+16*58], m1 ;t55a
+
+ mova m0, [rsp+gprsize*2+16*44] ;t41a
+ mova m2, [rsp+gprsize*2+16*45] ;t42a
+ mova m3, [rsp+gprsize*2+16*56] ;t53a
+ mova m1, [rsp+gprsize*2+16*57] ;t54a
+ psubsw m4, m0, m2 ;t42
+ paddsw m0, m2 ;t41
+ psubsw m5, m1, m3 ;t53
+ paddsw m1, m3 ;t54
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t42a, t53a
+ mova [rsp+gprsize*2+16*44], m0 ;t41
+ mova [rsp+gprsize*2+16*45], m5 ;t42a
+ mova [rsp+gprsize*2+16*56], m4 ;t53a
+ mova [rsp+gprsize*2+16*57], m1 ;t54
+
+ mova m0, [rsp+gprsize*2+16*41] ;t38a
+ mova m2, [rsp+gprsize*2+16*40] ;t37a
+ mova m3, [rsp+gprsize*2+16*61] ;t58a
+ mova m1, [rsp+gprsize*2+16*60] ;t57a
+ psubsw m4, m0, m2 ;t37
+ paddsw m0, m2 ;t38
+ psubsw m5, m1, m3 ;t58
+ paddsw m1, m3 ;t57
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a
+ mova [rsp+gprsize*2+16*41], m0 ;t38
+ mova [rsp+gprsize*2+16*40], m5 ;t37a
+ mova [rsp+gprsize*2+16*61], m4 ;t58a
+ mova [rsp+gprsize*2+16*60], m1 ;t57
+
+ mova m0, [rsp+gprsize*2+16*42] ;t39
+ mova m2, [rsp+gprsize*2+16*39] ;t36
+ mova m3, [rsp+gprsize*2+16*62] ;t59
+ mova m1, [rsp+gprsize*2+16*59] ;t56
+ psubsw m4, m0, m2 ;t36a
+ paddsw m0, m2 ;t39a
+ psubsw m5, m1, m3 ;t59a
+ paddsw m1, m3 ;t56a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59
+ mova [rsp+gprsize*2+16*42], m0 ;t39a
+ mova [rsp+gprsize*2+16*39], m5 ;t36
+ mova [rsp+gprsize*2+16*62], m4 ;t59
+ mova [rsp+gprsize*2+16*59], m1 ;t56a
+
+ mova m0, [rsp+gprsize*2+16*35] ;t32
+ mova m2, [rsp+gprsize*2+16*38] ;t35
+ mova m3, [rsp+gprsize*2+16*63] ;t60
+ mova m1, [rsp+gprsize*2+16*66] ;t63
+ psubsw m4, m0, m2 ;t35a
+ paddsw m0, m2 ;t32a
+ psubsw m5, m1, m3 ;t60a
+ paddsw m1, m3 ;t63a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t35, t60
+ mova [rsp+gprsize*2+16*35], m0 ;t32a
+ mova [rsp+gprsize*2+16*38], m5 ;t35
+ mova [rsp+gprsize*2+16*63], m4 ;t60
+ mova [rsp+gprsize*2+16*66], m1 ;t63a
+
+ mova m0, [rsp+gprsize*2+16*36] ;t33a
+ mova m2, [rsp+gprsize*2+16*37] ;t34a
+ mova m3, [rsp+gprsize*2+16*64] ;t61a
+ mova m1, [rsp+gprsize*2+16*65] ;t62a
+ psubsw m4, m0, m2 ;t34
+ paddsw m0, m2 ;t33
+ psubsw m5, m1, m3 ;t61
+ paddsw m1, m3 ;t62
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t34a, t61a
+
+ mova m2, [rsp+gprsize*2+16*41] ;t38
+ mova m3, [rsp+gprsize*2+16*60] ;t57
+ psubsw m6, m0, m2 ;t38a
+ paddsw m0, m2 ;t33a
+ psubsw m2, m1, m3 ;t57a
+ paddsw m1, m3 ;t62a
+ mova [rsp+gprsize*2+16*36], m0 ;t33a
+ mova [rsp+gprsize*2+16*65], m1 ;t62a
+ ITX_MULSUB_2W 2, 6, 0, 3, 7, 1567, 3784 ;t38, t57
+ mova [rsp+gprsize*2+16*41], m2 ;t38
+ mova [rsp+gprsize*2+16*60], m6 ;t57
+
+ mova m2, [rsp+gprsize*2+16*40] ;t37
+ mova m3, [rsp+gprsize*2+16*61] ;t58
+ psubsw m0, m5, m2 ;t37
+ paddsw m5, m2 ;t34
+ psubsw m1, m4, m3 ;t58
+ paddsw m4, m3 ;t61
+ ITX_MULSUB_2W 1, 0, 2, 3, 7, 1567, 3784 ;t37a, t58a
+ mova [rsp+gprsize*2+16*37], m5 ;t34
+ mova [rsp+gprsize*2+16*64], m4 ;t61
+ mova [rsp+gprsize*2+16*40], m1 ;t37a
+ mova [rsp+gprsize*2+16*61], m0 ;t58a
+
+ mova m0, [rsp+gprsize*2+16*38] ;t35
+ mova m2, [rsp+gprsize*2+16*39] ;t36
+ mova m3, [rsp+gprsize*2+16*62] ;t59
+ mova m1, [rsp+gprsize*2+16*63] ;t60
+ psubsw m4, m0, m2 ;t36a
+ paddsw m0, m2 ;t35a
+ psubsw m5, m1, m3 ;t59a
+ paddsw m1, m3 ;t60a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t36, t59
+ mova [rsp+gprsize*2+16*38], m0 ;t35a
+ mova [rsp+gprsize*2+16*39], m5 ;t36
+ mova [rsp+gprsize*2+16*62], m4 ;t59
+ mova [rsp+gprsize*2+16*63], m1 ;t60a
+
+ mova m0, [rsp+gprsize*2+16*35] ;t32a
+ mova m2, [rsp+gprsize*2+16*42] ;t39a
+ mova m3, [rsp+gprsize*2+16*59] ;t56a
+ mova m1, [rsp+gprsize*2+16*66] ;t63a
+ psubsw m4, m0, m2 ;t39
+ paddsw m0, m2 ;t32
+ psubsw m5, m1, m3 ;t56
+ paddsw m1, m3 ;t63
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t39a, t56a
+ mova [rsp+gprsize*2+16*35], m0 ;t32
+ mova [rsp+gprsize*2+16*42], m5 ;t39a
+ mova [rsp+gprsize*2+16*59], m4 ;t56a
+ mova [rsp+gprsize*2+16*66], m1 ;t63
+
+ mova m0, [rsp+gprsize*2+16*50] ;t47a
+ mova m2, [rsp+gprsize*2+16*43] ;t40a
+ mova m3, [rsp+gprsize*2+16*58] ;t55a
+ mova m1, [rsp+gprsize*2+16*51] ;t48a
+ psubsw m4, m0, m2 ;t40
+ paddsw m0, m2 ;t47
+ psubsw m5, m1, m3 ;t55
+ paddsw m1, m3 ;t48
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a
+ mova [rsp+gprsize*2+16*50], m0 ;t47
+ mova [rsp+gprsize*2+16*43], m5 ;t40a
+ mova [rsp+gprsize*2+16*58], m4 ;t55a
+ mova [rsp+gprsize*2+16*51], m1 ;t48
+
+ mova m0, [rsp+gprsize*2+16*49] ;t46
+ mova m2, [rsp+gprsize*2+16*44] ;t41
+ mova m3, [rsp+gprsize*2+16*57] ;t54
+ mova m1, [rsp+gprsize*2+16*52] ;t49
+ psubsw m4, m0, m2 ;t41a
+ paddsw m0, m2 ;t46a
+ psubsw m5, m1, m3 ;t54a
+ paddsw m1, m3 ;t49a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54
+ mova [rsp+gprsize*2+16*49], m0 ;t46a
+ mova [rsp+gprsize*2+16*44], m5 ;t41
+ mova [rsp+gprsize*2+16*57], m4 ;t54
+ mova [rsp+gprsize*2+16*52], m1 ;t49a
+
+ mova m0, [rsp+gprsize*2+16*48] ;t45a
+ mova m2, [rsp+gprsize*2+16*45] ;t42a
+ mova m3, [rsp+gprsize*2+16*56] ;t53a
+ mova m1, [rsp+gprsize*2+16*53] ;t50a
+ psubsw m4, m0, m2 ;t42
+ paddsw m0, m2 ;t45
+ psubsw m5, m1, m3 ;t53
+ paddsw m1, m3 ;t50
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a
+ mova [rsp+gprsize*2+16*48], m0 ;t45
+ mova [rsp+gprsize*2+16*45], m5 ;t42a
+ mova [rsp+gprsize*2+16*56], m4 ;t53a
+ mova [rsp+gprsize*2+16*53], m1 ;t50
+
+ mova m0, [rsp+gprsize*2+16*47] ;t44
+ mova m2, [rsp+gprsize*2+16*46] ;t43
+ mova m3, [rsp+gprsize*2+16*55] ;t52
+ mova m1, [rsp+gprsize*2+16*54] ;t51
+ psubsw m4, m0, m2 ;t43a
+ paddsw m0, m2 ;t44a
+ psubsw m5, m1, m3 ;t52a
+ paddsw m1, m3 ;t51a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52
+
+ mova m2, [rsp+gprsize*2+16*38] ;t35a
+ mova m3, [rsp+gprsize*2+16*31] ;tmp[28]
+ psubsw m6, m2, m0 ;t44
+ paddsw m2, m0 ;t35
+ psubsw m0, m3, m2 ;out35
+ paddsw m2, m3 ;out28
+ mova m3, [rsp+gprsize*2+16*63] ;t60a
+ mova [rsp+gprsize*2+16*38], m0 ;out35
+ mova [rsp+gprsize*2+16*31], m2 ;out28
+ psubsw m0, m3, m1 ;t51
+ paddsw m3, m1 ;t60
+ ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a
+ mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3]
+ psubsw m1, m2, m3 ;out60
+ paddsw m2, m3 ;out3
+ mova m3, [rsp+gprsize*2+16*22] ;tmp[19]
+ mova [rsp+gprsize*2+16*63], m1 ;out60
+ mova [rsp+gprsize*2+16*6 ], m2 ;out3
+ psubsw m1, m3, m0 ;out44
+ paddsw m3, m0 ;out19
+ mova m2, [rsp+gprsize*2+16*15] ;tmp[12]
+
+ mova m0, [rsp+gprsize*2+16*39] ;t36
+ mova [rsp+gprsize*2+16*47], m1 ;out44
+ mova [rsp+gprsize*2+16*22], m3 ;out19
+ mova m1, [rsp+gprsize*2+16*62] ;t59
+ psubsw m3, m2, m6 ;out51
+ paddsw m2, m6 ;out12
+ mova [rsp+gprsize*2+16*54], m3 ;out51
+ mova [rsp+gprsize*2+16*15], m2 ;out12
+ psubsw m2, m0, m5 ;t43a
+ paddsw m0, m5 ;t36a
+ mova m5, [rsp+gprsize*2+16*30] ;tmp[27]
+ psubsw m3, m1, m4 ;t52a
+ paddsw m1, m4 ;t59a
+ ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52
+ mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ]
+ psubsw m6, m5, m0 ;out36
+ paddsw m5, m0 ;out27
+ psubsw m0, m4, m1 ;out59
+ paddsw m4, m1 ;out4
+ mova [rsp+gprsize*2+16*39], m6 ;out36
+ mova [rsp+gprsize*2+16*30], m5 ;out27
+ mova [rsp+gprsize*2+16*62], m0 ;out59
+ mova [rsp+gprsize*2+16*7 ], m4 ;out4
+ mova m0, [rsp+gprsize*2+16*23] ;tmp[20]
+ mova m5, [rsp+gprsize*2+16*14] ;tmp[11]
+ psubsw m4, m0, m3 ;out43
+ paddsw m0, m3 ;out20
+ psubsw m6, m5, m2 ;out52
+ paddsw m5, m2 ;out11
+ mova [rsp+gprsize*2+16*46], m4 ;out43
+ mova [rsp+gprsize*2+16*23], m0 ;out20
+ mova [rsp+gprsize*2+16*55], m6 ;out52
+ mova [rsp+gprsize*2+16*14], m5 ;out11
+
+ mova m0, [rsp+gprsize*2+16*40] ;t37a
+ mova m5, [rsp+gprsize*2+16*45] ;t42a
+ mova m3, [rsp+gprsize*2+16*56] ;t53a
+ mova m1, [rsp+gprsize*2+16*61] ;t58a
+ mova m2, [rsp+gprsize*2+16*29] ;tmp[26]
+ psubsw m4, m0, m5 ;t42
+ paddsw m0, m5 ;t37
+ psubsw m5, m1, m3 ;t53
+ paddsw m1, m3 ;t58
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52
+ mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ]
+ psubsw m6, m2, m0 ;out37
+ paddsw m2, m0 ;out26
+ psubsw m0, m3, m1 ;out58
+ paddsw m3, m1 ;out5
+ mova [rsp+gprsize*2+16*40], m6 ;out37
+ mova [rsp+gprsize*2+16*29], m2 ;out26
+ mova [rsp+gprsize*2+16*61], m0 ;out58
+ mova [rsp+gprsize*2+16*8 ], m3 ;out5
+ mova m0, [rsp+gprsize*2+16*24] ;tmp[21]
+ mova m1, [rsp+gprsize*2+16*13] ;tmp[10]
+ psubsw m2, m0, m5 ;out42
+ paddsw m0, m5 ;out21
+ psubsw m3, m1, m4 ;out53
+ paddsw m1, m4 ;out10
+ mova [rsp+gprsize*2+16*45], m2 ;out42
+ mova [rsp+gprsize*2+16*24], m0 ;out21
+ mova [rsp+gprsize*2+16*56], m3 ;out53
+ mova [rsp+gprsize*2+16*13], m1 ;out10
+
+ mova m0, [rsp+gprsize*2+16*41] ;t38
+ mova m5, [rsp+gprsize*2+16*44] ;t41
+ mova m3, [rsp+gprsize*2+16*57] ;t54
+ mova m1, [rsp+gprsize*2+16*60] ;t57
+ mova m2, [rsp+gprsize*2+16*28] ;tmp[25]
+ psubsw m4, m0, m5 ;t41a
+ paddsw m0, m5 ;t38a
+ psubsw m5, m1, m3 ;t54a
+ paddsw m1, m3 ;t57a
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a
+ mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ]
+ psubsw m6, m2, m0 ;out38
+ paddsw m2, m0 ;out25
+ psubsw m0, m3, m1 ;out57
+ paddsw m3, m1 ;out6
+ mova [rsp+gprsize*2+16*41], m6 ;out38
+ mova [rsp+gprsize*2+16*28], m2 ;out25
+ mova [rsp+gprsize*2+16*60], m0 ;out57
+ mova [rsp+gprsize*2+16*9 ], m3 ;out6
+ mova m0, [rsp+gprsize*2+16*25] ;tmp[22]
+ mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ]
+ psubsw m2, m0, m5 ;out41
+ paddsw m0, m5 ;out22
+ psubsw m3, m1, m4 ;out54
+ paddsw m1, m4 ;out9
+ mova [rsp+gprsize*2+16*44], m2 ;out41
+ mova [rsp+gprsize*2+16*25], m0 ;out22
+ mova [rsp+gprsize*2+16*57], m3 ;out54
+ mova [rsp+gprsize*2+16*12], m1 ;out9
+
+ mova m0, [rsp+gprsize*2+16*42] ;t39a
+ mova m5, [rsp+gprsize*2+16*43] ;t40a
+ mova m3, [rsp+gprsize*2+16*58] ;t55a
+ mova m1, [rsp+gprsize*2+16*59] ;t56a
+ mova m2, [rsp+gprsize*2+16*27] ;tmp[24]
+ psubsw m4, m0, m5 ;t40
+ paddsw m0, m5 ;t39
+ psubsw m5, m1, m3 ;t55
+ paddsw m1, m3 ;t56
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a
+ mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ]
+ psubsw m6, m2, m0 ;out39
+ paddsw m2, m0 ;out24
+ psubsw m0, m3, m1 ;out56
+ paddsw m3, m1 ;out7
+ mova [rsp+gprsize*2+16*42], m6 ;out39
+ mova [rsp+gprsize*2+16*27], m2 ;out24
+ mova [rsp+gprsize*2+16*59], m0 ;out56
+ mova [rsp+gprsize*2+16*10], m3 ;out7
+ mova m0, [rsp+gprsize*2+16*26] ;tmp[23]
+ mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ]
+ psubsw m2, m0, m5 ;out40
+ paddsw m0, m5 ;out23
+ psubsw m3, m1, m4 ;out55
+ paddsw m1, m4 ;out8
+ mova [rsp+gprsize*2+16*43], m2 ;out40
+ mova [rsp+gprsize*2+16*26], m0 ;out23
+ mova [rsp+gprsize*2+16*58], m3 ;out55
+ mova [rsp+gprsize*2+16*11], m1 ;out8
+
+ mova m0, [rsp+gprsize*2+16*37] ;t34
+ mova m5, [rsp+gprsize*2+16*48] ;t45
+ mova m3, [rsp+gprsize*2+16*53] ;t50
+ mova m1, [rsp+gprsize*2+16*64] ;t61
+ mova m2, [rsp+gprsize*2+16*32] ;tmp[29]
+ psubsw m4, m0, m5 ;t45a
+ paddsw m0, m5 ;t34a
+ psubsw m5, m1, m3 ;t50a
+ paddsw m1, m3 ;t61a
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50
+ mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ]
+ psubsw m6, m2, m0 ;out34
+ paddsw m2, m0 ;out29
+ psubsw m0, m3, m1 ;out61
+ paddsw m3, m1 ;out2
+ mova [rsp+gprsize*2+16*37], m6 ;out34
+ mova [rsp+gprsize*2+16*32], m2 ;out29
+ mova [rsp+gprsize*2+16*64], m0 ;out61
+ mova [rsp+gprsize*2+16*5 ], m3 ;out2
+ mova m0, [rsp+gprsize*2+16*21] ;tmp[18]
+ mova m1, [rsp+gprsize*2+16*16] ;tmp[13]
+ psubsw m2, m0, m5 ;out45
+ paddsw m0, m5 ;out18
+ psubsw m3, m1, m4 ;out50
+ paddsw m1, m4 ;out13
+ mova [rsp+gprsize*2+16*48], m2 ;out45
+ mova [rsp+gprsize*2+16*21], m0 ;out18
+ mova [rsp+gprsize*2+16*53], m3 ;out50
+ mova [rsp+gprsize*2+16*16], m1 ;out13
+
+ mova m0, [rsp+gprsize*2+16*36] ;t33a
+ mova m5, [rsp+gprsize*2+16*49] ;t46a
+ mova m3, [rsp+gprsize*2+16*52] ;t49a
+ mova m1, [rsp+gprsize*2+16*65] ;t62a
+ mova m2, [rsp+gprsize*2+16*33] ;tmp[30]
+ psubsw m4, m0, m5 ;t46
+ paddsw m0, m5 ;t33
+ psubsw m5, m1, m3 ;t49
+ paddsw m1, m3 ;t62
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50
+ mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ]
+ psubsw m6, m2, m0 ;out33
+ paddsw m2, m0 ;out30
+ psubsw m0, m3, m1 ;out62
+ paddsw m3, m1 ;out1
+ mova [rsp+gprsize*2+16*36], m6 ;out33
+ mova [rsp+gprsize*2+16*33], m2 ;out30
+ mova [rsp+gprsize*2+16*65], m0 ;out62
+ mova [rsp+gprsize*2+16*4 ], m3 ;out1
+ mova m0, [rsp+gprsize*2+16*20] ;tmp[17]
+ mova m1, [rsp+gprsize*2+16*17] ;tmp[14]
+ psubsw m2, m0, m5 ;out46
+ paddsw m0, m5 ;out17
+ psubsw m3, m1, m4 ;out49
+ paddsw m1, m4 ;out14
+ mova [rsp+gprsize*2+16*49], m2 ;out46
+ mova [rsp+gprsize*2+16*20], m0 ;out17
+ mova [rsp+gprsize*2+16*52], m3 ;out49
+ mova [rsp+gprsize*2+16*17], m1 ;out14
+
+ mova m0, [rsp+gprsize*2+16*35] ;t32
+ mova m5, [rsp+gprsize*2+16*50] ;t47
+ mova m3, [rsp+gprsize*2+16*51] ;t48
+ mova m1, [rsp+gprsize*2+16*66] ;t63
+ mova m2, [rsp+gprsize*2+16*34] ;tmp[31]
+ psubsw m4, m0, m5 ;t47a
+ paddsw m0, m5 ;t32a
+ psubsw m5, m1, m3 ;t48a
+ paddsw m1, m3 ;t63a
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48
+ mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ]
+ psubsw m6, m2, m0 ;out32
+ paddsw m2, m0 ;out31
+ psubsw m0, m3, m1 ;out63
+ paddsw m3, m1 ;out0
+ mova [rsp+gprsize*2+16*35], m6 ;out32
+ mova [rsp+gprsize*2+16*34], m2 ;out31
+ mova [rsp+gprsize*2+16*66], m0 ;out63
+ mova [rsp+gprsize*2+16*3 ], m3 ;out0
+ mova m0, [rsp+gprsize*2+16*19] ;tmp[16]
+ mova m1, [rsp+gprsize*2+16*18] ;tmp[15]
+ psubsw m2, m0, m5 ;out47
+ paddsw m0, m5 ;out16
+ psubsw m3, m1, m4 ;out48
+ paddsw m1, m4 ;out15
+ mova [rsp+gprsize*2+16*50], m2 ;out47
+ mova [rsp+gprsize*2+16*19], m0 ;out16
+ mova [rsp+gprsize*2+16*51], m3 ;out48
+ mova [rsp+gprsize*2+16*18], m1 ;out15
+ ret
+
+
+cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_64x16_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 16
+ lea tx2q, [o(.end)]
+
+.body:
+ pmulhrsw m0, m2
+ movd m2, [o(pw_2048)] ;intentionally rip-relative
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ pxor m7, m7
+
+.loop:
+ mova m1, [dstq+16*0]
+ mova m3, [dstq+16*1]
+ mova m5, [dstq+16*2]
+ mova m6, [dstq+16*3]
+ punpckhbw m2, m1, m7
+ punpcklbw m1, m7
+ punpckhbw m4, m3, m7
+ punpcklbw m3, m7
+ paddw m2, m0
+ paddw m1, m0
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m1, m2
+ packuswb m3, m4
+ punpckhbw m2, m5, m7
+ punpcklbw m5, m7
+ punpckhbw m4, m6, m7
+ punpcklbw m6, m7
+ paddw m2, m0
+ paddw m5, m0
+ paddw m4, m0
+ paddw m6, m0
+ packuswb m5, m2
+ packuswb m6, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m3
+ mova [dstq+16*2], m5
+ mova [dstq+16*3], m6
+ add dstq, strideq
+ dec r3d
+ jg .loop
+ jmp tx2q
+
+.end:
+ RET
+
+
+%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2
+
+%if %3
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [%1+%2*0]
+ pmulhrsw m1, m3, [%1+%2*1]
+ pmulhrsw m2, m3, [%1+%2*2]
+ pmulhrsw m3, [%1+%2*3]
+%else
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+%endif
+%endmacro
+
+%macro LOAD_4ROWS_H 2 ;src, stride
+ mova m4, [%1+%2*0]
+ mova m5, [%1+%2*1]
+ mova m6, [%1+%2*2]
+ mova m7, [%1+%2*3]
+%endmacro
+
+cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r3d, 2
+ mov [rsp+gprsize*2+16*67], dstq
+ lea dstq, [rsp+gprsize+16*68]
+
+.pass1_loop:
+ LOAD_4ROWS coeffq+32*0, 32*8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ LOAD_4ROWS coeffq+32*4, 32*8
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+32*2, 32*4
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+32*1, 32*2
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ LOAD_8ROWS coeffq+32*17, 32*2
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call m(idct_16x64_internal_8bpc).main
+
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+32*0, 32
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+32*8, 32
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+32*16, 32
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+32*24, 32
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end4:
+ SAVE_8ROWS dstq+32*0, 32
+ LOAD_8ROWS rsp+gprsize+16*43, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end5:
+ SAVE_8ROWS dstq+32*8, 32
+ LOAD_8ROWS rsp+gprsize+16*51, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end6:
+ SAVE_8ROWS dstq+32*16, 32
+ LOAD_8ROWS rsp+gprsize+16*59, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end7:
+ SAVE_8ROWS dstq+32*24, 32
+
+ add coeffq, 16
+ add dstq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov dstq, [rsp+gprsize*2+16*67]
+ sub coeffq, 32
+ mov r3d, 4
+
+.pass2_loop:
+ mov [rsp+gprsize*1+16*67], r3d
+
+ LOAD_4ROWS coeffq+16*0, 32*2
+ LOAD_4ROWS_H coeffq+16*1, 32*2
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_4ROWS coeffq+16*2, 32*2
+ LOAD_4ROWS_H coeffq+16*3, 32*2
+ call m(idct_16x8_internal_8bpc).main
+
+ mov r3, dstq
+ lea tx2q, [o(.end)]
+ lea dstq, [dstq+strideq*8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 16*16
+ mov r3d, [rsp+gprsize*1+16*67]
+ mov dstq, [rsp+gprsize*2+16*67]
+ add dstq, 8
+ mov [rsp+gprsize*2+16*67], dstq
+ dec r3d
+ jg .pass2_loop
+
+ mov r3d, 4
+ lea coeffq, [rsp+gprsize+16*68]
+.pass2_loop2:
+ mov [rsp+gprsize*1+16*67], r3d
+
+ LOAD_4ROWS coeffq+16*0, 32*2
+ LOAD_4ROWS_H coeffq+16*1, 32*2
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_4ROWS coeffq+16*2, 32*2
+ LOAD_4ROWS_H coeffq+16*3, 32*2
+ call m(idct_16x8_internal_8bpc).main
+
+ mov r3, dstq
+ lea tx2q, [o(.end2)]
+ lea dstq, [dstq+strideq*8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end2:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end3)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end3:
+
+ add coeffq, 16*16
+ mov r3d, [rsp+gprsize*1+16*67]
+ mov dstq, [rsp+gprsize*2+16*67]
+ add dstq, 8
+ mov [rsp+gprsize*2+16*67], dstq
+ dec r3d
+ jg .pass2_loop2
+ ret
+
+
+cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_32x64_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r3d, 64
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
+
+
+cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 136
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*67], coeffq
+
+.pass1_loop:
+ LOAD_8ROWS coeffq+64*1, 64*2, 1
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mov tx2d, [rsp+gprsize*1+16*67]
+ test tx2d, tx2d
+ jl .fast
+
+.full:
+ LOAD_8ROWS coeffq+64*0, 64*4, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+64*2, 64*4, 1
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*17, 64*2, 1
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp .pass1_end
+
+.fast:
+ LOAD_4ROWS coeffq, 256, 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_4ROWS coeffq+128*1, 256, 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+
+.pass1_end:
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+64*24, 64
+
+ add coeffq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov coeffq, [rsp+gprsize*2+16*67]
+ mov r3d, 4
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(m(idct_16x64_internal_8bpc).end1)]
+ jmp m(idct_16x64_internal_8bpc).pass2_loop
+
+
+cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_64x32_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd
+ mov r3d, 32
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
+
+
+cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 136
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*67], coeffq
+ mov [rsp+gprsize*3+16*67], dstq
+ lea dstq, [rsp+gprsize+16*69]
+ mov [rsp+gprsize*4+16*67], dstq
+
+.pass1_loop:
+ LOAD_4ROWS coeffq+64*0, 64*8, 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ LOAD_4ROWS coeffq+64*4, 64*8, 1
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*2, 64*4, 1
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+64*1, 64*2, 1
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ LOAD_8ROWS coeffq+64*17, 64*2, 1
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call m(idct_16x64_internal_8bpc).main
+
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*24, 64
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS dstq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*43, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end5:
+ SAVE_8ROWS dstq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*51, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end6:
+ SAVE_8ROWS dstq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*59, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end7:
+ SAVE_8ROWS dstq+64*24, 64
+
+ add coeffq, 16
+ add dstq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov coeffq, [rsp+gprsize*4+16*67]
+ mov dstq, [rsp+gprsize*3+16*67]
+ mov eobd, [rsp+gprsize*1+16*67]
+ lea dstq, [dstq+32]
+ mov [rsp+gprsize*1+16*35], eobd
+ lea tx2q, [o(.pass2_end)]
+ mov r3d, 4
+ jmp m(idct_32x32_internal_8bpc).pass2_loop
+
+.pass2_end:
+ mova [rsp+gprsize+16*0], m7
+ lea r3, [o(.pass2_end1)]
+ jmp m(idct_8x32_internal_8bpc).end2
+
+.pass2_end1:
+ lea tx2q, [o(.pass2_end)]
+ add coeffq, 16*32
+ mov dstq, [rsp+gprsize*2+16*35]
+ mov r3d, [rsp+gprsize*3+16*35]
+ dec r3d
+ jg m(idct_32x32_internal_8bpc).pass2_loop
+
+.pass2_end2:
+ mov dstq, [rsp+gprsize*3+16*67]
+ mov coeffq, [rsp+gprsize*2+16*67]
+ lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
+ mov r3d, 4
+ jmp m(idct_32x32_internal_8bpc).pass2_loop
+
+
+cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_64x64_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 64
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
+
+cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r5d, 4
+ mov r4d, 2
+ sub eobd, 136
+ cmovns r4d, r5d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, r4d
+ mov [rsp+gprsize*4+16*67], coeffq
+ mov [rsp+gprsize*3+16*67], dstq
+ lea dstq, [rsp+gprsize+16*69]
+ mov [rsp+gprsize*2+16*67], dstq
+
+.pass1_loop:
+ LOAD_4ROWS coeffq+64*0, 64*8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ LOAD_4ROWS coeffq+64*4, 64*8
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*2, 64*4
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+64*1, 64*2
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ LOAD_8ROWS coeffq+64*17, 64*2
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call m(idct_16x64_internal_8bpc).main
+
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*24, 64
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end4:
+ SAVE_8ROWS dstq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*43, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end5:
+ SAVE_8ROWS dstq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*51, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end6:
+ SAVE_8ROWS dstq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*59, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end7:
+ SAVE_8ROWS dstq+64*24, 64
+
+ add coeffq, 16
+ add dstq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov dstq, [rsp+gprsize*3+16*67]
+ mov coeffq, [rsp+gprsize*2+16*67]
+ lea dstq, [dstq+32]
+ mov r3d, 4
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.pass2_end)]
+ jmp m(idct_16x64_internal_8bpc).pass2_loop
+
+.pass2_end:
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ lea dstq, [dstq+strideq*2]
+ lea r3, [rsp+16*32+gprsize]
+ mova [rsp+gprsize+16*0], m7
+ call m(idct_16x64_internal_8bpc).write
+ mov dstq, [rsp+gprsize*2+16*67]
+ mov r3d, [rsp+gprsize*3+16*67]
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.pass2_end)]
+
+ dec r3d
+ jg m(idct_16x64_internal_8bpc).pass2_loop
+
+.pass2_end2:
+ mov coeffq, [rsp+gprsize*4+16*67]
+ mov dstq, [rsp+gprsize*2+16*67]
+ mov r3d, 4
+ sub dstq, 72
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(m(idct_16x64_internal_8bpc).end1)]
+ jmp m(idct_16x64_internal_8bpc).pass2_loop
diff --git a/third_party/dav1d/src/x86/loopfilter.h b/third_party/dav1d/src/x86/loopfilter.h
new file mode 100644
index 0000000000..33c842a9ce
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/loopfilter.h"
+
+#define decl_loopfilter_sb_fns(ext) \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, ext))
+
+decl_loopfilter_sb_fns(ssse3);
+decl_loopfilter_sb_fns(avx2);
+decl_loopfilter_sb_fns(avx512icl);
+
+static ALWAYS_INLINE void loop_filter_dsp_init_x86(Dav1dLoopFilterDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx2);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx2);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx2);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx512icl);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx512icl);
+#endif
+}
diff --git a/third_party/dav1d/src/x86/loopfilter16_avx2.asm b/third_party/dav1d/src/x86/loopfilter16_avx2.asm
new file mode 100644
index 0000000000..ed83000ac2
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter16_avx2.asm
@@ -0,0 +1,1161 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8
+pb_4x1_4x5_4x9_4x13: times 4 db 0, 1
+ times 4 db 8, 9
+ times 4 db 0, 1
+ times 4 db 8, 9
+
+pw_1: times 16 dw 1
+pw_2: times 16 dw 2
+pw_3: times 16 dw 3
+pw_4096: times 2 dw 4096
+
+; 10bpc/12bpc:
+pw_4: times 2 dw 4
+ times 2 dw 16
+clip_max: times 2 dw 511
+ times 2 dw 2047
+clip_min: times 2 dw -512
+ times 2 dw -2048
+
+SECTION .text
+
+; in: out:
+; mm%1 a b c d a e i m
+; mm%2 e f g h b f j n
+; mm%3 i j k l -> c g k o
+; mm%4 m n o p d h l p
+%macro TRANSPOSE4X4W 5
+ punpcklwd m%5, m%1, m%2
+ punpckhwd m%1, m%2
+ punpcklwd m%2, m%3, m%4
+ punpckhwd m%3, m%4
+ punpckldq m%4, m%5, m%2
+ punpckhdq m%5, m%2
+ punpckldq m%2, m%1, m%3
+ punpckhdq m%1, m%3
+
+ SWAP %1, %4
+ SWAP %2, %5, %3
+%endmacro
+
+; in: out:
+; xmm%1 a b c d e f g h a i q y 6 E M U
+; xmm%2 i j k l m n o p b j r z 7 F N V
+; xmm%3 q r s t u v w x c k s 0 8 G O W
+; xmm%4 y z 0 1 2 3 4 5 d l t 1 9 H P X
+; xmm%5 6 7 8 9 A B C D -> e m u 2 A I Q Y
+; xmm%6 E F G H I J K L f n v 3 B J R Z
+; xmm%7 M N O P Q R S T g o w 4 C K S +
+; xmm%8 U V W X Y Z + = h p x 5 D L T =
+%macro TRANSPOSE8X8W 9
+ ; xmm%1 a b c d e f g h a i q y b j r z
+ ; xmm%2 i j k l m n o p c k s 0 d l t 1
+ ; xmm%3 q r s t u v w x -> e m u 2 f n v 3
+ ; xmm%4 y z 0 1 2 3 4 5 g o w 4 h p x 5
+ TRANSPOSE4X4W %1, %2, %3, %4, %9
+
+ ; xmm%5 6 7 8 9 A B C D 6 E M U 7 F N V
+ ; xmm%6 E F G H I J K L 8 G O W 9 H P X
+ ; xmm%7 M N O P Q R S T -> A I Q Y B J R Z
+ ; xmm%8 U V W X Y Z + = C K S + D L T =
+ TRANSPOSE4X4W %5, %6, %7, %8, %9
+
+ ; xmm%1 a i q y b j r z a i q y 6 E M U
+ ; xmm%2 c k s 0 d l t 1 b j r z 7 F N V
+ ; xmm%3 e m u 2 f n v 3 c k s 0 8 G O W
+ ; xmm%4 g o w 4 h p x 5 d l t 1 9 H P X
+ ; xmm%5 6 E M U 7 F N V -> e m u 2 A I Q Y
+ ; xmm%6 8 G O W 9 H P X f n v 3 B J R Z
+ ; xmm%7 A I Q Y B J R Z g o w 4 C K S +
+ ; xmm%8 C K S + D L T = h p x 5 D L T =
+ punpckhqdq m%9, m%1, m%5
+ punpcklqdq m%1, m%5
+ punpckhqdq m%5, m%2, m%6
+ punpcklqdq m%2, m%6
+ punpckhqdq m%6, m%3, m%7
+ punpcklqdq m%3, m%7
+ punpckhqdq m%7, m%4, m%8
+ punpcklqdq m%4, m%8
+
+ SWAP %8, %7, %4, %5, %3, %2, %9
+%endmacro
+
+; transpose and write m3-6, everything else is scratch
+%macro TRANSPOSE_8x4_AND_WRITE_4x16 0
+ ; transpose 8x4
+ punpcklwd m0, m3, m4
+ punpckhwd m3, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpckldq m6, m0, m4
+ punpckhdq m0, m4
+ punpckldq m4, m3, m5
+ punpckhdq m3, m5
+
+ ; write out
+ movq [dstq+strideq*0-4], xm6
+ movhps [dstq+strideq*1-4], xm6
+ movq [dstq+strideq*2-4], xm0
+ movhps [dstq+stride3q -4], xm0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm4
+ movhps [dstq+strideq*1-4], xm4
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+
+ vextracti128 xm6, m6, 1
+ vextracti128 xm0, m0, 1
+ vextracti128 xm4, m4, 1
+ vextracti128 xm3, m3, 1
+
+ movq [dstq+strideq*0-4], xm6
+ movhps [dstq+strideq*1-4], xm6
+ movq [dstq+strideq*2-4], xm0
+ movhps [dstq+stride3q -4], xm0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm4
+ movhps [dstq+strideq*1-4], xm4
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%if %1 == 4
+ lea tmpq, [dstq+mstrideq*2]
+ mova m3, [tmpq+strideq*0] ; p1
+ mova m4, [tmpq+strideq*1] ; p0
+ mova m5, [tmpq+strideq*2] ; q0
+ mova m6, [tmpq+stride3q] ; q1
+%else
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+ lea tmpq, [dstq+mstrideq*4]
+ ; we load p3 later
+ mova m13, [tmpq+strideq*1]
+ mova m3, [tmpq+strideq*2]
+ mova m4, [tmpq+stride3q]
+ mova m5, [dstq+strideq*0]
+ mova m6, [dstq+strideq*1]
+ mova m14, [dstq+strideq*2]
+%if %1 != 6
+ mova m15, [dstq+stride3q]
+%endif
+%endif
+%else
+ ; load lines
+%if %1 == 4
+ movq xm3, [dstq+strideq*0-4]
+ movq xm4, [dstq+strideq*1-4]
+ movq xm5, [dstq+strideq*2-4]
+ movq xm6, [dstq+stride3q -4]
+ lea tmpq, [dstq+strideq*4]
+ movq xm11, [tmpq+strideq*0-4]
+ movq xm13, [tmpq+strideq*1-4]
+ movq xm14, [tmpq+strideq*2-4]
+ movq xm15, [tmpq+stride3q -4]
+ lea tmpq, [tmpq+strideq*4]
+ ; this overreads by 8 bytes but the buffers are padded
+ ; so that should be ok
+ vinserti128 m3, [tmpq+strideq*0-4], 1
+ vinserti128 m4, [tmpq+strideq*1-4], 1
+ vinserti128 m5, [tmpq+strideq*2-4], 1
+ vinserti128 m6, [tmpq+stride3q -4], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m11, [tmpq+strideq*0-4], 1
+ vinserti128 m13, [tmpq+strideq*1-4], 1
+ vinserti128 m14, [tmpq+strideq*2-4], 1
+ vinserti128 m15, [tmpq+stride3q -4], 1
+
+ ; transpose 4x8
+ ; xm3: A-D0,A-D4
+ ; xm4: A-D1,A-D5
+ ; xm5: A-D2,A-D6
+ ; xm6: A-D3,A-D7
+ punpcklwd m7, m3, m4
+ punpcklwd m3, m11, m13
+ punpcklwd m4, m5, m6
+ punpcklwd m5, m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1
+ ; xm3: A4-5,B4-5,C4-5,D4-5
+ ; xm4: A2-3,B2-3,C2-3,D2-3
+ ; xm5: A6-7,B6-7,C6-7,D6-7
+ punpckldq m6, m7, m4
+ punpckhdq m7, m4
+ punpckldq m8, m3, m5
+ punpckhdq m5, m3, m5
+ ; xm6: A0-3,B0-3
+ ; xm7: C0-3,D0-3
+ ; xm8: A4-7,B4-7
+ ; xm5: C4-7,D4-7
+ punpcklqdq m3, m6, m8
+ punpckhqdq m4, m6, m8
+ punpckhqdq m6, m7, m5
+ punpcklqdq m5, m7, m5
+ ; xm3: A0-7
+ ; xm4: B0-7
+ ; xm5: C0-7
+ ; xm6: D0-7
+%elif %1 == 6 || %1 == 8
+ movu xm3, [dstq+strideq*0-8]
+ movu xm4, [dstq+strideq*1-8]
+ movu xm5, [dstq+strideq*2-8]
+ movu xm6, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4]
+ movu xm11, [tmpq+strideq*0-8]
+ movu xm13, [tmpq+strideq*1-8]
+ movu xm14, [tmpq+strideq*2-8]
+ movu xm15, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m3, [tmpq+strideq*0-8], 1
+ vinserti128 m4, [tmpq+strideq*1-8], 1
+ vinserti128 m5, [tmpq+strideq*2-8], 1
+ vinserti128 m6, [tmpq+stride3q -8], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m11, [tmpq+strideq*0-8], 1
+ vinserti128 m13, [tmpq+strideq*1-8], 1
+ vinserti128 m14, [tmpq+strideq*2-8], 1
+ vinserti128 m15, [tmpq+stride3q -8], 1
+
+ ; transpose 8x16
+ ; xm3: A-H0,A-H8
+ ; xm4: A-H1,A-H9
+ ; xm5: A-H2,A-H10
+ ; xm6: A-H3,A-H11
+ ; xm11: A-H4,A-H12
+ ; xm13: A-H5,A-H13
+ ; xm14: A-H6,A-H14
+ ; xm15: A-H7,A-H15
+ punpcklwd m7, m3, m4
+ punpckhwd m3, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpcklwd m6, m11, m13
+ punpckhwd m11, m13
+ punpcklwd m13, m14, m15
+ punpckhwd m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1
+ ; xm3: E0-1,F0-1,G0-1,H0-1
+ ; xm4: A2-3,B2-3,C2-3,D2-3
+ ; xm5: E2-3,F2-3,G2-3,H2-3
+ ; xm6: A4-5,B4-5,C4-5,D4-5
+ ; xm11: E4-5,F4-5,G4-5,H4-5
+ ; xm13: A6-7,B6-7,C6-7,D6-7
+ ; xm14: E6-7,F6-7,G6-7,H6-7
+ punpckldq m15, m7, m4
+ punpckhdq m7, m4
+ punpckldq m9, m3, m5
+ punpckhdq m8, m3, m5
+ punpckldq m3, m6, m13
+ punpckhdq m6, m13
+ punpckldq m10, m11, m14
+ punpckhdq m11, m14
+ ; xm15: A0-3,B0-3
+ ; xm7: C0-3,D0-3
+ ; xm9: E0-3,F0-3
+ ; xm8: G0-3,H0-3
+ ; xm3: A4-7,B4-7
+ ; xm6: C4-7,D4-7
+ ; xm10: E4-7,F4-7
+ ; xm11: G4-7,H4-7
+%if %1 != 6
+ punpcklqdq m0, m15, m3
+%endif
+ punpckhqdq m13, m15, m3
+ punpcklqdq m3, m7, m6
+ punpckhqdq m4, m7, m6
+ punpcklqdq m5, m9, m10
+ punpckhqdq m6, m9, m10
+ punpcklqdq m14, m8, m11
+%if %1 != 6
+ punpckhqdq m15, m8, m11
+ mova [rsp+5*32], m0
+%endif
+%else
+ ; We only use 14 pixels but we'll need the remainder at the end for
+ ; the second transpose
+ mova xm0, [dstq+strideq*0-16]
+ mova xm1, [dstq+strideq*1-16]
+ mova xm2, [dstq+strideq*2-16]
+ mova xm3, [dstq+stride3q -16]
+ lea tmpq, [dstq+strideq*4]
+ mova xm4, [tmpq+strideq*0-16]
+ mova xm5, [tmpq+strideq*1-16]
+ mova xm6, [tmpq+strideq*2-16]
+ mova xm7, [tmpq+stride3q -16]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m0, m0, [tmpq+strideq*0-16], 1
+ vinserti128 m1, m1, [tmpq+strideq*1-16], 1
+ vinserti128 m2, m2, [tmpq+strideq*2-16], 1
+ vinserti128 m3, m3, [tmpq+stride3q -16], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m4, m4, [tmpq+strideq*0-16], 1
+ vinserti128 m5, m5, [tmpq+strideq*1-16], 1
+ vinserti128 m6, m6, [tmpq+strideq*2-16], 1
+ vinserti128 m7, m7, [tmpq+stride3q -16], 1
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+
+ mova [rsp+6*32], m0
+ mova [rsp+7*32], m1
+ mova [rsp+8*32], m2
+ mova [rsp+9*32], m3
+ mova [rsp+5*32], m4
+
+ mova xm0, [dstq+strideq*0]
+ mova xm1, [dstq+strideq*1]
+ mova xm2, [dstq+strideq*2]
+ mova xm3, [dstq+stride3q ]
+ lea tmpq, [dstq+strideq*4]
+ mova xm8, [tmpq+strideq*0]
+ mova xm9, [tmpq+strideq*1]
+ mova xm10, [tmpq+strideq*2]
+ mova xm11, [tmpq+stride3q ]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m0, m0, [tmpq+strideq*0], 1
+ vinserti128 m1, m1, [tmpq+strideq*1], 1
+ vinserti128 m2, m2, [tmpq+strideq*2], 1
+ vinserti128 m3, m3, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m8, m8, [tmpq+strideq*0], 1
+ vinserti128 m9, m9, [tmpq+strideq*1], 1
+ vinserti128 m10, m10, [tmpq+strideq*2], 1
+ vinserti128 m11, m11, [tmpq+stride3q ], 1
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 8, 9, 10, 11, 4
+
+ mova [rsp+10*32], m8
+ mova [rsp+11*32], m9
+ mova [rsp+12*32], m10
+ mova [rsp+13*32], m11
+
+ ; 5,6,7,0,1,2,3 -> 13,3,4,5,6,14,15
+ SWAP 13, 5, 0
+ SWAP 3, 6, 1, 15
+ SWAP 4, 7
+ SWAP 2, 14
+%endif
+%endif
+
+ ; load L/E/I/H
+%ifidn %2, v
+ pmovzxbw m1, [lq]
+ pmovzxbw m0, [lq+l_strideq]
+ pxor m2, m2
+%else
+ vpbroadcastq m0, [lq] ; l0, l1
+ vpbroadcastq m1, [lq+l_strideq] ; l2, l3
+ vpbroadcastq m2, [lq+l_strideq*2] ; l4, l5
+ vpbroadcastq m10, [lq+l_stride3q] ; l6, l7
+ punpckldq m0, m1 ; l0, l2, l1, l3 [2x]
+ punpckldq m2, m10 ; l4, l6, l5, l7 [2x]
+ vpblendd m0, m0, m2, 11110000b ; l0, l2, l1, l3, l4, l6, l5, l7
+ pxor m2, m2
+ punpcklbw m1, m0, m2 ; l0, l2, l4, l6
+ punpckhbw m0, m2 ; l1, l3, l5, l7
+%endif
+ pcmpeqw m10, m2, m0
+ pand m1, m10
+ por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1]
+ pcmpeqw m10, m2, m0 ; !L
+ psrlw m10, 1
+ psrlw m2, m0, [lutq+128]
+ vpbroadcastw m1, [lutq+136]
+ pminuw m2, m1
+ pmaxuw m2, [pw_1] ; I
+ psrlw m1, m0, 4 ; H
+ paddw m0, [pw_2]
+ vpbroadcastd m8, [r11]
+ paddw m0, m0
+ paddw m0, m2 ; E
+ REPX {pmullw x, m8}, m0, m1, m2
+
+ psubw m8, m3, m4 ; p1-p0
+ psubw m9, m5, m6 ; q1-q0
+ REPX {pabsw x, x}, m8, m9
+ pmaxuw m8, m10
+ pmaxuw m8, m9
+ pcmpgtw m7, m8, m1 ; hev
+%if %1 != 4
+ psubw m9, m13, m4 ; p2-p0
+ pabsw m9, m9
+ pmaxuw m9, m8
+%if %1 != 6
+%ifidn %2, v
+ mova m11, [tmpq+strideq*0] ; p3
+%else
+ mova m11, [rsp+5*32] ; p3
+%endif
+ psubw m10, m11, m4 ; p3-p0
+ pabsw m10, m10
+ pmaxuw m9, m10
+%endif
+ psubw m10, m5, m14 ; q2-q0
+ pabsw m10, m10
+ pmaxuw m9, m10
+%if %1 != 6
+ psubw m10, m5, m15 ; q3-q0
+ pabsw m10, m10
+ pmaxuw m9, m10
+%endif
+ vpbroadcastd m10, [r11]
+ pcmpgtw m9, m10 ; !flat8in
+
+ psubw m10, m13, m3 ; p2-p1
+ pabsw m10, m10
+%if %1 != 6
+ psubw m11, m13 ; p3-p2
+ pabsw m11, m11
+ pmaxuw m10, m11
+ psubw m11, m14, m15 ; q3-q2
+ pabsw m11, m11
+ pmaxuw m10, m11
+%endif
+ psubw m11, m14, m6 ; q2-q1
+ pabsw m11, m11
+ pmaxuw m10, m11
+
+%if %1 == 16
+ vpbroadcastd m11, [maskq+8]
+ vpbroadcastd m1, [maskq+4]
+ por m11, m1
+ pand m11, m12
+ pcmpeqd m11, m12
+ pand m10, m11
+%else
+ vpbroadcastd m11, [maskq+4]
+ pand m11, m12
+ pcmpeqd m11, m12
+ pand m10, m11 ; only apply fm-wide to wd>4 blocks
+%endif
+ pmaxuw m8, m10
+%endif
+ pcmpgtw m8, m2
+
+ psubw m10, m3, m6 ; p1-q1
+ psubw m11, m4, m5 ; p0-q0
+ REPX {pabsw x, x}, m10, m11
+ paddw m11, m11
+ psrlw m10, 1
+ paddw m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ pcmpgtw m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+ por m8, m10
+
+%if %1 == 16
+
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1]
+ mova m1, [tmpq+strideq*2]
+ mova m2, [tmpq+stride3q]
+%else
+ mova m0, [rsp+7*32]
+ mova m1, [rsp+8*32]
+ mova m2, [rsp+9*32]
+%endif
+ REPX {psubw x, m4}, m0, m1, m2
+ REPX {pabsw x, x}, m0, m1, m2
+ pmaxuw m1, m0
+ pmaxuw m1, m2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+ mova m0, [tmpq+strideq*0]
+ mova m2, [tmpq+strideq*1]
+ mova m10, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+10*32]
+ mova m2, [rsp+11*32]
+ mova m10, [rsp+12*32]
+%endif
+ REPX {psubw x, m5}, m0, m2, m10
+ REPX {pabsw x, x}, m0, m2, m10
+ pmaxuw m0, m2
+ pmaxuw m1, m10
+ pmaxuw m1, m0
+ vpbroadcastd m0, [r11]
+ pcmpgtw m1, m0 ; !flat8out
+ por m1, m9 ; !flat8in | !flat8out
+ vpbroadcastd m2, [maskq+8]
+ pand m10, m2, m12
+ pcmpeqd m10, m12
+ pandn m1, m10 ; flat16
+ pandn m1, m8, m1 ; flat16 & fm
+
+ vpbroadcastd m10, [maskq+4]
+ por m10, m2
+ pand m2, m10, m12
+ pcmpeqd m2, m12
+ pandn m9, m2 ; flat8in
+ pandn m9, m8, m9
+ vpbroadcastd m2, [maskq+0]
+ por m2, m10
+ pand m2, m12
+ pcmpeqd m2, m12
+ pandn m8, m2
+ pandn m8, m9, m8 ; fm & !flat8 & !flat16
+ pandn m9, m1, m9 ; flat8 & !flat16
+%elif %1 != 4
+ vpbroadcastd m0, [maskq+4]
+ pand m2, m0, m12
+ pcmpeqd m2, m12
+ pandn m9, m2
+ pandn m9, m8, m9 ; flat8 & fm
+ vpbroadcastd m2, [maskq+0]
+ por m0, m2
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m8, m0
+ pandn m8, m9, m8 ; fm & !flat8
+%else
+ vpbroadcastd m0, [maskq+0]
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m8, m0 ; fm
+%endif
+
+ ; short filter
+ vpbroadcastd m0, [r11+8*1] ; 511 or 2047
+ vpbroadcastd m2, [r11+8*2] ; -512 or -2048
+ psubw m10, m5, m4
+ paddw m11, m10, m10
+ paddw m11, m10
+ psubw m10, m3, m6 ; iclip_diff(p1-q1)
+ pminsw m10, m0
+ pmaxsw m10, m2
+ pand m10, m7 ; f=iclip_diff(p1-q1)&hev
+ paddw m10, m11 ; f=iclip_diff(3*(q0-p0)+f)
+ pminsw m10, m0
+ pmaxsw m10, m2
+ pand m8, m10 ; f&=fm
+ vpbroadcastd m10, [pw_4]
+ paddw m10, m8
+ paddw m8, [pw_3]
+ REPX {pminsw x, m0}, m10, m8
+ psraw m10, 3 ; f2
+ psraw m8, 3 ; f1
+ psubw m5, m10
+ paddw m4, m8
+
+ paddw m10, [pw_1]
+ psraw m10, 1 ; f=(f1+1)>>1
+ pandn m8, m7, m10 ; f&=!hev
+ paddw m3, m8
+ psubw m6, m8
+ pxor m8, m8
+ psubw m0, m2 ; 1023 or 4095
+ REPX {pminsw x, m0}, m3, m4, m5, m6
+ REPX {pmaxsw x, m8}, m3, m4, m5, m6
+
+%if %1 == 16
+
+; m3-6 = p1/p0/q0/q1, m9=flat8, m1=flat16
+; m12=filter bits mask
+; m13-15=p2/q2/q3
+; m0,2,7-8,10-11 = free
+
+ ; flat16 filter
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1] ; p6
+ mova m2, [tmpq+strideq*2] ; p5
+ mova m7, [tmpq+stride3q] ; p4
+ mova m11, [tmpq+strideq*4] ; p3
+%else
+ mova m0, [rsp+7*32]
+ mova m2, [rsp+8*32]
+ mova m7, [rsp+9*32]
+ mova m11, [rsp+5*32]
+%endif
+
+ mova [rsp+ 0*32], m9
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ paddw m8, m0, [pw_1]
+ psllw m8, 3 ; p6*8+8
+ paddw m10, m2, m7 ; p5+p4
+ psubw m8, m0
+ paddw m10, m10 ; (p5+p4)*2
+ paddw m8, m11 ; p6*7+p3
+ paddw m10, m13 ; (p5+p4)*2+p2
+ paddw m8, m3 ; p6*7+p3+p1
+ paddw m10, m4 ; (p5+p4)*2+p2+p0
+ paddw m8, m5 ; p6*7+p3+p1+q0
+ paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ psrlw m10, m8, 4
+ vpblendvb m10, m2, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*2], m10 ; p5
+%else
+ mova [rsp+8*32], m10
+%endif
+
+ ; sub p6*2, add p3/q1
+ paddw m8, m11
+ paddw m10, m0, m0
+ paddw m8, m6
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m7, m10, m1
+%ifidn %2, v
+ mova [tmpq+stride3q], m10 ; p4
+%else
+ mova [rsp+9*32], m10
+%endif
+
+ ; sub p6/p5, add p2/q2
+ psubw m8, m0
+ paddw m10, m13, m14
+ psubw m8, m2
+ paddw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m11, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*4], m10 ; p3
+ lea tmpq, [dstq+strideq*4]
+%else
+ mova [rsp+5*32], m10
+%endif
+
+ ; sub p6/p4, add p1/q3
+ paddw m8, m3
+ paddw m10, m0, m7
+ paddw m8, m15
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m13, m10, m1
+ mova [rsp+1*32], m10 ; don't clobber p2/m13
+
+ ; sub p6/p3, add p0/q4
+ paddw m8, m4
+ paddw m10, m0, m11
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*0]
+%else
+ paddw m8, [rsp+10*32]
+%endif
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m3, m10, m1
+ mova [rsp+2*32], m10 ; don't clobber p1/m3
+
+ ; sub p6/p2, add q0/q5
+ paddw m8, m5
+ paddw m10, m0, m13
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*1]
+%else
+ paddw m8, [rsp+11*32]
+%endif
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m4, m10, m1
+ mova [rsp+3*32], m10 ; don't clobber p0/m4
+
+ ; sub p6/p1, add q1/q6
+ paddw m8, m6
+ paddw m10, m0, m3
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2] ; q6
+%else
+ mova m0, [rsp+12*32] ; q6
+%endif
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m5, m10, m1
+ mova [rsp+4*32], m10 ; don't clobber q0/m5
+
+ ; sub p5/p0, add q2/q6
+ paddw m8, m14
+ paddw m10, m2, m4
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m2, m6, m10, m1 ; don't clobber q1/m6
+
+ ; sub p4/q0, add q3/q6
+ paddw m8, m15
+ paddw m10, m7, m5
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m7, m14, m10, m1 ; don't clobber q2/m14
+
+ ; sub p3/q1, add q4/q6
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*0]
+%else
+ paddw m8, [rsp+10*32]
+%endif
+ paddw m10, m11, m6
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+ vpblendvb m10, m15, m10, m1
+%ifidn %2, v
+ mova [tmpq+mstrideq], m10 ; q3
+%else
+ mova [rsp+14*32], m10
+%endif
+
+ ; sub p2/q2, add q5/q6
+%ifidn %2, v
+ paddw m8, [tmpq+strideq*1]
+%else
+ paddw m8, [rsp+11*32]
+%endif
+ paddw m10, m13, m14
+ paddw m8, m0
+ psubw m8, m10
+ psrlw m10, m8, 4
+%ifidn %2, v
+ mova m9, [tmpq+strideq*0]
+%else
+ mova m9, [rsp+10*32]
+%endif
+ vpblendvb m10, m9, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*0], m10 ; q4
+%else
+ mova [rsp+10*32], m10
+%endif
+
+ ; sub p1/q3, add q6*2
+ psubw m8, m3
+ paddw m0, m0
+ psubw m8, m15
+ paddw m8, m0
+ psrlw m10, m8, 4
+%ifidn %2, v
+ mova m9, [tmpq+strideq*1]
+%else
+ mova m9, [rsp+11*32]
+%endif
+ vpblendvb m10, m9, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*1], m10 ; q5
+%else
+ mova [rsp+11*32], m10
+%endif
+
+ mova m9, [rsp+0*32]
+ mova m13, [rsp+1*32]
+ mova m3, [rsp+2*32]
+ mova m4, [rsp+3*32]
+ mova m5, [rsp+4*32]
+ SWAP 2, 6
+ SWAP 7, 14
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*4]
+%else
+ mova m15, [rsp+14*32]
+%endif
+%endif
+
+%if %1 >= 8
+ ; flat8 filter
+ vpbroadcastd m7, [pw_4096]
+%ifidn %2, v
+ mova m0, [tmpq+strideq*0] ; p3
+%else
+ mova m0, [rsp+5*32] ; p3
+%endif
+ paddw m1, m0, m13 ; p3+p2
+ paddw m2, m3, m4 ; p1+p0
+ paddw m8, m1, m1 ; 2*(p3+p2)
+ paddw m2, m0 ; p1+p0+p3
+ paddw m8, m5 ; 2*(p3+p2)+q0
+ paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0
+ pmulhrsw m10, m2, m7
+
+ paddw m8, m3, m6
+ psubw m2, m1
+ paddw m2, m8
+ pmulhrsw m8, m2, m7
+
+ paddw m11, m0, m3
+ paddw m1, m4, m14
+ psubw m2, m11
+ paddw m2, m1
+ pmulhrsw m1, m2, m7
+
+ paddw m11, m0, m4
+ pblendvb m4, m1, m9
+ paddw m1, m5, m15
+ psubw m2, m11
+ paddw m2, m1
+ pmulhrsw m11, m2, m7
+
+ paddw m2, m6
+ paddw m2, m15
+ paddw m1, m13, m5
+ pblendvb m5, m11, m9
+ pblendvb m13, m10, m9
+ psubw m2, m1
+ pmulhrsw m1, m2, m7
+
+ psubw m2, m3
+ pblendvb m3, m8, m9
+ psubw m2, m6
+ pblendvb m6, m1, m9
+ paddw m1, m15, m14
+ paddw m2, m1
+ pmulhrsw m2, m7
+
+ pblendvb m14, m2, m9
+
+%ifidn %2, v
+ mova [tmpq+strideq*1], m13 ; p2
+ mova [tmpq+strideq*2], m3 ; p1
+ mova [tmpq+stride3q ], m4 ; p0
+ mova [dstq+strideq*0], m5 ; q0
+ mova [dstq+strideq*1], m6 ; q1
+ mova [dstq+strideq*2], m14 ; q2
+%elif %1 == 8
+ TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1
+
+ ; write 8x16
+ movu [dstq+strideq*0-8], xm0
+ movu [dstq+strideq*1-8], xm13
+ movu [dstq+strideq*2-8], xm3
+ movu [dstq+stride3q -8], xm4
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm5
+ movu [dstq+strideq*1-8], xm6
+ movu [dstq+strideq*2-8], xm14
+ movu [dstq+stride3q -8], xm15
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m0, 1
+ vextracti128 [dstq+strideq*1-8], m13, 1
+ vextracti128 [dstq+strideq*2-8], m3, 1
+ vextracti128 [dstq+stride3q -8], m4, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m5, 1
+ vextracti128 [dstq+strideq*1-8], m6, 1
+ vextracti128 [dstq+strideq*2-8], m14, 1
+ vextracti128 [dstq+stride3q -8], m15, 1
+ lea dstq, [dstq+strideq*4]
+%else
+ mova m8, [rsp+6*32]
+ mova m1, [rsp+7*32]
+ mova m2, [rsp+8*32]
+ mova m7, [rsp+9*32]
+ TRANSPOSE8X8W 8, 1, 2, 7, 0, 13, 3, 4, 9
+
+ mova [dstq+strideq*0-16], xm8
+ mova [dstq+strideq*1-16], xm1
+ mova [dstq+strideq*2-16], xm2
+ mova [dstq+stride3q -16], xm7
+ lea tmpq, [dstq+strideq*4]
+ mova [tmpq+strideq*0-16], xm0
+ mova [tmpq+strideq*1-16], xm13
+ mova [tmpq+strideq*2-16], xm3
+ mova [tmpq+stride3q -16], xm4
+ lea tmpq, [tmpq+strideq*4]
+ vextracti128 [tmpq+strideq*0-16], m8, 1
+ vextracti128 [tmpq+strideq*1-16], m1, 1
+ vextracti128 [tmpq+strideq*2-16], m2, 1
+ vextracti128 [tmpq+stride3q -16], m7, 1
+ lea tmpq, [tmpq+strideq*4]
+ vextracti128 [tmpq+strideq*0-16], m0, 1
+ vextracti128 [tmpq+strideq*1-16], m13, 1
+ vextracti128 [tmpq+strideq*2-16], m3, 1
+ vextracti128 [tmpq+stride3q -16], m4, 1
+
+ mova m0, [rsp+10*32]
+ mova m1, [rsp+11*32]
+ mova m2, [rsp+12*32]
+ mova m3, [rsp+13*32]
+ TRANSPOSE8X8W 5, 6, 14, 15, 0, 1, 2, 3, 4
+ mova [dstq+strideq*0], xm5
+ mova [dstq+strideq*1], xm6
+ mova [dstq+strideq*2], xm14
+ mova [dstq+stride3q ], xm15
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ mova [dstq+strideq*2], xm2
+ mova [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0], m5, 1
+ vextracti128 [dstq+strideq*1], m6, 1
+ vextracti128 [dstq+strideq*2], m14, 1
+ vextracti128 [dstq+stride3q ], m15, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0], m0, 1
+ vextracti128 [dstq+strideq*1], m1, 1
+ vextracti128 [dstq+strideq*2], m2, 1
+ vextracti128 [dstq+stride3q ], m3, 1
+ lea dstq, [dstq+strideq*4]
+%endif
+%elif %1 == 6
+ ; flat6 filter
+ vpbroadcastd m7, [pw_4096]
+ paddw m8, m3, m4
+ paddw m8, m13 ; p2+p1+p0
+ paddw m11, m13, m5
+ paddw m8, m8
+ paddw m8, m11 ; p2+2*(p2+p1+p0)+q0
+ pmulhrsw m2, m8, m7
+
+ paddw m8, m5
+ paddw m11, m13, m13
+ paddw m8, m6
+ psubw m8, m11
+ pmulhrsw m10, m8, m7
+
+ paddw m8, m6
+ paddw m11, m13, m3
+ paddw m8, m14
+ psubw m8, m11
+ pmulhrsw m11, m8, m7
+
+ psubw m8, m3
+ paddw m14, m14
+ psubw m8, m4
+ paddw m8, m14
+ pmulhrsw m8, m7
+
+ pblendvb m3, m2, m9
+ pblendvb m4, m10, m9
+ pblendvb m5, m11, m9
+ pblendvb m6, m8, m9
+
+%ifidn %2, v
+ mova [tmpq+strideq*2], m3 ; p1
+ mova [tmpq+stride3q ], m4 ; p0
+ mova [dstq+strideq*0], m5 ; q0
+ mova [dstq+strideq*1], m6 ; q1
+%else
+ TRANSPOSE_8x4_AND_WRITE_4x16
+%endif
+%else
+%ifidn %2, v
+ mova [tmpq+strideq*0], m3 ; p1
+ mova [tmpq+strideq*1], m4 ; p0
+ mova [tmpq+strideq*2], m5 ; q0
+ mova [tmpq+stride3q ], m6 ; q1
+%else
+ TRANSPOSE_8x4_AND_WRITE_4x16
+%endif
+%endif
+%endmacro
+
+INIT_YMM avx2
+cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+
+ call .v4
+
+.end:
+ pslld m12, 4
+ add lq, 16
+ add dstq, 32
+ shl mask_bitsd, 4
+ sub wd, 4
+ jg .loop
+ RET
+ALIGN function_align
+.v4:
+ FILTER 4, v
+ ret
+
+INIT_YMM avx2
+cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov hd, hm
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .no_filter
+
+ call .h4
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+ lea dstq, [dstq+strideq*8]
+.end:
+ pslld m12, 4
+ lea lq, [lq+l_strideq*4]
+ shl mask_bitsd, 4
+ sub hd, 4
+ jg .loop
+ RET
+ALIGN function_align
+.h4:
+ FILTER 4, h
+ ret
+
+INIT_YMM avx2
+cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+
+ call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx2).v4
+
+.end:
+ pslld m12, 4
+ add lq, 16
+ add dstq, 32
+ shl mask_bitsd, 4
+ sub wd, 4
+ jg .loop
+ RET
+
+INIT_YMM avx2
+cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+ mov r6d, r7m
+ lea r11, [pw_4]
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
+ mov hd, hm
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+ mov mask_bitsd, 0xf
+ mova m12, [pb_mask]
+
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .no_filter
+
+ call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx2).h4
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+ lea dstq, [dstq+strideq*8]
+.end:
+ pslld m12, 4
+ lea lq, [lq+l_strideq*4]
+ shl mask_bitsd, 4
+ sub hd, 4
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/loopfilter16_avx512.asm b/third_party/dav1d/src/x86/loopfilter16_avx512.asm
new file mode 100644
index 0000000000..b7bc3aa106
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter16_avx512.asm
@@ -0,0 +1,912 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+l_shuf_v: times 2 db 0, 32
+pw_1: times 2 dw 1
+ times 2 db 4, 36
+pw_3: times 2 dw 3
+ times 2 db 8, 40
+pw_4: times 2 dw 4
+ times 2 db 12, 44
+pw_16: times 2 dw 16
+ times 2 db 16, 48
+pw_4096: times 2 dw 4096
+ times 2 db 20, 52
+pw_16384: times 2 dw 16384
+ times 2 db 24, 56
+pw_32767: times 2 dw 32767
+ times 2 db 28, 60
+ times 2 dw 0
+filter_mask: dd 1, 2, 4, 8, 16, 32, 64,128
+stride_mul: dd 0, 1, 8, 9, 16, 17, 24, 25
+l_shuf_h: db 4, -1, 4, -1, 4, -1, 4, -1, 12, -1, 12, -1, 12, -1, 12, -1
+clip_max: dw 511, 511, 2047, 2047
+clip_min: dw -512, -512, -2048, -2048
+
+SECTION .text
+
+%macro TRANSPOSE8X8W 9 ; src/dst[1-8], tmp
+ punpckhwd m%9, m%5, m%6
+ punpcklwd m%5, m%6
+ punpckhwd m%6, m%1, m%2
+ punpcklwd m%1, m%2
+ punpckhwd m%2, m%7, m%8
+ punpcklwd m%7, m%8
+ punpckhwd m%8, m%3, m%4
+ punpcklwd m%3, m%4
+ punpckhdq m%4, m%1, m%3
+ punpckldq m%1, m%3
+ punpckldq m%3, m%5, m%7
+ punpckhdq m%5, m%7
+ punpckhdq m%7, m%6, m%8
+ punpckldq m%6, m%8
+ punpckldq m%8, m%9, m%2
+ punpckhdq m%9, m%2
+ punpckhqdq m%2, m%1, m%3
+ punpcklqdq m%1, m%3
+ punpcklqdq m%3, m%4, m%5
+ punpckhqdq m%4, m%5
+ punpcklqdq m%5, m%6, m%8
+ punpckhqdq m%6, m%8
+ punpckhqdq m%8, m%7, m%9
+ punpcklqdq m%7, m%9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+%ifidn %2, v
+%if %1 == 16
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1 ]
+ mova m1, [tmpq+strideq*2 ] ; p5
+ mova m2, [tmpq+stride3q ] ; p4
+ mova m3, [tmpq+strideq*4 ] ; p3
+ mova m4, [tmpq+stride5q ] ; p2
+%elif %1 == 6 || %1 == 8
+ lea tmpq, [dstq+mstrideq*4]
+%if %1 == 8
+ mova m3, [tmpq+strideq*0 ]
+%endif
+ mova m4, [tmpq+strideq*1 ]
+%endif
+ mova m5, [dstq+mstrideq*2] ; p1
+ mova m6, [dstq+mstrideq*1] ; p0
+ mova m7, [dstq+strideq*0 ] ; q0
+ mova m8, [dstq+strideq*1 ] ; q1
+%if %1 != 4
+ mova m9, [dstq+strideq*2 ] ; q2
+%endif
+%if %1 == 8 || %1 == 16
+ mova m10, [dstq+stride3q ] ; q3
+%endif
+%if %1 == 16
+ mova m11, [dstq+strideq*4 ] ; q4
+ mova m22, [dstq+stride5q ] ; q5
+ mova m23, [dstq+stride3q*2]
+%endif
+%else ; h
+%if %1 == 16
+ movu ym16, [dstq+strideq*0 -16]
+ movu ym17, [dstq+strideq*1 -16]
+ movu ym18, [dstq+strideq*2 -16]
+ movu ym19, [dstq+stride3q -16]
+ movu ym20, [dstq+strideq*4 -16]
+ movu ym22, [dstq+stride5q -16]
+ movu ym23, [dstq+stride3q*2-16]
+ movu ym28, [dstq+stride7q -16]
+ lea tmpq, [dstq+strideq*8 -16]
+ vinserti32x8 m7, m16, [tmpq+strideq*0 ], 1
+ vinserti32x8 m8, m17, [tmpq+strideq*1 ], 1
+ vinserti32x8 m9, m18, [tmpq+strideq*2 ], 1
+ vinserti32x8 m10, m19, [tmpq+stride3q ], 1
+ vinserti32x8 m11, m20, [tmpq+strideq*4 ], 1
+ vinserti32x8 m22, m22, [tmpq+stride5q ], 1
+ vinserti32x8 m23, m23, [tmpq+stride3q*2], 1
+ vinserti32x8 m28, m28, [tmpq+stride7q ], 1
+ lea tmpq, [tmpq+strideq*8]
+ TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 27
+ movu ym16, [tmpq+strideq*0 ]
+ movu ym17, [tmpq+strideq*1 ]
+ movu ym18, [tmpq+strideq*2 ]
+ movu ym19, [tmpq+stride3q ]
+ movu ym24, [tmpq+strideq*4 ]
+ movu ym25, [tmpq+stride5q ]
+ movu ym26, [tmpq+stride3q*2]
+ movu ym20, [tmpq+stride7q ]
+ lea tmpq, [tmpq+strideq*8]
+ vinserti32x8 m0, m16, [tmpq+strideq*0 ], 1
+ vinserti32x8 m1, m17, [tmpq+strideq*1 ], 1
+ vinserti32x8 m2, m18, [tmpq+strideq*2 ], 1
+ vinserti32x8 m3, m19, [tmpq+stride3q ], 1
+ vinserti32x8 m4, m24, [tmpq+strideq*4 ], 1
+ vinserti32x8 m5, m25, [tmpq+stride5q ], 1
+ vinserti32x8 m6, m26, [tmpq+stride3q*2], 1
+ vinserti32x8 m20, m20, [tmpq+stride7q ], 1
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 20, 27
+ vshufi32x4 m27, m7, m0, q2020
+ vshufi32x4 m7, m0, q3131
+ vshufi32x4 m0, m8, m1, q2020
+ vshufi32x4 m8, m1, q3131
+ vshufi32x4 m1, m9, m2, q2020
+ vshufi32x4 m9, m2, q3131
+ vshufi32x4 m2, m10, m3, q2020
+ vshufi32x4 m10, m3, q3131
+ vshufi32x4 m3, m11, m4, q2020
+ vshufi32x4 m11, m4, q3131
+ vshufi32x4 m4, m22, m5, q2020
+ vshufi32x4 m22, m5, q3131
+ vshufi32x4 m5, m23, m6, q2020
+ vshufi32x4 m23, m6, q3131
+ vshufi32x4 m6, m28, m20, q2020
+ vshufi32x4 m28, m20, q3131
+%elif %1 == 6 || %1 == 8
+%if %1 == 8
+ sub dstq, 8
+ movu xm16, [dstq+strideq*0 ]
+ movu xm17, [dstq+strideq*1 ]
+ movu xm18, [dstq+strideq*2 ]
+ movu xm19, [dstq+stride3q ]
+ movu xm24, [dstq+strideq*4 ]
+ movu xm25, [dstq+stride5q ]
+ movu xm26, [dstq+stride3q*2]
+ movu xm27, [dstq+stride7q ]
+ lea tmpq, [dstq+strideq*8 ]
+ vinserti128 ym16, [tmpq+strideq*0 ], 1
+ vinserti128 ym17, [tmpq+strideq*1 ], 1
+ vinserti128 ym18, [tmpq+strideq*2 ], 1
+ vinserti128 ym19, [tmpq+stride3q ], 1
+ vinserti128 ym24, [tmpq+strideq*4 ], 1
+ vinserti128 ym25, [tmpq+stride5q ], 1
+ vinserti128 ym26, [tmpq+stride3q*2], 1
+ vinserti128 ym27, [tmpq+stride7q ], 1
+ lea tmpq, [tmpq+strideq*8 ]
+ vinserti32x4 m10, m16, [tmpq+strideq*0 ], 2
+ vinserti32x4 m8, m17, [tmpq+strideq*1 ], 2
+ vinserti32x4 m5, m18, [tmpq+strideq*2 ], 2
+ vinserti32x4 m7, m19, [tmpq+stride3q ], 2
+ vinserti32x4 m2, m24, [tmpq+strideq*4 ], 2
+ vinserti32x4 m9, m25, [tmpq+stride5q ], 2
+ vinserti32x4 m3, m26, [tmpq+stride3q*2], 2
+ vinserti32x4 m4, m27, [tmpq+stride7q ], 2
+ lea tmpq, [tmpq+strideq*8 ]
+ vinserti32x4 m10, [tmpq+strideq*0 ], 3
+ vinserti32x4 m8, [tmpq+strideq*1 ], 3
+ vinserti32x4 m5, [tmpq+strideq*2 ], 3
+ vinserti32x4 m7, [tmpq+stride3q ], 3
+ vinserti32x4 m2, [tmpq+strideq*4 ], 3
+ vinserti32x4 m9, [tmpq+stride5q ], 3
+ vinserti32x4 m3, [tmpq+stride3q*2], 3
+ vinserti32x4 m4, [tmpq+stride7q ], 3
+%else ; %1 == 6
+ movu xm16, [dstq+strideq*0-8]
+ movu xm17, [dstq+strideq*1-8]
+ movu xm18, [dstq+strideq*2-8]
+ movu xm19, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4-8]
+ movu xm2, [tmpq+strideq*0]
+ movu xm9, [tmpq+strideq*1]
+ movu xm3, [tmpq+strideq*2]
+ movu xm4, [tmpq+stride3q ]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 ym16, [tmpq+strideq*0], 1
+ vinserti128 ym17, [tmpq+strideq*1], 1
+ vinserti128 ym18, [tmpq+strideq*2], 1
+ vinserti128 ym19, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 ym2, [tmpq+strideq*0], 1
+ vinserti128 ym9, [tmpq+strideq*1], 1
+ vinserti128 ym3, [tmpq+strideq*2], 1
+ vinserti128 ym4, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m10, m16, [tmpq+strideq*0], 2
+ vinserti32x4 m8, m17, [tmpq+strideq*1], 2
+ vinserti32x4 m5, m18, [tmpq+strideq*2], 2
+ vinserti32x4 m7, m19, [tmpq+stride3q ], 2
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m2, [tmpq+strideq*0], 2
+ vinserti32x4 m9, [tmpq+strideq*1], 2
+ vinserti32x4 m3, [tmpq+strideq*2], 2
+ vinserti32x4 m4, [tmpq+stride3q ], 2
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m10, [tmpq+strideq*0], 3
+ vinserti32x4 m8, [tmpq+strideq*1], 3
+ vinserti32x4 m5, [tmpq+strideq*2], 3
+ vinserti32x4 m7, [tmpq+stride3q ], 3
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m2, [tmpq+strideq*0], 3
+ vinserti32x4 m9, [tmpq+strideq*1], 3
+ vinserti32x4 m3, [tmpq+strideq*2], 3
+ vinserti32x4 m4, [tmpq+stride3q ], 3
+%endif
+ punpcklwd m6, m10, m8
+ punpckhwd m10, m8
+ punpcklwd m8, m5, m7
+ punpckhwd m5, m7
+ punpcklwd m7, m2, m9
+ punpckhwd m2, m9
+ punpcklwd m9, m3, m4
+ punpckhwd m3, m4
+ punpckldq m4, m6, m8
+ punpckhdq m6, m8
+ punpckldq m8, m10, m5
+ punpckhdq m10, m5
+ punpckldq m5, m7, m9
+ punpckhdq m7, m9
+ punpckldq m9, m2, m3
+ punpckhdq m2, m3
+%if %1 == 8
+ punpcklqdq m3, m4, m5
+%endif
+ punpckhqdq m4, m5
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ punpcklqdq m7, m8, m9
+ punpckhqdq m8, m9
+ punpcklqdq m9, m10, m2
+%if %1 == 8
+ punpckhqdq m10, m2
+%endif
+%else ; %1 == 4
+ kxnorb k1, k1, k1
+ kmovb k2, k1
+ vpgatherdq m7{k1}, [dstq+ym12-4]
+ lea tmpq, [dstq+strideq*2-4]
+ kmovb k1, k2
+ vpgatherdq m4{k2}, [tmpq+ym12]
+ lea tmpq, [tmpq+strideq*2]
+ kmovb k2, k1
+ vpgatherdq m5{k1}, [tmpq+ym12]
+ lea tmpq, [tmpq+strideq*2]
+ vpgatherdq m6{k2}, [tmpq+ym12]
+ punpcklwd m8, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpcklwd m6, m8, m7
+ punpckhwd m8, m7
+ punpcklwd m7, m4, m5
+ punpckhwd m4, m5
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ punpcklqdq m7, m8, m4
+ punpckhqdq m8, m4
+%endif
+%endif
+
+ ; load L/E/I/H
+%ifidn %2, v
+ movu ym16, [lq+l_strideq*1]
+ movsldup m17, [l_shuf_v]
+ vptestnmb k1, ym16, ym16
+ vmovdqu8 ym16{k1}, [lq+l_strideq*0] ; l[x][] ? l[x][] : l[x-stride][]
+ vpermb m16, m17, m16 ; l[x][1]
+%else
+ movq xm16, [lq+l_strideq*0]
+ movq xm17, [lq+l_strideq*1]
+ vinserti128 ym16, [lq+l_strideq*2], 1
+ vinserti128 ym17, [lq+l_stride3q ], 1
+ lea tmpq, [lq+l_strideq*4]
+ vinserti32x4 m16, [tmpq+l_strideq*0], 2
+ vinserti32x4 m17, [tmpq+l_strideq*1], 2
+ vinserti32x4 m16, [tmpq+l_strideq*2], 3
+ vinserti32x4 m17, [tmpq+l_stride3q ], 3
+ punpcklqdq m16, m17
+ vbroadcasti32x4 m17, [l_shuf_h]
+ vptestnmb k1, m16, m16
+ vpalignr m16{k1}, m16, 12
+ pshufb m16, m17 ; l[x][1]
+%endif
+ vpbroadcastd m20, [pw_32767]
+ psubw m17, m5, m6 ; p1-p0
+ psubw m18, m7, m8 ; q1-q0
+ vptestmw k1, m16, m16 ; L
+ pabsw m17, m17
+ pabsw m18, m18
+ vpmaxuw m20{k1}, m17, m18
+ vpbroadcastw m17, [lutq+136]
+ psrlw m18, m16, [lutq+128]
+ vpbroadcastd m19, [pw_1]
+ pminuw m18, m17
+ psrlw m17, m16, 4 ; H
+ paddw m16, m16
+ pmaxuw m18, m19 ; I
+ vpaddd m16, [pw_4] {1to16}
+ paddw m16, m18 ; E
+ REPX {pmullw x, m13}, m17, m18, m16
+ vpcmpw k4, m20, m17, 6 ; hev
+%if %1 != 4
+ psubw m19, m4, m5 ; p2-p1
+ pabsw m19, m19
+%if %1 == 8 || %1 == 16
+ psubw m17, m3, m4 ; p3-p2
+ pabsw m17, m17
+ pmaxuw m19, m17
+ psubw m17, m9, m10 ; q3-q2
+ pabsw m17, m17
+ pmaxuw m19, m17
+%endif
+ psubw m17, m9, m8 ; q2-q1
+ pabsw m17, m17
+ pmaxuw m19, m17
+%if %1 == 16
+ vpbroadcastd ym17, [maskq+4]
+ vpord ym17, [maskq+8] {1to8}
+ vptestmd k1, ym17, ym21
+%else
+ vptestmd k1, ym21, [maskq+4] {1to8}
+%endif
+ pmaxuw m19, m20
+ psubw m17, m4, m6 ; p2-p0
+ pabsw m17, m17
+ pmaxuw m17, m20
+ vmovdqa64 m20{k1}, m19 ; only apply fm-wide to wd>4 blocks
+%if %1 == 8 || %1 == 16
+ psubw m19, m3, m6 ; p3-p0
+ pabsw m19, m19
+ pmaxuw m17, m19
+ psubw m19, m7, m10 ; q3-q0
+ pabsw m19, m19
+ pmaxuw m17, m19
+%endif
+ psubw m19, m7, m9 ; q2-q0
+ pabsw m19, m19
+ pmaxuw m17, m19
+%endif
+ vpcmpw k1, m20, m18, 2
+ psubw m18, m5, m8 ; p1-q1
+ psubw m19, m6, m7 ; p0-q0
+ pabsw m18, m18
+ pabsw m19, m19
+ psrlw m18, 1
+ paddw m19, m19
+ paddw m18, m19 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ vpcmpw k1{k1}, m18, m16, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E
+%if %1 != 4
+ vpcmpw k2{k1}, m17, m13, 2 ; flat8in
+%endif
+%if %1 == 16
+ psubw m20, m0, m6
+ psubw m16, m1, m6
+ pabsw m20, m20
+ psubw m17, m2, m6
+ pabsw m16, m16
+ psubw m18, m11, m7
+ pabsw m17, m17
+ psubw m19, m22, m7
+ pabsw m18, m18
+ pmaxuw m20, m16
+ psubw m16, m23, m7
+ pabsw m19, m19
+ pmaxuw m17, m18
+ pabsw m16, m16
+ vpandd ym18, ym21, [maskq+8] {1to8}
+ pmaxuw m20, m17
+ pmaxuw m19, m16
+ pcmpeqd ym16, ym21, ym18
+ vpternlogd ym18, ym21, [maskq+4] {1to8}, 0xc8
+ pmaxuw m20, m19
+ pcmpeqd ym17, ym21, ym18
+ vpternlogd ym18, ym21, [maskq+0] {1to8}, 0xc8
+ vpcmpw k3{k2}, m20, m13, 2 ; flat8in & flat8out
+ pcmpeqd ym18, ym21
+ vptestmb k3{k3}, ym16, ym16 ; flat8 & fm
+ vptestmb k2{k2}, ym17, ym17 ; flat8in
+ vptestmb k1{k1}, ym18, ym18
+ kandnd k1, k2, k1 ; fm & !flat8 & !flat16
+ kandnd k2, k3, k2 ; flat8 & !flat16
+%elif %1 == 6 || %1 == 8
+ vpandd ym17, ym21, [maskq+4] {1to8}
+ pcmpeqd ym16, ym21, ym17
+ vpternlogd ym17, ym21, [maskq+0] {1to8}, 0xc8
+ pcmpeqd ym17, ym21
+ vptestmb k2{k2}, ym16, ym16 ; flat8 & fm
+ vptestmb k1{k1}, ym17, ym17
+ kandnd k1, k2, k1 ; fm & !flat8
+%else ; %1 == 4
+ vpandd ym16, ym21, [maskq+0] {1to8}
+ pcmpeqd ym16, ym21
+ vptestmb k1{k1}, ym16, ym16
+%endif
+
+ ; short filter
+ psubw m16, m7, m6
+ vpbroadcastd m17, [pw_3]
+ paddw m18, m16, m16
+ paddw m18, m16
+ psubw m16, m5, m8 ; iclip_diff(p1-q1)
+ pminsw m16, m14
+ vpmaxsw m16{k4}{z}, m15 ; f=iclip_diff(p1-q1)&hev
+ knotd k4, k4 ; !hev
+ paddw m16, m18 ; f=iclip_diff(3*(q0-p0)+f)
+ vpbroadcastd m18, [pw_4]
+ pminsw m16, m14
+ vpmaxsw m16{k1}{z}, m15 ; f&=fm
+ paddw m17, m16
+ paddw m16, m18
+ vpbroadcastd m18, [pw_16384]
+ pminsw m17, m14
+ pminsw m16, m14
+ psraw m17, 3 ; f2
+ psraw m16, 3 ; f1
+ paddw m6, m17
+ psubw m7, m16
+ vpmulhrsw m16{k4}{z}, m18 ; (f=(f1+1)>>1) & !hev
+ psubw m17, m14, m15 ; 1023 or 4095
+ pxor m18, m18
+ paddw m5, m16
+ psubw m8, m16
+ REPX {pminsw x, m17}, m6, m7, m5, m8
+ REPX {pmaxsw x, m18}, m6, m7, m5, m8
+
+%if %1 == 16 ; flat16 filter
+ vpaddd m19, m0, [pw_1] {1to16}
+ paddw m16, m1, m2 ; p5+p4
+ paddw m26, m1, m6 ; p5+p0
+ paddw m24, m2, m7 ; p4+q0
+ paddw m16, m4 ; p5+p4+p3
+ paddw m17, m3, m5 ; p2+p1
+ psllw m19, 3
+ paddw m16, m26 ; p5*2+p4+p3+p0
+ paddw m17, m24 ; p4+p2+p1+q0
+ psubw m19, m0 ; p6*7+8
+ paddw m16, m17 ; p5*2+p4*2+p3+p2+p1+q0
+ paddw m18, m3, m8
+ paddw m19, m16 ; p6*7+p5+p4*2+p3+p2+p1+p0+q0
+ paddw m25, m1, m0
+ paddw m16, m0, m0
+ psrlw m1{k3}, m19, 4
+ paddw m19, m18
+ psubw m19, m16 ; +p3+q1-p6*2
+ paddw m16, m2, m0
+ psrlw m2{k3}, m19, 4
+ psubw m19, m25
+ paddw m25, m4, m9
+ paddw m20, m10, m5
+ paddw m19, m25 ; +p2+q2-p6-p5
+ paddw m17, m0, m3
+ psubw m16, m20, m16
+ psrlw m3{k3}, m19, 4
+ paddw m19, m16 ; +p1+q3-p6-p4
+ paddw m16, m11, m6
+ psubw m16, m17
+ paddw m17, m0, m4
+ psrlw m4{k3}, m19, 4
+ paddw m19, m16 ; +p0+q4-p6-p3
+ paddw m16, m22, m7
+ psubw m16, m17
+ paddw m17, m0, m5
+ psrlw m5{k3}, m19, 4
+ paddw m19, m16 ; +q0+q5-p6-p2
+ paddw m16, m23, m8
+ psrlw m6{k3}, m19, 4
+ psubw m16, m17
+ paddw m19, m16 ; +q1+q6-p6-p1
+ paddw m16, m23, m9
+ psrlw m7{k3}, m19, 4
+ psubw m16, m26
+ paddw m19, m16 ; +q2+q6-p5-p0
+ paddw m16, m23, m10
+ psrlw m8{k3}, m19, 4
+ psubw m16, m24
+ paddw m19, m16 ; +q3+q6-p4-p0
+ paddw m16, m23, m11
+ psrlw m9{k3}, m19, 4
+ psubw m16, m18
+ paddw m19, m16 ; +q4+q6-p3-q1
+ paddw m16, m23, m22
+ psrlw m10{k3}, m19, 4
+ psubw m16, m25
+ paddw m19, m16 ; +q5+q6-p2-q2
+ paddw m16, m23, m23
+ psrlw m11{k3}, m19, 4
+ psubw m16, m20
+ paddw m19, m16 ; +q6*2-p1-q3
+ psrlw m22{k3}, m19, 4
+%endif
+%if %1 == 8 || %1 == 16 ; flat8 filter
+ vpbroadcastd m20, [pw_4096]
+ paddw m16, m3, m4 ; p3+p2
+ paddw m19, m5, m6 ; p1+p0
+ paddw m17, m16, m16 ; 2*(p3+p2)
+ paddw m19, m3 ; p1+p0+p3
+ paddw m17, m7 ; 2*(p3+p2)+q0
+ paddw m19, m17 ; 3*p3+2*p2+p1+p0+q0
+ paddw m18, m4, m7
+ pmulhrsw m4{k2}, m19, m20
+ psubw m19, m16
+ paddw m17, m5, m8
+ paddw m16, m3, m5
+ paddw m19, m17
+ pmulhrsw m5{k2}, m19, m20
+ psubw m19, m16
+ paddw m16, m6, m9
+ paddw m19, m16
+ paddw m16, m3, m6
+ pmulhrsw m6{k2}, m19, m20
+ paddw m19, m10
+ psubw m16, m7, m16
+ paddw m19, m16
+ psubw m16, m10, m18
+ pmulhrsw m7{k2}, m19, m20
+ paddw m16, m8
+ paddw m19, m16
+ psubw m16, m10, m17
+ pmulhrsw m8{k2}, m19, m20
+ paddw m16, m9
+ paddw m19, m16
+ pmulhrsw m9{k2}, m19, m20
+%elif %1 == 6 ; flat6 filter
+ vpbroadcastd m10, [pw_4096]
+ paddw m2, m5, m6
+ paddw m0, m4, m7
+ paddw m1, m2, m4 ; p2+p1+p0
+ paddw m3, m4, m4
+ paddw m1, m1
+ paddw m4, m5
+ paddw m1, m0 ; p2+2*(p2+p1+p0)+q0
+ psubw m3, m7, m3
+ pmulhrsw m5{k2}, m1, m10
+ paddw m3, m8
+ psubw m4, m8, m4
+ paddw m1, m3
+ pmulhrsw m6{k2}, m1, m10
+ paddw m4, m9
+ paddw m9, m9
+ paddw m1, m4
+ pmulhrsw m7{k2}, m1, m10
+ psubw m9, m2
+ paddw m1, m9
+ pmulhrsw m8{k2}, m1, m10
+%endif
+
+%ifidn %2, v
+%if %1 == 16
+ mova [tmpq+strideq*2 ], m1 ; p5
+ mova [tmpq+stride3q ], m2 ; p4
+ mova [tmpq+strideq*4 ], m3 ; p3
+ mova [tmpq+stride5q ], m4 ; p2
+%elif %1 == 8
+ mova [tmpq+strideq*1 ], m4 ; p2
+%endif
+ mova [dstq+mstrideq*2], m5 ; p1
+ mova [dstq+mstrideq ], m6 ; p0
+ mova [dstq+strideq*0 ], m7 ; q0
+ mova [dstq+strideq*1 ], m8 ; q1
+%if %1 == 8 || %1 == 16
+ mova [dstq+strideq*2 ], m9 ; q2
+%endif
+%if %1 == 16
+ mova [dstq+stride3q ], m10 ; q3
+ mova [dstq+strideq*4 ], m11 ; q4
+ mova [dstq+stride5q ], m22 ; q5
+%endif
+%else
+%if %1 == 16
+ TRANSPOSE8X8W 27, 0, 1, 2, 3, 4, 5, 6, 20
+ TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 20
+ mova [dstq+strideq*0 -16], xm27
+ mova [dstq+strideq*0 ], xm7
+ mova [dstq+strideq*1 -16], xm0
+ mova [dstq+strideq*1 ], xm8
+ mova [dstq+strideq*2 -16], xm1
+ mova [dstq+strideq*2 ], xm9
+ mova [dstq+stride3q -16], xm2
+ mova [dstq+stride3q ], xm10
+ mova [dstq+strideq*4 -16], xm3
+ mova [dstq+strideq*4 ], xm11
+ mova [dstq+stride5q -16], xm4
+ mova [dstq+stride5q ], xm22
+ mova [dstq+stride3q*2-16], xm5
+ mova [dstq+stride3q*2 ], xm23
+ mova [dstq+stride7q -16], xm6
+ mova [dstq+stride7q ], xm28
+ lea dstq, [dstq+strideq*8]
+ vextracti128 [dstq+strideq*0 -16], ym27, 1
+ vextracti128 [dstq+strideq*0 ], ym7, 1
+ vextracti128 [dstq+strideq*1 -16], ym0, 1
+ vextracti128 [dstq+strideq*1 ], ym8, 1
+ vextracti128 [dstq+strideq*2 -16], ym1, 1
+ vextracti128 [dstq+strideq*2 ], ym9, 1
+ vextracti128 [dstq+stride3q -16], ym2, 1
+ vextracti128 [dstq+stride3q ], ym10, 1
+ vextracti128 [dstq+strideq*4 -16], ym3, 1
+ vextracti128 [dstq+strideq*4 ], ym11, 1
+ vextracti128 [dstq+stride5q -16], ym4, 1
+ vextracti128 [dstq+stride5q ], ym22, 1
+ vextracti128 [dstq+stride3q*2-16], ym5, 1
+ vextracti128 [dstq+stride3q*2 ], ym23, 1
+ vextracti128 [dstq+stride7q -16], ym6, 1
+ vextracti128 [dstq+stride7q ], ym28, 1
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 -16], m27, 2
+ vextracti32x4 [dstq+strideq*0 ], m7, 2
+ vextracti32x4 [dstq+strideq*1 -16], m0, 2
+ vextracti32x4 [dstq+strideq*1 ], m8, 2
+ vextracti32x4 [dstq+strideq*2 -16], m1, 2
+ vextracti32x4 [dstq+strideq*2 ], m9, 2
+ vextracti32x4 [dstq+stride3q -16], m2, 2
+ vextracti32x4 [dstq+stride3q ], m10, 2
+ vextracti32x4 [dstq+strideq*4 -16], m3, 2
+ vextracti32x4 [dstq+strideq*4 ], m11, 2
+ vextracti32x4 [dstq+stride5q -16], m4, 2
+ vextracti32x4 [dstq+stride5q ], m22, 2
+ vextracti32x4 [dstq+stride3q*2-16], m5, 2
+ vextracti32x4 [dstq+stride3q*2 ], m23, 2
+ vextracti32x4 [dstq+stride7q -16], m6, 2
+ vextracti32x4 [dstq+stride7q ], m28, 2
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 -16], m27, 3
+ vextracti32x4 [dstq+strideq*0 ], m7, 3
+ vextracti32x4 [dstq+strideq*1 -16], m0, 3
+ vextracti32x4 [dstq+strideq*1 ], m8, 3
+ vextracti32x4 [dstq+strideq*2 -16], m1, 3
+ vextracti32x4 [dstq+strideq*2 ], m9, 3
+ vextracti32x4 [dstq+stride3q -16], m2, 3
+ vextracti32x4 [dstq+stride3q ], m10, 3
+ vextracti32x4 [dstq+strideq*4 -16], m3, 3
+ vextracti32x4 [dstq+strideq*4 ], m11, 3
+ vextracti32x4 [dstq+stride5q -16], m4, 3
+ vextracti32x4 [dstq+stride5q ], m22, 3
+ vextracti32x4 [dstq+stride3q*2-16], m5, 3
+ vextracti32x4 [dstq+stride3q*2 ], m23, 3
+ vextracti32x4 [dstq+stride7q -16], m6, 3
+ vextracti32x4 [dstq+stride7q ], m28, 3
+%elif %1 == 8
+ TRANSPOSE8X8W 3, 4, 5, 6, 7, 8, 9, 10, 2
+ movu [dstq+strideq*0 ], xm3
+ movu [dstq+strideq*1 ], xm4
+ movu [dstq+strideq*2 ], xm5
+ movu [dstq+stride3q ], xm6
+ movu [dstq+strideq*4 ], xm7
+ movu [dstq+stride5q ], xm8
+ movu [dstq+stride3q*2], xm9
+ movu [dstq+stride7q ], xm10
+ lea dstq, [dstq+strideq*8]
+ vextracti128 [dstq+strideq*0 ], ym3, 1
+ vextracti128 [dstq+strideq*1 ], ym4, 1
+ vextracti128 [dstq+strideq*2 ], ym5, 1
+ vextracti128 [dstq+stride3q ], ym6, 1
+ vextracti128 [dstq+strideq*4 ], ym7, 1
+ vextracti128 [dstq+stride5q ], ym8, 1
+ vextracti128 [dstq+stride3q*2], ym9, 1
+ vextracti128 [dstq+stride7q ], ym10, 1
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 ], m3, 2
+ vextracti32x4 [dstq+strideq*1 ], m4, 2
+ vextracti32x4 [dstq+strideq*2 ], m5, 2
+ vextracti32x4 [dstq+stride3q ], m6, 2
+ vextracti32x4 [dstq+strideq*4 ], m7, 2
+ vextracti32x4 [dstq+stride5q ], m8, 2
+ vextracti32x4 [dstq+stride3q*2], m9, 2
+ vextracti32x4 [dstq+stride7q ], m10, 2
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 ], m3, 3
+ vextracti32x4 [dstq+strideq*1 ], m4, 3
+ vextracti32x4 [dstq+strideq*2 ], m5, 3
+ vextracti32x4 [dstq+stride3q ], m6, 3
+ vextracti32x4 [dstq+strideq*4 ], m7, 3
+ vextracti32x4 [dstq+stride5q ], m8, 3
+ vextracti32x4 [dstq+stride3q*2], m9, 3
+ vextracti32x4 [dstq+stride7q ], m10, 3
+ lea dstq, [dstq+strideq*8+8]
+%else ; %1 == 4 || %1 == 6
+ punpcklwd m9, m5, m6
+ punpckhwd m5, m6
+ kxnorb k1, k1, k1
+ punpcklwd m6, m7, m8
+ punpckhwd m7, m8
+ kmovb k2, k1
+ punpckldq m8, m9, m6
+ vpscatterdq [dstq+ym12-4]{k1}, m8
+ punpckhdq m9, m6
+ lea tmpq, [dstq+strideq*2-4]
+ kmovb k1, k2
+ vpscatterdq [tmpq+ym12]{k2}, m9
+ punpckldq m6, m5, m7
+ lea tmpq, [tmpq+strideq*2]
+ kmovb k2, k1
+ vpscatterdq [tmpq+ym12]{k1}, m6
+ punpckhdq m5, m7
+ lea tmpq, [tmpq+strideq*2]
+ vpscatterdq [tmpq+ym12]{k2}, m5
+%endif
+%endif
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal lpf_v_sb_y_16bpc, 6, 12, 26, dst, stride, mask, l, l_stride, \
+ lut, w, stride3, mstride, tmp, \
+ mask_bits, stride5
+%define base tmpq-filter_mask
+ SWAP 12, 26 ; avoids clobbering xmm10 on WIN64
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ lea stride3q, [strideq*3]
+ shl l_strideq, 2
+ lea stride5q, [strideq*5]
+ shr r6d, 11 ; is_12bpc
+ mova ym21, [base+filter_mask]
+ mov mstrideq, strideq
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ sub lq, l_strideq
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ neg mstrideq
+ mov wd, wm
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+ FILTER 16, v
+ jmp .end
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 8, v
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call .v4
+.end:
+ shl mask_bitsd, 8
+ add dstq, 64
+ pslld ym21, 8
+ add lq, 32
+ sub wd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.v4: ; called by both luma and chroma
+ FILTER 4, v
+ ret
+
+cglobal lpf_h_sb_y_16bpc, 6, 13, 29, dst, stride, mask, l, l_stride, \
+ lut, h, stride3, l_stride3, tmp, \
+ mask_bits, stride5, stride7
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ lea stride3q, [strideq*3]
+ vpbroadcastd ym12, strided
+ shl l_strideq, 2
+ lea stride5q, [strideq*5]
+ shr r6d, 11 ; is_12bpc
+ pmulld ym12, [base+stride_mul]
+ lea stride7q, [strideq+stride3q*2]
+ mova ym21, [base+filter_mask]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ sub lq, 4
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ lea l_stride3q, [l_strideq*3]
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ mov hd, hm
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+ FILTER 16, h
+ jmp .end
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 8, h
+ jmp .end2
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .no_filter
+ call .h4
+.no_filter:
+ lea dstq, [dstq+stride3q*8]
+.end:
+ lea dstq, [dstq+strideq*8]
+.end2:
+ shl mask_bitsd, 8
+ pslld ym21, 8
+ lea lq, [lq+l_strideq*8]
+ sub hd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.h4: ; called by both luma and chroma
+ FILTER 4, h
+ ret
+
+cglobal lpf_v_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ shl l_strideq, 2
+ lea stride3q, [strideq*3]
+ shr r6d, 11 ; is_12bpc
+ mova ym21, [base+filter_mask]
+ mov mstrideq, strideq
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ sub lq, l_strideq
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ neg mstrideq
+ mov wd, wm
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 6, v
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx512icl).v4
+.end:
+ shl mask_bitsd, 8
+ add dstq, 64
+ pslld ym21, 8
+ add lq, 32
+ sub wd, 8
+ jg .loop
+ RET
+
+cglobal lpf_h_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ vpbroadcastd ym12, strided
+ shl l_strideq, 2
+ shr r6d, 11 ; is_12bpc
+ pmulld ym12, [base+stride_mul]
+ lea stride3q, [strideq*3]
+ mova ym21, [base+filter_mask]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ sub lq, 4
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ lea l_stride3q, [l_strideq*3]
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ mov hd, hm
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 6, h
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx512icl).h4
+.end:
+ lea tmpq, [strideq+stride3q]
+ shl mask_bitsd, 8
+ pslld ym21, 8
+ lea dstq, [dstq+tmpq*8]
+ lea lq, [lq+l_strideq*8]
+ sub hd, 8
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/loopfilter16_sse.asm b/third_party/dav1d/src/x86/loopfilter16_sse.asm
new file mode 100644
index 0000000000..c486b57a21
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter16_sse.asm
@@ -0,0 +1,1793 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%if ARCH_X86_64
+%define PIC_sym(a) a
+%else
+%define PIC_base $$
+%define PIC_sym(a) pic_regq+a-PIC_base
+%endif
+
+pb_4x1_4x5_4x9_4x13: times 4 db 0, 1
+ times 4 db 8, 9
+
+pw_1: times 8 dw 1
+pw_2: times 8 dw 2
+pw_3: times 8 dw 3
+; 4 and 16 need to be next to each other since they are used as alternates
+; depending on whether bitdepth is 10 or 12
+pw_4: times 8 dw 4
+pw_16: times 8 dw 16
+pw_8: times 8 dw 8
+pw_4096: times 8 dw 4096
+
+pb_mask: dd 1, 1, 2, 2
+
+SECTION .text
+
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < 16
+%define extra_stack 2
+%else
+%define extra_stack 0
+%endif
+%endif
+
+%macro RELOC_ARGS 2 ; h/v, off
+ASSERT ARCH_X86_32
+%if STACK_ALIGNMENT < 16
+ mov r5d, [rstk + stack_offset + 4*4 + 4]
+%define lstridem [esp+%2+0*gprsize]
+ mov lstridem, r5d
+ mov r5d, [rstk + stack_offset + 4*5 + 4]
+%define lutm [esp+%2+1*gprsize]
+ mov lutm, r5d
+ mov r5d, [rstk + stack_offset + 4*6 + 4]
+%ifidn %1, v
+%define wm [esp+%2+2*gprsize]
+ mov wm, r5d
+ mov r5d, [rstk + stack_offset + 4*3 + 4]
+%define lm [esp+%2+3*gprsize]
+ mov lm, r5d
+%else ; %1 == h
+%define hm [esp+%2+2*gprsize]
+ mov hm, r5d
+%endif ; %1==v
+ mov r5d, r7m
+%define bdmulm [esp+%2+4*gprsize]
+ mov bdmulm, r5d
+%else
+%define lstridem r4m
+%define lutm r5m
+%ifidn %1, v
+%define wm r6m
+%define lm r3m
+%else
+%define hm r6m
+%endif
+%define bdmulm r7m
+%endif ; STACK_ALIGNMENT
+%endmacro
+
+%macro UNRELOC_ARGS 0
+%if ARCH_X86_32
+%undef lm
+%undef lstridem
+%undef wm
+%undef hm
+%undef lutm
+%endif
+%endmacro
+
+%macro SPLATD 2
+ movd %1, %2
+ pshufd %1, %1, q0000
+%endmacro
+
+%macro SPLATW 2
+ movd %1, %2
+ pshuflw %1, %1, q0000
+ punpcklqdq %1, %1
+%endmacro
+
+; in: out:
+; mm%1 a b c d a e i m
+; mm%2 e f g h b f j n
+; mm%3 i j k l -> c g k o
+; mm%4 m n o p d h l p
+%macro TRANSPOSE4X4W 5
+ punpcklwd m%5, m%1, m%2
+ punpckhwd m%1, m%2
+ punpcklwd m%2, m%3, m%4
+ punpckhwd m%3, m%4
+ punpckldq m%4, m%5, m%2
+ punpckhdq m%5, m%2
+ punpckldq m%2, m%1, m%3
+ punpckhdq m%1, m%3
+
+ SWAP %1, %4
+ SWAP %2, %5, %3
+%endmacro
+
+; in: out:
+; m%1 a b c d e f g h a i q y 6 E M U
+; m%2 i j k l m n o p b j r z 7 F N V
+; m%3 q r s t u v w x c k s 0 8 G O W
+; m%4 y z 0 1 2 3 4 5 d l t 1 9 H P X
+; m%5 6 7 8 9 A B C D -> e m u 2 A I Q Y
+; m%6 E F G H I J K L f n v 3 B J R Z
+; m%7 M N O P Q R S T g o w 4 C K S +
+; m%8 U V W X Y Z + = h p x 5 D L T =
+%if ARCH_X86_64
+%macro TRANSPOSE8X8W 9
+ ; m%1 a b c d e f g h a i q y b j r z
+ ; m%2 i j k l m n o p c k s 0 d l t 1
+ ; m%3 q r s t u v w x -> e m u 2 f n v 3
+ ; m%4 y z 0 1 2 3 4 5 g o w 4 h p x 5
+ TRANSPOSE4X4W %1, %2, %3, %4, %9
+
+ ; m%5 6 7 8 9 A B C D 6 E M U 7 F N V
+ ; m%6 E F G H I J K L 8 G O W 9 H P X
+ ; m%7 M N O P Q R S T -> A I Q Y B J R Z
+ ; m%8 U V W X Y Z + = C K S + D L T =
+ TRANSPOSE4X4W %5, %6, %7, %8, %9
+
+ ; m%1 a i q y b j r z a i q y 6 E M U
+ ; m%2 c k s 0 d l t 1 b j r z 7 F N V
+ ; m%3 e m u 2 f n v 3 c k s 0 8 G O W
+ ; m%4 g o w 4 h p x 5 d l t 1 9 H P X
+ ; m%5 6 E M U 7 F N V -> e m u 2 A I Q Y
+ ; m%6 8 G O W 9 H P X f n v 3 B J R Z
+ ; m%7 A I Q Y B J R Z g o w 4 C K S +
+ ; m%8 C K S + D L T = h p x 5 D L T =
+ punpckhqdq m%9, m%1, m%5
+ punpcklqdq m%1, m%5
+ punpckhqdq m%5, m%2, m%6
+ punpcklqdq m%2, m%6
+ punpckhqdq m%6, m%3, m%7
+ punpcklqdq m%3, m%7
+ punpckhqdq m%7, m%4, m%8
+ punpcklqdq m%4, m%8
+
+ SWAP %8, %7, %4, %5, %3, %2, %9
+%endmacro
+%else ; x86-32
+; input: 1-7 in registers, 8 in first memory [read-only]
+; second memory is scratch, and may overlap with first or third memory
+; output: 1-5,7-8 in registers, 6 in third memory [write-only]
+%macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x]
+ TRANSPOSE4X4W %1, %2, %3, %4, %8
+%ifnidn %9, ""
+ mov%12 m%8, %9
+%else
+ mova m%8, %10
+%endif
+ mova %10, m%4
+ TRANSPOSE4X4W %5, %6, %7, %8, %4
+ punpckhqdq m%4, m%1, m%5
+ punpcklqdq m%1, m%5
+ punpckhqdq m%5, m%2, m%6
+ punpcklqdq m%2, m%6
+ punpckhqdq m%6, m%3, m%7
+ punpcklqdq m%3, m%7
+ mova m%7, %10
+%ifnidn %11, ""
+ mov%13 %11, m%6
+%else
+ mova %10, m%6
+%endif
+ punpckhqdq m%6, m%7, m%8
+ punpcklqdq m%7, m%8
+
+ ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8
+ SWAP %2, %4, %5, %3
+ SWAP %6, %8
+%endmacro
+%endif ; x86-32/64
+
+; transpose and write m8-11, everything else is scratch
+%macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp
+ ; transpose 8x4
+ punpcklwd %5, %1, %2
+ punpckhwd %1, %2
+ punpcklwd %2, %3, %4
+ punpckhwd %3, %4
+ punpckldq %4, %5, %2
+ punpckhdq %5, %2
+ punpckldq %2, %1, %3
+ punpckhdq %1, %3
+
+ ; write out
+ movq [dstq+strideq*0-4], %4
+ movhps [dstq+strideq*1-4], %4
+ movq [dstq+strideq*2-4], %5
+ movhps [dstq+stride3q -4], %5
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], %2
+ movhps [dstq+strideq*1-4], %2
+ movq [dstq+strideq*2-4], %1
+ movhps [dstq+stride3q -4], %1
+ lea dstq, [dstq+strideq*4]
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%if %1 == 4
+%if ARCH_X86_64
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+ mova P1, [dstq+mstrideq*2] ; p1
+ mova P0, [dstq+mstrideq*1] ; p0
+ mova Q0, [dstq+strideq*0] ; q0
+ mova Q1, [dstq+strideq*1] ; q1
+%else ; x86-32
+%define P1 [dstq+mstrideq*2]
+%define P0 [dstq+mstrideq*1]
+%define Q0 [dstq+strideq*0]
+%define Q1 [dstq+strideq*1]
+%endif ; x86-32/64
+%else ; %1 != 4
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+ lea tmpq, [dstq+mstrideq*4]
+%if ARCH_X86_64
+ ; we load p3 later
+%define P2 m13
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+%define Q2 m14
+ mova P2, [tmpq+strideq*1]
+ mova P1, [tmpq+strideq*2]
+ mova P0, [tmpq+stride3q]
+ mova Q0, [dstq+strideq*0]
+ mova Q1, [dstq+strideq*1]
+ mova Q2, [dstq+strideq*2]
+%if %1 != 6
+%define P3 [tmpq+strideq*0]
+%define Q3 m15
+ mova Q3, [dstq+stride3q]
+%endif ; %1 != 6
+%else ; x86-32
+%define P2 [tmpq+strideq*1]
+%define P1 [dstq+mstrideq*2]
+%define P0 [dstq+mstrideq*1]
+%define Q0 [dstq+strideq*0]
+%define Q1 [dstq+strideq*1]
+%define Q2 [dstq+strideq*2]
+%if %1 != 6
+%define P3 [dstq+mstrideq*4]
+%define Q3 [dstq+stride3q]
+%endif ; %1 != 6
+%endif ; x86-32/64
+%endif ; %1 ==/!= 4
+%else ; %2 != v
+ ; load lines
+%if %1 == 4
+ movq m0, [dstq+strideq*0-4]
+ movq m2, [dstq+strideq*1-4]
+ movq m4, [dstq+strideq*2-4]
+ movq m5, [dstq+stride3q -4]
+ lea tmpq, [dstq+strideq*4]
+ movq m3, [tmpq+strideq*0-4]
+ movq m6, [tmpq+strideq*1-4]
+ movq m1, [tmpq+strideq*2-4]
+ movq m7, [tmpq+stride3q -4]
+
+ ; transpose 4x8
+ ; m0: A-D0
+ ; m2: A-D1
+ ; m4: A-D2
+ ; m5: A-D3
+ ; m3: A-D4
+ ; m6: A-D5
+ ; m1: A-D6
+ ; m7: A-D7
+ punpcklwd m0, m2
+ punpcklwd m4, m5
+ punpcklwd m3, m6
+ punpcklwd m1, m7
+ ; m0: A0-1,B0-1,C0-1,D0-1
+ ; m4: A2-3,B2-3,C2-3,D2-3
+ ; m3: A4-5,B4-5,C4-5,D4-5
+ ; m1: A6-7,B6-7,C6-7,D6-7
+ punpckhdq m2, m0, m4
+ punpckldq m0, m4
+ punpckhdq m4, m3, m1
+ punpckldq m3, m1
+ ; m0: A0-3,B0-3
+ ; m2: C0-3,D0-3
+ ; m3: A4-7,B4-7
+ ; m4: C4-7,D4-7
+ punpckhqdq m1, m0, m3
+ punpcklqdq m0, m3
+ punpckhqdq m3, m2, m4
+ punpcklqdq m2, m4
+ ; m0: A0-7
+ ; m1: B0-7
+ ; m2: C0-7
+ ; m3: D0-7
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 3, 11
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+%else
+%define P1 [esp+3*mmsize]
+%define P0 [esp+4*mmsize]
+%define Q0 [esp+5*mmsize]
+%define Q1 [esp+6*mmsize]
+ mova P1, m0
+ mova P0, m1
+ mova Q0, m2
+ mova Q1, m3
+%endif
+%elif %1 == 6 || %1 == 8
+ movu m0, [dstq+strideq*0-8]
+ movu m1, [dstq+strideq*1-8]
+ movu m2, [dstq+strideq*2-8]
+ movu m3, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4]
+ movu m4, [tmpq+strideq*0-8]
+ movu m5, [tmpq+strideq*1-8]
+ movu m6, [tmpq+strideq*2-8]
+%if ARCH_X86_64
+ movu m7, [tmpq+stride3q -8]
+%endif
+
+ ; transpose 8x16
+ ; m0: A-H0,A-H8
+ ; m1: A-H1,A-H9
+ ; m2: A-H2,A-H10
+ ; m3: A-H3,A-H11
+ ; m4: A-H4,A-H12
+ ; m5: A-H5,A-H13
+ ; m6: A-H6,A-H14
+ ; m7: A-H7,A-H15
+%if ARCH_X86_64
+ punpcklwd m8, m0, m1
+%else
+ punpcklwd m7, m0, m1
+%endif
+ punpckhwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ punpcklwd m3, m4, m5
+ punpckhwd m4, m5
+%if ARCH_X86_64
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+%else
+ mova [rsp+3*16], m4
+ movu m4, [tmpq+stride3q -8]
+ punpcklwd m5, m6, m4
+ punpckhwd m6, m4
+%endif
+ ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32]
+ ; m0: E0-1,F0-1,G0-1,H0-1
+ ; m1: A2-3,B2-3,C2-3,D2-3
+ ; m2: E2-3,F2-3,G2-3,H2-3
+ ; m3: A4-5,B4-5,C4-5,D4-5
+ ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32]
+ ; m5: A6-7,B6-7,C6-7,D6-7
+ ; m6: E6-7,F6-7,G6-7,H6-7
+%if ARCH_X86_64
+ punpckldq m7, m8, m1
+ punpckhdq m8, m1
+%else
+ punpckldq m4, m7, m1
+ punpckhdq m7, m1
+%endif
+ punpckldq m1, m0, m2
+ punpckhdq m0, m2
+ punpckldq m2, m3, m5
+ punpckhdq m3, m5
+%if ARCH_X86_64
+ punpckldq m5, m4, m6
+ punpckhdq m4, m6
+%else
+ mova [rsp+4*16], m3
+ mova m3, [rsp+3*16]
+ punpckldq m5, m3, m6
+ punpckhdq m3, m6
+%endif
+ ; m7: A0-3,B0-3 [m4 on x86-32]
+ ; m8: C0-3,D0-3 [m7 on x86-32]
+ ; m1: E0-3,F0-3
+ ; m0: G0-3,H0-3
+ ; m2: A4-7,B4-7
+ ; m3: C4-7,D4-7 [r4 on x86-32]
+ ; m5: E4-7,F4-7
+ ; m4: G4-7,H4-7 [m3 on x86-32]
+%if ARCH_X86_64
+%if %1 != 6
+ punpcklqdq m6, m7, m2
+%endif
+ punpckhqdq m7, m2
+ punpcklqdq m2, m8, m3
+ punpckhqdq m8, m3
+ punpcklqdq m3, m1, m5
+ punpckhqdq m1, m5
+%if %1 != 6
+ punpckhqdq m5, m0, m4
+%endif
+ punpcklqdq m0, m4
+%if %1 == 8
+ mova [rsp+1*16], m6
+%define P3 [rsp+1*16]
+%endif
+ ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15
+ SWAP 7, 13
+ SWAP 8, 2, 9
+ SWAP 3, 10
+ SWAP 1, 11
+ SWAP 0, 14
+ SWAP 5, 15
+%define P2 m13
+%define P1 m8
+%define P0 m9
+%define Q0 m10
+%define Q1 m11
+%define Q2 m14
+%if %1 == 8
+%define Q3 m15
+%endif
+%else ; x86-32
+%if %1 == 8
+%define P3 [rsp+ 6*16]
+ punpcklqdq m6, m4, m2
+ mova P3, m6
+%endif
+ mova m6, [rsp+4*16]
+ punpckhqdq m4, m2
+ punpcklqdq m2, m7, m6
+ punpckhqdq m7, m6
+ punpcklqdq m6, m1, m5
+ punpckhqdq m1, m5
+%if %1 == 8
+%define Q3 [rsp+24*16]
+ punpckhqdq m5, m0, m3
+ mova Q3, m5
+%endif
+ punpcklqdq m0, m3
+%if %1 == 8
+%define P2 [rsp+18*16]
+%define P1 [rsp+19*16]
+%define P0 [rsp+20*16]
+%define Q0 [rsp+21*16]
+%define Q1 [rsp+22*16]
+%define Q2 [rsp+23*16]
+%else
+%define P2 [rsp+3*16]
+%define P1 [rsp+4*16]
+%define P0 [rsp+5*16]
+%define Q0 [rsp+6*16]
+%define Q1 [rsp+7*16]
+%define Q2 [rsp+8*16]
+%endif
+ mova P2, m4
+ mova P1, m2
+ mova P0, m7
+ mova Q0, m6
+ mova Q1, m1
+ mova Q2, m0
+%endif ; x86-32/64
+%else ; %1 == 16
+ ; We only use 14 pixels but we'll need the remainder at the end for
+ ; the second transpose
+ mova m0, [dstq+strideq*0-16]
+ mova m1, [dstq+strideq*1-16]
+ mova m2, [dstq+strideq*2-16]
+ mova m3, [dstq+stride3q -16]
+ lea tmpq, [dstq+strideq*4]
+ mova m4, [tmpq+strideq*0-16]
+ mova m5, [tmpq+strideq*1-16]
+ mova m6, [tmpq+strideq*2-16]
+%if ARCH_X86_64
+ mova m7, [tmpq+stride3q -16]
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8
+ SWAP 5, 13
+ SWAP 6, 8
+ SWAP 7, 9
+%define P2 m13
+%define P1 m8
+%define P0 m9
+%else ; x86-32
+%define P2 [esp+18*16]
+%define P1 [esp+19*16]
+%define P0 [esp+20*16]
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \
+ [tmpq+stride3q -16], P2, "", a, a
+ mova P1, m6
+ mova P0, m7
+%endif ; x86-32/64
+ mova [rsp+ 7*16], m0
+ mova [rsp+ 8*16], m1
+ mova [rsp+ 9*16], m2
+ mova [rsp+10*16], m3
+%define P3 [rsp+6*16]
+ mova P3, m4
+
+ mova m0, [dstq+strideq*0]
+ mova m1, [dstq+strideq*1]
+ mova m2, [dstq+strideq*2]
+ mova m3, [dstq+stride3q ]
+ lea tmpq, [dstq+strideq*4]
+ mova m4, [tmpq+strideq*0]
+ mova m5, [tmpq+strideq*1]
+ mova m6, [tmpq+strideq*2]
+%if ARCH_X86_64
+ mova m7, [tmpq+stride3q ]
+
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10
+ SWAP 0, 10
+ SWAP 1, 11
+ SWAP 2, 14
+ SWAP 3, 15
+%define Q0 m10
+%define Q1 m11
+%define Q2 m14
+%define Q3 m15
+%else ; x86-32
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \
+ [tmpq+stride3q ], [rsp+12*16], "", a, a
+%define Q0 [esp+21*16]
+%define Q1 [esp+22*16]
+%define Q2 [esp+23*16]
+%define Q3 [esp+24*16]
+ mova Q0, m0
+ mova Q1, m1
+ mova Q2, m2
+ mova Q3, m3
+%endif ; x86-32/64
+
+ mova [rsp+11*16], m4
+%if ARCH_X86_64
+ mova [rsp+12*16], m5
+%endif
+ mova [rsp+13*16], m6
+ mova [rsp+14*16], m7
+%endif ; %1 == 4/6/8/16
+%endif ; %2 ==/!= v
+
+ ; load L/E/I/H
+%if ARCH_X86_32
+%define l_strideq r5
+ mov l_strideq, dword lstridem
+%ifidn %2, v
+%define lq r3
+ mov lq, dword lm
+%endif
+%endif
+%ifidn %2, v
+%if cpuflag(sse4)
+ pmovzxbw m1, [lq]
+ pmovzxbw m0, [lq+l_strideq]
+ pxor m2, m2
+%else ; ssse3
+ movq m1, [lq]
+ movq m0, [lq+l_strideq]
+ pxor m2, m2
+ REPX {punpcklbw x, m2}, m1, m0
+%endif ; ssse3/sse4
+%else ; %2 != v
+ movq m0, [lq] ; l0, l1
+ movq m1, [lq+l_strideq] ; l2, l3
+ punpckldq m0, m1 ; l0, l2, l1, l3
+ pxor m2, m2
+ punpcklbw m1, m0, m2 ; l0, l2
+ punpckhbw m0, m2 ; l1, l3
+%endif ; %2==/!=v
+%if ARCH_X86_32
+%ifidn %2, v
+%undef lq
+ mov mstrideq, mstridem
+%endif
+%endif
+ pcmpeqw m5, m2, m0
+ pand m1, m5
+ por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1]
+ pcmpeqw m5, m2, m0 ; !L
+ psrlw m5, 1
+%if ARCH_X86_64
+ psrlw m2, m0, [lutq+128]
+ SPLATW m1, [lutq+136]
+%else ; x86-32
+ mov r5, lutm
+ psrlw m2, m0, [r5+128]
+ SPLATW m1, [r5+136]
+%endif ; x86-32/64
+ pminsw m2, m1
+ pmaxsw m2, [PIC_sym(pw_1)] ; I
+ psrlw m1, m0, 4 ; H
+ paddw m0, [PIC_sym(pw_2)]
+ paddw m0, m0
+ paddw m0, m2 ; E
+ REPX {pmullw x, [bdmulq]}, m0, m1, m2
+%if ARCH_X86_32
+%undef l_strideq
+ lea stride3q, [strideq*3]
+%endif
+
+ psubw m3, P1, P0 ; p1-p0
+ psubw m4, Q0, Q1 ; q0-q1
+ REPX {pabsw x, x}, m3, m4
+ pmaxsw m3, m5
+ pmaxsw m3, m4
+ pcmpgtw m7, m3, m1 ; hev
+%if %1 != 4
+ psubw m4, P2, P0 ; p2-p0
+ pabsw m4, m4
+ pmaxsw m4, m3
+%if %1 != 6
+ mova m6, P3 ; p3
+ psubw m5, m6, P0 ; p3-p0
+ pabsw m5, m5
+ pmaxsw m4, m5
+%endif ; %1 != 6
+ psubw m5, Q0, Q2 ; q0-q2
+ pabsw m5, m5
+ pmaxsw m4, m5
+%if %1 != 6
+ psubw m5, Q0, Q3 ; q0-q3
+ pabsw m5, m5
+ pmaxsw m4, m5
+%endif ; %1 != 6
+ pcmpgtw m4, [bdmulq] ; !flat8in
+
+ psubw m5, P2, P1 ; p2-p1
+ pabsw m5, m5
+%if %1 != 6
+ psubw m6, P2 ; p3-p2
+ pabsw m6, m6
+ pmaxsw m5, m6
+ psubw m6, Q2, Q3 ; q2-q3
+ pabsw m6, m6
+ pmaxsw m5, m6
+%endif ; %1 != 6
+ psubw m6, Q2, Q1 ; q2-q1
+ pabsw m6, m6
+ pmaxsw m5, m6
+
+%if %1 == 16
+ SPLATD m6, [maskq+8]
+ SPLATD m1, [maskq+4]
+ por m6, m1
+ pand m6, m12
+ pcmpeqd m6, m12
+ pand m5, m6
+%else ; %1 != 16
+ SPLATD m6, [maskq+4]
+ pand m6, m12
+ pcmpeqd m6, m12
+ pand m5, m6 ; only apply fm-wide to wd>4 blocks
+%endif ; %1==/!=16
+ pmaxsw m3, m5
+%endif ; %1 != 4
+ pcmpgtw m3, m2
+
+ psubw m5, P1, Q1 ; p1-q1
+ psubw m6, P0, Q0 ; p0-q0
+ REPX {pabsw x, x}, m5, m6
+ paddw m6, m6
+ psrlw m5, 1
+ paddw m5, m6 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ pcmpgtw m5, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+ por m3, m5
+
+%if %1 == 16
+
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1]
+ mova m1, [tmpq+strideq*2]
+ mova m2, [tmpq+stride3q]
+%else ; %2 != v
+ mova m0, [rsp+ 8*16]
+ mova m1, [rsp+ 9*16]
+ mova m2, [rsp+10*16]
+%endif ; %2==/!=v
+ REPX {psubw x, P0}, m0, m1, m2
+ REPX {pabsw x, x}, m0, m1, m2
+ pmaxsw m1, m0
+ pmaxsw m1, m2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+ mova m0, [tmpq+strideq*0]
+ mova m2, [tmpq+strideq*1]
+ mova m5, [tmpq+strideq*2]
+%else ; %2 != v
+ mova m0, [rsp+11*16]
+ mova m2, [rsp+12*16]
+ mova m5, [rsp+13*16]
+%endif ; %2==/!=v
+ REPX {psubw x, Q0}, m0, m2, m5
+ REPX {pabsw x, x}, m0, m2, m5
+ pmaxsw m0, m2
+ pmaxsw m1, m5
+ pmaxsw m1, m0
+ pcmpgtw m1, [bdmulq] ; !flat8out
+ por m1, m4 ; !flat8in | !flat8out
+ SPLATD m2, [maskq+8]
+ pand m5, m2, m12
+ pcmpeqd m5, m12
+ pandn m1, m5 ; flat16
+ pandn m5, m3, m1 ; flat16 & fm
+ SWAP 1, 5
+
+ SPLATD m5, [maskq+4]
+ por m5, m2
+ pand m2, m5, m12
+ pcmpeqd m2, m12
+ pandn m4, m2 ; flat8in
+ pandn m2, m3, m4
+ SWAP 2, 4
+ SPLATD m2, [maskq+0]
+ por m2, m5
+ pand m2, m12
+ pcmpeqd m2, m12
+ pandn m3, m2
+ pandn m0, m4, m3 ; fm & !flat8 & !flat16
+ SWAP 0, 3
+ pandn m0, m1, m4 ; flat8 & !flat16
+ SWAP 0, 4
+%elif %1 != 4
+ SPLATD m0, [maskq+4]
+ pand m2, m0, m12
+ pcmpeqd m2, m12
+ pandn m4, m2
+ pandn m2, m3, m4 ; flat8 & fm
+ SWAP 2, 4
+ SPLATD m2, [maskq+0]
+ por m0, m2
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m3, m0
+ pandn m0, m4, m3 ; fm & !flat8
+ SWAP 0, 3
+%else ; %1 == 4
+ SPLATD m0, [maskq+0]
+ pand m0, m12
+ pcmpeqd m0, m12
+ pandn m3, m0 ; fm
+%endif ; %1==/!=4
+
+ ; short filter
+%if ARCH_X86_64
+ SPLATW m0, r7m
+%else
+ SPLATW m0, bdmulm
+%endif
+ pcmpeqw m2, m2
+ psrlw m0, 1 ; 511 or 2047
+ pxor m2, m0 ; -512 or -2048
+
+ psubw m5, Q0, P0 ; q0-p0
+ paddw m6, m5, m5
+ paddw m6, m5 ; 3*(q0-p0)
+ psubw m5, P1, Q1 ; iclip_diff(p1-q1)
+ pminsw m5, m0
+ pmaxsw m5, m2
+ pand m5, m7 ; f=iclip_diff(p1-q1)&hev
+ paddw m5, m6 ; f=iclip_diff(3*(q0-p0)+f)
+ pminsw m5, m0
+ pmaxsw m5, m2
+ pand m3, m5 ; f&=fm
+ paddw m5, m3, [PIC_sym(pw_3)]
+ paddw m3, [PIC_sym(pw_4)]
+ REPX {pminsw x, m0}, m5, m3
+ psraw m5, 3 ; f2
+ psraw m3, 3 ; f1
+ psubw m0, m2 ; 1023 or 4095
+ pxor m2, m2
+%if ARCH_X86_64
+ paddw P0, m5
+ psubw Q0, m3
+%else
+ paddw m5, P0
+ psubw m6, Q0, m3
+ REPX {pminsw x, m0}, m5, m6
+ REPX {pmaxsw x, m2}, m5, m6
+%endif
+
+ paddw m3, [PIC_sym(pw_1)]
+ psraw m3, 1 ; f=(f1+1)>>1
+ pandn m7, m3 ; f&=!hev
+ SWAP 7, 3
+%if ARCH_X86_64
+ paddw P1, m3
+ psubw Q1, m3
+ REPX {pminsw x, m0}, P1, P0, Q0, Q1
+ REPX {pmaxsw x, m2}, P1, P0, Q0, Q1
+%else
+ psubw m7, Q1, m3
+ paddw m3, P1
+ REPX {pminsw x, m0}, m7, m3
+ REPX {pmaxsw x, m2}, m7, m3
+%if %1 > 4
+ mova P1, m3
+ mova P0, m5
+ mova Q0, m6
+ mova Q1, m7
+%endif
+%endif
+
+%if %1 == 16
+
+; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16
+; m12=filter bits mask
+; m13-15=p2/q2/q3
+; m0,2-3,5-7 = free
+
+ ; flat16 filter
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1] ; p6
+ mova m2, [tmpq+strideq*2] ; p5
+ mova m7, [tmpq+stride3q] ; p4
+ mova m6, [tmpq+strideq*4] ; p3
+ lea tmpq, [dstq+mstrideq*4]
+%else ; %2 != v
+ mova m0, [rsp+ 8*16]
+ mova m2, [rsp+ 9*16]
+ mova m7, [rsp+10*16]
+ mova m6, [rsp+ 6*16]
+%endif ; %2==/!=v
+
+ mova [rsp+ 0*16], m4
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ psllw m3, m0, 3 ; p6*8
+ paddw m3, [PIC_sym(pw_8)]
+ paddw m5, m2, m7 ; p5+p4
+ psubw m3, m0
+ paddw m5, m5 ; (p5+p4)*2
+ paddw m3, m6 ; p6*7+p3
+ paddw m5, P2 ; (p5+p4)*2+p2
+ paddw m3, P1 ; p6*7+p3+p1
+ paddw m5, P0 ; (p5+p4)*2+p2+p0
+ paddw m3, Q0 ; p6*7+p3+p1+q0
+ paddw m3, m5 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, m2
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+mstrideq*2], m5 ; p5
+%else ; %2 != v
+ mova [rsp+9*16], m5
+%endif ; %2==/!=v
+
+ ; sub p6*2, add p3/q1
+ paddw m3, m6
+ paddw m5, m0, m0
+ paddw m3, Q1
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, m7
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+mstrideq*1], m5 ; p4
+%else ; %2 != v
+ mova [rsp+10*16], m5
+%endif ; %2==/!=v
+
+ ; sub p6/p5, add p2/q2
+ psubw m3, m0
+ paddw m5, P2, Q2
+ psubw m3, m2
+ paddw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, m6
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+strideq*0], m5 ; p3
+%else ; %2 != v
+ mova [rsp+6*16], m5
+%endif ; %2==/!=v
+
+%define WRITE_IN_PLACE 0
+%ifidn %2, v
+%if ARCH_X86_64
+%define WRITE_IN_PLACE 1
+%endif
+%endif
+
+ ; sub p6/p4, add p1/q3
+ paddw m3, P1
+ paddw m5, m0, m7
+ paddw m3, Q3
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, P2
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [tmpq+strideq*1], m5
+%else
+ mova [rsp+1*16], m5 ; don't clobber p2/m13
+%endif
+
+ ; sub p6/p3, add p0/q4
+ paddw m3, P0
+ paddw m5, m0, m6
+%ifidn %2, v
+ paddw m3, [dstq+strideq*4]
+%else ; %2 != v
+ paddw m3, [rsp+11*16]
+%endif ; %2==/!=v
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, P1
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [dstq+mstrideq*2], m5
+%else
+ mova [rsp+2*16], m5 ; don't clobber p1/m3
+%endif
+
+ ; sub p6/p2, add q0/q5
+ paddw m3, Q0
+ paddw m5, m0, P2
+%ifidn %2, v
+%if ARCH_X86_32
+ lea r4, P2
+%endif
+ lea tmpq, [dstq+strideq*4]
+ paddw m3, [tmpq+strideq*1]
+%else ; %2 != v
+ paddw m3, [rsp+12*16]
+%endif ; %2==/!=v
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, P0
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [dstq+mstrideq*1], m5
+%else
+ mova [rsp+3*16], m5 ; don't clobber p0/m4
+%endif
+
+ ; sub p6/p1, add q1/q6
+ paddw m3, Q1
+ paddw m5, m0, P1
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2] ; q6
+%else ; %2 != v
+ mova m0, [rsp+13*16] ; q6
+%endif ; %2==/!=v
+ paddw m3, m0
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, Q0
+ por m5, m4
+%if WRITE_IN_PLACE
+ mova [dstq], m5
+%else
+ mova [rsp+4*16], m5 ; don't clobber q0/m5
+%endif
+
+ ; sub p5/p0, add q2/q6
+ paddw m3, Q2
+ paddw m5, m2, P0
+ paddw m3, m0
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+ pandn m4, m1, Q1
+ por m2, m5, m4 ; don't clobber q1/m6
+
+ ; sub p4/q0, add q3/q6
+ paddw m3, Q3
+ paddw m7, Q0
+ paddw m3, m0
+ psubw m3, m7
+ psrlw m7, m3, 4
+ pand m7, m1
+ pandn m4, m1, Q2
+ por m7, m4 ; don't clobber q2/m14
+
+ ; sub p3/q1, add q4/q6
+%ifidn %2, v
+ paddw m3, [tmpq+strideq*0]
+%else ; %2 != v
+ paddw m3, [rsp+11*16]
+%endif ; %2==/!=v
+ paddw m6, Q1
+ paddw m3, m0
+ psubw m3, m6
+ psrlw m6, m3, 4
+ pand m6, m1
+ pandn m4, m1, Q3
+ por m6, m4
+%if WRITE_IN_PLACE
+ mova [tmpq+mstrideq], m6 ; q3
+%else ; %2 != v
+ mova [rsp+5*16], m6
+%endif ; %2==/!=v
+
+ ; sub p2/q2, add q5/q6
+%ifidn %2, v
+ paddw m3, [tmpq+strideq*1]
+%if ARCH_X86_64
+ paddw m5, P2, Q2
+%else
+ ; because tmpq is clobbered, so we use a backup pointer for P2 instead
+ paddw m5, [r4], Q2
+ mov pic_regq, pic_regm
+%endif
+%else ; %2 != v
+ paddw m3, [rsp+12*16]
+ paddw m5, P2, Q2
+%endif ; %2==/!=v
+ paddw m3, m0
+ psubw m3, m5
+ psrlw m5, m3, 4
+ pand m5, m1
+%ifidn %2, v
+ pandn m4, m1, [tmpq+strideq*0]
+%else ; %2 != v
+ pandn m4, m1, [rsp+11*16]
+%endif ; %2==/!=v
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+strideq*0], m5 ; q4
+%else ; %2 != v
+ mova [rsp+11*16], m5
+%endif ; %2==/!=v
+
+ ; sub p1/q3, add q6*2
+ psubw m3, P1
+ paddw m0, m0
+ psubw m3, Q3
+ paddw m3, m0
+ psrlw m5, m3, 4
+ pand m5, m1
+%ifidn %2, v
+ pandn m4, m1, [tmpq+strideq*1]
+%else ; %2 != v
+ pandn m4, m1, [rsp+12*16]
+%endif ; %2==/!=v
+ por m5, m4
+%ifidn %2, v
+ mova [tmpq+strideq*1], m5 ; q5
+%else ; %2 != v
+ mova [rsp+12*16], m5
+%endif ; %2==/!=v
+
+ mova m4, [rsp+0*16]
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*4]
+%endif
+%if ARCH_X86_64
+ SWAP 2, 11
+ SWAP 7, 14
+ SWAP 6, 15
+%else ; x86-32
+ mova Q1, m2
+ mova Q2, m7
+%endif ; x86-32/64
+%if WRITE_IN_PLACE
+ mova P2, [tmpq+strideq*1]
+ mova P1, [tmpq+strideq*2]
+ mova P0, [tmpq+stride3q]
+ mova Q0, [dstq]
+%elif ARCH_X86_64
+ mova P2, [rsp+1*16]
+ mova P1, [rsp+2*16]
+ mova P0, [rsp+3*16]
+ mova Q0, [rsp+4*16]
+%else ; !WRITE_IN_PLACE & x86-32
+ mova m0, [rsp+1*16]
+ mova m1, [rsp+2*16]
+ mova m2, [rsp+3*16]
+ mova m3, [rsp+4*16]
+ mova m7, [rsp+5*16]
+ mova P2, m0
+ mova P1, m1
+ mova P0, m2
+ mova Q0, m3
+ mova Q3, m7
+%endif ; WRITE_IN_PLACE / x86-32/64
+%undef WRITE_IN_PLACE
+%endif ; %1 == 16
+
+%if %1 >= 8
+
+ ; flat8 filter
+ mova m0, P3 ; p3
+ paddw m1, m0, P2 ; p3+p2
+ paddw m2, P1, P0 ; p1+p0
+ paddw m3, m1, m1 ; 2*(p3+p2)
+ paddw m2, m0 ; p1+p0+p3
+ paddw m3, Q0 ; 2*(p3+p2)+q0
+ paddw m2, m3 ; 3*p3+2*p2+p1+p0+q0
+ pmulhrsw m7, m2, [PIC_sym(pw_4096)]
+ psubw m7, P2
+ pand m7, m4
+
+ paddw m3, P1, Q1 ; p1+q1
+ psubw m2, m1 ; 2*p3+p2+p1+p0+q0
+ paddw m2, m3 ; 2*p3+p2+2*p1+p0+q0+q1
+ pmulhrsw m3, m2, [PIC_sym(pw_4096)]
+ psubw m3, P1
+ pand m3, m4
+
+ paddw m5, m0, P1 ; p3+p1
+ paddw m6, P0, Q2 ; p0+q2
+ psubw m2, m5 ; p3+p2+p1+p0+q0+q1
+ paddw m2, m6 ; p3+p2+p1+2*p0+q0+q1+q2
+ pmulhrsw m5, m2, [PIC_sym(pw_4096)]
+ psubw m5, P0
+ pand m5, m4
+
+ paddw m6, m0, P0 ; p3+p0
+ paddw m1, Q0, Q3 ; q0+q3
+ psubw m2, m6 ; p2+p1+p0+q0+q1+q2
+ paddw m2, m1 ; p2+p1+p0+2*q0+q1+q2+q3
+ pmulhrsw m6, m2, [PIC_sym(pw_4096)]
+ psubw m6, Q0
+ pand m6, m4
+
+ paddw m2, Q1 ; p2+p1+p0+2*q0+2*q1+q2+q3
+ paddw m2, Q3 ; p2+p1+p0+2*q0+2*q1+q2+2*q3
+ paddw m1, P2, Q0 ; p2+q0
+ psubw m2, m1 ; p1+p0+q0+2*q1+q2+2*q3
+ pmulhrsw m1, m2, [PIC_sym(pw_4096)]
+ psubw m1, Q1
+ pand m1, m4
+
+ psubw m2, P1 ; p0+q0+2*q1+q2+2*q3
+ psubw m2, Q1 ; p0+q0+q1+q2+2*q3
+ paddw m0, Q3, Q2 ; q3+q2
+ paddw m2, m0 ; p0+q0+q1+2*q2+3*q3
+ pmulhrsw m2, [PIC_sym(pw_4096)]
+ psubw m2, Q2
+ pand m2, m4
+
+ paddw m7, P2
+ paddw m3, P1
+ paddw m5, P0
+ paddw m6, Q0
+ paddw m1, Q1
+ paddw m2, Q2
+
+%ifidn %2, v
+ mova [tmpq+strideq*1], m7 ; p2
+ mova [tmpq+strideq*2], m3 ; p1
+ mova [tmpq+stride3q ], m5 ; p0
+ mova [dstq+strideq*0], m6 ; q0
+ mova [dstq+strideq*1], m1 ; q1
+ mova [dstq+strideq*2], m2 ; q2
+%else ; %2 != v
+ mova m0, P3
+
+%if %1 == 8
+ lea tmpq, [dstq+strideq*4]
+%if ARCH_X86_64
+ SWAP 4, 15
+ TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, 8
+%else
+ TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, "", \
+ Q3, [tmpq+strideq*1-8], a, u
+%endif
+
+ ; write 8x8
+ movu [dstq+strideq*0-8], m0
+ movu [dstq+strideq*1-8], m7
+ movu [dstq+strideq*2-8], m3
+ movu [dstq+stride3q -8], m5
+ movu [tmpq+strideq*0-8], m6
+%if ARCH_X86_64
+ movu [tmpq+strideq*1-8], m1
+%endif
+ movu [tmpq+strideq*2-8], m2
+ movu [tmpq+stride3q -8], m4
+ lea dstq, [dstq+strideq*8]
+%else ; %1 != 8
+%if ARCH_X86_64
+ SWAP 6, 8
+ SWAP 1, 9
+ SWAP 2, 10
+%else
+ mova [rsp+1*16], m6
+ mova [rsp+2*16], m1
+ mova [rsp+3*16], m2
+%endif
+
+ mova m1, [rsp+ 7*16]
+ mova m2, [rsp+ 8*16]
+ mova m4, [rsp+ 9*16]
+ mova m6, [rsp+10*16]
+ lea tmpq, [dstq+strideq*4]
+%if ARCH_X86_64
+ TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, 11
+%else
+ mova [rsp+7*16], m5
+ TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, "", \
+ [rsp+7*16], [tmpq+strideq*1-16], a, a
+%endif
+
+ mova [dstq+strideq*0-16], m1
+ mova [dstq+strideq*1-16], m2
+ mova [dstq+strideq*2-16], m4
+ mova [dstq+stride3q -16], m6
+ mova [tmpq+strideq*0-16], m0
+%if ARCH_X86_64
+ mova [tmpq+strideq*1-16], m7
+%endif
+ mova [tmpq+strideq*2-16], m3
+ mova [tmpq+stride3q -16], m5
+
+%if ARCH_X86_64
+ SWAP 6, 8
+ SWAP 1, 9
+ SWAP 2, 10
+ SWAP 4, 15
+%else
+ mova m6, [rsp+1*16]
+ mova m1, [rsp+2*16]
+ mova m2, [rsp+3*16]
+ mova m4, Q3
+%endif
+ mova m0, [rsp+11*16]
+ mova m3, [rsp+12*16]
+ mova m5, [rsp+13*16]
+%if ARCH_X86_64
+ mova m7, [rsp+14*16]
+ TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, 8
+%else
+ TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, "", \
+ [rsp+14*16], [tmpq+strideq*1], a, a
+%endif
+ mova [dstq+strideq*0], m6
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m4
+ mova [tmpq+strideq*0], m0
+%if ARCH_X86_64
+ mova [tmpq+strideq*1], m3
+%endif
+ mova [tmpq+strideq*2], m5
+ mova [tmpq+stride3q ], m7
+ lea dstq, [dstq+strideq*8]
+%endif ; %1==/!=8
+%endif ; %2==/!=v
+%elif %1 == 6
+ ; flat6 filter
+ paddw m3, P1, P0 ; p1+p0
+ paddw m3, P2 ; p2+p1+p0
+ paddw m6, P2, Q0 ; p2+q0
+ paddw m3, m3 ; 2*(p2+p1+p0)
+ paddw m3, m6 ; p2+2*(p2+p1+p0)+q0
+ pmulhrsw m2, m3, [PIC_sym(pw_4096)]
+ psubw m2, P1
+ pand m2, m4
+
+ paddw m3, Q0 ; p2+2*(p2+p1+p0+q0)
+ paddw m6, P2, P2 ; 2*p2
+ paddw m3, Q1 ; p2+2*(p2+p1+p0+q0)+q1
+ psubw m3, m6 ; p2+2*(p1+p0+q0)+q1
+ pmulhrsw m5, m3, [PIC_sym(pw_4096)]
+ psubw m5, P0
+ pand m5, m4
+
+ paddw m3, Q1 ; p2+2*(p1+p0+q0+q1)
+ paddw m6, P2, P1 ; p2+p1
+ paddw m3, Q2 ; p2+2*(p1+p0+q0+q1)+q2
+ psubw m3, m6 ; p1+2*(p0+q0+q1)+q2
+ pmulhrsw m6, m3, [PIC_sym(pw_4096)]
+ psubw m6, Q0
+ pand m6, m4
+
+ psubw m3, P1 ; 2*(p0+q0+q1)+q2
+%if ARCH_X86_64
+ paddw Q2, Q2 ; q2*2
+%else
+ mova m0, Q2
+ paddw m0, m0
+%endif
+ psubw m3, P0 ; p0+2*(q0+q1)+q2
+%if ARCH_X86_64
+ paddw m3, Q2 ; p0+q*(q0+q1+q2)+q2
+%else
+ paddw m3, m0
+%endif
+ pmulhrsw m3, [PIC_sym(pw_4096)]
+ psubw m3, Q1
+ pand m3, m4
+
+ paddw m2, P1
+ paddw m5, P0
+ paddw m6, Q0
+ paddw m3, Q1
+
+%ifidn %2, v
+ mova [dstq+mstrideq*2], m2 ; p1
+ mova [dstq+mstrideq*1], m5 ; p0
+ mova [dstq+strideq*0], m6 ; q0
+ mova [dstq+strideq*1], m3 ; q1
+%else ; %2 != v
+ TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0
+%endif ; %2==/!=v
+%else ; %1 == 4
+%if ARCH_X86_64
+%ifidn %2, v
+ mova [dstq+mstrideq*2], P1 ; p1
+ mova [dstq+mstrideq*1], P0 ; p0
+ mova [dstq+strideq*0], Q0 ; q0
+ mova [dstq+strideq*1], Q1 ; q1
+%else ; %2 != v
+ TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0
+%endif ; %2==/!=v
+%else ; x86-32
+%ifidn %2, v
+ mova [dstq+mstrideq*2], m3
+ mova [dstq+mstrideq*1], m5
+ mova [dstq+strideq*0], m6
+ mova [dstq+strideq*1], m7
+%else ; %2 != v
+ TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0
+%endif ; %2==/!=v
+%endif ; x86-32/64
+%endif ; %1
+%undef P3
+%undef P2
+%undef P1
+%undef P0
+%undef Q0
+%undef Q1
+%undef Q2
+%undef Q3
+%endmacro
+
+INIT_XMM ssse3
+; stack layout:
+; r0 - flat8 backup inside flat16 code
+%if ARCH_X86_64
+cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+%else
+; stack layout [32bit only]:
+; r1-4 - p2-q0 post-filter16
+; r5 - p3
+; r6 - q3 post-filter16
+; r7 - GPRs [mask_bitsm, mstridem]
+; r8 - m12/pb_mask
+; r9 - bdmulq
+cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \
+ dst, stride, mask, mstride, pic_reg, stride3, tmp
+ RELOC_ARGS v, 10*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+%define pic_regm dword [esp+7*16+2*gprsize]
+ mov pic_regm, pic_regq
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+9*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+ sub r3, dword lstridem
+ mov dword lm, r3
+%endif
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mstridem dword [esp+7*16+1*gprsize]
+ mov mstridem, mstrideq
+%define mask_bitsm dword [esp+7*16+0*gprsize]
+ mov mask_bitsm, 0x3
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+8*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+8], mask_bitsd ; vmask[2]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+8], r6d
+%endif
+ jz .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .end
+
+ FILTER 4, v
+
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ add lq, 8
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add dword lm, 8
+%endif
+ add dstq, 16
+%if ARCH_X86_64
+ shl mask_bitsd, 2
+ sub wd, 2
+%else
+ shl mask_bitsm, 2
+ sub dword wm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
+
+INIT_XMM ssse3
+; stack layout:
+; r0 - flat8 backup inside flat16
+; r1-4 - p2-q0 post-filter16 backup
+; r5 - q3 post-filter16 backup
+; r6 - p3
+; r7-10 - p7-4
+; r11-14 - q4-7
+%if ARCH_X86_64
+cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov hd, hm
+ shl l_strideq, 2
+%else
+; stack layout [32bit only]:
+; r15 - GPRs [mask_bitsm]
+; r16 - m12/pb_mask
+; r17 - bdmulq
+; r18-24 - p2-q3
+cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \
+ dst, stride, mask, l, pic_reg, stride3, tmp
+ RELOC_ARGS h, 25*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+17*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+%endif
+ sub lq, 4
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mask_bitsm dword [esp+15*16+0*gprsize]
+ mov mask_bitsm, 0x3
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+16*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+8], mask_bitsd ; vmask[2]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+8], r6d
+%endif
+ jz .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .no_filter
+
+ FILTER 4, h
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ lea lq, [lq+l_strideq*2]
+ shl mask_bitsd, 2
+ sub hd, 2
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add lq, dword lstridem
+ add lq, dword lstridem
+ shl mask_bitsm, 2
+ sub dword hm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov wd, wm
+ shl l_strideq, 2
+ sub lq, l_strideq
+%else
+; stack layout [32bit only]:
+; r0 - GPRs [mask_bitsm, mstridem]
+; r1 - m12/pb_mask
+; r2 - bdmulq
+cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \
+ dst, stride, mask, mstride, pic_reg, stride3, tmp
+ RELOC_ARGS v, 3*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+2*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+ sub r3, dword lstridem
+ mov dword lm, r3
+%endif
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mask_bitsm dword [esp+0*gprsize]
+%define mstridem dword [esp+1*gprsize]
+ mov mask_bitsm, 0x3
+ mov mstridem, mstrideq
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+1*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .end
+
+ FILTER 4, v
+
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ add lq, 8
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add dword lm, 8
+%endif
+ add dstq, 16
+%if ARCH_X86_64
+ shl mask_bitsd, 2
+ sub wd, 2
+%else
+ shl mask_bitsm, 2
+ sub dword wm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, tmp, mask_bits, bdmul
+ mov r6d, r7m
+ sar r6d, 7
+ and r6d, 16 ; 0 for 10bpc, 16 for 12bpc
+ lea bdmulq, [pw_4]
+ add bdmulq, r6
+ mov hd, hm
+ shl l_strideq, 2
+%else
+; stack layout [32bit only]:
+; r0 - GPRs [mask_bitsm]
+; r1 - m12/pb_mask
+; r2 - bdmulq
+; r3-8 - p2-q2
+cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \
+ dst, stride, mask, l, pic_reg, stride3, tmp
+ RELOC_ARGS h, 9*16
+%if STACK_ALIGNMENT >= 16
+ mov r5d, r7m
+%endif
+ sar r5d, 7
+ and r5d, 16 ; 0 for 10bpc, 16 for 12bpc
+ LEA pic_regq, PIC_base
+ mova m0, [PIC_sym(pw_4)+r5]
+%define bdmulq esp+2*16
+ mova [bdmulq], m0
+ shl dword lstridem, 2
+%endif
+ sub lq, 4
+ lea stride3q, [strideq*3]
+%if ARCH_X86_64
+ mov mask_bitsd, 0x3
+ mova m12, [pb_mask]
+%else
+%define mask_bitsm dword [esp+0*gprsize]
+ mov mask_bitsm, 0x3
+ mova m0, [PIC_sym(pb_mask)]
+%define m12 [esp+1*16]
+ mova m12, m0
+%endif
+
+.loop:
+%if ARCH_X86_64
+ test [maskq+4], mask_bitsd ; vmask[1]
+%else
+ mov r6d, mask_bitsm
+ test [maskq+4], r6d
+%endif
+ jz .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+%if ARCH_X86_64
+ test [maskq+0], mask_bitsd ; vmask[0]
+%else
+ test [maskq+0], r6d
+%endif
+ jz .no_filter
+
+ FILTER 4, h
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+.end:
+%if ARCH_X86_64
+ pslld m12, 2
+ lea lq, [lq+l_strideq*2]
+ shl mask_bitsd, 2
+ sub hd, 2
+%else
+ mova m0, m12
+ pslld m0, 2
+ mova m12, m0
+ add lq, dword lstridem
+ add lq, dword lstridem
+ shl mask_bitsm, 2
+ sub dword hm, 2
+%endif
+ jg .loop
+%undef mask_bitsm
+%undef bdmulq
+ UNRELOC_ARGS
+ RET
diff --git a/third_party/dav1d/src/x86/loopfilter_avx2.asm b/third_party/dav1d/src/x86/loopfilter_avx2.asm
new file mode 100644
index 0000000000..84696c758a
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter_avx2.asm
@@ -0,0 +1,1569 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+pb_4x1_4x5_4x9_4x13: times 2 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+pb_7_1: times 16 db 7, 1
+pb_3_1: times 16 db 3, 1
+pb_2_1: times 16 db 2, 1
+pb_m1_0: times 16 db -1, 0
+pb_m1_1: times 16 db -1, 1
+pb_m1_2: times 16 db -1, 2
+pb_1: times 32 db 1
+pb_2: times 32 db 2
+pb_3: times 32 db 3
+pb_4: times 32 db 4
+pb_16: times 32 db 16
+pb_63: times 32 db 63
+pb_64: times 32 db 64
+pb_128: times 32 db 0x80
+pb_129: times 32 db 0x81
+pb_240: times 32 db 0xf0
+pb_248: times 32 db 0xf8
+pb_254: times 32 db 0xfe
+
+pw_2048: times 16 dw 2048
+pw_4096: times 16 dw 4096
+
+pb_mask: dd 1, 2, 4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+%macro ABSSUB 4 ; dst, a, b, tmp
+ psubusb %1, %2, %3
+ psubusb %4, %3, %2
+ por %1, %4
+%endmacro
+
+%macro TRANSPOSE_16x4_AND_WRITE_4x32 5
+ ; transpose 16x4
+ punpcklbw m%5, m%1, m%2
+ punpckhbw m%1, m%2
+ punpcklbw m%2, m%3, m%4
+ punpckhbw m%3, m%4
+ punpcklwd m%4, m%5, m%2
+ punpckhwd m%5, m%2
+ punpcklwd m%2, m%1, m%3
+ punpckhwd m%1, m%3
+
+ ; write out
+ movd [dstq+strideq*0-2], xm%4
+ pextrd [dstq+strideq*1-2], xm%4, 1
+ pextrd [dstq+strideq*2-2], xm%4, 2
+ pextrd [dstq+stride3q-2], xm%4, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%5
+ pextrd [dstq+strideq*1-2], xm%5, 1
+ pextrd [dstq+strideq*2-2], xm%5, 2
+ pextrd [dstq+stride3q-2], xm%5, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%2
+ pextrd [dstq+strideq*1-2], xm%2, 1
+ pextrd [dstq+strideq*2-2], xm%2, 2
+ pextrd [dstq+stride3q-2], xm%2, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%1
+ pextrd [dstq+strideq*1-2], xm%1, 1
+ pextrd [dstq+strideq*2-2], xm%1, 2
+ pextrd [dstq+stride3q-2], xm%1, 3
+ lea dstq, [dstq+strideq*4]
+
+ vextracti128 xm%4, m%4, 1
+ vextracti128 xm%5, m%5, 1
+ vextracti128 xm%2, m%2, 1
+ vextracti128 xm%1, m%1, 1
+
+ movd [dstq+strideq*0-2], xm%4
+ pextrd [dstq+strideq*1-2], xm%4, 1
+ pextrd [dstq+strideq*2-2], xm%4, 2
+ pextrd [dstq+stride3q-2], xm%4, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%5
+ pextrd [dstq+strideq*1-2], xm%5, 1
+ pextrd [dstq+strideq*2-2], xm%5, 2
+ pextrd [dstq+stride3q-2], xm%5, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%2
+ pextrd [dstq+strideq*1-2], xm%2, 1
+ pextrd [dstq+strideq*2-2], xm%2, 2
+ pextrd [dstq+stride3q-2], xm%2, 3
+ lea dstq, [dstq+strideq*4]
+ movd [dstq+strideq*0-2], xm%1
+ pextrd [dstq+strideq*1-2], xm%1, 1
+ pextrd [dstq+strideq*2-2], xm%1, 2
+ pextrd [dstq+stride3q-2], xm%1, 3
+ lea dstq, [dstq+strideq*4]
+%endmacro
+
+%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
+%if %1 == 0
+ mova %3, m15
+%endif
+
+ ; input in m0-15
+ punpcklbw m15, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m5, m6, m7
+ punpckhbw m6, m7
+ punpcklbw m7, m8, m9
+ punpckhbw m8, m9
+ punpcklbw m9, m10, m11
+ punpckhbw m10, m11
+ punpcklbw m11, m12, m13
+ punpckhbw m12, m13
+ mova m13, %3
+ mova %3, m12
+ punpcklbw m12, m14, m13
+ punpckhbw m13, m14, m13
+
+ ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13
+ punpcklwd m14, m15, m1
+ punpckhwd m15, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m7, m9
+ punpckhwd m7, m9
+ punpcklwd m9, m8, m10
+ punpckhwd m8, m10
+ punpcklwd m10, m11, m12
+ punpckhwd m11, m12
+ mova m12, %3
+ mova %3, m11
+ punpcklwd m11, m12, m13
+ punpckhwd m12, m13
+
+ ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12
+ punpckldq m13, m14, m2
+ punpckhdq m14, m2
+ punpckldq m2, m15, m3
+ punpckhdq m15, m3
+ punpckldq m3, m1, m5
+ punpckhdq m1, m5
+ punpckldq m5, m0, m4
+ punpckhdq m0, m4
+ punpckldq m4, m6, m10
+ punpckhdq m6, m10
+ punpckldq m10, m9, m11
+ punpckhdq m9, m11
+ punpckldq m11, m8, m12
+ punpckhdq m8, m12
+ mova m12, %3
+ mova %3, m8
+ punpckldq m8, m7, m12
+ punpckhdq m7, m12
+
+ ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3
+ punpcklqdq m12, m13, m4
+ punpckhqdq m13, m4
+ punpcklqdq m4, m14, m6
+ punpckhqdq m14, m6
+ punpcklqdq m6, m2, m8
+ punpckhqdq m2, m8
+ punpcklqdq m8, m15, m7
+ punpckhqdq m15, m7
+ punpcklqdq m7, m3, m10
+ punpckhqdq m3, m10
+ punpcklqdq m10, m1, m9
+ punpckhqdq m1, m9
+ punpcklqdq m9, m5, m11
+ punpckhqdq m5, m11
+ mova m11, %3
+ mova %3, m12
+ punpcklqdq m12, m0, m11
+ punpckhqdq m0, m11
+%if %2 == 0
+ mova m11, %3
+%endif
+
+ ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0
+ SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15
+ SWAP 3, 14, 12, 9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%if %1 == 4
+ lea tmpq, [dstq+mstrideq*2]
+ mova m3, [tmpq+strideq*0] ; p1
+ mova m4, [tmpq+strideq*1] ; p0
+ mova m5, [tmpq+strideq*2] ; q0
+ mova m6, [tmpq+stride3q] ; q1
+%else
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+ lea tmpq, [dstq+mstrideq*4]
+%if %1 != 6
+ mova m12, [tmpq+strideq*0]
+%endif
+ mova m13, [tmpq+strideq*1]
+ mova m3, [tmpq+strideq*2]
+ mova m4, [tmpq+stride3q]
+ mova m5, [dstq+strideq*0]
+ mova m6, [dstq+strideq*1]
+ mova m14, [dstq+strideq*2]
+%if %1 != 6
+ mova m15, [dstq+stride3q]
+%endif
+%endif
+%else
+ ; load lines
+%if %1 == 4
+ movd xm3, [dstq+strideq*0-2]
+ movd xm4, [dstq+strideq*1-2]
+ movd xm5, [dstq+strideq*2-2]
+ movd xm6, [dstq+stride3q -2]
+ lea tmpq, [dstq+strideq*4]
+ pinsrd xm3, [tmpq+strideq*0-2], 2
+ pinsrd xm4, [tmpq+strideq*1-2], 2
+ pinsrd xm5, [tmpq+strideq*2-2], 2
+ pinsrd xm6, [tmpq+stride3q -2], 2
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm3, [tmpq+strideq*0-2], 1
+ pinsrd xm4, [tmpq+strideq*1-2], 1
+ pinsrd xm5, [tmpq+strideq*2-2], 1
+ pinsrd xm6, [tmpq+stride3q -2], 1
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm3, [tmpq+strideq*0-2], 3
+ pinsrd xm4, [tmpq+strideq*1-2], 3
+ pinsrd xm5, [tmpq+strideq*2-2], 3
+ pinsrd xm6, [tmpq+stride3q -2], 3
+ lea tmpq, [tmpq+strideq*4]
+ movd xm12, [tmpq+strideq*0-2]
+ movd xm13, [tmpq+strideq*1-2]
+ movd xm14, [tmpq+strideq*2-2]
+ movd xm15, [tmpq+stride3q -2]
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm12, [tmpq+strideq*0-2], 2
+ pinsrd xm13, [tmpq+strideq*1-2], 2
+ pinsrd xm14, [tmpq+strideq*2-2], 2
+ pinsrd xm15, [tmpq+stride3q -2], 2
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm12, [tmpq+strideq*0-2], 1
+ pinsrd xm13, [tmpq+strideq*1-2], 1
+ pinsrd xm14, [tmpq+strideq*2-2], 1
+ pinsrd xm15, [tmpq+stride3q -2], 1
+ lea tmpq, [tmpq+strideq*4]
+ pinsrd xm12, [tmpq+strideq*0-2], 3
+ pinsrd xm13, [tmpq+strideq*1-2], 3
+ pinsrd xm14, [tmpq+strideq*2-2], 3
+ pinsrd xm15, [tmpq+stride3q -2], 3
+ vinserti128 m3, xm12, 1
+ vinserti128 m4, xm13, 1
+ vinserti128 m5, xm14, 1
+ vinserti128 m6, xm15, 1
+
+ ; transpose 4x16
+ ; xm3: A-D0,A-D8,A-D4,A-D12
+ ; xm4: A-D1,A-D9,A-D5,A-D13
+ ; xm5: A-D2,A-D10,A-D6,A-D14
+ ; xm6: A-D3,A-D11,A-D7,A-D15
+ punpcklbw m7, m3, m4
+ punpckhbw m3, m4
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9
+ ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13
+ ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11
+ ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15
+ punpcklwd m6, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ ; xm6: A0-3,B0-3,C0-3,D0-3
+ ; xm7: A8-11,B8-11,C8-11,D8-11
+ ; xm4: A4-7,B4-7,C4-7,D4-7
+ ; xm3: A12-15,B12-15,C12-15,D12-15
+ punpckldq m5, m6, m4
+ punpckhdq m6, m4
+ punpckldq m4, m7, m3
+ punpckhdq m7, m3
+ ; xm5: A0-7,B0-7
+ ; xm6: C0-7,D0-7
+ ; xm4: A8-15,B8-15
+ ; xm7: C8-15,D8-15
+ punpcklqdq m3, m5, m4
+ punpckhqdq m4, m5, m4
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ ; xm3: A0-15
+ ; xm5: B0-15
+ ; xm4: C0-15
+ ; xm6: D0-15
+%elif %1 == 6 || %1 == 8
+ movq xm3, [dstq+strideq*0-%1/2]
+ movq xm4, [dstq+strideq*1-%1/2]
+ movq xm5, [dstq+strideq*2-%1/2]
+ movq xm6, [dstq+stride3q -%1/2]
+ lea tmpq, [dstq+strideq*8]
+ movhps xm3, [tmpq+strideq*0-%1/2]
+ movhps xm4, [tmpq+strideq*1-%1/2]
+ movhps xm5, [tmpq+strideq*2-%1/2]
+ movhps xm6, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movq xm7, [tmpq+strideq*0-%1/2]
+ movq xm8, [tmpq+strideq*1-%1/2]
+ movq xm9, [tmpq+strideq*2-%1/2]
+ movq xm11, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movhps xm7, [tmpq+strideq*0-%1/2]
+ movhps xm8, [tmpq+strideq*1-%1/2]
+ movhps xm9, [tmpq+strideq*2-%1/2]
+ movhps xm11, [tmpq+stride3q -%1/2]
+ vinserti128 m3, xm7, 1
+ vinserti128 m4, xm8, 1
+ vinserti128 m5, xm9, 1
+ vinserti128 m6, xm11, 1
+ lea tmpq, [dstq+strideq*4]
+ movq xm12, [tmpq+strideq*0-%1/2]
+ movq xm13, [tmpq+strideq*1-%1/2]
+ movq xm14, [tmpq+strideq*2-%1/2]
+ movq xm15, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movhps xm12, [tmpq+strideq*0-%1/2]
+ movhps xm13, [tmpq+strideq*1-%1/2]
+ movhps xm14, [tmpq+strideq*2-%1/2]
+ movhps xm15, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movq xm7, [tmpq+strideq*0-%1/2]
+ movq xm8, [tmpq+strideq*1-%1/2]
+ movq xm9, [tmpq+strideq*2-%1/2]
+ movq xm11, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ movhps xm7, [tmpq+strideq*0-%1/2]
+ movhps xm8, [tmpq+strideq*1-%1/2]
+ movhps xm9, [tmpq+strideq*2-%1/2]
+ movhps xm11, [tmpq+stride3q -%1/2]
+ vinserti128 m12, xm7, 1
+ vinserti128 m13, xm8, 1
+ vinserti128 m14, xm9, 1
+ vinserti128 m15, xm11, 1
+
+ ; transpose 8x16
+ ; xm3: A-H0,A-H8
+ ; xm4: A-H1,A-H9
+ ; xm5: A-H2,A-H10
+ ; xm6: A-H3,A-H11
+ ; xm12: A-H4,A-H12
+ ; xm13: A-H5,A-H13
+ ; xm14: A-H6,A-H14
+ ; xm15: A-H7,A-H15
+ punpcklbw m7, m3, m4
+ punpckhbw m3, m4
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ punpcklbw m6, m12, m13
+ punpckhbw m12, m13
+ punpcklbw m13, m14, m15
+ punpckhbw m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
+ ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
+ ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
+ ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
+ ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
+ ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
+ ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
+ ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
+ punpcklwd m15, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m6, m13
+ punpckhwd m6, m13
+ punpcklwd m13, m12, m14
+ punpckhwd m12, m14
+ ; xm15: A0-3,B0-3,C0-3,D0-3
+ ; xm7: E0-3,F0-3,G0-3,H0-3
+ ; xm4: A8-11,B8-11,C8-11,D8-11
+ ; xm3: E8-11,F8-11,G8-11,H8-11
+ ; xm5: A4-7,B4-7,C4-7,D4-7
+ ; xm6: E4-7,F4-7,G4-7,H4-7
+ ; xm13: A12-15,B12-15,C12-15,D12-15
+ ; xm12: E12-15,F12-15,G12-15,H12-15
+ punpckldq m14, m15, m5
+ punpckhdq m15, m5
+ punpckldq m5, m7, m6
+%if %1 != 6
+ punpckhdq m7, m6
+%endif
+ punpckldq m6, m4, m13
+ punpckhdq m4, m13
+ punpckldq m13, m3, m12
+%if %1 != 6
+ punpckhdq m12, m3, m12
+%endif
+ ; xm14: A0-7,B0-7
+ ; xm15: C0-7,D0-7
+ ; xm5: E0-7,F0-7
+ ; xm7: G0-7,H0-7
+ ; xm6: A8-15,B8-15
+ ; xm4: C8-15,D8-15
+ ; xm13: E8-15,F8-15
+ ; xm12: G8-15,H8-15
+ punpcklqdq m3, m14, m6
+ punpckhqdq m14, m6
+ punpckhqdq m6, m15, m4
+ punpcklqdq m15, m4
+ punpcklqdq m4, m5, m13
+ punpckhqdq m13, m5, m13
+%if %1 == 8
+ punpcklqdq m5, m7, m12
+ punpckhqdq m12, m7, m12
+ ; xm3: A0-15
+ ; xm14: B0-15
+ ; xm15: C0-15
+ ; xm6: D0-15
+ ; xm4: E0-15
+ ; xm13: F0-15
+ ; xm5: G0-15
+ ; xm12: H0-15
+ SWAP 12, 3, 15
+ SWAP 13, 14, 5, 4, 6
+ ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15
+%else
+ SWAP 13, 3, 14
+ SWAP 6, 4, 15, 5
+ ; 3,14,15,6,4,13 -> 13,3,4,5,6,14
+%endif
+%else
+ ; load and 16x16 transpose. We only use 14 pixels but we'll need the
+ ; remainder at the end for the second transpose
+ movu xm0, [dstq+strideq*0-8]
+ movu xm1, [dstq+strideq*1-8]
+ movu xm2, [dstq+strideq*2-8]
+ movu xm3, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4]
+ movu xm4, [tmpq+strideq*0-8]
+ movu xm5, [tmpq+strideq*1-8]
+ movu xm6, [tmpq+strideq*2-8]
+ movu xm7, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ movu xm8, [tmpq+strideq*0-8]
+ movu xm9, [tmpq+strideq*1-8]
+ movu xm10, [tmpq+strideq*2-8]
+ movu xm11, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ movu xm12, [tmpq+strideq*0-8]
+ movu xm13, [tmpq+strideq*1-8]
+ movu xm14, [tmpq+strideq*2-8]
+ movu xm15, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m0, [tmpq+strideq*0-8], 1
+ vinserti128 m1, [tmpq+strideq*1-8], 1
+ vinserti128 m2, [tmpq+strideq*2-8], 1
+ vinserti128 m3, [tmpq+stride3q -8], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m4, [tmpq+strideq*0-8], 1
+ vinserti128 m5, [tmpq+strideq*1-8], 1
+ vinserti128 m6, [tmpq+strideq*2-8], 1
+ vinserti128 m7, [tmpq+stride3q -8], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m8, [tmpq+strideq*0-8], 1
+ vinserti128 m9, [tmpq+strideq*1-8], 1
+ vinserti128 m10, [tmpq+strideq*2-8], 1
+ vinserti128 m11, [tmpq+stride3q -8], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 m12, [tmpq+strideq*0-8], 1
+ vinserti128 m13, [tmpq+strideq*1-8], 1
+ vinserti128 m14, [tmpq+strideq*2-8], 1
+ vinserti128 m15, [tmpq+stride3q -8], 1
+
+ TRANSPOSE_16X16B 0, 1, [rsp+11*32]
+ mova [rsp+12*32], m1
+ mova [rsp+13*32], m2
+ mova [rsp+14*32], m3
+ mova [rsp+15*32], m12
+ mova [rsp+16*32], m13
+ mova [rsp+17*32], m14
+ mova [rsp+18*32], m15
+ ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
+ SWAP 12, 4, 7
+ SWAP 13, 5, 8
+ SWAP 3, 6, 9
+ SWAP 10, 14
+ SWAP 11, 15
+%endif
+%endif
+
+ ; load L/E/I/H
+%ifidn %2, v
+ movu m1, [lq]
+ movu m0, [lq+l_strideq]
+%else
+ movq xm1, [lq]
+ movq xm2, [lq+l_strideq*2]
+ movhps xm1, [lq+l_strideq]
+ movhps xm2, [lq+l_stride3q]
+ lea lq, [lq+l_strideq*4]
+ movq xm10, [lq]
+ movq xm0, [lq+l_strideq*2]
+ movhps xm10, [lq+l_strideq]
+ movhps xm0, [lq+l_stride3q]
+ lea lq, [lq+l_strideq*4]
+ vinserti128 m1, xm10, 1
+ vinserti128 m2, xm0, 1
+ shufps m0, m1, m2, q3131
+ shufps m1, m2, q2020
+%endif
+ pxor m2, m2
+ pcmpeqb m10, m2, m0
+ pand m1, m10
+ por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1]
+ pcmpeqb m10, m2, m0 ; !L
+ psrlq m2, m0, [lutq+128]
+ pand m2, [pb_63]
+ vpbroadcastb m1, [lutq+136]
+ pminub m2, m1
+ pmaxub m2, [pb_1] ; I
+ pand m1, m0, [pb_240]
+ psrlq m1, 4 ; H
+ paddb m0, [pb_2]
+ paddb m0, m0
+ paddb m0, m2 ; E
+ pxor m1, [pb_128]
+ pxor m2, [pb_128]
+ pxor m0, [pb_128]
+
+ ABSSUB m8, m3, m4, m9 ; abs(p1-p0)
+ pmaxub m8, m10
+ ABSSUB m9, m5, m6, m10 ; abs(q1-q0)
+ pmaxub m8, m9
+%if %1 == 4
+ pxor m8, [pb_128]
+ pcmpgtb m7, m8, m1 ; hev
+%else
+ pxor m7, m8, [pb_128]
+ pcmpgtb m7, m1 ; hev
+
+%if %1 == 6
+ ABSSUB m9, m13, m4, m10 ; abs(p2-p0)
+ pmaxub m9, m8
+%else
+ ABSSUB m9, m12, m4, m10 ; abs(p3-p0)
+ pmaxub m9, m8
+ ABSSUB m10, m13, m4, m11 ; abs(p2-p0)
+ pmaxub m9, m10
+%endif
+ ABSSUB m10, m5, m14, m11 ; abs(q2-q0)
+ pmaxub m9, m10
+%if %1 != 6
+ ABSSUB m10, m5, m15, m11 ; abs(q3-q0)
+ pmaxub m9, m10
+%endif
+ pxor m9, [pb_128]
+ pcmpgtb m9, [pb_129] ; !flat8in
+
+%if %1 == 6
+ ABSSUB m10, m13, m3, m1 ; abs(p2-p1)
+%else
+ ABSSUB m10, m12, m13, m11 ; abs(p3-p2)
+ ABSSUB m11, m13, m3, m1 ; abs(p2-p1)
+ pmaxub m10, m11
+ ABSSUB m11, m14, m15, m1 ; abs(q3-q2)
+ pmaxub m10, m11
+%endif
+ ABSSUB m11, m14, m6, m1 ; abs(q2-q1)
+ pmaxub m10, m11
+%if %1 == 16
+ vpbroadcastd m11, [maskq+8]
+ vpbroadcastd m1, [maskq+4]
+ por m11, m1
+ pand m11, [pb_mask]
+ pcmpeqd m11, [pb_mask]
+ pand m10, m11
+%else
+ vpbroadcastd m11, [maskq+4]
+ pand m11, [pb_mask]
+ pcmpeqd m11, [pb_mask]
+ pand m10, m11 ; only apply fm-wide to wd>4 blocks
+%endif
+ pmaxub m8, m10
+
+ pxor m8, [pb_128]
+%endif
+ pcmpgtb m8, m2
+
+ ABSSUB m10, m3, m6, m11 ; abs(p1-q1)
+ ABSSUB m11, m4, m5, m2 ; abs(p0-q0)
+ paddusb m11, m11
+ pand m10, [pb_254]
+ psrlq m10, 1
+ paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ pxor m10, [pb_128]
+ pcmpgtb m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+ por m8, m10
+
+%if %1 == 16
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1]
+%else
+ mova m0, [rsp+12*32]
+%endif
+ ABSSUB m1, m0, m4, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+13*32]
+%endif
+ ABSSUB m2, m0, m4, m10
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+stride3q]
+%else
+ mova m0, [rsp+14*32]
+%endif
+ ABSSUB m2, m0, m4, m10
+ pmaxub m1, m2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+ mova m0, [tmpq+strideq*0]
+%else
+ mova m0, [rsp+15*32]
+%endif
+ ABSSUB m2, m0, m5, m10
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*1]
+%else
+ mova m0, [rsp+16*32]
+%endif
+ ABSSUB m2, m0, m5, m10
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+17*32]
+%endif
+ ABSSUB m2, m0, m5, m10
+ pmaxub m1, m2
+ pxor m1, [pb_128]
+ pcmpgtb m1, [pb_129] ; !flat8out
+ por m1, m9 ; !flat8in | !flat8out
+ vpbroadcastd m2, [maskq+8]
+ pand m10, m2, [pb_mask]
+ pcmpeqd m10, [pb_mask]
+ pandn m1, m10 ; flat16
+ pandn m1, m8, m1 ; flat16 & fm
+
+ vpbroadcastd m10, [maskq+4]
+ por m10, m2
+ pand m2, m10, [pb_mask]
+ pcmpeqd m2, [pb_mask]
+ pandn m9, m2 ; flat8in
+ pandn m9, m8, m9
+ vpbroadcastd m2, [maskq+0]
+ por m2, m10
+ pand m2, [pb_mask]
+ pcmpeqd m2, [pb_mask]
+ pandn m8, m2
+ pandn m8, m9, m8 ; fm & !flat8 & !flat16
+ pandn m9, m1, m9 ; flat8 & !flat16
+%elif %1 != 4
+ vpbroadcastd m0, [maskq+4]
+ pand m2, m0, [pb_mask]
+ pcmpeqd m2, [pb_mask]
+ pandn m9, m2
+ pandn m9, m8, m9 ; flat8 & fm
+ vpbroadcastd m2, [maskq+0]
+ por m0, m2
+ pand m0, [pb_mask]
+ pcmpeqd m0, [pb_mask]
+ pandn m8, m0
+ pandn m8, m9, m8 ; fm & !flat8
+%else
+ vpbroadcastd m0, [maskq+0]
+ pand m0, [pb_mask]
+ pcmpeqd m0, [pb_mask]
+ pandn m8, m0 ; fm
+%endif
+
+ ; short filter
+
+ pxor m3, [pb_128]
+ pxor m6, [pb_128]
+ psubsb m10, m3, m6 ; iclip_diff(p1-q1)
+ pand m10, m7 ; f=iclip_diff(p1-q1)&hev
+ pxor m4, [pb_128]
+ pxor m5, [pb_128]
+ psubsb m11, m5, m4
+ paddsb m10, m11
+ paddsb m10, m11
+ paddsb m10, m11 ; f=iclip_diff(3*(q0-p0)+f)
+ pand m8, m10 ; f&=fm
+ paddsb m10, m8, [pb_3]
+ paddsb m8, [pb_4]
+ pand m10, [pb_248]
+ pand m8, [pb_248]
+ psrlq m10, 3
+ psrlq m8, 3
+ pxor m10, [pb_16]
+ pxor m8, [pb_16]
+ psubb m10, [pb_16] ; f2
+ psubb m8, [pb_16] ; f1
+ paddsb m4, m10
+ psubsb m5, m8
+ pxor m4, [pb_128]
+ pxor m5, [pb_128]
+
+ pxor m8, [pb_128]
+ pxor m10, m10
+ pavgb m8, m10 ; f=(f1+1)>>1
+ psubb m8, [pb_64]
+ pandn m8, m7, m8 ; f&=!hev
+ paddsb m3, m8
+ psubsb m6, m8
+ pxor m3, [pb_128]
+ pxor m6, [pb_128]
+
+%if %1 == 16
+ ; flat16 filter
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1] ; p6
+ mova m2, [tmpq+strideq*2] ; p5
+ mova m7, [tmpq+stride3q] ; p4
+%else
+ mova m0, [rsp+12*32]
+ mova m2, [rsp+13*32]
+ mova m7, [rsp+14*32]
+%endif
+
+ mova [rsp+0*32], m9
+ mova [rsp+1*32], m14
+ mova [rsp+2*32], m15
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
+ ; write -6
+ punpcklbw m14, m0, m12
+ punpckhbw m15, m0, m12
+ pmaddubsw m10, m14, [pb_7_1]
+ pmaddubsw m11, m15, [pb_7_1] ; p6*7+p3
+ punpcklbw m8, m2, m7
+ punpckhbw m9, m2, m7
+ pmaddubsw m8, [pb_2]
+ pmaddubsw m9, [pb_2]
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3
+ punpcklbw m8, m13, m3
+ punpckhbw m9, m13, m3
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m9, [pb_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1
+ punpcklbw m8, m4, m5
+ punpckhbw m9, m4, m5
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m9, [pb_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ pand m8, m1
+ pandn m9, m1, m2
+ por m8, m9
+%ifidn %2, v
+ mova [tmpq+strideq*2], m8 ; p5
+%else
+ mova [rsp+13*32], m8
+%endif
+
+ ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
+ ; write -5
+ pmaddubsw m14, [pb_m1_1]
+ pmaddubsw m15, [pb_m1_1]
+ paddw m10, m14
+ paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+ punpcklbw m8, m0, m6
+ punpckhbw m9, m0, m6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ mova [rsp+3*32], m8
+ mova [rsp+4*32], m9
+ paddw m10, m8
+ paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m7, m8, m1
+%ifidn %2, v
+ mova [tmpq+stride3q], m8 ; p4
+%else
+ mova [rsp+14*32], m8
+%endif
+
+ ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
+ ; write -4
+ mova m14, [rsp+1*32]
+ punpcklbw m8, m0, m13
+ punpckhbw m9, m0, m13
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
+ punpcklbw m8, m2, m14
+ punpckhbw m2, m14
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m2, [pb_m1_1]
+ mova [rsp+1*32], m8
+ paddw m10, m8
+ paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m12, m8, m1
+%ifidn %2, v
+ mova [tmpq+strideq*4], m8 ; p3
+%else
+ mova [rsp+19*32], m8
+%endif
+
+ ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
+ ; write -3
+ mova m15, [rsp+2*32]
+ punpcklbw m8, m0, m3
+ punpckhbw m9, m0, m3
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
+ punpcklbw m8, m7, m15
+ punpckhbw m7, m15
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m7, [pb_m1_1]
+ mova [rsp+2*32], m8
+ paddw m10, m8
+ paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m13, m8, m1
+ mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F
+
+ ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
+ ; write -2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+%endif
+ punpcklbw m8, m0, m4
+ punpckhbw m9, m0, m4
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+%ifidn %2, v
+ mova m9, [tmpq+strideq*0] ; q4
+%else
+ mova m9, [rsp+15*32]
+%endif
+ punpcklbw m8, m12, m9
+ punpckhbw m9, m12, m9
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ mova [rsp+7*32], m8
+ mova [rsp+5*32], m9
+ paddw m10, m8
+ paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m3, m8, m1
+ mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G
+
+ ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
+ ; write -1
+%ifidn %2, v
+ mova m9, [tmpq+strideq*1] ; q5
+%else
+ mova m9, [rsp+16*32]
+%endif
+ punpcklbw m8, m0, m5
+ punpckhbw m0, m5
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m0, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+ punpcklbw m0, m13, m9
+ punpckhbw m9, m13, m9
+ mova m13, [rsp+6*32]
+ pmaddubsw m0, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ mova [rsp+ 9*32], m0
+ mova [rsp+10*32], m9
+ paddw m10, m0
+ paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
+ pmulhrsw m0, m10, [pw_2048]
+ pmulhrsw m8, m11, [pw_2048]
+ packuswb m0, m8
+ vpblendvb m0, m4, m0, m1
+ mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H
+
+ ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
+ ; write +0
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2] ; q6
+%else
+ mova m0, [rsp+17*32]
+%endif
+ paddw m10, [rsp+3*32]
+ paddw m11, [rsp+4*32] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
+ punpcklbw m8, m3, m0
+ punpckhbw m9, m3, m0
+ mova m3, [rsp+8*32]
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ mova [rsp+3*32], m8
+ mova [rsp+4*32], m9
+ paddw m10, m8
+ paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m5, m8, m1
+ mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I
+
+ ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
+ ; write +1
+ paddw m10, [rsp+1*32]
+ paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+ punpcklbw m8, m4, m0
+ punpckhbw m2, m4, m0
+ mova m4, [rsp+6*32]
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m2, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
+ pmulhrsw m2, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m2, m9
+ vpblendvb m2, m6, m2, m1 ; don't clobber q1/m6 since we need it in K
+
+ ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
+ ; write +2
+ paddw m10, [rsp+2*32]
+ paddw m11, m7 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ punpcklbw m8, m5, m0
+ punpckhbw m9, m5, m0
+ mova m5, [rsp+8*32]
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
+ pmulhrsw m7, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m7, m9
+ vpblendvb m7, m14, m7, m1 ; don't clobber q2/m14 since we need it in K
+
+ ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
+ ; write +3
+ paddw m10, [rsp+7*32]
+ paddw m11, [rsp+5*32] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+ punpcklbw m8, m6, m0
+ punpckhbw m9, m6, m0
+ SWAP 2, 6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+ vpblendvb m8, m15, m8, m1
+%ifidn %2, v
+ mova [tmpq+mstrideq], m8 ; q3
+%else
+ mova [rsp+20*32], m8
+%endif
+
+ ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
+ ; write +4
+ paddw m10, [rsp+ 9*32]
+ paddw m11, [rsp+10*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ punpcklbw m8, m14, m0
+ punpckhbw m9, m14, m0
+ SWAP 14, 7
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m8, m10, [pw_2048]
+ pmulhrsw m9, m11, [pw_2048]
+ packuswb m8, m9
+%ifidn %2, v
+ mova m9, [tmpq+strideq*0]
+%else
+ mova m9, [rsp+15*32]
+%endif
+ vpblendvb m8, m9, m8, m1
+%ifidn %2, v
+ mova [tmpq+strideq*0], m8 ; q4
+%else
+ mova [rsp+15*32], m8
+%endif
+
+ ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
+ ; write +5
+ paddw m10, [rsp+3*32]
+ paddw m11, [rsp+4*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ punpcklbw m8, m15, m0
+ punpckhbw m9, m15, m0
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m9, [pb_m1_1]
+ paddw m10, m8
+ paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m10, [pw_2048]
+ pmulhrsw m11, [pw_2048]
+ packuswb m10, m11
+%ifidn %2, v
+ mova m11, [tmpq+strideq*1]
+%else
+ mova m11, [rsp+16*32]
+%endif
+ vpblendvb m10, m11, m10, m1
+%ifidn %2, v
+ mova [tmpq+strideq*1], m10 ; q5
+%else
+ mova [rsp+16*32], m10
+%endif
+
+ mova m9, [rsp+0*32]
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*4]
+%endif
+%endif
+%if %1 >= 8
+ ; flat8 filter
+ punpcklbw m0, m12, m3
+ punpckhbw m1, m12, m3
+ pmaddubsw m2, m0, [pb_3_1]
+ pmaddubsw m7, m1, [pb_3_1] ; 3 * p3 + p1
+ punpcklbw m8, m13, m4
+ punpckhbw m11, m13, m4
+ pmaddubsw m8, [pb_2_1]
+ pmaddubsw m11, [pb_2_1]
+ paddw m2, m8
+ paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0
+ punpcklbw m8, m5, [pb_4]
+ punpckhbw m11, m5, [pb_4]
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m11, [pb_1]
+ paddw m2, m8
+ paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendvb m10, m13, m8, m9 ; p2
+%ifidn %2, v
+ mova [tmpq+strideq*1], m10 ; p2
+%endif
+
+ pmaddubsw m8, m0, [pb_m1_1]
+ pmaddubsw m11, m1, [pb_m1_1]
+ paddw m2, m8
+ paddw m7, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m11, [pb_m1_1]
+ paddw m2, m8
+ paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendvb m8, m3, m8, m9 ; p1
+%ifidn %2, v
+ mova [tmpq+strideq*2], m8 ; p1
+%else
+ mova [rsp+0*32], m8
+%endif
+
+ pmaddubsw m0, [pb_1]
+ pmaddubsw m1, [pb_1]
+ psubw m2, m0
+ psubw m7, m1
+ punpcklbw m8, m4, m14
+ punpckhbw m11, m4, m14
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m11, [pb_1]
+ paddw m2, m8
+ paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendvb m8, m4, m8, m9 ; p0
+%ifidn %2, v
+ mova [tmpq+stride3q ], m8 ; p0
+%else
+ mova [rsp+1*32], m8
+%endif
+
+ punpcklbw m0, m5, m15
+ punpckhbw m1, m5, m15
+ pmaddubsw m8, m0, [pb_1]
+ pmaddubsw m11, m1, [pb_1]
+ paddw m2, m8
+ paddw m7, m11
+ punpcklbw m8, m4, m12
+ punpckhbw m11, m4, m12
+ pmaddubsw m8, [pb_1]
+ pmaddubsw m11, [pb_1]
+ psubw m2, m8
+ psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendvb m11, m5, m8, m9 ; q0
+%ifidn %2, v
+ mova [dstq+strideq*0], m11 ; q0
+%endif
+
+ pmaddubsw m0, [pb_m1_1]
+ pmaddubsw m1, [pb_m1_1]
+ paddw m2, m0
+ paddw m7, m1
+ punpcklbw m8, m13, m6
+ punpckhbw m13, m6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m13, [pb_m1_1]
+ paddw m2, m8
+ paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
+ psrlw m8, m2, 3
+ psrlw m13, m7, 3
+ packuswb m8, m13
+ vpblendvb m13, m6, m8, m9 ; q1
+%ifidn %2, v
+ mova [dstq+strideq*1], m13 ; q1
+%endif
+
+ punpcklbw m0, m3, m6
+ punpckhbw m1, m3, m6
+ pmaddubsw m0, [pb_1]
+ pmaddubsw m1, [pb_1]
+ psubw m2, m0
+ psubw m7, m1
+ punpcklbw m0, m14, m15
+ punpckhbw m1, m14, m15
+ pmaddubsw m0, [pb_1]
+ pmaddubsw m1, [pb_1]
+ paddw m2, m0
+ paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+ psrlw m2, 3
+ psrlw m7, 3
+ packuswb m2, m7
+ vpblendvb m2, m14, m2, m9 ; q2
+%ifidn %2, v
+ mova [dstq+strideq*2], m2 ; q2
+%else
+ mova m0, [rsp+0*32]
+ mova m1, [rsp+1*32]
+%if %1 == 8
+ ; 16x8 transpose
+ punpcklbw m3, m12, m10
+ punpckhbw m12, m10
+ punpcklbw m10, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m11, m13
+ punpckhbw m11, m13
+ punpcklbw m13, m2, m15
+ punpckhbw m2, m15
+
+ punpcklwd m15, m3, m10
+ punpckhwd m3, m10
+ punpcklwd m10, m12, m0
+ punpckhwd m12, m0
+ punpcklwd m0, m1, m13
+ punpckhwd m1, m13
+ punpcklwd m13, m11, m2
+ punpckhwd m11, m2
+
+ punpckldq m2, m15, m0
+ punpckhdq m15, m0
+ punpckldq m0, m3, m1
+ punpckhdq m3, m1
+ punpckldq m1, m10, m13
+ punpckhdq m10, m13
+ punpckldq m13, m12, m11
+ punpckhdq m12, m11
+
+ ; write 8x32
+ movq [dstq+strideq*0-4], xm2
+ movhps [dstq+strideq*1-4], xm2
+ movq [dstq+strideq*2-4], xm15
+ movhps [dstq+stride3q -4], xm15
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm0
+ movhps [dstq+strideq*1-4], xm0
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm1
+ movhps [dstq+strideq*1-4], xm1
+ movq [dstq+strideq*2-4], xm10
+ movhps [dstq+stride3q -4], xm10
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm13
+ movhps [dstq+strideq*1-4], xm13
+ movq [dstq+strideq*2-4], xm12
+ movhps [dstq+stride3q -4], xm12
+ lea dstq, [dstq+strideq*4]
+
+ vextracti128 xm2, m2, 1
+ vextracti128 xm15, m15, 1
+ vextracti128 xm0, m0, 1
+ vextracti128 xm3, m3, 1
+ vextracti128 xm1, m1, 1
+ vextracti128 xm10, m10, 1
+ vextracti128 xm13, m13, 1
+ vextracti128 xm12, m12, 1
+
+ movq [dstq+strideq*0-4], xm2
+ movhps [dstq+strideq*1-4], xm2
+ movq [dstq+strideq*2-4], xm15
+ movhps [dstq+stride3q -4], xm15
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm0
+ movhps [dstq+strideq*1-4], xm0
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm1
+ movhps [dstq+strideq*1-4], xm1
+ movq [dstq+strideq*2-4], xm10
+ movhps [dstq+stride3q -4], xm10
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm13
+ movhps [dstq+strideq*1-4], xm13
+ movq [dstq+strideq*2-4], xm12
+ movhps [dstq+stride3q -4], xm12
+ lea dstq, [dstq+strideq*4]
+%else
+ ; 16x16 transpose and store
+ SWAP 5, 10, 2
+ SWAP 6, 0
+ SWAP 7, 1
+ SWAP 8, 11
+ SWAP 9, 13
+ mova m0, [rsp+11*32]
+ mova m1, [rsp+12*32]
+ mova m2, [rsp+13*32]
+ mova m3, [rsp+14*32]
+ mova m4, [rsp+19*32]
+ mova m11, [rsp+20*32]
+ mova m12, [rsp+15*32]
+ mova m13, [rsp+16*32]
+ mova m14, [rsp+17*32]
+ TRANSPOSE_16X16B 1, 0, [rsp+18*32]
+ movu [dstq+strideq*0-8], xm0
+ movu [dstq+strideq*1-8], xm1
+ movu [dstq+strideq*2-8], xm2
+ movu [dstq+stride3q -8], xm3
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm4
+ movu [dstq+strideq*1-8], xm5
+ movu [dstq+strideq*2-8], xm6
+ movu [dstq+stride3q -8], xm7
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm8
+ movu [dstq+strideq*1-8], xm9
+ movu [dstq+strideq*2-8], xm10
+ movu [dstq+stride3q -8], xm11
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm12
+ movu [dstq+strideq*1-8], xm13
+ movu [dstq+strideq*2-8], xm14
+ movu [dstq+stride3q -8], xm15
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m0, 1
+ vextracti128 [dstq+strideq*1-8], m1, 1
+ vextracti128 [dstq+strideq*2-8], m2, 1
+ vextracti128 [dstq+stride3q -8], m3, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m4, 1
+ vextracti128 [dstq+strideq*1-8], m5, 1
+ vextracti128 [dstq+strideq*2-8], m6, 1
+ vextracti128 [dstq+stride3q -8], m7, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m8, 1
+ vextracti128 [dstq+strideq*1-8], m9, 1
+ vextracti128 [dstq+strideq*2-8], m10, 1
+ vextracti128 [dstq+stride3q -8], m11, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 [dstq+strideq*0-8], m12, 1
+ vextracti128 [dstq+strideq*1-8], m13, 1
+ vextracti128 [dstq+strideq*2-8], m14, 1
+ vextracti128 [dstq+stride3q -8], m15, 1
+ lea dstq, [dstq+strideq*4]
+%endif
+%endif
+%elif %1 == 6
+ ; flat6 filter
+
+ punpcklbw m8, m13, m5
+ punpckhbw m11, m13, m5
+ pmaddubsw m0, m8, [pb_3_1]
+ pmaddubsw m1, m11, [pb_3_1]
+ punpcklbw m7, m4, m3
+ punpckhbw m10, m4, m3
+ pmaddubsw m2, m7, [pb_2]
+ pmaddubsw m12, m10, [pb_2]
+ paddw m0, m2
+ paddw m1, m12
+ pmulhrsw m2, m0, [pw_4096]
+ pmulhrsw m12, m1, [pw_4096]
+ packuswb m2, m12
+ vpblendvb m2, m3, m2, m9
+%ifidn %2, v
+ mova [tmpq+strideq*2], m2 ; p1
+%endif
+
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m11, [pb_m1_1]
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+ pmaddubsw m8, [pb_m1_1]
+ pmaddubsw m11, [pb_m1_1]
+ paddw m0, m8
+ paddw m1, m11
+ pmulhrsw m12, m0, [pw_4096]
+ pmulhrsw m13, m1, [pw_4096]
+ packuswb m12, m13
+ vpblendvb m12, m4, m12, m9
+%ifidn %2, v
+ mova [tmpq+stride3q], m12 ; p0
+%endif
+
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m3, m14
+ punpckhbw m11, m3, m14
+ pmaddubsw m14, m8, [pb_m1_1]
+ pmaddubsw m13, m11, [pb_m1_1]
+ paddw m0, m14
+ paddw m1, m13
+ pmulhrsw m14, m0, [pw_4096]
+ pmulhrsw m13, m1, [pw_4096]
+ packuswb m14, m13
+ vpblendvb m14, m5, m14, m9
+%ifidn %2, v
+ mova [dstq+strideq*0], m14 ; q0
+%endif
+
+ pmaddubsw m8, [pb_m1_2]
+ pmaddubsw m11, [pb_m1_2]
+ paddw m0, m8
+ paddw m1, m11
+ pmaddubsw m7, [pb_m1_0]
+ pmaddubsw m10, [pb_m1_0]
+ paddw m0, m7
+ paddw m1, m10
+ pmulhrsw m0, [pw_4096]
+ pmulhrsw m1, [pw_4096]
+ packuswb m0, m1
+ vpblendvb m0, m6, m0, m9
+%ifidn %2, v
+ mova [dstq+strideq*1], m0 ; q1
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1
+%endif
+%else
+%ifidn %2, v
+ mova [tmpq+strideq*0], m3 ; p1
+ mova [tmpq+strideq*1], m4 ; p0
+ mova [tmpq+strideq*2], m5 ; q0
+ mova [tmpq+stride3q ], m6 ; q1
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7
+%endif
+%endif
+%endmacro
+
+INIT_YMM avx2
+cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+
+.loop:
+ cmp byte [maskq+8], 0 ; vmask[2]
+ je .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+ cmp byte [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+ cmp byte [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call .v4
+
+.end:
+ add lq, 32
+ add dstq, 32
+ add maskq, 1
+ sub wd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.v4:
+ FILTER 4, v
+ ret
+
+INIT_YMM avx2
+cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+
+.loop:
+ cmp byte [maskq+8], 0 ; vmask[2]
+ je .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+ cmp byte [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+ cmp byte [maskq+0], 0 ; vmask[0]
+ je .no_filter
+
+ call .h4
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+stride3q*8]
+ lea lq, [lq+l_strideq*8]
+ lea dstq, [dstq+strideq*8]
+.end:
+ add maskq, 1
+ sub hd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.h4:
+ FILTER 4, h
+ ret
+
+INIT_YMM avx2
+cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+
+.loop:
+ cmp byte [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+ cmp byte [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx2).v4
+
+.end:
+ add lq, 32
+ add dstq, 32
+ add maskq, 1
+ sub wd, 8
+ jg .loop
+ RET
+
+INIT_YMM avx2
+cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+
+.loop:
+ cmp byte [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+ cmp byte [maskq+0], 0 ; vmask[0]
+ je .no_filter
+
+ call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx2).h4
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+stride3q*8]
+ lea lq, [lq+l_strideq*8]
+ lea dstq, [dstq+strideq*8]
+.end:
+ add maskq, 1
+ sub hd, 8
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/loopfilter_avx512.asm b/third_party/dav1d/src/x86/loopfilter_avx512.asm
new file mode 100644
index 0000000000..0218b624d3
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter_avx512.asm
@@ -0,0 +1,1534 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+pb_4x0_4x4_4x8_4x12: times 4 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+
+pb_mask: dd 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080
+ dd 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000
+
+hmulA: dd 0, 8, 16, 24, 32, 40, 48, 56, 4, 12, 20, 28, 36, 44, 52, 60
+hmulB: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+hmulC: dd 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51
+hmulD: dd 0, 1, 16, 17, 32, 33, 48, 49
+hshuf4:db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+pb_1: times 4 db 1
+pb_2: times 4 db 2
+pb_3: times 4 db 3
+pb_4: times 4 db 4
+pb_16: times 4 db 16
+pb_63: times 4 db 63
+pb_64: times 4 db 64
+pb_128: times 4 db 0x80
+pb_240: times 4 db 0xf0
+pb_248: times 4 db 0xf8
+pb_254: times 4 db 0xfe
+pb_2_1: times 2 db 2, 1
+pb_3_1: times 2 db 3, 1
+pb_7_1: times 2 db 7, 1
+pb_m1_0: times 2 db -1, 0
+pb_m1_1: times 2 db -1, 1
+pb_m1_2: times 2 db -1, 2
+pw_2048: times 2 dw 2048
+pw_4096: times 2 dw 4096
+
+SECTION .text
+
+%macro ABSSUB 4 ; dst, a, b, tmp
+ psubusb %1, %2, %3
+ psubusb %4, %3, %2
+ por %1, %4
+%endmacro
+
+%macro TRANSPOSE_16x4_AND_WRITE_4x32 5
+ punpcklbw m%5, m%1, m%2
+ punpckhbw m%1, m%2
+ punpcklbw m%2, m%3, m%4
+ punpckhbw m%3, m%4
+ punpcklwd m%4, m%5, m%2
+ punpckhwd m%5, m%2
+ punpcklwd m%2, m%1, m%3
+ punpckhwd m%1, m%3
+ kmovw k1, k6
+ lea t0, [dstq+strideq*4]
+ vpscatterdd [dstq+m19-2]{k1}, m%4
+ kmovw k1, k6
+ lea t1, [dstq+strideq*8]
+ vpscatterdd [t0 +m19-2]{k1}, m%5
+ kmovw k1, k6
+ lea t2, [t0 +strideq*8]
+ vpscatterdd [t1 +m19-2]{k1}, m%2
+ kmovw k1, k6
+ vpscatterdd [t2 +m19-2]{k1}, m%1
+%endmacro
+
+%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
+%if %1 == 0
+ SWAP m16, m22
+%endif
+ punpcklbw m22, m24, m26
+ punpckhbw m24, m26
+ punpcklbw m26, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+ punpcklbw m5, m6, m7
+ punpckhbw m6, m7
+ punpcklbw m7, m8, m9
+ punpckhbw m8, m9
+ punpcklbw m9, m10, m11
+ punpckhbw m10, m11
+ punpcklbw m11, m25, m13
+ punpckhbw m25, m13
+%if %1 == 0
+ SWAP m13, m16
+%else
+ mova m13, %3
+%endif
+ SWAP m16, m25
+ punpcklbw m25, m14, m13
+ punpckhbw m13, m14, m13
+ ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13
+ punpcklwd m14, m22, m26
+ punpckhwd m22, m26
+ punpcklwd m26, m24, m2
+ punpckhwd m24, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m7, m9
+ punpckhwd m7, m9
+ punpcklwd m9, m8, m10
+ punpckhwd m8, m10
+ punpcklwd m10, m11, m25
+ punpckhwd m11, m25
+ SWAP m25, m16, m11
+ punpcklwd m11, m25, m13
+ punpckhwd m25, m13
+ ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25
+ punpckldq m13, m14, m2
+ punpckhdq m14, m2
+ punpckldq m2, m22, m3
+ punpckhdq m22, m3
+ punpckldq m3, m26, m5
+ punpckhdq m26, m5
+ punpckldq m5, m24, m4
+ punpckhdq m24, m4
+ punpckldq m4, m6, m10
+ punpckhdq m6, m10
+ punpckldq m10, m9, m11
+ punpckhdq m9, m11
+ punpckldq m11, m8, m25
+ punpckhdq m8, m25
+ SWAP m25, m16, m8
+ punpckldq m8, m7, m25
+ punpckhdq m7, m25
+ ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3
+ punpcklqdq m25, m13, m4
+ punpckhqdq m13, m4
+ punpcklqdq m4, m14, m6
+ punpckhqdq m14, m6
+ punpcklqdq m6, m2, m8
+ punpckhqdq m2, m8
+ punpcklqdq m8, m22, m7
+ punpckhqdq m22, m7
+ punpcklqdq m7, m3, m10
+ punpckhqdq m3, m10
+ punpcklqdq m10, m26, m9
+ punpckhqdq m26, m9
+ punpcklqdq m9, m5, m11
+ punpckhqdq m5, m11
+ SWAP m11, m16
+%if %2 == 0
+ SWAP m16, m25
+%else
+ mova %3, m25
+%endif
+ punpcklqdq m25, m24, m11
+ punpckhqdq m24, m11
+%if %2 == 0
+ SWAP m11, m16
+%endif
+ ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24
+ SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22
+ SWAP 3, 14, 25, 9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+ ; load data
+%ifidn %2, v
+%define is_h 0
+%if %1 == 4
+ lea t0, [dstq+mstrideq*2]
+ mova m3, [t0 +strideq*0] ; p1
+ mova m4, [t0 +strideq*1] ; p0
+ mova m5, [t0 +strideq*2] ; q0
+ mova m6, [t0 +stride3q ] ; q1
+%else
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+%if %1 == 16
+ lea t0, [dstq+mstrideq*8]
+ mova m16, [t0 +strideq*1]
+ mova m17, [t0 +strideq*2]
+ mova m18, [t0 +stride3q ]
+%endif
+ lea t0, [dstq+mstrideq*4]
+%if %1 != 6
+ mova m25, [t0 +strideq*0]
+%endif
+ mova m13, [t0 +strideq*1]
+ mova m3, [t0 +strideq*2]
+ mova m4, [t0 +stride3q ]
+ mova m5, [dstq+strideq*0]
+ mova m6, [dstq+strideq*1]
+ mova m14, [dstq+strideq*2]
+%if %1 != 6
+ mova m22, [dstq+stride3q ]
+%endif
+%if %1 == 16
+ lea t0, [dstq+strideq*4]
+ mova m29, [t0 +strideq*0]
+ mova m30, [t0 +strideq*1]
+ mova m31, [t0 +strideq*2]
+%endif
+%endif
+%else ; h
+%define is_h 1
+ ; load lines
+%if %1 == 4
+ vbroadcasti32x4 m0, [hshuf4]
+ kmovw k1, k6
+ lea t0, [dstq+strideq*4]
+ vpgatherdd m3{k1}, [dstq+m19-2]
+ kmovw k1, k6
+ lea t1, [dstq+strideq*8]
+ vpgatherdd m4{k1}, [t0 +m19-2]
+ kmovw k1, k6
+ lea t2, [t0 +strideq*8]
+ vpgatherdd m5{k1}, [t1 +m19-2]
+ kmovw k1, k6
+ vpgatherdd m6{k1}, [t2 +m19-2]
+ pshufb m3, m0
+ pshufb m4, m0
+ pshufb m5, m0
+ pshufb m6, m0
+ punpckldq m7, m3, m4
+ punpckhdq m3, m4
+ punpckldq m4, m5, m6
+ punpckhdq m5, m6
+ punpcklqdq m6, m7, m4
+ punpckhqdq m7, m4
+ punpcklqdq m4, m3, m5
+ punpckhqdq m3, m5
+ SWAP 3, 6
+ SWAP 5, 4, 7
+ ; 6,7,4,3 -> 3,4,5,6
+%elif %1 == 6 || %1 == 8
+ kmovb k1, k7
+ lea t0, [dstq+strideq*1]
+ vpgatherdq m3{k1}, [dstq+ym21-%1/2]
+ kmovb k1, k7
+ lea t1, [dstq+strideq*2]
+ vpgatherdq m4{k1}, [t0 +ym21-%1/2]
+ kmovb k1, k7
+ lea t2, [dstq+stride3q ]
+ vpgatherdq m5{k1}, [t1 +ym21-%1/2]
+ kmovb k1, k7
+ vextracti32x8 ym0, m21, 1
+ vpgatherdq m6{k1}, [t2 +ym21-%1/2]
+ kmovb k1, k7
+ vpgatherdq m12{k1}, [dstq+ym0 -%1/2]
+ kmovb k1, k7
+ vpgatherdq m13{k1}, [t0 +ym0 -%1/2]
+ kmovb k1, k7
+ vpgatherdq m14{k1}, [t1 +ym0 -%1/2]
+ kmovb k1, k7
+ vpgatherdq m15{k1}, [t2 +ym0 -%1/2]
+ ; transpose 8x16
+ ; xm3: A-H0,A-H8
+ ; xm4: A-H1,A-H9
+ ; xm5: A-H2,A-H10
+ ; xm6: A-H3,A-H11
+ ; xm12: A-H4,A-H12
+ ; xm13: A-H5,A-H13
+ ; xm14: A-H6,A-H14
+ ; xm15: A-H7,A-H15
+ punpcklbw m7, m3, m4
+ punpckhbw m3, m4
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ punpcklbw m6, m12, m13
+ punpckhbw m12, m13
+ punpcklbw m13, m14, m15
+ punpckhbw m14, m15
+ ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
+ ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
+ ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
+ ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
+ ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
+ ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
+ ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
+ ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
+ punpcklwd m15, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m6, m13
+ punpckhwd m6, m13
+ punpcklwd m13, m12, m14
+ punpckhwd m12, m14
+ ; xm15: A0-3,B0-3,C0-3,D0-3
+ ; xm7: E0-3,F0-3,G0-3,H0-3
+ ; xm4: A8-11,B8-11,C8-11,D8-11
+ ; xm3: E8-11,F8-11,G8-11,H8-11
+ ; xm5: A4-7,B4-7,C4-7,D4-7
+ ; xm6: E4-7,F4-7,G4-7,H4-7
+ ; xm13: A12-15,B12-15,C12-15,D12-15
+ ; xm12: E12-15,F12-15,G12-15,H12-15
+ punpckldq m14, m15, m5
+ punpckhdq m15, m5
+ punpckldq m5, m7, m6
+ %if %1 != 6
+ punpckhdq m7, m6
+ %endif
+ punpckldq m6, m4, m13
+ punpckhdq m4, m13
+ punpckldq m13, m3, m12
+ %if %1 != 6
+ punpckhdq m12, m3, m12
+ %endif
+ ; xm14: A0-7,B0-7
+ ; xm15: C0-7,D0-7
+ ; xm5: E0-7,F0-7
+ ; xm7: G0-7,H0-7
+ ; xm6: A8-15,B8-15
+ ; xm4: C8-15,D8-15
+ ; xm13: E8-15,F8-15
+ ; xm12: G8-15,H8-15
+ punpcklqdq m3, m14, m6
+ punpckhqdq m14, m6
+ punpckhqdq m6, m15, m4
+ punpcklqdq m15, m4
+ punpcklqdq m4, m5, m13
+ punpckhqdq m13, m5, m13
+ %if %1 == 8
+ punpcklqdq m5, m7, m12
+ punpckhqdq m25, m7, m12
+ ; xm3: A0-15
+ ; xm14: B0-15
+ ; xm15: C0-15
+ ; xm6: D0-15
+ ; xm4: E0-15
+ ; xm13: F0-15
+ ; xm5: G0-15
+ ; xm25: H0-15
+ SWAP 25, 3, 15
+ SWAP 13, 14, 5, 4, 6
+ SWAP 15, 22
+ ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22
+ %else
+ SWAP 13, 3, 14
+ SWAP 6, 4, 15, 5
+ ; 3,14,15,6,4,13 -> 13,3,4,5,6,14
+ %endif
+%else ; 16, h
+ ; load and 16x16 transpose. We only use 14 pixels but we'll need the
+ ; remainder at the end for the second transpose
+ movu xm24, [dstq+strideq*0-8]
+ movu xm26, [dstq+strideq*1-8]
+ movu xm2, [dstq+strideq*2-8]
+ movu xm3, [dstq+stride3q -8]
+ lea t0, [dstq+strideq*4]
+ movu xm4, [t0 +strideq*0-8]
+ movu xm5, [t0 +strideq*1-8]
+ movu xm6, [t0 +strideq*2-8]
+ movu xm7, [t0 +stride3q -8]
+ lea t0, [t0 +strideq*4]
+ movu xm8, [t0 +strideq*0-8]
+ movu xm9, [t0 +strideq*1-8]
+ movu xm10, [t0 +strideq*2-8]
+ movu xm11, [t0 +stride3q -8]
+ lea t0, [t0 +strideq*4]
+ movu xm25, [t0 +strideq*0-8]
+ movu xm13, [t0 +strideq*1-8]
+ movu xm14, [t0 +strideq*2-8]
+ movu xm22, [t0 +stride3q -8]
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym24, [t0 +strideq*0-8], 1
+ vinserti32x4 ym26, [t0 +strideq*1-8], 1
+ vinserti32x4 ym2, [t0 +strideq*2-8], 1
+ vinserti32x4 ym3, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym4, [t0 +strideq*0-8], 1
+ vinserti32x4 ym5, [t0 +strideq*1-8], 1
+ vinserti32x4 ym6, [t0 +strideq*2-8], 1
+ vinserti32x4 ym7, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym8, [t0 +strideq*0-8], 1
+ vinserti32x4 ym9, [t0 +strideq*1-8], 1
+ vinserti32x4 ym10, [t0 +strideq*2-8], 1
+ vinserti32x4 ym11, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 ym25, [t0 +strideq*0-8], 1
+ vinserti32x4 ym13, [t0 +strideq*1-8], 1
+ vinserti32x4 ym14, [t0 +strideq*2-8], 1
+ vinserti32x4 ym22, [t0 +stride3q -8], 1
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m24, [t0 +strideq*0-8], 2
+ vinserti32x4 m26, [t0 +strideq*1-8], 2
+ vinserti32x4 m2, [t0 +strideq*2-8], 2
+ vinserti32x4 m3, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m4, [t0 +strideq*0-8], 2
+ vinserti32x4 m5, [t0 +strideq*1-8], 2
+ vinserti32x4 m6, [t0 +strideq*2-8], 2
+ vinserti32x4 m7, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m8, [t0 +strideq*0-8], 2
+ vinserti32x4 m9, [t0 +strideq*1-8], 2
+ vinserti32x4 m10, [t0 +strideq*2-8], 2
+ vinserti32x4 m11, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m25, [t0 +strideq*0-8], 2
+ vinserti32x4 m13, [t0 +strideq*1-8], 2
+ vinserti32x4 m14, [t0 +strideq*2-8], 2
+ vinserti32x4 m22, [t0 +stride3q -8], 2
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m24, [t0 +strideq*0-8], 3
+ vinserti32x4 m26, [t0 +strideq*1-8], 3
+ vinserti32x4 m2, [t0 +strideq*2-8], 3
+ vinserti32x4 m3, [t0 +stride3q -8], 3
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m4, [t0 +strideq*0-8], 3
+ vinserti32x4 m5, [t0 +strideq*1-8], 3
+ vinserti32x4 m6, [t0 +strideq*2-8], 3
+ vinserti32x4 m7, [t0 +stride3q -8], 3
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m8, [t0 +strideq*0-8], 3
+ vinserti32x4 m9, [t0 +strideq*1-8], 3
+ vinserti32x4 m10, [t0 +strideq*2-8], 3
+ vinserti32x4 m11, [t0 +stride3q -8], 3
+ lea t0, [t0 +strideq*4]
+ vinserti32x4 m25, [t0 +strideq*0-8], 3
+ vinserti32x4 m13, [t0 +strideq*1-8], 3
+ vinserti32x4 m14, [t0 +strideq*2-8], 3
+ vinserti32x4 m22, [t0 +stride3q -8], 3
+ ;
+ TRANSPOSE_16X16B 0, 1, [rsp+0*64]
+ SWAP m16, m26
+ SWAP m17, m2
+ SWAP m18, m3
+ SWAP m29, m25
+ SWAP m30, m13
+ SWAP m31, m14
+ mova [rsp+4*64], m22
+ ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22
+ SWAP 25, 4, 7
+ SWAP 13, 5, 8
+ SWAP 3, 6, 9
+ SWAP 10, 14
+ SWAP 11, 22
+%endif
+%endif
+
+ ; load L/E/I/H
+ vpbroadcastd m15, [pb_1]
+%ifidn %2, v
+ movu m1, [lq]
+ movu m0, [lq+l_strideq]
+%else
+ kmovw k1, k6
+ vpgatherdd m0{k1}, [lq+m20+4]
+ kmovw k1, k6
+ vpgatherdd m1{k1}, [lq+m20+0]
+%endif
+ pxor m2, m2
+ pcmpeqb k1, m0, m2
+ vmovdqu8 m0{k1}, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, pbshuf ; l[x][0]
+ vpcmpub k3, m0, m2, 4 ; neq ; L
+ psrlq m2, m0, [lutq+128]
+ pand m2, [pb_63]{bcstd}
+ vpbroadcastb m1, [lutq+136]
+ pminub m2, m1
+ pmaxub m2, m15 ; I
+ pand m1, m0, [pb_240]{bcstd}
+ psrlq m1, 4 ; H
+ paddd m0, [pb_2]{bcstd}
+ paddb m0, m0
+ paddb m0, m2 ; E
+
+ ABSSUB m8, m3, m4, m9 ; abs(p1-p0)
+ ABSSUB m9, m5, m6, m10 ; abs(q1-q0)
+ pmaxub m8, m9
+ vpcmpub k1, m8, m1, 6 ; gt ; hev
+%if %1 != 4
+ %if %1 == 6
+ ABSSUB m9, m13, m4, m10 ; abs(p2-p0)
+ pmaxub m9, m8
+ %else
+ ABSSUB m9, m25, m4, m10 ; abs(p3-p0)
+ pmaxub m9, m8
+ ABSSUB m10, m13, m4, m11 ; abs(p2-p0)
+ pmaxub m9, m10
+ %endif
+ ABSSUB m10, m5, m14, m11 ; abs(q2-q0)
+ pmaxub m9, m10
+ %if %1 != 6
+ ABSSUB m10, m5, m22, m11 ; abs(q3-q0)
+ pmaxub m9, m10
+ %endif
+ vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in
+ %if %1 == 6
+ ABSSUB m10, m13, m3, m1 ; abs(p2-p1)
+ %else
+ ABSSUB m10, m25, m13, m11 ; abs(p3-p2)
+ ABSSUB m11, m13, m3, m1 ; abs(p2-p1)
+ pmaxub m10, m11
+ ABSSUB m11, m14, m22, m1 ; abs(q3-q2)
+ pmaxub m10, m11
+ %endif
+ ABSSUB m11, m14, m6, m1 ; abs(q2-q1)
+ pmaxub m10, m11
+ %if %1 == 16
+ vpbroadcastd m11, [maskq+8]
+ por m11, [maskq+4]{bcstd}
+ %else
+ vpbroadcastd m11, [maskq+4]
+ %endif
+ vptestmd k4, m11, pbmask
+ vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks
+ pmaxub m8, m10
+%endif
+ vpcmpub k3{k3}, m8, m2, 2 ; le
+ ABSSUB m10, m3, m6, m11 ; abs(p1-q1)
+ ABSSUB m11, m4, m5, m2 ; abs(p0-q0)
+ paddusb m11, m11
+ pand m10, [pb_254]{bcstd}
+ psrlq m10, 1
+ paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ vpcmpub k3{k3}, m10, m0, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E
+
+%if %1 == 16
+ ABSSUB m1, m16, m4, m2
+ ABSSUB m2, m17, m4, m10
+ pmaxub m1, m2
+ ABSSUB m2, m18, m4, m10
+ pmaxub m1, m2
+ ABSSUB m2, m29, m5, m10
+ pmaxub m1, m2
+ ABSSUB m2, m30, m5, m10
+ pmaxub m1, m2
+ ABSSUB m2, m31, m5, m10
+ pmaxub m1, m2
+ kandq k2, k2, k3
+ vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out
+ vpbroadcastd m2, [maskq+8]
+ vptestmd k5, m2, pbmask
+ vpmovm2d m7, k5
+ vptestmb k4{k4}, m7, m7 ; flat16 & fm
+ por m10, m2, [maskq+4]{bcstd}
+ vptestmd k5, m10, pbmask
+ vpmovm2d m7, k5
+ vptestmb k2{k2}, m7, m7 ; flat8in
+ por m2, m10, [maskq+0]{bcstd}
+ vptestmd k5, m2, pbmask
+ vpmovm2d m7, k5
+ vptestmb k3{k3}, m7, m7
+ kandnq k3, k2, k3 ; fm & !flat8 & !flat16
+ kandnq k2, k4, k2 ; flat8 & !flat16
+%elif %1 != 4
+ vpbroadcastd m0, [maskq+4]
+ vptestmd k4, m0, pbmask
+ vpmovm2d m7, k4
+ vptestmb k2{k2}, m7, m7
+ kandq k2, k2, k3 ; flat8 & fm
+ por m0, [maskq+0]{bcstd}
+ vptestmd k4, m0, pbmask
+ vpmovm2d m7, k4
+ vptestmb k3{k3}, m7, m7
+ kandnq k3, k2, k3 ; fm & !flat8
+%else
+ %ifidn %2, v
+ vptestmd k4, pbmask, [maskq+0]{bcstd}
+ %else
+ vpbroadcastd m0, [maskq+0]
+ vptestmd k4, m0, pbmask
+ %endif
+ vpmovm2d m7, k4
+ vptestmb k3{k3}, m7, m7 ; fm
+%endif
+
+ ; short filter
+%if %1 >= 8
+ SWAP m23, m15
+%endif
+ vpbroadcastd m15, [pb_3]
+ vpbroadcastd m0, [pb_4]
+ vpbroadcastd m12, [pb_16]
+ vpbroadcastd m1, [pb_64]
+ pxor m3, pb128
+ pxor m6, pb128
+ psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev
+ pxor m4, pb128
+ pxor m5, pb128
+ psubsb m11, m5, m4
+ paddsb m10, m11
+ paddsb m10, m11
+ paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm
+ paddsb m8, m10, m15
+ paddsb m10, m0
+ pand m8, [pb_248]{bcstd}
+ pand m10, [pb_248]{bcstd}
+ psrlq m8, 3
+ psrlq m10, 3
+ pxor m8, m12
+ pxor m10, m12
+ psubb m8, m12 ; f2
+ psubb m10, m12 ; f1
+ paddsb m4, m8
+ psubsb m5, m10
+ pxor m4, pb128
+ pxor m5, pb128
+ ;
+ pxor m10, pb128
+ pxor m8, m8
+ pavgb m8, m10 ; f=(f1+1)>>1
+ psubb m8, m1
+ knotq k1, k1
+ paddsb m3{k1}, m3, m8
+ psubsb m6{k1}, m6, m8
+ pxor m3, pb128
+ pxor m6, pb128
+
+%if %1 == 16
+ ; flat16 filter
+%ifidn %2, v
+ lea t0, [dstq+mstrideq*8]
+%endif
+ SWAP m24, m16, m14
+ SWAP m2, m17, m22
+ SWAP m7, m18
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
+ ; write -6
+ vpbroadcastd m1, [pb_7_1]
+ vpbroadcastd m12, [pb_2]
+ punpcklbw m14, m24, m25
+ punpckhbw m22, m24, m25
+ pmaddubsw m10, m14, m1
+ pmaddubsw m11, m22, m1 ; p6*7+p3
+ punpcklbw m8, m2, m7
+ punpckhbw m9, m2, m7
+ pmaddubsw m8, m12
+ pmaddubsw m9, m12
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3
+%ifidn %2, h
+ vpbroadcastd m27, [pw_2048]
+ vpbroadcastd m1, [pb_m1_1]
+ %define pw2048 m27
+ %define pbm1_1 m1
+%endif
+ punpcklbw m8, m13, m3
+ punpckhbw m9, m13, m3
+ pmaddubsw m8, m23
+ pmaddubsw m9, m23
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1
+ punpcklbw m8, m4, m5
+ punpckhbw m9, m4, m5
+ pmaddubsw m8, m23
+ pmaddubsw m9, m23
+ paddw m10, m8
+ paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*2]{k4}, m8 ; p5
+%else
+ vpblendmb m8{k4}, m2, m8
+ mova [rsp+1*64], m8
+%endif
+
+ ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
+ ; write -5
+ pmaddubsw m14, pbm1_1
+ pmaddubsw m22, pbm1_1
+ paddw m10, m14
+ paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+ punpcklbw m8, m24, m6
+ punpckhbw m9, m24, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
+ SWAP m18, m8
+ SWAP m23, m9
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+stride3q]{k4}, m8 ; p4
+%else
+ vpblendmb m8{k4}, m7, m8
+ mova [rsp+2*64], m8
+%endif
+
+ ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
+ ; write -4
+ SWAP m14, m16
+ punpcklbw m8, m24, m13
+ punpckhbw m9, m24, m13
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
+ punpcklbw m8, m2, m14
+ punpckhbw m2, m14
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m2, pbm1_1
+ paddw m10, m8
+ paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
+ SWAP m16, m8
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3
+%else
+ vpblendmb m8{k4}, m25, m8
+ mova [rsp+3*64], m8
+%endif
+
+ ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
+ ; write -3
+ SWAP m22, m17
+ punpcklbw m8, m24, m3
+ punpckhbw m9, m24, m3
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
+ punpcklbw m8, m7, m22
+ punpckhbw m7, m22
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m7, pbm1_1
+ paddw m10, m8
+ paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
+ SWAP m17, m8
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+ vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F
+
+ ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
+ ; write -2
+%ifidn %2, v
+ lea t0, [dstq+strideq*4]
+%endif
+ punpcklbw m8, m24, m4
+ punpckhbw m9, m24, m4
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+ punpcklbw m8, m25, m29
+ punpckhbw m9, m25, m29
+ SWAP m26, m29
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
+ SWAP m29, m8
+ SWAP m0, m9
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+ vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G
+
+ ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
+ ; write -1
+%ifidn %2, h
+ SWAP m28, m24
+ punpcklbw m8, m28, m5
+ punpckhbw m24, m28, m5
+%else
+ punpcklbw m8, m24, m5
+ punpckhbw m24, m5
+%endif
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m24, pbm1_1
+ paddw m10, m8
+ paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+ punpcklbw m24, m13, m30
+ punpckhbw m9, m13, m30
+%ifidn %2, h
+ SWAP m27, m30
+%endif
+ SWAP m13, m15
+ pmaddubsw m24, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m24
+ paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
+ SWAP m30, m24
+ SWAP m15, m9
+%ifidn %2, h
+ SWAP m9, m24
+ %define pw2048 m9
+%endif
+ pmulhrsw m24, m10, pw2048
+ pmulhrsw m8, m11, pw2048
+ paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
+ paddw m11, m23
+ packuswb m24, m8
+ punpcklbw m8, m3, m31
+ pmaddubsw m8, pbm1_1
+ paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+ SWAP m18, m8
+ pmulhrsw m8, m10, pw2048
+ paddw m10, m16 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+%ifidn %2, h
+ SWAP m16, m9
+ %define pw2048 m16
+%endif
+ punpckhbw m9, m3, m31
+ SWAP m3, m12
+ pmaddubsw m9, pbm1_1
+ paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+ SWAP m23, m9
+ pmulhrsw m9, m11, pw2048
+ paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+%ifidn %2, h
+ SWAP m2, m1
+ %define pbm1_1 m2
+%endif
+ vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H
+
+ ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
+ ; write +0
+ SWAP m24, m31 ; q6
+ packuswb m8, m9
+%ifidn %2, h
+ SWAP m31, m2
+ %define pbm1_1 m31
+%endif
+ vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I
+
+ ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
+ ; write +1
+ punpcklbw m8, m4, m24
+ punpckhbw m2, m4, m24
+ SWAP m4, m1
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m2, pbm1_1
+ paddw m10, m8
+ paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
+ pmulhrsw m2, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m2, m9
+ vpblendmb m2{k4}, m6, m2 ; don't clobber q1/m6 since we need it in K
+
+ ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
+ ; write +2
+ paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ paddw m11, m7
+ punpcklbw m8, m5, m24
+ punpckhbw m9, m5, m24
+ SWAP m5, m12
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
+ pmulhrsw m7, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m7, m9
+ vpblendmb m7{k4}, m14, m7 ; don't clobber q2/m14 since we need it in K
+
+ ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
+ ; write +3
+ paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+ paddw m11, m0
+ punpcklbw m8, m6, m24
+ punpckhbw m9, m6, m24
+ SWAP 2, 6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+mstrideq]{k4}, m8
+%else
+ SWAP m29, m16
+ %define pw2048 m29
+ vpblendmb m16{k4}, m22, m8
+%endif
+
+ ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
+ ; write +4
+ paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ paddw m11, m15
+%ifidn %2, h
+ SWAP m15, m8
+%endif
+ punpcklbw m8, m14, m24
+ punpckhbw m9, m14, m24
+ SWAP 14, 7
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m8, m10, pw2048
+ pmulhrsw m9, m11, pw2048
+ packuswb m8, m9
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4
+%else
+ vpblendmb m17{k4}, m26, m8
+%endif
+
+ ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
+ ; write +5
+ paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ paddw m11, m23
+ punpcklbw m8, m22, m24
+ punpckhbw m9, m22, m24
+ SWAP m30, m24
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m9, pbm1_1
+ paddw m10, m8
+ paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m10, pw2048
+ pmulhrsw m11, pw2048
+ packuswb m10, m11
+%ifidn %2, v
+ vmovdqu8 [t0+strideq*1]{k4}, m10 ; q5
+%else
+ vmovdqu8 m27{k4}, m10
+%endif
+
+%ifidn %2, v
+ lea t0, [dstq+mstrideq*4]
+%endif
+%endif
+
+%if %1 >= 8
+ ; flat8 filter
+ vpbroadcastd m9, [pb_3_1]
+ vpbroadcastd m10, [pb_2_1]
+%if %1 == 16
+ vpbroadcastd m23, [pb_1]
+ vpbroadcastd m0, [pb_4]
+%elifidn %2, h
+ vpbroadcastd m31, [pb_m1_1]
+ %define pbm1_1 m31
+%endif
+ punpcklbw m24, m25, m3
+ punpckhbw m26, m25, m3
+ pmaddubsw m2, m24, m9
+ pmaddubsw m7, m26, m9 ; 3 * p3 + p1
+ punpcklbw m8, m13, m4
+ punpckhbw m11, m13, m4
+ pmaddubsw m8, m10
+ pmaddubsw m11, m10
+ paddw m2, m8
+ paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0
+ punpcklbw m8, m5, m0
+ punpckhbw m11, m5, m0
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
+ paddw m2, m8
+ paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+%if is_h || %1 == 16
+ vpblendmb m10{k2}, m13, m8 ; p2
+%endif
+%ifidn %2, v
+ %if %1 == 8
+ vmovdqu8 [t0+strideq*1]{k2}, m8
+ %else
+ mova [t0+strideq*1], m10
+ %endif
+%endif
+
+ pmaddubsw m8, m24, pbm1_1
+ pmaddubsw m11, m26, pbm1_1
+ paddw m2, m8
+ paddw m7, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m11, pbm1_1
+ paddw m2, m8
+ paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendmb m8{k2}, m3, m8 ; p1
+%ifidn %2, v
+ mova [t0+strideq*2], m8
+%else
+ SWAP m18, m8
+%endif
+
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ psubw m2, m24
+ psubw m7, m26
+ punpcklbw m8, m4, m14
+ punpckhbw m11, m4, m14
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
+ paddw m2, m8
+ paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendmb m8{k2}, m4, m8 ; p0
+%ifidn %2, v
+ mova [t0+stride3q], m8
+%else
+ SWAP m29, m8
+%endif
+
+ punpcklbw m24, m5, m22
+ punpckhbw m26, m5, m22
+ pmaddubsw m8, m24, m23
+ pmaddubsw m11, m26, m23
+ paddw m2, m8
+ paddw m7, m11
+ punpcklbw m8, m4, m25
+ punpckhbw m11, m4, m25
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
+ psubw m2, m8
+ psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
+ psrlw m8, m2, 3
+ psrlw m11, m7, 3
+ packuswb m8, m11
+ vpblendmb m11{k2}, m5, m8 ; q0
+%ifidn %2, v
+ mova [dstq+strideq*0], m11
+%endif
+
+ pmaddubsw m24, pbm1_1
+ pmaddubsw m26, pbm1_1
+ paddw m2, m24
+ paddw m7, m26
+ punpcklbw m8, m13, m6
+ punpckhbw m13, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m13, pbm1_1
+ paddw m2, m8
+ paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
+ psrlw m8, m2, 3
+ psrlw m13, m7, 3
+ packuswb m8, m13
+ vpblendmb m13{k2}, m6, m8 ; q1
+%ifidn %2, v
+ mova [dstq+strideq*1], m13
+%endif
+
+ punpcklbw m24, m3, m6
+ punpckhbw m26, m3, m6
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ psubw m2, m24
+ psubw m7, m26
+ punpcklbw m24, m14, m22
+ punpckhbw m26, m14, m22
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ paddw m2, m24
+ paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+ psrlw m2, 3
+ psrlw m7, 3
+ packuswb m2, m7
+%if is_h || %1 == 16
+ vpblendmb m2{k2}, m14, m2 ; q2
+%endif
+%ifidn %2, v
+ %if %1 == 8
+ vmovdqu8 [dstq+strideq*2]{k2}, m2
+ %else
+ mova [dstq+strideq*2], m2
+ %endif
+%endif
+
+%ifidn %2, h
+ SWAP m24, m18
+ SWAP m26, m29
+%if %1 == 8
+ ; 16x8 transpose
+ punpcklbw m3, m25, m10
+ punpckhbw m25, m10
+ punpcklbw m10, m24, m26
+ punpckhbw m24, m26
+ punpcklbw m26, m11, m13
+ punpckhbw m11, m13
+ punpcklbw m13, m2, m22
+ punpckhbw m2, m22
+ ;
+ punpcklwd m22, m3, m10
+ punpckhwd m3, m10
+ punpcklwd m10, m25, m24
+ punpckhwd m25, m24
+ punpcklwd m24, m26, m13
+ punpckhwd m26, m13
+ punpcklwd m13, m11, m2
+ punpckhwd m11, m2
+ ;
+ punpckldq m2, m22, m24
+ punpckhdq m22, m24
+ punpckldq m24, m3, m26
+ punpckhdq m3, m26
+ punpckldq m26, m10, m13
+ punpckhdq m10, m13
+ punpckldq m13, m25, m11
+ punpckhdq m25, m11
+ ; write 8x32
+ vpbroadcastd ym16, strided
+ pmulld ym16, [hmulD]
+ lea t1, [dstq+strideq*2]
+ lea t2, [dstq+strideq*4]
+ lea t3, [t1 +strideq*4]
+ lea t0, [dstq+strideq*8]
+ kmovb k1, k6
+ kmovb k2, k6
+ kmovb k3, k6
+ kmovb k4, k6
+ vpscatterdq [dstq+ym16-4]{k1}, m2
+ vpscatterdq [t1 +ym16-4]{k2}, m22
+ vpscatterdq [t2 +ym16-4]{k3}, m24
+ vpscatterdq [t3 +ym16-4]{k4}, m3
+ lea t1, [t0+strideq*2]
+ lea t2, [t0+strideq*4]
+ lea t3, [t1+strideq*4]
+ kmovb k1, k6
+ kmovb k2, k6
+ kmovb k3, k6
+ kmovb k4, k6
+ vpscatterdq [t0+ym16-4]{k1}, m26
+ vpscatterdq [t1+ym16-4]{k2}, m10
+ vpscatterdq [t2+ym16-4]{k3}, m13
+ vpscatterdq [t3+ym16-4]{k4}, m25
+%else
+ ; 16x16 transpose and store
+ SWAP 5, 10, 2
+ SWAP 6, 24
+ SWAP 7, 26
+ SWAP 8, 11
+ SWAP 9, 13
+ mova m24, [rsp+0*64]
+ SWAP m26, m28
+ mova m2, [rsp+1*64]
+ mova m3, [rsp+2*64]
+ mova m4, [rsp+3*64]
+ SWAP m11, m16
+ SWAP m25, m17
+ SWAP m13, m27
+ SWAP m14, m30
+ TRANSPOSE_16X16B 1, 0, [rsp+4*64]
+ movu [dstq+strideq*0-8], xm24
+ movu [dstq+strideq*1-8], xm26
+ movu [dstq+strideq*2-8], xm2
+ movu [dstq+stride3q -8], xm3
+ lea t0, [dstq+strideq*4]
+ movu [t0+strideq*0-8], xm4
+ movu [t0+strideq*1-8], xm5
+ movu [t0+strideq*2-8], xm6
+ movu [t0+stride3q -8], xm7
+ lea t0, [t0+strideq*4]
+ movu [t0+strideq*0-8], xm8
+ movu [t0+strideq*1-8], xm9
+ movu [t0+strideq*2-8], xm10
+ movu [t0+stride3q -8], xm11
+ lea t0, [t0+strideq*4]
+ movu [t0+strideq*0-8], xm25
+ movu [t0+strideq*1-8], xm13
+ movu [t0+strideq*2-8], xm14
+ movu [t0+stride3q -8], xm22
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym24, 1
+ vextracti128 [t0+strideq*1-8], ym26, 1
+ vextracti128 [t0+strideq*2-8], ym2, 1
+ vextracti128 [t0+stride3q -8], ym3, 1
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym4, 1
+ vextracti128 [t0+strideq*1-8], ym5, 1
+ vextracti128 [t0+strideq*2-8], ym6, 1
+ vextracti128 [t0+stride3q -8], ym7, 1
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym8, 1
+ vextracti128 [t0+strideq*1-8], ym9, 1
+ vextracti128 [t0+strideq*2-8], ym10, 1
+ vextracti128 [t0+stride3q -8], ym11, 1
+ lea t0, [t0+strideq*4]
+ vextracti128 [t0+strideq*0-8], ym25, 1
+ vextracti128 [t0+strideq*1-8], ym13, 1
+ vextracti128 [t0+strideq*2-8], ym14, 1
+ vextracti128 [t0+stride3q -8], ym22, 1
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m24, 2
+ vextracti32x4 [t0+strideq*1-8], m26, 2
+ vextracti32x4 [t0+strideq*2-8], m2, 2
+ vextracti32x4 [t0+stride3q -8], m3, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m4, 2
+ vextracti32x4 [t0+strideq*1-8], m5, 2
+ vextracti32x4 [t0+strideq*2-8], m6, 2
+ vextracti32x4 [t0+stride3q -8], m7, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m8, 2
+ vextracti32x4 [t0+strideq*1-8], m9, 2
+ vextracti32x4 [t0+strideq*2-8], m10, 2
+ vextracti32x4 [t0+stride3q -8], m11, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m25, 2
+ vextracti32x4 [t0+strideq*1-8], m13, 2
+ vextracti32x4 [t0+strideq*2-8], m14, 2
+ vextracti32x4 [t0+stride3q -8], m22, 2
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m24, 3
+ vextracti32x4 [t0+strideq*1-8], m26, 3
+ vextracti32x4 [t0+strideq*2-8], m2, 3
+ vextracti32x4 [t0+stride3q -8], m3, 3
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m4, 3
+ vextracti32x4 [t0+strideq*1-8], m5, 3
+ vextracti32x4 [t0+strideq*2-8], m6, 3
+ vextracti32x4 [t0+stride3q -8], m7, 3
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m8, 3
+ vextracti32x4 [t0+strideq*1-8], m9, 3
+ vextracti32x4 [t0+strideq*2-8], m10, 3
+ vextracti32x4 [t0+stride3q -8], m11, 3
+ lea t0, [t0+strideq*4]
+ vextracti32x4 [t0+strideq*0-8], m25, 3
+ vextracti32x4 [t0+strideq*1-8], m13, 3
+ vextracti32x4 [t0+strideq*2-8], m14, 3
+ vextracti32x4 [t0+stride3q -8], m22, 3
+%endif
+%endif
+
+%elif %1 == 6
+ ; flat6 filter
+ vpbroadcastd m15, [pb_3_1]
+ vpbroadcastd m12, [pb_2]
+ punpcklbw m8, m13, m5
+ punpckhbw m11, m13, m5
+ pmaddubsw m0, m8, m15
+ pmaddubsw m1, m11, m15
+ punpcklbw m7, m4, m3
+ punpckhbw m10, m4, m3
+ pmaddubsw m2, m7, m12
+ pmaddubsw m12, m10, m12
+%ifidn %2, h
+ vpbroadcastd m15, [pb_m1_1]
+ %define pbm1_1 m15
+%endif
+ paddw m0, m2
+ paddw m1, m12
+ pmulhrsw m2, m0, m16
+ pmulhrsw m12, m1, m16
+ packuswb m2, m12
+ vpblendmb m2{k2}, m3, m2 ; p1
+%ifidn %2, v
+ mova [t0+strideq*2], m2
+%endif
+
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m11, pbm1_1
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+ pmaddubsw m8, pbm1_1
+ pmaddubsw m11, pbm1_1
+ paddw m0, m8
+ paddw m1, m11
+ pmulhrsw m12, m0, m16
+ pmulhrsw m13, m1, m16
+ packuswb m12, m13
+ vpblendmb m12{k2}, m4, m12 ; p0
+%ifidn %2, v
+ mova [t0+stride3q], m12
+%endif
+
+ vpbroadcastd m9, [pb_m1_2]
+ vpbroadcastd m4, [pb_m1_0]
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m3, m14
+ punpckhbw m11, m3, m14
+ pmaddubsw m14, m8, pbm1_1
+ pmaddubsw m13, m11, pbm1_1
+ paddw m0, m14
+ paddw m1, m13
+ pmulhrsw m14, m0, m16
+ pmulhrsw m13, m1, m16
+ packuswb m14, m13
+ vpblendmb m14{k2}, m5, m14 ; q0
+%ifidn %2, v
+ mova [dstq+strideq*0], m14
+%endif
+
+ pmaddubsw m8, m9
+ pmaddubsw m11, m9
+ paddw m0, m8
+ paddw m1, m11
+ pmaddubsw m7, m4
+ pmaddubsw m10, m4
+ paddw m0, m7
+ paddw m1, m10
+ pmulhrsw m0, m16
+ pmulhrsw m1, m16
+ packuswb m0, m1
+ vpblendmb m0{k2}, m6, m0 ; q1
+%ifidn %2, v
+ mova [dstq+strideq*1], m0
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1
+%endif
+%else ; %1 == 4
+%ifidn %2, v
+ mova [t0+strideq*0], m3 ; p1
+ mova [t0+strideq*1], m4 ; p0
+ mova [t0+strideq*2], m5 ; q0
+ mova [t0+stride3q ], m6 ; q1
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7
+%endif
+%endif
+%endmacro
+
+%define k7 k6
+
+INIT_ZMM avx512icl
+cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \
+ lut, w, stride3, mstride
+ DECLARE_REG_TMP 9
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mova m21, [pb_4x0_4x4_4x8_4x12]
+ mova m20, [pb_mask]
+ vpbroadcastd m19, [pb_128]
+ vpbroadcastd m28, [pb_m1_1]
+ vpbroadcastd m27, [pw_2048]
+ %define pbshuf m21
+ %define pbmask m20
+ %define pb128 m19
+ %define pbm1_1 m28
+ %define pw2048 m27
+
+.loop:
+ cmp word [maskq+8], 0 ; vmask[2]
+ je .no_flat16
+
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call .v4
+
+.end:
+ add lq, 64
+ add dstq, 64
+ add maskq, 2
+ sub wd, 16
+ jg .loop
+ RET
+ALIGN function_align
+RESET_MM_PERMUTATION
+.v4:
+ FILTER 4, v
+ ret
+
+cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
+ lut, h, stride3, stride8
+ DECLARE_REG_TMP 9, 10, 11, 12
+ shl l_strideq, 2
+ sub lq, 4
+ lea stride3q, [strideq*3]
+ lea stride8q, [strideq*8]
+ kxnorw k6, k6, k6
+ vpbroadcastd m19, strided
+ vpbroadcastd m20, l_strided
+ pmulld m21, m19, [hmulA]
+ pmulld m20, [hmulB]
+ pmulld m19, [hmulC]
+ %define pbshuf [pb_4x0_4x4_4x8_4x12]
+ %define pbmask [pb_mask]
+ %define pb128 [pb_128]{bcstd}
+ shl l_strideq, 1
+
+.loop:
+ cmp word [maskq+8], 0 ; vmask[2]
+ je .no_flat16
+
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call .h4
+
+.end:
+ lea lq, [lq+l_strideq*8]
+ lea dstq, [dstq+stride8q*8]
+ add maskq, 2
+ sub hd, 16
+ jg .loop
+ RET
+ALIGN function_align
+RESET_MM_PERMUTATION
+.h4:
+ FILTER 4, h
+ ret
+
+cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \
+ lut, w, stride3, mstride
+ DECLARE_REG_TMP 9
+ shl l_strideq, 2
+ sub lq, l_strideq
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+ mova m21, [pb_4x0_4x4_4x8_4x12]
+ mova m20, [pb_mask]
+ vpbroadcastd m19, [pb_128]
+ vpbroadcastd m17, [pb_m1_1]
+ vpbroadcastd m16, [pw_4096]
+ %define pbshuf m21
+ %define pbmask m20
+ %define pb128 m19
+ %define pbm1_1 m17
+
+.loop:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4
+
+.end:
+ add lq, 64
+ add dstq, 64
+ add maskq, 2
+ sub wd, 16
+ jg .loop
+ RET
+
+%undef k7
+cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \
+ lut, h, stride3, stride8
+ DECLARE_REG_TMP 9, 10, 11
+ mov r7d, 0xffff
+ movzx r8d, r7b
+ cmp hd, 9
+ cmovb r7d, r8d
+ kmovw k6, r7d ; h > 8 ? 0xffff : 0x00ff
+ shl l_strideq, 2
+ sub lq, 4
+ kshiftrw k7, k6, 4 ; h > 8 ? 0xff : 0xf0
+ lea stride3q, [strideq*3]
+ lea stride8q, [strideq*8]
+ vpbroadcastd m19, strided
+ vpbroadcastd m20, l_strided
+ pmulld m21, m19, [hmulA]
+ pmulld m20, [hmulB]
+ pmulld m19, [hmulC]
+ mova m18, [pb_mask]
+ vpbroadcastd m17, [pb_128]
+ vpbroadcastd m16, [pw_4096]
+ %define pbshuf [pb_4x0_4x4_4x8_4x12]
+ %define pbmask m18
+ %define pb128 m17
+ add l_strideq, l_strideq
+
+.loop:
+ cmp word [maskq+4], 0 ; vmask[1]
+ je .no_flat
+
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+ cmp word [maskq+0], 0 ; vmask[0]
+ je .end
+
+ call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4
+
+.end:
+ lea lq, [lq+l_strideq*8]
+ lea dstq, [dstq+stride8q*8]
+ add maskq, 2
+ sub hd, 16
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/loopfilter_sse.asm b/third_party/dav1d/src/x86/loopfilter_sse.asm
new file mode 100644
index 0000000000..cd0eb54702
--- /dev/null
+++ b/third_party/dav1d/src/x86/loopfilter_sse.asm
@@ -0,0 +1,2348 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+pb_4x0_4x4_4x8_4x12: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+pb_7_1: times 8 db 7, 1
+pb_3_1: times 8 db 3, 1
+pb_2_1: times 8 db 2, 1
+pb_m1_0: times 8 db -1, 0
+pb_m1_1: times 8 db -1, 1
+pb_m1_2: times 8 db -1, 2
+pb_1: times 16 db 1
+pb_2: times 16 db 2
+pb_3: times 16 db 3
+pb_4: times 16 db 4
+pb_16: times 16 db 16
+pb_63: times 16 db 63
+pb_64: times 16 db 64
+pb_128: times 16 db 0x80
+pb_129: times 16 db 0x81
+pb_240: times 16 db 0xf0
+pb_248: times 16 db 0xf8
+pb_254: times 16 db 0xfe
+
+pw_2048: times 8 dw 2048
+pw_4096: times 8 dw 4096
+
+pd_mask: dd 1, 2, 4, 8
+
+SECTION .text
+
+%macro ABSSUB 4 ; dst, a, b, tmp
+ psubusb %1, %2, %3
+ psubusb %4, %3, %2
+ por %1, %4
+%endmacro
+
+%macro TRANSPOSE_16x4_AND_WRITE_4x16 5
+ ; transpose 16x4
+ punpcklbw m%5, m%1, m%2
+ punpckhbw m%1, m%2
+ punpcklbw m%2, m%3, m%4
+ punpckhbw m%3, m%4
+ punpcklwd m%4, m%5, m%2
+ punpckhwd m%5, m%2
+ punpcklwd m%2, m%1, m%3
+ punpckhwd m%1, m%3
+
+ ; write out
+%assign %%n 0
+%rep 4
+ movd [dstq+strideq *0-2], xm%4
+ movd [dstq+strideq *4-2], xm%5
+ movd [dstq+strideq *8-2], xm%2
+ movd [dstq+stride3q*4-2], xm%1
+ add dstq, strideq
+%if %%n < 3
+ psrldq xm%4, 4
+ psrldq xm%5, 4
+ psrldq xm%2, 4
+ psrldq xm%1, 4
+%endif
+%assign %%n (%%n+1)
+%endrep
+ lea dstq, [dstq+stride3q*4]
+%endmacro
+
+%macro TRANSPOSE_16X16B 2 ; output_transpose, mem
+%if %1 == 0
+ mova %2, m15 ; m7 in 32-bit
+%endif
+
+ ; input in m0-7
+ punpcklbw m15, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ punpcklbw m3, m4, m5
+ punpckhbw m4, m5
+%if ARCH_X86_64
+ SWAP 4, 5, 7
+%else
+ %if %1 == 0
+ mova m5, %2
+ %else
+ mova m5, [esp+1*16]
+ %endif
+ mova %2, m4
+%endif
+ punpcklbw m4, m6, m5
+ punpckhbw m6, m5
+
+ ; interleaved in m15,0,1,2,3,7,4,6
+ punpcklwd m5, m15, m1
+ punpckhwd m15, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m4
+ punpckhwd m3, m4
+%if ARCH_X86_64
+ SWAP 3, 4, 7
+%else
+ mova m4, %2
+ mova %2, m3
+%endif
+ punpcklwd m3, m4, m6
+ punpckhwd m4, m6
+
+ ; interleaved in m5,15,1,0,2,7,3,4
+ punpckldq m6, m5, m2
+ punpckhdq m5, m2
+%if ARCH_X86_64
+ SWAP 2, 7, 5
+%else
+ mova m2, %2
+ mova [esp+1*16], m5
+%endif
+ punpckldq m5, m15, m2
+ punpckhdq m15, m2
+ punpckldq m2, m1, m3
+ punpckhdq m1, m3
+ punpckldq m3, m0, m4
+ punpckhdq m0, m4
+
+%if ARCH_X86_32
+ mova [esp+0*16], m6
+ mova [esp+2*16], m5
+ mova [esp+3*16], m15
+ mova [esp+4*16], m2
+ mova [esp+5*16], m1
+ mova [esp+6*16], m3
+ mova [esp+7*16], m0
+ mova m8, [esp+ 8*16]
+ mova m9, [esp+ 9*16]
+ mova m10, [esp+10*16]
+ %if %1 == 0
+ mova m11, [esp+11*16]
+ mova m12, [esp+12*16]
+ mova m13, [esp+13*16]
+ mova m14, [esp+14*16]
+ %else
+ mova m11, [esp+20*16]
+ mova m12, [esp+15*16]
+ mova m13, [esp+16*16]
+ mova m14, [esp+17*16]
+ %endif
+%endif
+
+ ; input in m8-m15
+%if ARCH_X86_64
+ SWAP 7, 4
+%endif
+ punpcklbw m7, m8, m9
+ punpckhbw m8, m9
+ punpcklbw m9, m10, m11
+ punpckhbw m10, m11
+ punpcklbw m11, m12, m13
+ punpckhbw m12, m13
+%if ARCH_X86_64
+ mova m13, %2
+%else
+ %if %1 == 0
+ mova m13, [esp+15*16]
+ %else
+ mova m13, [esp+18*16]
+ %endif
+%endif
+ mova %2, m12
+ punpcklbw m12, m14, m13
+ punpckhbw m14, m14, m13
+
+ ; interleaved in m7,8,9,10,11,rsp%2,12,14
+ punpcklwd m13, m7, m9
+ punpckhwd m7, m9
+ punpcklwd m9, m8, m10
+ punpckhwd m8, m10
+ punpcklwd m10, m11, m12
+ punpckhwd m11, m12
+ mova m12, %2
+ mova %2, m11
+ punpcklwd m11, m12, m14
+ punpckhwd m12, m14
+
+ ; interleaved in m13,7,9,8,10,rsp%2,11,12
+ punpckldq m14, m13, m10
+ punpckhdq m13, m10
+ punpckldq m10, m9, m11
+ punpckhdq m9, m11
+ punpckldq m11, m8, m12
+ punpckhdq m8, m12
+ mova m12, %2
+ mova %2, m8
+ punpckldq m8, m7, m12
+ punpckhdq m7, m12
+
+%if ARCH_X86_32
+ mova [esp+ 8*16], m10
+ mova [esp+ 9*16], m9
+ mova [esp+10*16], m11
+ SWAP 6, 1
+ SWAP 4, 2
+ SWAP 5, 3
+ mova m6, [esp+0*16]
+ mova m4, [esp+1*16]
+ mova m5, [esp+2*16]
+%endif
+
+ ; interleaved in m6,7,5,15,2,1,3,0,14,13,10,9,11,rsp%2,8,7
+ punpcklqdq m12, m6, m14
+ punpckhqdq m6, m14
+ punpcklqdq m14, m4, m13
+ punpckhqdq m4, m13
+ punpcklqdq m13, m5, m8
+ punpckhqdq m5, m8
+%if ARCH_X86_64
+ SWAP 8, 5
+%else
+ mova m8, [esp+3*16]
+ mova [esp+27*16], m5
+ %define m15 m8
+%endif
+ punpcklqdq m5, m15, m7
+ punpckhqdq m15, m7
+
+%if ARCH_X86_32
+ mova [esp+11*16], m12
+ mova [esp+12*16], m6
+ mova [esp+13*16], m14
+ mova [esp+14*16], m4
+ mova [esp+26*16], m13
+ mova [esp+ 0*16], m5
+ mova [esp+ 1*16], m15
+ mova m2, [esp+ 4*16]
+ mova m10, [esp+ 8*16]
+ mova m1, [esp+ 5*16]
+ mova m9, [esp+ 9*16]
+ mova m3, [esp+ 6*16]
+ mova m11, [esp+10*16]
+ mova m0, [esp+ 7*16]
+%endif
+
+ punpcklqdq m7, m2, m10
+ punpckhqdq m2, m10
+ punpcklqdq m10, m1, m9
+ punpckhqdq m1, m9
+ punpcklqdq m9, m3, m11
+ punpckhqdq m3, m11
+ mova m11, %2
+%if ARCH_X86_32
+ %define m12 m3
+%endif
+ mova %2, m12
+ punpcklqdq m12, m0, m11
+ punpckhqdq m0, m11
+%if %1 == 1
+ mova m11, %2
+%endif
+
+%if ARCH_X86_64
+ ; interleaved m11,6,14,4,13,8,5,15,7,2,10,1,9,3,12,0
+ SWAP 0, 11, 1, 6, 5, 8, 7, 15
+ SWAP 2, 14, 12, 9
+ SWAP 3, 4, 13
+%else
+ %if %1 == 0
+ mova [esp+15*16], m9
+ mova [esp+17*16], m12
+ mova [esp+18*16], m0
+ mova [esp+28*16], m10
+ mova [esp+29*16], m1
+ mova m3, [esp+0*16]
+ mova m4, [esp+1*16]
+ SWAP m5, m7
+ SWAP m6, m2
+ %else
+ SWAP 0, 7
+ SWAP 3, 1, 2, 4, 6
+ %endif
+%endif
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+%if ARCH_X86_64
+ %define %%flat8mem [rsp+0*16]
+ %define %%q2mem [rsp+1*16]
+ %define %%q3mem [rsp+2*16]
+%else
+ %if %1 == 4 || %1 == 6
+ %define %%p2mem [esp+ 8*16]
+ %define %%q2mem [esp+ 9*16]
+ %define %%flat8mem [esp+10*16]
+ %else
+ %ifidn %2, v
+ %define %%p2mem [esp+16*16]
+ %define %%q2mem [esp+ 1*16]
+ %define %%q3mem [esp+18*16]
+ %define %%flat8mem [esp+ 0*16]
+ %define %%flat16mem [esp+20*16]
+ %else
+ %define %%p2mem [esp+27*16]
+ %define %%q2mem [esp+28*16]
+ %define %%q3mem [esp+29*16]
+ %define %%flat8mem [esp+21*16]
+ %define %%flat16mem [esp+30*16]
+ %endif
+ %endif
+ %xdefine m12reg m12
+%endif
+
+%if ARCH_X86_32
+ lea stride3q, [strideq*3]
+%endif
+ ; load data
+%ifidn %2, v
+%if ARCH_X86_32
+ mov mstrideq, strideq
+ neg mstrideq
+%endif
+%if %1 == 4
+ lea tmpq, [dstq+mstrideq*2]
+ mova m3, [tmpq+strideq*0] ; p1
+ mova m4, [tmpq+strideq*1] ; p0
+ mova m5, [tmpq+strideq*2] ; q0
+ mova m6, [tmpq+stride3q] ; q1
+%else
+ ; load 6-8 pixels, remainder (for wd=16) will be read inline
+ lea tmpq, [dstq+mstrideq*4]
+ ; we load p3 later
+%define %%p3mem [dstq+mstrideq*4]
+ %if ARCH_X86_32
+ %define m13 m0
+ %define m14 m1
+ %define m15 m2
+ %endif
+ mova m13, [tmpq+strideq*1]
+ mova m3, [tmpq+strideq*2]
+ mova m4, [tmpq+stride3q]
+ mova m5, [dstq+strideq*0]
+ mova m6, [dstq+strideq*1]
+ mova m14, [dstq+strideq*2]
+%if %1 != 6
+ mova m15, [dstq+stride3q]
+%endif
+ %if ARCH_X86_32
+ mova %%p2mem, m13
+ mova %%q2mem, m14
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ %if %1 != 6
+ mova %%q3mem, m15
+ %define m15 %%q3mem
+ %endif
+ %endif
+%endif
+%else ; %2 == h
+ ; load lines
+%if %1 == 4
+ ; transpose 4x16
+ movd m7, [dstq+strideq*0-2]
+ movd m3, [dstq+strideq*1-2]
+ movd m4, [dstq+strideq*2-2]
+ movd m5, [dstq+stride3q -2]
+ lea tmpq, [dstq+strideq*4]
+ punpcklbw m7, m3
+ punpcklbw m4, m5
+ movd m3, [tmpq+strideq*0-2]
+ movd m1, [tmpq+strideq*1-2]
+ movd m5, [tmpq+strideq*2-2]
+ movd m6, [tmpq+stride3q -2]
+ lea tmpq, [tmpq+strideq*4]
+ punpcklbw m3, m1
+ punpcklbw m5, m6
+ movd m0, [tmpq+strideq*0-2]
+ movd m1, [tmpq+strideq*1-2]
+ punpcklbw m0, m1
+ movd m1, [tmpq+strideq*2-2]
+ movd m2, [tmpq+stride3q -2]
+ punpcklbw m1, m2
+ punpcklqdq m7, m0
+ punpcklqdq m4, m1
+ lea tmpq, [tmpq+strideq*4]
+ movd m0, [tmpq+strideq*0-2]
+ movd m1, [tmpq+strideq*1-2]
+ punpcklbw m0, m1
+ movd m1, [tmpq+strideq*2-2]
+ movd m2, [tmpq+stride3q -2]
+ punpcklbw m1, m2
+ punpcklqdq m3, m0
+ punpcklqdq m5, m1
+ ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9
+ ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13
+ ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11
+ ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15
+ punpcklwd m6, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ ; xm6: A0-3,B0-3,C0-3,D0-3
+ ; xm7: A8-11,B8-11,C8-11,D8-11
+ ; xm4: A4-7,B4-7,C4-7,D4-7
+ ; xm3: A12-15,B12-15,C12-15,D12-15
+ punpckldq m5, m6, m4
+ punpckhdq m6, m4
+ punpckldq m4, m7, m3
+ punpckhdq m7, m3
+ ; xm5: A0-7,B0-7
+ ; xm6: C0-7,D0-7
+ ; xm4: A8-15,B8-15
+ ; xm7: C8-15,D8-15
+ punpcklqdq m3, m5, m4
+ punpckhqdq m5, m5, m4
+ punpcklqdq m4, m6, m7
+ punpckhqdq m6, m7
+ ; xm3: A0-15
+ ; xm5: B0-15
+ ; xm4: C0-15
+ ; xm6: D0-15
+ SWAP 4, 5
+%elif %1 == 6 || %1 == 8
+ ; transpose 8x16
+ movq m7, [dstq+strideq*0-%1/2]
+ movq m3, [dstq+strideq*1-%1/2]
+ movq m4, [dstq+strideq*2-%1/2]
+ movq m5, [dstq+stride3q -%1/2]
+ lea tmpq, [dstq+strideq*8]
+ punpcklbw m7, m3
+ punpcklbw m4, m5
+ movq m3, [tmpq+strideq*0-%1/2]
+ movq m1, [tmpq+strideq*1-%1/2]
+ movq m5, [tmpq+strideq*2-%1/2]
+ movq m6, [tmpq+stride3q -%1/2]
+ lea tmpq, [dstq+strideq*4]
+ punpcklbw m3, m1
+ punpcklbw m5, m6
+ movq m6, [tmpq+strideq*0-%1/2]
+ movq m0, [tmpq+strideq*1-%1/2]
+ movq m1, [tmpq+strideq*2-%1/2]
+ movq m2, [tmpq+stride3q -%1/2]
+ lea tmpq, [tmpq+strideq*8]
+ punpcklbw m6, m0
+ punpcklbw m1, m2
+ movq m2, [tmpq+strideq*2-%1/2]
+ movq m0, [tmpq+stride3q -%1/2]
+ punpcklbw m2, m0
+%if ARCH_X86_64
+ SWAP m15, m2
+%else
+ %define m15 [esp+3*16]
+ mova m15, m2
+%endif
+ movq m0, [tmpq+strideq*0-%1/2]
+ movq m2, [tmpq+strideq*1-%1/2]
+ punpcklbw m0, m2
+ ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
+ ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
+ ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
+ ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
+ ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
+ ; xm0: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
+ ; xm1: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
+ ; xm2: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
+ punpcklwd m2, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m6, m1
+ punpckhwd m6, m1
+ punpcklwd m1, m0, m15
+ punpckhwd m0, m15
+%if ARCH_X86_64
+ SWAP m15, m0
+%else
+ mova m15, m0
+%endif
+ ; xm2: A0-3,B0-3,C0-3,D0-3
+ ; xm7: E0-3,F0-3,G0-3,H0-3
+ ; xm4: A8-11,B8-11,C8-11,D8-11
+ ; xm3: E8-11,F8-11,G8-11,H8-11
+ ; xm5: A4-7,B4-7,C4-7,D4-7
+ ; xm6: E4-7,F4-7,G4-7,H4-7
+ ; xm1: A12-15,B12-15,C12-15,D12-15
+ ; xm0: E12-15,F12-15,G12-15,H12-15
+ punpckldq m0, m2, m5
+ punpckhdq m2, m5
+ punpckldq m5, m7, m6
+%if %1 != 6
+ punpckhdq m7, m6
+%endif
+ punpckldq m6, m4, m1
+ punpckhdq m4, m1
+ punpckldq m1, m3, m15
+%if %1 != 6
+ punpckhdq m3, m15
+ %if ARCH_X86_64
+ SWAP m15, m3
+ %else
+ mova m15, m3
+ %endif
+%endif
+ ; xm0: A0-7,B0-7
+ ; xm2: C0-7,D0-7
+ ; xm5: E0-7,F0-7
+ ; xm7: G0-7,H0-7
+ ; xm6: A8-15,B8-15
+ ; xm4: C8-15,D8-15
+ ; xm1: E8-15,F8-15
+ ; xm3: G8-15,H8-15
+ punpcklqdq m3, m0, m6
+ punpckhqdq m0, m6
+ punpckhqdq m6, m2, m4
+ punpcklqdq m2, m4
+ punpcklqdq m4, m5, m1
+ punpckhqdq m5, m1
+%if %1 == 8
+ punpcklqdq m1, m7, m15
+ punpckhqdq m7, m15
+ ; xm3: A0-15
+ ; xm0: B0-15
+ ; xm2: C0-15
+ ; xm6: D0-15
+ ; xm4: E0-15
+ ; xm5: F0-15
+ ; xm1: G0-15
+ ; xm7: H0-15
+%if ARCH_X86_64
+ SWAP 11, 3, 2
+ SWAP 13, 0
+ SWAP 6, 5, 4
+ SWAP 14, 1
+ SWAP 15, 7
+ ; 3,0,2,6,4,5,1,7 -> 11,13,3,4,5,6,14,15
+ mova [rsp+21*16], m11
+ %define %%p3mem [rsp+21*16]
+%else
+ %define m11 [esp+26*16]
+ %define m13 [esp+27*16]
+ %define m14 [esp+28*16]
+ %define m15 [esp+29*16]
+ mova m11, m3
+ mova m13, m0
+ SWAP 3, 2
+ SWAP 6, 5, 4
+ mova m14, m1
+ mova m15, m7
+ %define %%p3mem [esp+26*16]
+%endif
+%else
+ %if ARCH_X86_64
+ SWAP 13, 3, 0
+ SWAP 14, 5, 6, 4, 2
+ ; 3,0,2,6,4,5 -> 13,3,4,5,6,14
+ %else
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ mova m13, m3
+ mova m14, m5
+ SWAP 3, 0
+ SWAP 5, 6, 4, 2
+ ; 0,2,6,4 -> 3,4,5,6
+ %endif
+%endif
+%else
+%if ARCH_X86_64
+ mova [rsp+20*16], m12
+%endif
+ ; load and 16x16 transpose. We only use 14 pixels but we'll need the
+ ; remainder at the end for the second transpose
+%if ARCH_X86_32
+ %xdefine m8 m0
+ %xdefine m9 m1
+ %xdefine m10 m2
+ %xdefine m11 m3
+ %xdefine m12 m4
+ %xdefine m13 m5
+ %xdefine m14 m6
+ %xdefine m15 m7
+ lea tmpq, [dstq+strideq*8]
+ movu m8, [tmpq+strideq*0-8]
+ movu m9, [tmpq+strideq*1-8]
+ movu m10, [tmpq+strideq*2-8]
+ movu m11, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ movu m12, [tmpq+strideq*0-8]
+ movu m13, [tmpq+strideq*1-8]
+ movu m14, [tmpq+strideq*2-8]
+ movu m15, [tmpq+stride3q -8]
+ mova [esp+ 8*16], m8
+ mova [esp+ 9*16], m9
+ mova [esp+10*16], m10
+ mova [esp+11*16], m11
+ mova [esp+12*16], m12
+ mova [esp+13*16], m13
+ mova [esp+14*16], m14
+ mova [esp+15*16], m15
+%endif
+ movu m0, [dstq+strideq*0-8]
+ movu m1, [dstq+strideq*1-8]
+ movu m2, [dstq+strideq*2-8]
+ movu m3, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4]
+ movu m4, [tmpq+strideq*0-8]
+ movu m5, [tmpq+strideq*1-8]
+ movu m6, [tmpq+strideq*2-8]
+ movu m7, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+%if ARCH_X86_64
+ movu m8, [tmpq+strideq*0-8]
+ movu m9, [tmpq+strideq*1-8]
+ movu m10, [tmpq+strideq*2-8]
+ movu m11, [tmpq+stride3q -8]
+ lea tmpq, [tmpq+strideq*4]
+ movu m12, [tmpq+strideq*0-8]
+ movu m13, [tmpq+strideq*1-8]
+ movu m14, [tmpq+strideq*2-8]
+ movu m15, [tmpq+stride3q -8]
+%endif
+
+%if ARCH_X86_64
+ TRANSPOSE_16X16B 0, [rsp+11*16]
+ mova [rsp+12*16], m1
+ mova [rsp+13*16], m2
+ mova [rsp+14*16], m3
+ mova [rsp+15*16], m12
+ mova [rsp+16*16], m13
+ mova [rsp+17*16], m14
+ mova [rsp+18*16], m15
+ ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
+ SWAP 12, 4, 7
+ SWAP 13, 5, 8
+ SWAP 3, 6, 9
+ SWAP 10, 14
+ SWAP 11, 15
+ mova [rsp+21*16], m12
+ %define %%p3mem [rsp+21*16]
+ mova m12, [rsp+20*16]
+%else
+ TRANSPOSE_16X16B 0, [esp+16*16]
+ %define %%p3mem [esp+26*16]
+ %define m11 %%p3mem
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ %define m15 %%q3mem
+%endif
+%endif ; if 4 elif 6 or 8 else 16
+%endif ; if v else h
+
+ ; load L/E/I/H
+%if ARCH_X86_32
+ mov l_strideq, l_stridem
+%endif
+%ifidn %2, v
+ movu m1, [lq]
+ movu m0, [lq+l_strideq]
+%else
+ %if ARCH_X86_32
+ lea l_stride3q, [l_strideq*3]
+ %endif
+ movq xm1, [lq]
+ movq xm2, [lq+l_strideq*2]
+ movhps xm1, [lq+l_strideq]
+ movhps xm2, [lq+l_stride3q]
+ shufps m0, m1, m2, q3131
+ shufps m1, m2, q2020
+ %if ARCH_X86_32
+ lea stride3q, [strideq*3]
+ %endif
+%endif
+
+%if ARCH_X86_32
+ %ifidn %2, v
+ mov lutd, lutm
+ %endif
+%endif
+ pxor m2, m2
+ pcmpeqb m7, m2, m0
+ pand m1, m7
+ por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
+ pshufb m0, [PIC_sym(pb_4x0_4x4_4x8_4x12)] ; l[x][1]
+ pcmpeqb m2, m0 ; !L
+ psrlq m7, m0, [lutq+128]
+ pand m7, [PIC_sym(pb_63)]
+ pminub m7, minlvl
+ pmaxub m7, [PIC_sym(pb_1)] ; I
+ pand m1, m0, [PIC_sym(pb_240)]
+ psrlq m1, 4 ; H
+ paddb m0, [PIC_sym(pb_2)]
+ paddb m0, m0
+ paddb m0, m7 ; E
+ pxor m1, [PIC_sym(pb_128)]
+ pxor m7, [PIC_sym(pb_128)]
+ pxor m0, [PIC_sym(pb_128)]
+ SWAP 2, 7
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 2, 10
+%else
+ %ifidn %2, v
+ mov mstrideq, strideq
+ neg mstrideq
+ %if %1 == 4
+ lea tmpq, [dstq+mstrideq*2]
+ %elif %1 == 6 || %1 == 8
+ lea tmpq, [dstq+mstrideq*4]
+ %endif
+ %endif
+ mova [esp+3*16], m0
+ mova [esp+4*16], m2
+%endif
+
+ ABSSUB m0, m3, m4, m2 ; abs(p1-p0)
+ pmaxub m0, m7
+ ABSSUB m2, m5, m6, m7 ; abs(q1-q0)
+ pmaxub m0, m2
+%if %1 == 4
+ pxor m0, [PIC_sym(pb_128)]
+ pcmpgtb m7, m0, m1 ; hev
+ %if ARCH_X86_64
+ SWAP 7, 11
+ %else
+ mova [esp+5*16], m7
+ %endif
+%else
+ pxor m7, m0, [PIC_sym(pb_128)]
+ pcmpgtb m7, m1 ; hev
+%if ARCH_X86_64
+ SWAP 7, 11
+%else
+ mova [esp+5*16], m7
+%endif
+
+%if %1 == 6
+ ABSSUB m1, m13, m4, m7 ; abs(p2-p0)
+ pmaxub m1, m0
+%else
+ mova m2, %%p3mem
+ ABSSUB m1, m2, m4, m7 ; abs(p3-p0)
+ pmaxub m1, m0
+ ABSSUB m7, m13, m4, m2 ; abs(p2-p0)
+ pmaxub m1, m7
+%endif
+ ABSSUB m7, m5, m14, m2 ; abs(p2-p0)
+ pmaxub m1, m7
+%if %1 != 6
+ ABSSUB m7, m5, m15, m2 ; abs(q3-q0)
+ pmaxub m1, m7
+%endif
+ pxor m1, [PIC_sym(pb_128)]
+ pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8in
+%if ARCH_X86_64
+ SWAP 1, 9
+%else
+ mova [esp+6*16], m1
+%endif
+
+%if %1 == 6
+ ABSSUB m7, m13, m3, m1 ; abs(p2-p1)
+%else
+ mova m2, %%p3mem
+ ABSSUB m7, m2, m13, m1 ; abs(p3-p2)
+ ABSSUB m2, m13, m3, m1 ; abs(p2-p1)
+ pmaxub m7, m2
+ ABSSUB m2, m14, m15, m1 ; abs(q3-q2)
+ pmaxub m7, m2
+%endif
+ ABSSUB m2, m14, m6, m1 ; abs(q2-q1)
+ pmaxub m7, m2
+%if ARCH_X86_32
+ %define m12 m1
+ mova m12, maskmem
+%endif
+ pand m2, m12, mask1
+ pcmpeqd m2, m12
+ pand m7, m2 ; only apply fm-wide to wd>4 blocks
+ pmaxub m0, m7
+
+ pxor m0, [PIC_sym(pb_128)]
+%endif ; %if %1 == 4 else
+%if ARCH_X86_64
+ SWAP 2, 10
+ pcmpgtb m0, m2
+%else
+ pcmpgtb m0, [esp+4*16]
+%endif
+
+ ABSSUB m1, m3, m6, m7 ; abs(p1-q1)
+ ABSSUB m7, m4, m5, m2 ; abs(p0-q0)
+ paddusb m7, m7
+ pand m1, [PIC_sym(pb_254)]
+ psrlq m1, 1
+ paddusb m1, m7 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ pxor m1, [PIC_sym(pb_128)]
+%if ARCH_X86_64
+ pcmpgtb m1, m8 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+%else
+ pcmpgtb m1, [esp+3*16]
+%endif
+ por m0, m1
+
+%if %1 == 16
+%if ARCH_X86_64
+ SWAP 0, 8
+%else
+ mova [esp+3*16], m0
+%endif
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1]
+%else
+ mova m0, [rsp+12*16]
+%endif
+ ABSSUB m1, m0, m4, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+13*16]
+%endif
+ ABSSUB m2, m0, m4, m7
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+stride3q]
+%else
+ mova m0, [rsp+14*16]
+%endif
+ ABSSUB m2, m0, m4, m7
+ pmaxub m1, m2
+%ifidn %2, v
+ lea tmpq, [dstq+strideq*4]
+ mova m0, [tmpq+strideq*0]
+%else
+ mova m0, [rsp+15*16]
+%endif
+ ABSSUB m2, m0, m5, m7
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*1]
+%else
+ mova m0, [rsp+16*16]
+%endif
+ ABSSUB m2, m0, m5, m7
+ pmaxub m1, m2
+%ifidn %2, v
+ mova m0, [tmpq+strideq*2]
+%else
+ mova m0, [rsp+17*16]
+%endif
+ ABSSUB m2, m0, m5, m7
+ pmaxub m1, m2
+ pxor m1, [PIC_sym(pb_128)]
+ pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8out
+%if ARCH_X86_64
+ por m1, m9 ; !flat8in | !flat8out
+%else
+ por m1, [esp+6*16]
+ %define m12 m7
+ mova m12, maskmem
+%endif
+ pand m2, m12, mask2
+ pcmpeqd m2, m12
+ pandn m1, m2 ; flat16
+%if ARCH_X86_64
+ pandn m2, m8, m1 ; flat16 & fm
+%else
+ pandn m2, [esp+3*16], m1 ; flat16 & fm
+ mova %%flat16mem, m2
+%endif
+ SWAP 1, 2
+
+ pand m2, m12, mask1
+ pcmpeqd m2, m12
+%if ARCH_X86_64
+ pandn m9, m2 ; flat8in
+ pandn m2, m8, m9
+ SWAP 2, 9
+%else
+ pandn m0, [esp+6*16], m2
+ pandn m2, [esp+3*16], m0
+ mova [esp+6*16], m2
+%endif
+ pand m2, m12, mask0
+ pcmpeqd m2, m12
+%if ARCH_X86_64
+ pandn m8, m2
+ pandn m2, m9, m8 ; fm & !flat8 & !flat16
+ SWAP 2, 8
+ pandn m2, m1, m9 ; flat8 & !flat16
+ SWAP 2, 9
+ SWAP 0, 8
+ SWAP 1, 10
+%else
+ pandn m0, [esp+3*16], m2
+ pandn m2, [esp+6*16], m0
+ SWAP 2, 0
+ pandn m2, m1, [esp+6*16]
+ mova %%flat8mem, m2
+%endif
+%elif %1 != 4
+ %if ARCH_X86_64
+ SWAP 1, 9
+ %else
+ %define m12 m7
+ mova m12, maskmem
+ mova m1, [esp+6*16]
+ %endif
+ pand m2, m12, mask1
+ pcmpeqd m2, m12
+ pandn m1, m2
+ pandn m2, m0, m1 ; flat8 & fm
+ pand m1, m12, mask0
+ pcmpeqd m1, m12
+ pandn m0, m1
+ pandn m1, m2, m0 ; fm & !flat8
+ SWAP 1, 2, 0
+ %if ARCH_X86_64
+ SWAP 1, 9
+ %else
+ mova %%flat8mem, m1
+ %endif
+%else
+%if ARCH_X86_32
+ %define m12 m1
+ mova m12, maskmem
+%endif
+ pand m2, m12, mask0
+ pcmpeqd m2, m12
+ pandn m0, m2 ; fm
+%endif
+
+ ; short filter
+
+ mova m1, [PIC_sym(pb_128)]
+%if ARCH_X86_64
+ SWAP 7, 11
+%else
+ mova m7, [esp+5*16]
+%endif
+ pxor m3, m1
+ pxor m6, m1
+ pxor m4, m1
+ pxor m5, m1
+ psubsb m1, m3, m6 ; iclip_diff(p1-q1)
+ pand m1, m7 ; f=iclip_diff(p1-q1)&hev
+ psubsb m2, m5, m4
+ paddsb m1, m2
+ paddsb m1, m2
+ paddsb m1, m2 ; f=iclip_diff(3*(q0-p0)+f)
+ mova m2, [PIC_sym(pb_16)]
+ pand m0, m1 ; f&=fm
+ paddsb m1, m0, [PIC_sym(pb_3)]
+ paddsb m0, [PIC_sym(pb_4)]
+ pand m1, [PIC_sym(pb_248)]
+ pand m0, [PIC_sym(pb_248)]
+ psrlq m1, 3
+ psrlq m0, 3
+ pxor m1, m2
+ pxor m0, m2
+ psubb m1, m2 ; f2
+ psubb m0, m2 ; f1
+ mova m2, [PIC_sym(pb_128)]
+ paddsb m4, m1
+ psubsb m5, m0
+ pxor m4, m2
+ pxor m5, m2
+
+ pxor m0, m2
+ pxor m1, m1
+ pavgb m0, m1 ; f=(f1+1)>>1
+ psubb m0, [PIC_sym(pb_64)]
+ pandn m7, m0 ; f&=!hev
+ paddsb m3, m7
+ psubsb m6, m7
+ pxor m3, m2
+ pxor m6, m2
+
+%if %1 == 16
+ ; flat16 filter
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1] ; p6
+ mova m2, [tmpq+strideq*2] ; p5
+ mova m7, [tmpq+stride3q] ; p4
+%else
+ mova m0, [rsp+12*16]
+ mova m2, [rsp+13*16]
+ mova m7, [rsp+14*16]
+%endif
+
+%if ARCH_X86_64
+ SWAP 1, 10
+ mova %%flat8mem, m9
+ mova %%q2mem, m14
+ mova %%q3mem, m15
+ SWAP 0, 8
+ SWAP 1, 9
+%else
+ %ifidn %2, v
+ mova [esp+17*16], m0
+ mova [esp+19*16], m3
+ mova [esp+21*16], m4
+ mova [esp+22*16], m5
+ mova [esp+23*16], m6
+ %xdefine m11 m3
+ %xdefine m14 m4
+ %xdefine m15 m5
+ %xdefine m10 m6
+ %define m13 %%p2mem
+ %define m8 [esp+17*16]
+ %define m9 %%flat16mem
+ %define m3 [esp+19*16]
+ %define m4 [esp+21*16]
+ %define m5 [esp+22*16]
+ %define m6 [esp+23*16]
+ %else
+ mova [esp+31*16], m0
+ mova [esp+32*16], m3
+ mova [esp+33*16], m4
+ mova [esp+34*16], m5
+ mova [esp+35*16], m6
+ %xdefine m11 m3
+ %xdefine m14 m4
+ %xdefine m15 m5
+ %xdefine m10 m6
+ %define m13 %%p2mem
+ %define m8 [esp+31*16]
+ %define m9 %%flat16mem
+ %define m3 [esp+32*16]
+ %define m4 [esp+33*16]
+ %define m5 [esp+34*16]
+ %define m6 [esp+35*16]
+ %endif
+%endif
+
+ ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
+ ; write -6
+ mova m11, %%p3mem
+%if ARCH_X86_64
+ punpcklbw m14, m8, m11
+ punpckhbw m15, m8, m11
+%else
+ punpcklbw m14, m0, m11
+ punpckhbw m15, m0, m11
+%endif
+%ifidn %2, v
+ mova [rsp+5*16], m11
+%endif
+ pmaddubsw m10, m14, [PIC_sym(pb_7_1)]
+ pmaddubsw m11, m15, [PIC_sym(pb_7_1)] ; p6*7+p3
+ punpcklbw m0, m2, m7
+ punpckhbw m1, m2, m7
+ pmaddubsw m0, [PIC_sym(pb_2)]
+ pmaddubsw m1, [PIC_sym(pb_2)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*7+p5*2+p4*2+p3
+ punpcklbw m0, m13, m3
+ punpckhbw m1, m13, m3
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1
+ punpcklbw m0, m4, m5
+ punpckhbw m1, m4, m5
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m2
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+strideq*2], m0 ; p5
+%else
+ mova [rsp+13*16], m0
+%endif
+
+ ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
+ ; write -5
+ pmaddubsw m14, [PIC_sym(pb_m1_1)]
+ pmaddubsw m15, [PIC_sym(pb_m1_1)]
+ paddw m10, m14
+ paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+ punpcklbw m0, m8, m6
+ punpckhbw m1, m8, m6
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ mova [rsp+3*16], m0
+ mova [rsp+4*16], m1
+ paddw m10, m0
+ paddw m11, m1 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m7
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+stride3q], m0 ; p4
+%else
+ mova [rsp+14*16], m0
+%endif
+
+ ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
+ ; write -4
+ mova m14, %%q2mem
+ punpcklbw m0, m8, m13
+ punpckhbw m1, m8, m13
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
+ punpcklbw m0, m2, m14
+ punpckhbw m2, m14
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m2, [PIC_sym(pb_m1_1)]
+ mova [rsp+1*16], m0
+ paddw m10, m0
+ paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, %%p3mem
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+strideq*4], m0 ; p3
+%else
+ mova [rsp+19*16], m0
+%endif
+
+ ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
+ ; write -3
+ mova m15, %%q3mem
+ punpcklbw m0, m8, m3
+ punpckhbw m1, m8, m3
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
+ punpcklbw m0, m7, m15
+ punpckhbw m7, m15
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m7, [PIC_sym(pb_m1_1)]
+ mova [rsp+2*16], m0
+%if ARCH_X86_32
+ %ifidn %2, v
+ mova [esp+24*16], m7
+ %else
+ mova [esp+36*16], m7
+ %endif
+%endif
+ paddw m10, m0
+ paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m13
+ por m0, m1
+ mova [rsp+6*16], m0 ; don't clobber p2/m13 since we need it in F
+
+ ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
+ ; write -2
+ punpcklbw m0, m8, m4
+ punpckhbw m1, m8, m4
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+%if ARCH_X86_64
+ SWAP 7, 8
+%endif
+%ifidn %2, v
+ mova m1, [dstq+strideq*4] ; q4
+ mova m7, [rsp+5*16] ; (pre-filter) p3
+%else
+ mova m1, [rsp+15*16]
+ mova m7, %%p3mem ; (pre-filter) p3
+%endif
+ punpcklbw m0, m1, m7
+ punpckhbw m1, m1, m7
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ mova [rsp+7*16], m0
+ mova [rsp+5*16], m1
+ psubw m10, m0
+ psubw m11, m1 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m3
+ por m0, m1
+ mova [rsp+8*16], m0 ; don't clobber p1/m3 since we need it in G
+
+ ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
+ ; write -1
+%ifidn %2, v
+ mova m7, [tmpq+strideq*1] ; p6
+ lea tmpq, [dstq+strideq*4]
+ mova m1, [tmpq+strideq*1] ; q5
+%else
+ mova m7, [rsp+12*16] ; p6
+ mova m1, [rsp+16*16]
+%endif
+ punpcklbw m0, m7, m5
+ punpckhbw m7, m5
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m7, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m7 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+ punpcklbw m7, m13, m1
+ pmaddubsw m7, [PIC_sym(pb_m1_1)]
+ mova [rsp+9*16], m7
+ paddw m10, m7
+%if ARCH_X86_64
+ punpckhbw m13, m1
+ mova m1, [rsp+6*16]
+ SWAP 1, 13
+%else
+ punpckhbw m7, m13, m1
+ mova m1, [esp+6*16]
+ mova m13, m1
+ SWAP 1, 7
+%endif
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ mova [rsp+10*16], m1
+ paddw m11, m1 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
+ pmulhrsw m7, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m0, m11, [PIC_sym(pw_2048)]
+ packuswb m7, m0
+ pand m7, m9
+ pandn m0, m9, m4
+ por m7, m0
+ mova [rsp+6*16], m7 ; don't clobber p0/m4 since we need it in H
+
+ ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
+ ; write +0
+%ifidn %2, v
+ mova m7, [tmpq+strideq*2] ; q6
+%else
+ mova m7, [rsp+17*16]
+%endif
+ paddw m10, [rsp+3*16]
+ paddw m11, [rsp+4*16] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
+ punpcklbw m0, m3, m7
+ punpckhbw m1, m3, m7
+%if ARCH_X86_64
+ mova m3, [rsp+8*16]
+%endif
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ mova [rsp+3*16], m0
+ mova [rsp+4*16], m1
+ paddw m10, m0
+ paddw m11, m1 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m5
+ por m0, m1
+%if ARCH_X86_32
+ mova m1, [esp+8*16]
+ mova m3, m1
+%endif
+ mova [rsp+8*16], m0 ; don't clobber q0/m5 since we need it in I
+
+ ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
+ ; write +1
+ paddw m10, [rsp+1*16]
+ paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
+ punpcklbw m0, m4, m7
+ punpckhbw m2, m4, m7
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m2, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
+%if ARCH_X86_64
+ mova m4, [rsp+6*16]
+%else
+ %define m4 [esp+6*16]
+%endif
+ pmulhrsw m2, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m2, m1
+ pand m2, m9
+ pandn m1, m9, m6
+ por m2, m1 ; don't clobber q1/m6 since we need it in K
+
+ ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
+ ; write +2
+ paddw m10, [rsp+2*16]
+%if ARCH_X86_64
+ SWAP 7, 8
+ paddw m11, m7
+%else
+ mova m8, m7
+ %ifidn %2, v
+ paddw m11, [esp+24*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ %else
+ paddw m11, [esp+36*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ %endif
+%endif
+ punpcklbw m0, m5, m8
+ punpckhbw m1, m5, m8
+%if ARCH_X86_64
+ mova m5, [rsp+8*16]
+%else
+ %define m5 [esp+8*16]
+%endif
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
+ pmulhrsw m7, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m7, m1
+ pand m7, m9
+ pandn m1, m9, m14
+ por m7, m1 ; don't clobber q2/m14 since we need it in K
+
+ ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
+ ; write +3
+ psubw m10, [rsp+7*16]
+ psubw m11, [rsp+5*16] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+ punpcklbw m0, m6, m8
+ punpckhbw m1, m6, m8
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m15
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+mstrideq], m0 ; q3
+%else
+ mova [rsp+20*16], m0
+%endif
+
+ ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
+ ; write +4
+ paddw m10, [rsp+ 9*16]
+ paddw m11, [rsp+10*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ punpcklbw m0, m14, m8
+ punpckhbw m1, m14, m8
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m0, m10, [PIC_sym(pw_2048)]
+ pmulhrsw m1, m11, [PIC_sym(pw_2048)]
+ packuswb m0, m1
+ pand m0, m9
+%ifidn %2, v
+ pandn m1, m9, [tmpq+strideq*0]
+%else
+ pandn m1, m9, [rsp+15*16]
+%endif
+ por m0, m1
+%ifidn %2, v
+ mova [tmpq+strideq*0], m0 ; q4
+%else
+ mova [rsp+15*16], m0
+%endif
+
+ ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
+ ; write +5
+ paddw m10, [rsp+3*16]
+ paddw m11, [rsp+4*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ punpcklbw m0, m15, m8
+ punpckhbw m1, m15, m8
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m10, m0
+ paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+ pmulhrsw m10, [PIC_sym(pw_2048)]
+ pmulhrsw m11, [PIC_sym(pw_2048)]
+ packuswb m10, m11
+ pand m10, m9
+%ifidn %2, v
+ pandn m11, m9, [tmpq+strideq*1]
+%else
+ pandn m11, m9, [rsp+16*16]
+%endif
+ por m10, m11
+%ifidn %2, v
+ mova [tmpq+strideq*1], m10 ; q5
+%else
+ mova [rsp+16*16], m10
+%endif
+
+%if ARCH_X86_64
+ SWAP 0, 8
+ SWAP 1, 9
+ SWAP 14, 7
+%else
+ %xdefine m3 m11
+ %xdefine m4 m14
+ %xdefine m5 m15
+ %xdefine m6 m10
+ mova %%q2mem, m7
+ %ifidn %2, v
+ mova m3, [esp+19*16]
+ %else
+ mova m3, [esp+32*16]
+ %endif
+ mova m4, [esp+ 6*16]
+ mova m5, [esp+ 8*16]
+%endif
+ SWAP m6, m2
+
+%if ARCH_X86_64
+ mova m9, %%flat8mem
+%endif
+%ifidn %2, v
+ lea tmpq, [dstq+mstrideq*4]
+%endif
+%endif ; if %1 == 16
+%if %1 >= 8
+ ; flat8 filter
+%if ARCH_X86_32
+ %define m9 %%flat8mem
+ %define m11 m1
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ %define m15 %%q3mem
+%endif
+ mova m11, %%p3mem
+ punpcklbw m0, m11, m3
+ punpcklbw m7, m13, m4
+ pmaddubsw m2, m0, [PIC_sym(pb_3_1)] ; 3 * p3 + p1
+ pmaddubsw m7, [PIC_sym(pb_2_1)]
+ paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0
+ punpcklbw m7, m5, [PIC_sym(pb_4)]
+ pmaddubsw m7, [PIC_sym(pb_1)]
+ paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+ punpckhbw m1, m11, m3
+ pmaddubsw m7, m1, [PIC_sym(pb_3_1)] ; 3 * p3 + p1
+ punpckhbw m0, m13, m4
+ pmaddubsw m0, [PIC_sym(pb_2_1)]
+ paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0
+ punpckhbw m0, m5, [PIC_sym(pb_4)]
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+ psrlw m0, m2, 3
+ psrlw m1, m7, 3
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m13
+ por m0, m1 ; p2
+%ifidn %2, v
+ mova [tmpq+strideq*1], m0
+%else
+ %if ARCH_X86_64
+ SWAP 0, 10
+ %else
+ mova [esp+2*16], m0
+ %endif
+%endif
+
+%if ARCH_X86_32
+ mova m11, %%p3mem
+%endif
+ punpcklbw m0, m11, m3
+ punpckhbw m1, m11, m3
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m2, m0
+ paddw m7, m1
+ punpcklbw m0, m13, m6
+ punpckhbw m1, m13, m6
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m2, m0
+ paddw m7, m1 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
+ psrlw m0, m2, 3
+ psrlw m1, m7, 3
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m3
+ por m0, m1 ; p1
+%ifidn %2, v
+ mova [tmpq+strideq*2], m0
+%else
+ mova [rsp+0*16], m0
+%endif
+
+%if ARCH_X86_32
+ mova m11, %%p3mem
+%endif
+ punpcklbw m0, m11, m3
+ punpckhbw m1, m11, m3
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ psubw m2, m0
+ psubw m7, m1
+ punpcklbw m0, m4, m14
+ punpckhbw m1, m4, m14
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m2, m0
+ paddw m7, m1 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
+ psrlw m0, m2, 3
+ psrlw m1, m7, 3
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m4
+ por m0, m1 ; p0
+%ifidn %2, v
+ mova [tmpq+stride3q], m0
+%else
+ mova [rsp+1*16], m0
+%endif
+
+ punpcklbw m0, m5, m15
+ punpckhbw m1, m5, m15
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m2, m0
+ paddw m7, m1
+%if ARCH_X86_32
+ mova m11, %%p3mem
+%endif
+ punpcklbw m0, m11, m4
+ punpckhbw m11, m11, m4
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m11, [PIC_sym(pb_1)]
+ psubw m2, m0
+ psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
+ psrlw m0, m2, 3
+ psrlw m11, m7, 3
+ packuswb m0, m11
+ pand m0, m9
+ pandn m11, m9, m5
+ por m11, m0 ; q0
+%ifidn %2, v
+ mova [dstq+strideq*0], m11
+%elif ARCH_X86_32
+ mova [esp+8*16], m11
+%endif
+
+ punpcklbw m0, m5, m15
+ punpckhbw m1, m5, m15
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m2, m0
+ paddw m7, m1
+ punpcklbw m0, m13, m6
+ punpckhbw m1, m13, m6
+ pmaddubsw m0, [PIC_sym(pb_m1_1)]
+ pmaddubsw m1, [PIC_sym(pb_m1_1)]
+ paddw m2, m0
+ paddw m7, m1 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
+ psrlw m0, m2, 3
+ psrlw m1, m7, 3
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m6
+ por m0, m1 ; q1
+%ifidn %2, v
+ mova [dstq+strideq*1], m0
+%else
+ %if ARCH_X86_64
+ SWAP 0, 13
+ %else
+ mova [esp+9*16], m0
+ %endif
+%endif
+
+ punpcklbw m0, m3, m6
+ punpckhbw m1, m3, m6
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ psubw m2, m0
+ psubw m7, m1
+ punpcklbw m0, m14, m15
+ punpckhbw m1, m14, m15
+ pmaddubsw m0, [PIC_sym(pb_1)]
+ pmaddubsw m1, [PIC_sym(pb_1)]
+ paddw m2, m0
+ paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+ psrlw m2, 3
+ psrlw m7, 3
+ packuswb m2, m7
+ pand m2, m9
+ pandn m7, m9, m14
+ por m2, m7 ; q2
+%ifidn %2, v
+ mova [dstq+strideq*2], m2
+%else
+ mova m0, [rsp+0*16]
+%if %1 == 8
+ mova m1, [rsp+1*16]
+ mova m4, %%p3mem
+
+%if ARCH_X86_32
+ %define m10 [esp+2*16]
+ %define m11 [esp+8*16]
+ %define m13 [esp+9*16]
+%endif
+
+ ; 16x8 transpose
+ punpcklbw m3, m4, m10
+ punpckhbw m4, m10
+ punpcklbw m5, m0, m1
+ punpckhbw m0, m1
+ punpcklbw m1, m11, m13
+ punpckhbw m6, m11, m13
+ punpcklbw m7, m2, m15
+ punpckhbw m2, m15
+%if ARCH_X86_64
+ SWAP 2, 15
+%else
+ mova m15, m2
+%endif
+
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m0
+ punpckhwd m4, m0
+ punpcklwd m0, m1, m7
+ punpckhwd m1, m7
+ punpcklwd m7, m6, m15
+ punpckhwd m6, m15
+%if ARCH_X86_64
+ SWAP 6, 15
+%else
+ mova m15, m6
+%endif
+
+ punpckldq m6, m2, m0
+ punpckhdq m2, m0
+ punpckldq m0, m3, m1
+ punpckhdq m3, m1
+ punpckldq m1, m5, m7
+ punpckhdq m5, m7
+ punpckldq m7, m4, m15
+ punpckhdq m4, m15
+
+ ; write 8x16
+ movq [dstq+strideq*0-4], xm6
+ movhps [dstq+strideq*1-4], xm6
+ movq [dstq+strideq*2-4], xm2
+ movhps [dstq+stride3q -4], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm0
+ movhps [dstq+strideq*1-4], xm0
+ movq [dstq+strideq*2-4], xm3
+ movhps [dstq+stride3q -4], xm3
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm1
+ movhps [dstq+strideq*1-4], xm1
+ movq [dstq+strideq*2-4], xm5
+ movhps [dstq+stride3q -4], xm5
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0-4], xm7
+ movhps [dstq+strideq*1-4], xm7
+ movq [dstq+strideq*2-4], xm4
+ movhps [dstq+stride3q -4], xm4
+ lea dstq, [dstq+strideq*4]
+%else
+ ; 16x16 transpose and store
+ SWAP 6, 0
+ SWAP 7, 1
+ %if ARCH_X86_64
+ SWAP 5, 10, 2
+ SWAP 8, 11
+ SWAP 9, 13
+ mova [rsp+21*16], m12
+ %else
+ mova [esp+10*16], m2
+ %xdefine m8 m0
+ %xdefine m9 m1
+ %xdefine m10 m2
+ %xdefine m11 m3
+ %xdefine m12 m4
+ %xdefine m13 m5
+ %xdefine m14 m6
+ %xdefine m15 m7
+ %endif
+ mova m0, [rsp+11*16]
+ mova m1, [rsp+12*16]
+ mova m2, [rsp+13*16]
+ mova m3, [rsp+14*16]
+ mova m4, [rsp+19*16]
+%if ARCH_X86_64
+ mova m7, [rsp+ 1*16]
+ mova m11, [rsp+20*16]
+ mova m12, [rsp+15*16]
+ mova m13, [rsp+16*16]
+ mova m14, [rsp+17*16]
+ TRANSPOSE_16X16B 1, [rsp+18*16]
+%else
+ mova m5, [esp+ 2*16]
+ TRANSPOSE_16X16B 1, [esp+32*16]
+ mov tmpq, dstq
+ lea dstq, [dstq+strideq*8]
+%endif
+ movu [dstq+strideq*0-8], xm0
+ movu [dstq+strideq*1-8], xm1
+ movu [dstq+strideq*2-8], xm2
+ movu [dstq+stride3q -8], xm3
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm4
+ movu [dstq+strideq*1-8], xm5
+ movu [dstq+strideq*2-8], xm6
+ movu [dstq+stride3q -8], xm7
+%if ARCH_X86_64
+ lea dstq, [dstq+strideq*4]
+%else
+ %xdefine m8 m0
+ %xdefine m9 m1
+ %xdefine m10 m2
+ %xdefine m11 m3
+ %xdefine m12 m4
+ %xdefine m13 m5
+ %xdefine m14 m6
+ %xdefine m15 m7
+ mova m8, [esp+11*16]
+ mova m9, [esp+12*16]
+ mova m10, [esp+13*16]
+ mova m11, [esp+14*16]
+ mova m12, [esp+26*16]
+ mova m13, [esp+27*16]
+ mova m14, [esp+ 0*16]
+ mova m15, [esp+ 1*16]
+ mov dstq, tmpq
+%endif
+ movu [dstq+strideq*0-8], xm8
+ movu [dstq+strideq*1-8], xm9
+ movu [dstq+strideq*2-8], xm10
+ movu [dstq+stride3q -8], xm11
+ lea dstq, [dstq+strideq*4]
+ movu [dstq+strideq*0-8], xm12
+ movu [dstq+strideq*1-8], xm13
+ movu [dstq+strideq*2-8], xm14
+ movu [dstq+stride3q -8], xm15
+ lea dstq, [dstq+strideq*4]
+%if ARCH_X86_32
+ lea dstq, [dstq+strideq*8]
+%else
+ mova m12, [rsp+21*16]
+%endif
+
+%endif ; if %1 == 8
+%endif ; ifidn %2, v
+%elif %1 == 6
+ ; flat6 filter
+%if ARCH_X86_32
+ mova [esp+3*16], m3
+ mova [esp+4*16], m4
+ mova [esp+5*16], m5
+ mova [esp+6*16], m6
+ %xdefine m8 m3
+ %xdefine m10 m4
+ %xdefine m11 m5
+ %xdefine m15 m6
+ %define m3 [esp+3*16]
+ %define m4 [esp+4*16]
+ %define m5 [esp+5*16]
+ %define m6 [esp+6*16]
+ %define m9 %%flat8mem
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+%endif
+
+ punpcklbw m8, m13, m5
+ punpckhbw m11, m13, m5
+ pmaddubsw m0, m8, [PIC_sym(pb_3_1)]
+ pmaddubsw m1, m11, [PIC_sym(pb_3_1)]
+ punpcklbw m7, m4, m3
+ punpckhbw m10, m4, m3
+ pmaddubsw m2, m7, [PIC_sym(pb_2)]
+ pmaddubsw m15, m10, [PIC_sym(pb_2)]
+ paddw m0, m2
+ paddw m1, m15
+ pmulhrsw m2, m0, [PIC_sym(pw_4096)]
+ pmulhrsw m15, m1, [PIC_sym(pw_4096)]
+ packuswb m2, m15
+ pand m2, m9
+ pandn m15, m9, m3
+ por m2, m15
+%ifidn %2, v
+ mova [tmpq+strideq*2], m2 ; p1
+%elif ARCH_X86_32
+ mova [esp+11*16], m2
+%endif
+
+ pmaddubsw m8, [PIC_sym(pb_m1_1)]
+ pmaddubsw m11, [PIC_sym(pb_m1_1)]
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m13, m6
+ punpckhbw m11, m13, m6
+%if ARCH_X86_64
+ SWAP 2, 13
+%endif
+ pmaddubsw m8, [PIC_sym(pb_m1_1)]
+ pmaddubsw m11, [PIC_sym(pb_m1_1)]
+ paddw m0, m8
+ paddw m1, m11
+ pmulhrsw m2, m0, [PIC_sym(pw_4096)]
+ pmulhrsw m15, m1, [PIC_sym(pw_4096)]
+ packuswb m2, m15
+ pand m2, m9
+ pandn m15, m9, m4
+ por m2, m15
+%ifidn %2, v
+ mova [tmpq+stride3q], m2 ; p0
+%elif ARCH_X86_32
+ mova [esp+8*16], m2
+%endif
+
+ paddw m0, m8
+ paddw m1, m11
+ punpcklbw m8, m3, m14
+ punpckhbw m11, m3, m14
+%if ARCH_X86_64
+ SWAP 2, 14
+%endif
+ pmaddubsw m2, m8, [PIC_sym(pb_m1_1)]
+ pmaddubsw m15, m11, [PIC_sym(pb_m1_1)]
+ paddw m0, m2
+ paddw m1, m15
+ pmulhrsw m2, m0, [PIC_sym(pw_4096)]
+ pmulhrsw m15, m1, [PIC_sym(pw_4096)]
+ packuswb m2, m15
+ pand m2, m9
+ pandn m15, m9, m5
+ por m2, m15
+%ifidn %2, v
+ mova [dstq+strideq*0], m2 ; q0
+%endif
+
+ pmaddubsw m8, [PIC_sym(pb_m1_2)]
+ pmaddubsw m11, [PIC_sym(pb_m1_2)]
+ paddw m0, m8
+ paddw m1, m11
+ pmaddubsw m7, [PIC_sym(pb_m1_0)]
+ pmaddubsw m10, [PIC_sym(pb_m1_0)]
+ paddw m0, m7
+ paddw m1, m10
+ pmulhrsw m0, [PIC_sym(pw_4096)]
+ pmulhrsw m1, [PIC_sym(pw_4096)]
+ packuswb m0, m1
+ pand m0, m9
+ pandn m1, m9, m6
+ por m0, m1
+%if ARCH_X86_32
+ %xdefine m3 m8
+ %xdefine m4 m10
+ %xdefine m5 m11
+ %xdefine m6 m15
+%endif
+%ifidn %2, v
+ mova [dstq+strideq*1], m0 ; q1
+%else
+ %if ARCH_X86_64
+ SWAP 3, 13
+ SWAP 4, 14
+ %else
+ mova m3, [esp+11*16]
+ mova m4, [esp+ 8*16]
+ %endif
+ SWAP 5, 2
+ SWAP 6, 0
+ TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7
+%endif
+%else ; if %1 == 4
+%ifidn %2, v
+ mova [tmpq+strideq*0], m3 ; p1
+ mova [tmpq+strideq*1], m4 ; p0
+ mova [tmpq+strideq*2], m5 ; q0
+ mova [tmpq+stride3q ], m6 ; q1
+%else
+ TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7
+%endif
+%endif
+%if ARCH_X86_32
+ %define m12 m12reg
+%endif
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 32-bit PIC helpers ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if ARCH_X86_32
+ %define PIC_base_offset $$
+
+ %macro SETUP_PIC 0 ; PIC_reg
+ %define PIC_reg r2
+ %assign PIC_reg_stk_offset stack_size-gprsize*(1+copy_args*4)
+ LEA PIC_reg, $$
+ %endmacro
+
+ %macro XCHG_PIC_REG 1 ; 0=mask 1=PIC_base
+ %if %1 == 0
+ mov [esp+PIC_reg_stk_offset], PIC_reg
+ mov PIC_reg, maskm
+ %else
+ mov PIC_reg, [esp+PIC_reg_stk_offset]
+ %endif
+ %endmacro
+
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+
+%else
+ %macro XCHG_PIC_REG 1
+ %endmacro
+ %define PIC_sym(sym) (sym)
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < required_stack_alignment
+ %assign copy_args 1
+ %else
+ %assign copy_args 0
+ %endif
+%endif
+
+%macro RELOC_ARGS 1
+ %if copy_args
+ %define maskm [esp+stack_size-gprsize*1]
+ %define l_stridem [esp+stack_size-gprsize*2]
+ %define lutm [esp+stack_size-gprsize*3]
+ %define %1m [esp+stack_size-gprsize*4]
+ mov r6d, r6m
+ mov maskm, maskd
+ mov lutm, lutd
+ mov %1m, r6d
+ %else
+ %define %1m r6m
+ %endif
+%endmacro
+
+%if ARCH_X86_32
+ %define tmpq r4
+ %define mstrideq r5
+ %define stride3q r6
+ %define l_stride3q r6
+%endif
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_v_sb_y_8bpc, 7, 11, 16, 16 * 15, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+%else
+cglobal lpf_v_sb_y_8bpc, 6, 7, 8, -16 * (26 + copy_args), \
+ dst, stride, mask, l, l_stride, lut, mask_bits
+ RELOC_ARGS w
+ SETUP_PIC
+ %define m12 m5
+%endif
+ shl l_strideq, 2
+ sub lq, l_strideq
+%if ARCH_X86_64
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+%else
+ mov l_stridem, l_strided
+%endif
+ mov mask_bitsd, 0xf
+ mova m12, [PIC_sym(pd_mask)]
+ XCHG_PIC_REG 0
+ movu m0, [maskq]
+ pxor m4, m4
+ movd m3, [lutq+136]
+ pshufb m3, m4
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ por m1, m2
+ por m0, m1
+ mova [rsp+11*16], m0
+ mova [rsp+12*16], m1
+ mova [rsp+13*16], m2
+ mova [rsp+14*16], m3
+
+%define maskmem [esp+15*16]
+%define mask0 [rsp+11*16]
+%define mask1 [rsp+12*16]
+%define mask2 [rsp+13*16]
+%define minlvl [rsp+14*16]
+
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ je .no_flat16
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+25*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 16, v
+ jmp .end
+
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ je .no_flat
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+25*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 8, v
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ XCHG_PIC_REG 1
+ je .no_filter
+
+%if ARCH_X86_32
+ mov [esp+25*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 4, v
+
+.end:
+%if ARCH_X86_32
+ mova m12, maskmem
+ mov mask_bitsd, [esp+25*16]
+%endif
+.no_filter:
+ pslld m12, 4
+ shl mask_bitsd, 4
+ add lq, 16
+ add dstq, 16
+%if ARCH_X86_64
+ sub wd, 4
+%else
+ sub dword wm, 4
+%endif
+ XCHG_PIC_REG 0
+ jg .loop
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_h_sb_y_8bpc, 7, 11, 16, 16 * 26, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+%else
+cglobal lpf_h_sb_y_8bpc, 6, 7, 8, -16 * (39 + copy_args), \
+ dst, stride, mask, l, l_stride, lut, mask_bits
+ RELOC_ARGS h
+ SETUP_PIC
+ %define m12 m5
+%endif
+ sub lq, 4
+ shl l_strideq, 2
+%if ARCH_X86_64
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+%else
+ mov l_stridem, l_strided
+%endif
+ mov mask_bitsd, 0xf
+ mova m12, [PIC_sym(pd_mask)]
+ XCHG_PIC_REG 0
+ movu m0, [maskq]
+ pxor m4, m4
+ movd m3, [lutq+136]
+ pshufb m3, m4
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ por m1, m2
+ por m0, m1
+ mova [rsp+22*16], m0
+ mova [rsp+23*16], m1
+ mova [rsp+24*16], m2
+ mova [rsp+25*16], m3
+
+%define maskmem [esp+37*16]
+%define mask0 [rsp+22*16]
+%define mask1 [rsp+23*16]
+%define mask2 [rsp+24*16]
+%define minlvl [rsp+25*16]
+
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ je .no_flat16
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+38*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 16, h
+ jmp .end
+
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ je .no_flat
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+38*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 8, h
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ XCHG_PIC_REG 1
+ je .no_filter
+
+%if ARCH_X86_32
+ mov [esp+38*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 4, h
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+ lea dstq, [dstq+strideq*8]
+%if ARCH_X86_32
+ jmp .end_noload
+.end:
+ mova m12, maskmem
+ mov l_strideq, l_stridem
+ mov mask_bitsd, [esp+38*16]
+.end_noload:
+%else
+.end:
+%endif
+ lea lq, [lq+l_strideq*4]
+ pslld m12, 4
+ shl mask_bitsd, 4
+%if ARCH_X86_64
+ sub hd, 4
+%else
+ sub dword hm, 4
+%endif
+ XCHG_PIC_REG 0
+ jg .loop
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_v_sb_uv_8bpc, 7, 11, 16, 3 * 16, \
+ dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+%else
+cglobal lpf_v_sb_uv_8bpc, 6, 7, 8, -16 * (12 + copy_args), \
+ dst, stride, mask, l, l_stride, lut, mask_bits
+ RELOC_ARGS w
+ SETUP_PIC
+ %define m12 m4
+%endif
+ shl l_strideq, 2
+ sub lq, l_strideq
+%if ARCH_X86_64
+ mov mstrideq, strideq
+ neg mstrideq
+ lea stride3q, [strideq*3]
+%else
+ mov l_stridem, l_strided
+%endif
+ mov mask_bitsd, 0xf
+ mova m12, [PIC_sym(pd_mask)]
+ XCHG_PIC_REG 0
+ movq m0, [maskq]
+ pxor m3, m3
+ movd m2, [lutq+136]
+ pshufb m2, m3
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ por m0, m1
+ mova [rsp+0*16], m0
+ mova [rsp+1*16], m1
+ mova [rsp+2*16], m2
+
+%define maskmem [esp+7*16]
+%define mask0 [rsp+0*16]
+%define mask1 [rsp+1*16]
+%define minlvl [rsp+2*16]
+
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ je .no_flat
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+11*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 6, v
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[1]
+ XCHG_PIC_REG 1
+ je .no_filter
+
+%if ARCH_X86_32
+ mov [esp+11*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 4, v
+
+.end:
+%if ARCH_X86_32
+ mova m12, maskmem
+ mov mask_bitsd, [esp+11*16]
+%endif
+.no_filter:
+ pslld m12, 4
+ shl mask_bitsd, 4
+ add lq, 16
+ add dstq, 16
+%if ARCH_X86_64
+ sub wd, 4
+%else
+ sub dword wm, 4
+%endif
+ XCHG_PIC_REG 0
+ jg .loop
+ RET
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal lpf_h_sb_uv_8bpc, 7, 11, 16, 16 * 3, \
+ dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+%else
+cglobal lpf_h_sb_uv_8bpc, 6, 7, 8, -16 * (13 + copy_args), \
+ dst, stride, mask, l, l_stride, lut, mask_bits
+ RELOC_ARGS h
+ SETUP_PIC
+ %define m12 m4
+%endif
+ sub lq, 4
+ shl l_strideq, 2
+%if ARCH_X86_64
+ lea stride3q, [strideq*3]
+ lea l_stride3q, [l_strideq*3]
+%else
+ mov l_stridem, l_strided
+%endif
+ mov mask_bitsd, 0xf
+ mova m12, [PIC_sym(pd_mask)]
+ XCHG_PIC_REG 0
+ movq m0, [maskq]
+ pxor m3, m3
+ movd m2, [lutq+136]
+ pshufb m2, m3
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ por m0, m1
+ mova [rsp+0*16], m0
+ mova [rsp+1*16], m1
+ mova [rsp+2*16], m2
+
+%define maskmem [esp+7*16]
+%define mask0 [rsp+0*16]
+%define mask1 [rsp+1*16]
+%define minlvl [rsp+2*16]
+
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ je .no_flat
+
+%if ARCH_X86_32
+ XCHG_PIC_REG 1
+ mov [esp+12*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 6, h
+ jmp .end
+
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[1]
+ XCHG_PIC_REG 1
+ je .no_filter
+
+%if ARCH_X86_32
+ mov [esp+12*16], mask_bitsd
+ mova maskmem, m12
+%endif
+ FILTER 4, h
+ jmp .end
+
+.no_filter:
+ lea dstq, [dstq+strideq*8]
+ lea dstq, [dstq+strideq*8]
+%if ARCH_X86_32
+ jmp .end_noload
+.end:
+ mova m12, maskmem
+ mov l_strided, l_stridem
+ mov mask_bitsd, [esp+12*16]
+.end_noload:
+%else
+.end:
+%endif
+ lea lq, [lq+l_strideq*4]
+ pslld m12, 4
+ shl mask_bitsd, 4
+%if ARCH_X86_64
+ sub hd, 4
+%else
+ sub dword hm, 4
+%endif
+ XCHG_PIC_REG 0
+ jg .loop
+ RET
diff --git a/third_party/dav1d/src/x86/looprestoration.h b/third_party/dav1d/src/x86/looprestoration.h
new file mode 100644
index 0000000000..de23be8866
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#include "common/intops.h"
+
+#define decl_wiener_filter_fns(ext) \
+decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \
+decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext))
+
+#define decl_sgr_filter_fns(ext) \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_5x5, ext)); \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_3x3, ext)); \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_mix, ext))
+
+decl_wiener_filter_fns(sse2);
+decl_wiener_filter_fns(ssse3);
+decl_wiener_filter_fns(avx2);
+decl_wiener_filter_fns(avx512icl);
+decl_sgr_filter_fns(ssse3);
+decl_sgr_filter_fns(avx2);
+decl_sgr_filter_fns(avx512icl);
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_x86(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+#if BITDEPTH == 8
+ c->wiener[0] = BF(dav1d_wiener_filter7, sse2);
+ c->wiener[1] = BF(dav1d_wiener_filter5, sse2);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+ c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
+ c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3);
+ }
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
+ c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
+ }
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->wiener[0] = BF(dav1d_wiener_filter7, avx512icl);
+#if BITDEPTH == 8
+ /* With VNNI we don't need a 5-tap version. */
+ c->wiener[1] = c->wiener[0];
+#else
+ c->wiener[1] = BF(dav1d_wiener_filter5, avx512icl);
+#endif
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx512icl);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx512icl);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx512icl);
+ }
+#endif
+}
diff --git a/third_party/dav1d/src/x86/looprestoration16_avx2.asm b/third_party/dav1d/src/x86/looprestoration16_avx2.asm
new file mode 100644
index 0000000000..ef25c28474
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration16_avx2.asm
@@ -0,0 +1,2540 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
+wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11
+wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
+wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1
+wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15
+pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+
+wiener_hshift: dw 4, 4, 1, 1
+wiener_vshift: dw 1024, 1024, 4096, 4096
+wiener_round: dd 1049600, 1048832
+
+pb_m10_m9: times 2 db -10, -9
+pb_m6_m5: times 2 db -6, -5
+pb_m2_m1: times 2 db -2, -1
+pb_2_3: times 2 db 2, 3
+pb_6_7: times 2 db 6, 7
+pw_1023: times 2 dw 1023
+pd_8: dd 8
+pd_25: dd 25
+pd_4096: dd 4096
+pd_34816: dd 34816
+pd_m262128: dd -262128
+pd_0xf00800a4: dd 0xf00800a4
+pd_0xf00801c7: dd 0xf00801c7
+
+%define pw_256 sgr_lshuf5
+
+cextern sgr_x_by_x_avx2
+
+SECTION .text
+
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
+
+INIT_YMM avx2
+cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+%define base t4-wiener_hshift
+ mov fltq, r6mp
+ movifnidn wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ vbroadcasti128 m6, [wiener_shufA]
+ vpbroadcastd m12, [fltq+ 0] ; x0 x1
+ lea t4, [wiener_hshift]
+ vbroadcasti128 m7, [wiener_shufB]
+ add wd, wd
+ vpbroadcastd m13, [fltq+ 4] ; x2 x3
+ shr t3d, 11
+ vpbroadcastd m14, [fltq+16] ; y0 y1
+ add lpfq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ add dstq, wq
+ vbroadcasti128 m8, [wiener_shufC]
+ lea t1, [rsp+wq+16]
+ vbroadcasti128 m9, [wiener_shufD]
+ neg wq
+ vpbroadcastd m0, [base+wiener_hshift+t3*4]
+ vpbroadcastd m10, [base+wiener_round+t3*4]
+ vpbroadcastd m11, [base+wiener_vshift+t3*4]
+ pmullw m12, m0 ; upshift filter coefs to make the
+ pmullw m13, m0 ; horizontal downshift constant
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.extend_right:
+ movd xm1, r10d
+ vpbroadcastd m0, [pb_6_7]
+ movu m2, [pb_0to31]
+ vpbroadcastb m1, xm1
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m3, m0
+ vpbroadcastd m0, [pb_m2_m1]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m4, m0
+ vpbroadcastd m0, [pb_m10_m9]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m5, m0
+ ret
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm3, [leftq]
+ vpblendd m3, [lpfq+r10-8], 0xfc
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m3, [lpfq+r10] ; avoid accessing memory located
+ mova m4, [lpfq+r10] ; before the start of the buffer
+ shufpd m3, m4, 0x05
+ pshufb m3, [wiener_lshuf7]
+ jmp .h_main2
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+r10-8]
+.h_main:
+ mova m4, [lpfq+r10+0]
+.h_main2:
+ movu m5, [lpfq+r10+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -36
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m3, m6
+ pshufb m1, m4, m7
+ paddw m0, m1
+ pshufb m3, m8
+ pmaddwd m0, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ pmaddwd m3, m13
+ pshufb m2, m5, m7
+ paddw m1, m2
+ vpbroadcastd m2, [pd_m262128] ; (1 << 4) - (1 << 18)
+ pshufb m4, m8
+ pmaddwd m1, m12
+ pshufb m5, m9
+ paddw m4, m5
+ pmaddwd m4, m13
+ paddd m0, m2
+ paddd m1, m2
+ paddd m0, m3
+ paddd m1, m4
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+r10], m0
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movq xm3, [leftq]
+ vpblendd m3, [lpfq+r10-8], 0xfc
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ movu m3, [lpfq+r10-8]
+ pshufb m3, [wiener_lshuf7]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+r10-8]
+.hv_main:
+ mova m4, [lpfq+r10+0]
+ movu m5, [lpfq+r10+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -36
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ pshufb m0, m3, m6
+ pshufb m1, m4, m7
+ paddw m0, m1
+ pshufb m3, m8
+ pmaddwd m0, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ pmaddwd m3, m13
+ pshufb m2, m5, m7
+ paddw m1, m2
+ vpbroadcastd m2, [pd_m262128]
+ pshufb m4, m8
+ pmaddwd m1, m12
+ pshufb m5, m9
+ paddw m4, m5
+ pmaddwd m4, m13
+ paddd m0, m2
+ paddd m1, m2
+ mova m2, [t4+r10]
+ paddw m2, [t2+r10]
+ mova m5, [t3+r10]
+ paddd m0, m3
+ paddd m1, m4
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova m4, [t5+r10]
+ paddw m4, [t1+r10]
+ psraw m0, 1
+ paddw m3, m0, [t6+r10]
+ mova [t0+r10], m0
+ punpcklwd m0, m2, m5
+ pmaddwd m0, m15
+ punpckhwd m2, m5
+ pmaddwd m2, m15
+ punpcklwd m1, m3, m4
+ pmaddwd m1, m14
+ punpckhwd m3, m4
+ pmaddwd m3, m14
+ paddd m0, m10
+ paddd m2, m10
+ paddd m0, m1
+ paddd m2, m3
+ psrad m0, 5
+ psrad m2, 5
+ packusdw m0, m2
+ pmulhuw m0, m11
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m1, [t4+r10]
+ paddw m1, [t2+r10]
+ mova m2, [t3+r10]
+ mova m4, [t1+r10]
+ paddw m3, m4, [t6+r10]
+ paddw m4, [t5+r10]
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m15
+ punpckhwd m1, m2
+ pmaddwd m1, m15
+ punpcklwd m2, m3, m4
+ pmaddwd m2, m14
+ punpckhwd m3, m4
+ pmaddwd m3, m14
+ paddd m0, m10
+ paddd m1, m10
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 5
+ psrad m1, 5
+ packusdw m0, m1
+ pmulhuw m0, m11
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+
+cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt
+%define base t4-wiener_hshift
+ mov fltq, r6mp
+ movifnidn wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ vbroadcasti128 m5, [wiener_shufE]
+ vpbroadcastw m11, [fltq+ 2] ; x1
+ vbroadcasti128 m6, [wiener_shufB]
+ lea t4, [wiener_hshift]
+ vbroadcasti128 m7, [wiener_shufD]
+ add wd, wd
+ vpbroadcastd m12, [fltq+ 4] ; x2 x3
+ shr t3d, 11
+ vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18)
+ add lpfq, wq
+ vpbroadcastw m13, [fltq+18] ; y1
+ add dstq, wq
+ vpbroadcastd m14, [fltq+20] ; y2 y3
+ lea t1, [rsp+wq+16]
+ neg wq
+ vpbroadcastd m0, [base+wiener_hshift+t3*4]
+ vpbroadcastd m9, [base+wiener_round+t3*4]
+ vpbroadcastd m10, [base+wiener_vshift+t3*4]
+ movu xm15, [wiener_lshuf5]
+ pmullw m11, m0
+ vinserti128 m15, [pb_0to31], 1
+ pmullw m12, m0
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+.v1:
+ call .v
+ jmp .end
+.extend_right:
+ movd xm2, r10d
+ vpbroadcastd m0, [pb_2_3]
+ vpbroadcastd m1, [pb_m6_m5]
+ vpbroadcastb m2, xm2
+ psubb m0, m2
+ psubb m1, m2
+ movu m2, [pb_0to31]
+ pminub m0, m2
+ pminub m1, m2
+ pshufb m3, m0
+ pshufb m4, m1
+ ret
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm3, [leftq+4]
+ vpblendd m3, [lpfq+r10-4], 0xfe
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m4, [lpfq+r10] ; avoid accessing memory located
+ mova m3, [lpfq+r10] ; before the start of the buffer
+ palignr m3, m4, 12
+ pshufb m3, m15
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+r10-4]
+.h_main:
+ movu m4, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m3, m5
+ pmaddwd m0, m11
+ pshufb m1, m4, m5
+ pmaddwd m1, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ pmaddwd m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ pmaddwd m3, m12
+ paddd m0, m8
+ paddd m1, m8
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+r10], m0
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm3, [leftq+4]
+ vpblendd m3, [lpfq+r10-4], 0xfe
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ movu m3, [lpfq+r10-4]
+ pshufb m3, m15
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+r10-4]
+.hv_main:
+ movu m4, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -34
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ pshufb m0, m3, m5
+ pmaddwd m0, m11
+ pshufb m1, m4, m5
+ pmaddwd m1, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ pmaddwd m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ pmaddwd m3, m12
+ paddd m0, m8
+ paddd m1, m8
+ paddd m0, m2
+ mova m2, [t3+r10]
+ paddw m2, [t1+r10]
+ paddd m1, m3
+ mova m4, [t2+r10]
+ punpckhwd m3, m2, m4
+ pmaddwd m3, m14
+ punpcklwd m2, m4
+ mova m4, [t4+r10]
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ pmaddwd m2, m14
+ psraw m0, 1
+ mova [t0+r10], m0
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 5
+ psrad m0, 5
+ packusdw m0, m1
+ pmulhuw m0, m10
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m0, [t1+r10]
+ paddw m2, m0, [t3+r10]
+ mova m1, [t2+r10]
+ mova m4, [t4+r10]
+ punpckhwd m3, m2, m1
+ pmaddwd m3, m14
+ punpcklwd m2, m1
+ pmaddwd m2, m14
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 5
+ psrad m0, 5
+ packusdw m0, m1
+ pmulhuw m0, m10
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ ret
+
+cglobal sgr_filter_5x5_16bpc, 4, 14, 15, 400*24+16, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x_avx2+256*4]
+ movifnidn hd, hm
+ mov edged, r7m
+ add wd, wd
+ vpbroadcastw m7, [paramsq+8] ; w0
+ add lpfq, wq
+ vpbroadcastd m8, [pd_8]
+ add dstq, wq
+ vpbroadcastd m9, [pd_25]
+ lea t3, [rsp+wq*2+400*12+16]
+ vpbroadcastd m10, [paramsq+0] ; s0
+ lea t4, [rsp+wq+400*20+16]
+ vpbroadcastd m11, [pd_0xf00800a4]
+ lea t1, [rsp+wq+20]
+ mova xm12, [sgr_lshuf5]
+ neg wq
+ vpbroadcastd m13, [pd_34816] ; (1 << 11) + (1 << 15)
+ pxor m6, m6
+ vpbroadcastd m14, [pw_1023]
+ psllw m7, 4
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call .top_fixup
+ add t1, 400*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ test hd, hd
+ jz .odd_height
+ call .h
+ add lpfq, strideq
+ call .hv
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .h_top
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+400*6]
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ jmp .main
+.no_top_height1:
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.extend_right:
+ vpbroadcastw m0, [lpfq-2]
+ movu m1, [r13+r10+ 0]
+ movu m2, [r13+r10+16]
+ vpblendvb m4, m0, m1
+ vpblendvb m5, m0, m2
+ ret
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .h_main
+.h_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10- 2]
+.h_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -36
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ palignr m2, m5, m4, 2
+ paddw m0, m4, m2
+ palignr m3, m5, m4, 6
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ shufpd m5, m4, m5, 0x05
+ paddw m0, m5
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ paddd m1, m3
+ punpckhwd m3, m4, m5
+ pmaddwd m3, m3
+ shufps m4, m5, q2121
+ paddw m0, m4 ; sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m2, m3
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+r10+400*0]
+ paddd m1, [t1+r10+400*2]
+ paddd m2, [t1+r10+400*4]
+.h_loop_end:
+ paddd m1, m5 ; sumsq
+ paddd m2, m4
+ mova [t1+r10+400*0], m0
+ mova [t1+r10+400*2], m1
+ mova [t1+r10+400*4], m2
+ add r10, 32
+ jl .h_loop
+ ret
+.top_fixup:
+ lea r10, [wq-4]
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+r10+400*0]
+ mova m1, [t1+r10+400*2]
+ mova m2, [t1+r10+400*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m1
+ mova [t2+r10+400*4], m2
+ add r10, 32
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .hv_main
+.hv_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+r10- 2]
+.hv_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -36
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ palignr m3, m5, m4, 2
+ paddw m0, m4, m3
+ palignr m1, m5, m4, 6
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ shufpd m5, m4, m5, 0x05
+ paddw m0, m5
+ punpcklwd m1, m4, m5
+ pmaddwd m1, m1
+ paddd m2, m1
+ punpckhwd m1, m4, m5
+ pmaddwd m1, m1
+ shufps m4, m5, q2121
+ paddw m0, m4 ; h sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m3, m1
+ paddd m2, m5 ; h sumsq
+ paddd m3, m4
+ paddw m1, m0, [t1+r10+400*0]
+ paddd m4, m2, [t1+r10+400*2]
+ paddd m5, m3, [t1+r10+400*4]
+ test hd, hd
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+r10+400*0] ; hv sum
+ paddd m4, [t2+r10+400*2] ; hv sumsq
+ paddd m5, [t2+r10+400*4]
+ mova [t0+r10+400*0], m0
+ mova [t0+r10+400*2], m2
+ mova [t0+r10+400*4], m3
+ psrlw m3, m1, 1
+ paddd m4, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m5, m8
+ psrld m4, 4 ; (a + 8) >> 4
+ punpcklwd m2, m3, m6
+ psrld m5, 4
+ punpckhwd m3, m6
+ pmulld m4, m9 ; a * 25
+ pmulld m5, m9
+ pmaddwd m2, m2 ; b * b
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ pmaxud m5, m3
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m10 ; p * s
+ pmulld m5, m10
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ mova [t4+r10+4], m2
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+r10*2+ 8], xm0
+ vextracti128 [t3+r10*2+40], m0, 1
+ mova [t3+r10*2+24], xm1
+ vextracti128 [t3+r10*2+56], m1, 1
+ add r10, 32
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+r10+400*0], m1
+ paddw m1, m0
+ mova [t1+r10+400*2], m4
+ paddd m4, m2
+ mova [t1+r10+400*4], m5
+ paddd m5, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+ lea r10, [wq-4]
+.v_loop:
+ mova m0, [t1+r10+400*0]
+ mova m2, [t1+r10+400*2]
+ mova m3, [t1+r10+400*4]
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m4, m2, [t2+r10+400*2]
+ paddd m5, m3, [t2+r10+400*4]
+ paddw m0, m0
+ paddd m2, m2
+ paddd m3, m3
+ paddw m1, m0 ; hv sum
+ paddd m4, m2 ; hv sumsq
+ paddd m5, m3
+ psrlw m3, m1, 1
+ paddd m4, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m5, m8
+ psrld m4, 4 ; (a + 8) >> 4
+ punpcklwd m2, m3, m6
+ psrld m5, 4
+ punpckhwd m3, m6
+ pmulld m4, m9 ; a * 25
+ pmulld m5, m9
+ pmaddwd m2, m2 ; b * b
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ pmaxud m5, m3
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m10 ; p * s
+ pmulld m5, m10
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ mova [t4+r10+4], m2
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+r10*2+ 8], xm0
+ vextracti128 [t3+r10*2+40], m0, 1
+ mova [t3+r10*2+24], xm1
+ vextracti128 [t3+r10*2+56], m1, 1
+ add r10, 32
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t4+r10*1+ 2]
+ movu m1, [t3+r10*2+ 4]
+ movu m2, [t3+r10*2+36]
+ paddw m3, m0, [t4+r10*1+ 0]
+ paddd m4, m1, [t3+r10*2+ 0]
+ paddd m5, m2, [t3+r10*2+32]
+ paddw m3, [t4+r10*1+ 4]
+ paddd m4, [t3+r10*2+ 8]
+ paddd m5, [t3+r10*2+40]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ mova [t4+r10*1+400*2+ 0], m0
+ mova [t3+r10*2+400*4+ 0], m1
+ mova [t3+r10*2+400*4+32], m2
+ add r10, 32
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m0, [t4+r10*1+ 2]
+ movu m1, [t3+r10*2+ 4]
+ movu m2, [t3+r10*2+36]
+ paddw m3, m0, [t4+r10*1+ 0]
+ paddd m4, m1, [t3+r10*2+ 0]
+ paddd m5, m2, [t3+r10*2+32]
+ paddw m3, [t4+r10*1+ 4]
+ paddd m4, [t3+r10*2+ 8]
+ paddd m5, [t3+r10*2+40]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ paddw m3, m0, [t4+r10*1+400*2+ 0]
+ paddd m4, m1, [t3+r10*2+400*4+ 0]
+ paddd m5, m2, [t3+r10*2+400*4+32]
+ mova [t4+r10*1+400*2+ 0], m0
+ mova [t3+r10*2+400*4+ 0], m1
+ mova [t3+r10*2+400*4+32], m2
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vinserti128 m1, m4, xm5, 1
+ vperm2i128 m4, m5, 0x31
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m4, m3
+ psrad m1, 9
+ psrad m4, 9
+ packssdw m1, m4
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m0, [dstq+r10]
+ mova m3, [t4+r10*1+400*2+ 0]
+ mova m4, [t3+r10*2+400*4+ 0]
+ mova m5, [t3+r10*2+400*4+32]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vinserti128 m1, m4, xm5, 1
+ vperm2i128 m4, m5, 0x31
+ psubd m1, m2 ; b - a * src + (1 << 7)
+ psubd m4, m3
+ psrad m1, 8
+ psrad m4, 8
+ packssdw m1, m4
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_3x3_16bpc, 4, 14, 14, 400*42+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x_avx2+256*4]
+ add wd, wd
+ movifnidn hd, hm
+ mov edged, r7m
+ add lpfq, wq
+ vpbroadcastw m7, [paramsq+10] ; w1
+ add dstq, wq
+ vpbroadcastd m9, [paramsq+ 4] ; s1
+ lea t3, [rsp+wq*2+400*12+8]
+ vpbroadcastd m8, [pd_8]
+ lea t4, [rsp+wq+400*32+8]
+ vpbroadcastd m10, [pd_0xf00801c7]
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m11, [pd_34816]
+ neg wq
+ mova xm12, [sgr_lshuf3]
+ pxor m6, m6
+ vpbroadcastd m13, [pw_1023]
+ psllw m7, 4
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ add t1, 400*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea r10, [wq-4]
+ lea t2, [t1+400*6]
+.top_fixup_loop:
+ mova m0, [t1+r10+400*0]
+ mova m1, [t1+r10+400*2]
+ mova m2, [t1+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m1
+ mova [t2+r10+400*4], m2
+ add r10, 32
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.extend_right:
+ vpbroadcastw m0, [lpfq-2]
+ movu m1, [r13+r10+ 2]
+ movu m2, [r13+r10+18]
+ vpblendvb m4, m0, m1
+ vpblendvb m5, m0, m2
+ ret
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 12
+ jmp .h_main
+.h_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+12], 1
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10+ 0]
+.h_main:
+ movu m5, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ mova [t1+r10+400*0], m1
+ mova [t1+r10+400*2], m2
+ mova [t1+r10+400*4], m3
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 12
+ jmp .hv0_main
+.hv0_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+12], 1
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu m4, [lpfq+r10+ 0]
+.hv0_main:
+ movu m5, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -34
+ jl .hv0_have_right
+ call .extend_right
+.hv0_have_right:
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ paddw m0, m1, [t1+r10+400*0]
+ paddd m4, m2, [t1+r10+400*2]
+ paddd m5, m3, [t1+r10+400*4]
+ mova [t1+r10+400*0], m1
+ mova [t1+r10+400*2], m2
+ mova [t1+r10+400*4], m3
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m2, m4, [t2+r10+400*2]
+ paddd m3, m5, [t2+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m4
+ mova [t2+r10+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ psubd m4, m2 ; p
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m9 ; p * s
+ pmulld m5, m9
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*0+ 4], m2
+ mova [t3+r10*2+400*0+ 8], xm0
+ vextracti128 [t3+r10*2+400*0+40], m0, 1
+ mova [t3+r10*2+400*0+24], xm1
+ vextracti128 [t3+r10*2+400*0+56], m1, 1
+ add r10, 32
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 12
+ jmp .hv1_main
+.hv1_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, xm12
+ vinserti128 m4, [lpfq+wq+12], 1
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu m4, [lpfq+r10+ 0]
+.hv1_main:
+ movu m5, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -34
+ jl .hv1_have_right
+ call .extend_right
+.hv1_have_right:
+ palignr m1, m5, m4, 2
+ paddw m0, m4, m1
+ punpcklwd m2, m4, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m0, m5 ; h sum
+ punpcklwd m1, m5, m6
+ pmaddwd m1, m1
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m1 ; h sumsq
+ paddd m3, m5
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m4, m2, [t2+r10+400*2]
+ paddd m5, m3, [t2+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m2
+ mova [t2+r10+400*4], m3
+ paddd m4, m8
+ paddd m5, m8
+ psrld m4, 4 ; (a + 8) >> 4
+ psrld m5, 4
+ pslld m2, m4, 3
+ pslld m3, m5, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ psubd m4, m2 ; p
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m9 ; p * s
+ pmulld m5, m9
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*2 +4], m2
+ mova [t3+r10*2+400*4+ 8], xm0
+ vextracti128 [t3+r10*2+400*4+40], m0, 1
+ mova [t3+r10*2+400*4+24], xm1
+ vextracti128 [t3+r10*2+400*4+56], m1, 1
+ add r10, 32
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab (even rows)
+ lea r10, [wq-4]
+.v0_loop:
+ mova m0, [t1+r10+400*0]
+ mova m4, [t1+r10+400*2]
+ mova m5, [t1+r10+400*4]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m2, m4, [t2+r10+400*2]
+ paddd m3, m5, [t2+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m4
+ mova [t2+r10+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ psubd m4, m2 ; p
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m9 ; p * s
+ pmulld m5, m9
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*0+ 4], m2
+ mova [t3+r10*2+400*0+ 8], xm0
+ vextracti128 [t3+r10*2+400*0+40], m0, 1
+ mova [t3+r10*2+400*0+24], xm1
+ vextracti128 [t3+r10*2+400*0+56], m1, 1
+ add r10, 32
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+.v1_loop:
+ mova m0, [t1+r10+400*0]
+ mova m4, [t1+r10+400*2]
+ mova m5, [t1+r10+400*4]
+ paddw m1, m0, [t2+r10+400*0]
+ paddd m2, m4, [t2+r10+400*2]
+ paddd m3, m5, [t2+r10+400*4]
+ mova [t2+r10+400*0], m0
+ mova [t2+r10+400*2], m4
+ mova [t2+r10+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaxud m4, m2
+ psubd m4, m2 ; p
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m9 ; p * s
+ pmulld m5, m9
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*2+ 4], m2
+ mova [t3+r10*2+400*4+ 8], xm0
+ vextracti128 [t3+r10*2+400*4+40], m0, 1
+ mova [t3+r10*2+400*4+24], xm1
+ vextracti128 [t3+r10*2+400*4+56], m1, 1
+ add r10, 32
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ mova xm0, [t4+r10*1+400*0+0]
+ paddw xm0, [t4+r10*1+400*0+4]
+ paddw xm2, xm0, [t4+r10*1+400*0+2]
+ mova m1, [t3+r10*2+400*0+0]
+ paddd m1, [t3+r10*2+400*0+8]
+ paddd m3, m1, [t3+r10*2+400*0+4]
+ psllw xm2, 2 ; a[-1] 444
+ pslld m3, 2 ; b[-1] 444
+ psubw xm2, xm0 ; a[-1] 343
+ psubd m3, m1 ; b[-1] 343
+ mova [t4+r10*1+400* 4], xm2
+ mova [t3+r10*2+400* 8], m3
+ mova xm0, [t4+r10*1+400*2+0]
+ paddw xm0, [t4+r10*1+400*2+4]
+ paddw xm2, xm0, [t4+r10*1+400*2+2]
+ mova m1, [t3+r10*2+400*4+0]
+ paddd m1, [t3+r10*2+400*4+8]
+ paddd m3, m1, [t3+r10*2+400*4+4]
+ psllw xm2, 2 ; a[ 0] 444
+ pslld m3, 2 ; b[ 0] 444
+ mova [t4+r10*1+400* 6], xm2
+ mova [t3+r10*2+400*12], m3
+ psubw xm2, xm0 ; a[ 0] 343
+ psubd m3, m1 ; b[ 0] 343
+ mova [t4+r10*1+400* 8], xm2
+ mova [t3+r10*2+400*16], m3
+ add r10, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ mova m3, [t4+r10*1+400*0+0]
+ paddw m3, [t4+r10*1+400*0+4]
+ paddw m1, m3, [t4+r10*1+400*0+2]
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+r10*1+400*4]
+ paddw m3, [t4+r10*1+400*6]
+ mova [t4+r10*1+400*4], m2
+ mova [t4+r10*1+400*6], m1
+ mova m4, [t3+r10*2+400*0+0]
+ paddd m4, [t3+r10*2+400*0+8]
+ paddd m1, m4, [t3+r10*2+400*0+4]
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+r10*2+400* 8+ 0]
+ paddd m4, [t3+r10*2+400*12+ 0]
+ mova [t3+r10*2+400* 8+ 0], m2
+ mova [t3+r10*2+400*12+ 0], m1
+ mova m5, [t3+r10*2+400*0+32]
+ paddd m5, [t3+r10*2+400*0+40]
+ paddd m1, m5, [t3+r10*2+400*0+36]
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+r10*2+400* 8+32]
+ paddd m5, [t3+r10*2+400*12+32]
+ mova [t3+r10*2+400* 8+32], m2
+ mova [t3+r10*2+400*12+32], m1
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vinserti128 m1, m4, xm5, 1
+ vperm2i128 m4, m5, 0x31
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m4, m3
+ psrad m1, 9
+ psrad m4, 9
+ packssdw m1, m4
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m13
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m3, [t4+r10*1+400*2+0]
+ paddw m3, [t4+r10*1+400*2+4]
+ paddw m1, m3, [t4+r10*1+400*2+2]
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+r10*1+400*6]
+ paddw m3, [t4+r10*1+400*8]
+ mova [t4+r10*1+400*6], m1
+ mova [t4+r10*1+400*8], m2
+ mova m4, [t3+r10*2+400*4+0]
+ paddd m4, [t3+r10*2+400*4+8]
+ paddd m1, m4, [t3+r10*2+400*4+4]
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+r10*2+400*12+ 0]
+ paddd m4, [t3+r10*2+400*16+ 0]
+ mova [t3+r10*2+400*12+ 0], m1
+ mova [t3+r10*2+400*16+ 0], m2
+ mova m5, [t3+r10*2+400*4+32]
+ paddd m5, [t3+r10*2+400*4+40]
+ paddd m1, m5, [t3+r10*2+400*4+36]
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+r10*2+400*12+32]
+ paddd m5, [t3+r10*2+400*16+32]
+ mova [t3+r10*2+400*12+32], m1
+ mova [t3+r10*2+400*16+32], m2
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vinserti128 m1, m4, xm5, 1
+ vperm2i128 m4, m5, 0x31
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m4, m3
+ psrad m1, 9
+ psrad m4, 9
+ packssdw m1, m4
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m13
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_mix_16bpc, 4, 14, 16, 400*66+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x_avx2+256*4]
+ add wd, wd
+ movifnidn hd, hm
+ mov edged, r7m
+ add lpfq, wq
+ vpbroadcastd m15, [paramsq+8] ; w0 w1
+ add dstq, wq
+ vpbroadcastd m13, [paramsq+0] ; s0
+ lea t3, [rsp+wq*2+400*24+8]
+ vpbroadcastd m14, [paramsq+4] ; s1
+ lea t4, [rsp+wq+400*52+8]
+ vpbroadcastd m9, [pd_8]
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m10, [pd_34816]
+ neg wq
+ vpbroadcastd m11, [pd_4096]
+ pxor m7, m7
+ vpbroadcastd m12, [pd_0xf00801c7]
+ psllw m15, 2
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup
+ add t1, 400*12
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea r10, [wq-4]
+ lea t2, [t1+400*12]
+.top_fixup_loop:
+ mova m0, [t1+r10+400* 0]
+ mova m1, [t1+r10+400* 2]
+ mova m2, [t1+r10+400* 4]
+ paddw m0, m0
+ mova m3, [t1+r10+400* 6]
+ paddd m1, m1
+ mova m4, [t1+r10+400* 8]
+ paddd m2, m2
+ mova m5, [t1+r10+400*10]
+ mova [t2+r10+400* 0], m0
+ mova [t2+r10+400* 2], m1
+ mova [t2+r10+400* 4], m2
+ mova [t2+r10+400* 6], m3
+ mova [t2+r10+400* 8], m4
+ mova [t2+r10+400*10], m5
+ add r10, 32
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .h_main
+.h_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, [sgr_lshuf5]
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10- 2]
+.h_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -36
+ jl .h_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
+.h_have_right:
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; sum3
+ punpcklwd m6, m0, m7
+ pmaddwd m6, m6
+ punpckhwd m0, m7
+ pmaddwd m0, m0
+ paddd m2, m6 ; sumsq3
+ shufpd m6, m4, m5, 0x05
+ punpcklwd m5, m6, m4
+ paddw m8, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m6, m4
+ pmaddwd m6, m6
+ paddd m3, m0
+ mova [t1+r10+400* 6], m1
+ mova [t1+r10+400* 8], m2
+ mova [t1+r10+400*10], m3
+ paddw m8, m1 ; sum5
+ paddd m5, m2 ; sumsq5
+ paddd m6, m3
+ mova [t1+r10+400* 0], m8
+ mova [t1+r10+400* 2], m5
+ mova [t1+r10+400* 4], m6
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .hv0_main
+.hv0_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, [sgr_lshuf5]
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu m4, [lpfq+r10- 2]
+.hv0_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -36
+ jl .hv0_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
+.hv0_have_right:
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; h sum3
+ punpcklwd m6, m0, m7
+ pmaddwd m6, m6
+ punpckhwd m0, m7
+ pmaddwd m0, m0
+ paddd m2, m6 ; h sumsq3
+ shufpd m6, m4, m5, 0x05
+ punpcklwd m5, m6, m4
+ paddw m8, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m6, m4
+ pmaddwd m6, m6
+ paddd m3, m0
+ paddw m8, m1 ; h sum5
+ paddd m5, m2 ; h sumsq5
+ paddd m6, m3
+ mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4?
+ mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd
+ mova [t3+r10*2+400*0+40], m6
+ paddw m8, [t1+r10+400* 0]
+ paddd m5, [t1+r10+400* 2]
+ paddd m6, [t1+r10+400* 4]
+ mova [t1+r10+400* 0], m8
+ mova [t1+r10+400* 2], m5
+ mova [t1+r10+400* 4], m6
+ paddw m0, m1, [t1+r10+400* 6]
+ paddd m4, m2, [t1+r10+400* 8]
+ paddd m5, m3, [t1+r10+400*10]
+ mova [t1+r10+400* 6], m1
+ mova [t1+r10+400* 8], m2
+ mova [t1+r10+400*10], m3
+ paddw m1, m0, [t2+r10+400* 6]
+ paddd m2, m4, [t2+r10+400* 8]
+ paddd m3, m5, [t2+r10+400*10]
+ mova [t2+r10+400* 6], m0
+ mova [t2+r10+400* 8], m4
+ mova [t2+r10+400*10], m5
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pmaxud m4, m2
+ psubd m4, m2 ; p3
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m12 ; b3 * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*2+ 4], m2
+ mova [t3+r10*2+400*4+ 8], xm0
+ vextracti128 [t3+r10*2+400*4+40], m0, 1
+ mova [t3+r10*2+400*4+24], xm1
+ vextracti128 [t3+r10*2+400*4+56], m1, 1
+ add r10, 32
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ vpbroadcastq xm5, [leftq]
+ vinserti128 m5, [lpfq+wq], 1
+ mova m4, [lpfq+wq]
+ add leftq, 8
+ palignr m4, m5, 10
+ jmp .hv1_main
+.hv1_extend_left:
+ mova xm4, [lpfq+wq]
+ pshufb xm4, [sgr_lshuf5]
+ vinserti128 m4, [lpfq+wq+10], 1
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu m4, [lpfq+r10- 2]
+.hv1_main:
+ movu m5, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -36
+ jl .hv1_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right
+.hv1_have_right:
+ palignr m6, m5, m4, 2
+ palignr m3, m5, m4, 4
+ paddw m2, m6, m3
+ punpcklwd m0, m6, m3
+ pmaddwd m0, m0
+ punpckhwd m6, m3
+ pmaddwd m6, m6
+ palignr m3, m5, m4, 6
+ paddw m2, m3 ; h sum3
+ punpcklwd m1, m3, m7
+ pmaddwd m1, m1
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ paddd m0, m1 ; h sumsq3
+ shufpd m1, m4, m5, 0x05
+ punpckhwd m5, m4, m1
+ paddw m8, m4, m1
+ pmaddwd m5, m5
+ punpcklwd m4, m1
+ pmaddwd m4, m4
+ paddd m6, m3
+ paddw m1, m2, [t2+r10+400* 6]
+ mova [t2+r10+400* 6], m2
+ paddw m8, m2 ; h sum5
+ paddd m2, m0, [t2+r10+400* 8]
+ paddd m3, m6, [t2+r10+400*10]
+ mova [t2+r10+400* 8], m0
+ mova [t2+r10+400*10], m6
+ paddd m4, m0 ; h sumsq5
+ paddd m5, m6
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m0, m2, 3
+ pslld m6, m3, 3
+ paddd m2, m0 ; ((a3 + 8) >> 4) * 9
+ paddd m3, m6
+ psrlw m6, m1, 1
+ pavgw m6, m7 ; (b3 + 2) >> 2
+ punpcklwd m0, m6, m7
+ pmaddwd m0, m0
+ punpckhwd m6, m7
+ pmaddwd m6, m6
+ pmaxud m2, m0
+ psubd m2, m0 ; p3
+ pmaxud m3, m6
+ psubd m3, m6
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pmulld m2, m14 ; p3 * s1
+ pmulld m3, m14
+ pmaddwd m0, m12 ; b3 * 455
+ pmaddwd m1, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrad m7, m2, 20 ; min(z3, 255) - 256
+ vpgatherdd m6, [r13+m7*4], m2 ; x3
+ psrad m2, m3, 20
+ vpgatherdd m7, [r13+m2*4], m3
+ pmulld m0, m6
+ packssdw m6, m7
+ pmulld m7, m1
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m7, m10
+ psrld m0, 12
+ psrld m7, 12
+ paddw m1, m8, [t2+r10+400*0]
+ paddd m2, m4, [t2+r10+400*2]
+ paddd m3, m5, [t2+r10+400*4]
+ paddw m1, [t1+r10+400*0]
+ paddd m2, [t1+r10+400*2]
+ paddd m3, [t1+r10+400*4]
+ mova [t2+r10+400*0], m8
+ mova [t2+r10+400*2], m4
+ mova [t2+r10+400*4], m5
+ mova [t4+r10*1+400*4 +4], m6
+ mova [t3+r10*2+400*8+ 8], xm0
+ vextracti128 [t3+r10*2+400*8+40], m0, 1
+ mova [t3+r10*2+400*8+24], xm7
+ vextracti128 [t3+r10*2+400*8+56], m7, 1
+ vpbroadcastd m4, [pd_25]
+ pxor m7, m7
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m4 ; ((a5 + 8) >> 4) * 25
+ pmulld m3, m4
+ psrlw m5, m1, 1
+ pavgw m5, m7 ; (b5 + 2) >> 2
+ punpcklwd m4, m5, m7
+ pmaddwd m4, m4
+ punpckhwd m5, m7
+ pmaddwd m5, m5
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+ pmaxud m2, m4
+ psubd m2, m4 ; p5
+ vpbroadcastd m4, [pd_0xf00800a4]
+ pmaxud m3, m5
+ psubd m3, m5
+ pmulld m2, m13 ; p5 * s0
+ pmulld m3, m13
+ pmaddwd m0, m4 ; b5 * 164
+ pmaddwd m1, m4
+ paddusw m2, m4
+ paddusw m3, m4
+ psrad m5, m2, 20 ; min(z5, 255) - 256
+ vpgatherdd m4, [r13+m5*4], m2 ; x5
+ psrad m2, m3, 20
+ vpgatherdd m5, [r13+m2*4], m3
+ pmulld m0, m4
+ pmulld m1, m5
+ packssdw m4, m5
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*0+ 4], m4
+ mova [t3+r10*2+400*0+ 8], xm0
+ vextracti128 [t3+r10*2+400*0+40], m0, 1
+ mova [t3+r10*2+400*0+24], xm1
+ vextracti128 [t3+r10*2+400*0+56], m1, 1
+ add r10, 32
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+ lea r10, [wq-4]
+.v0_loop:
+ mova m0, [t1+r10+400* 6]
+ mova m4, [t1+r10+400* 8]
+ mova m5, [t1+r10+400*10]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+r10+400* 6]
+ paddd m2, m4, [t2+r10+400* 8]
+ paddd m3, m5, [t2+r10+400*10]
+ mova [t2+r10+400* 6], m0
+ mova [t2+r10+400* 8], m4
+ mova [t2+r10+400*10], m5
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pmaxud m4, m2
+ psubd m4, m2 ; p3
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m12 ; b3 * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ mova m3, [t1+r10+400*0]
+ mova m4, [t1+r10+400*2]
+ mova m5, [t1+r10+400*4]
+ mova [t3+r10*2+400*8+ 8], m3
+ mova [t3+r10*2+400*0+ 8], m4
+ mova [t3+r10*2+400*0+40], m5
+ paddw m3, m3 ; cc5
+ paddd m4, m4
+ paddd m5, m5
+ mova [t1+r10+400*0], m3
+ mova [t1+r10+400*2], m4
+ mova [t1+r10+400*4], m5
+ mova [t4+r10*1+400*2+ 4], m2
+ mova [t3+r10*2+400*4+ 8], xm0
+ vextracti128 [t3+r10*2+400*4+40], m0, 1
+ mova [t3+r10*2+400*4+24], xm1
+ vextracti128 [t3+r10*2+400*4+56], m1, 1
+ add r10, 32
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+.v1_loop:
+ mova m4, [t1+r10+400* 6]
+ mova m5, [t1+r10+400* 8]
+ mova m6, [t1+r10+400*10]
+ paddw m1, m4, [t2+r10+400* 6]
+ paddd m2, m5, [t2+r10+400* 8]
+ paddd m3, m6, [t2+r10+400*10]
+ mova [t2+r10+400* 6], m4
+ mova [t2+r10+400* 8], m5
+ mova [t2+r10+400*10], m6
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pmaxud m4, m2
+ psubd m4, m2 ; p3
+ pmaxud m5, m3
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m12 ; b3 * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r13+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r13+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ packssdw m2, m3
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m8, m1, 12
+ mova [t4+r10*1+400*4+4], m2
+ mova m4, [t3+r10*2+400*8+ 8]
+ mova m5, [t3+r10*2+400*0+ 8]
+ mova m6, [t3+r10*2+400*0+40]
+ paddw m1, m4, [t2+r10+400*0]
+ paddd m2, m5, [t2+r10+400*2]
+ paddd m3, m6, [t2+r10+400*4]
+ paddw m1, [t1+r10+400*0]
+ paddd m2, [t1+r10+400*2]
+ paddd m3, [t1+r10+400*4]
+ mova [t2+r10+400*0], m4
+ mova [t2+r10+400*2], m5
+ mova [t2+r10+400*4], m6
+ vpbroadcastd m4, [pd_25]
+ mova [t3+r10*2+400*8+ 8], xm0
+ vextracti128 [t3+r10*2+400*8+40], m0, 1
+ mova [t3+r10*2+400*8+24], xm8
+ vextracti128 [t3+r10*2+400*8+56], m8, 1
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m4 ; ((a5 + 8) >> 4) * 25
+ pmulld m3, m4
+ psrlw m5, m1, 1
+ pavgw m5, m7 ; (b5 + 2) >> 2
+ punpcklwd m4, m5, m7
+ pmaddwd m4, m4
+ punpckhwd m5, m7
+ pmaddwd m5, m5
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+ pmaxud m2, m4
+ psubd m2, m4 ; p5
+ vpbroadcastd m4, [pd_0xf00800a4]
+ pmaxud m3, m5
+ psubd m3, m5
+ pmulld m2, m13 ; p5 * s0
+ pmulld m3, m13
+ pmaddwd m0, m4 ; b5 * 164
+ pmaddwd m1, m4
+ paddusw m2, m4
+ paddusw m3, m4
+ psrad m5, m2, 20 ; min(z5, 255) - 256
+ vpgatherdd m4, [r13+m5*4], m2 ; x5
+ psrad m2, m3, 20
+ vpgatherdd m5, [r13+m2*4], m3
+ pmulld m0, m4
+ pmulld m1, m5
+ packssdw m4, m5
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ mova [t4+r10*1+400*0+ 4], m4
+ mova [t3+r10*2+400*0+ 8], xm0
+ vextracti128 [t3+r10*2+400*0+40], m0, 1
+ mova [t3+r10*2+400*0+24], xm1
+ vextracti128 [t3+r10*2+400*0+56], m1, 1
+ add r10, 32
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu xm0, [t4+r10*1+400*0+2]
+ paddw xm2, xm0, [t4+r10*1+400*0+0]
+ paddw xm2, [t4+r10*1+400*0+4]
+ movu m1, [t3+r10*2+400*0+4]
+ paddd m3, m1, [t3+r10*2+400*0+0]
+ paddd m3, [t3+r10*2+400*0+8]
+ paddw xm0, xm2
+ paddd m1, m3
+ psllw xm2, 2
+ pslld m3, 2
+ paddw xm0, xm2 ; a5 565
+ paddd m1, m3 ; b5 565
+ mova [t4+r10*1+400* 6], xm0
+ mova [t3+r10*2+400*12], m1
+ mova xm0, [t4+r10*1+400*2+0]
+ paddw xm0, [t4+r10*1+400*2+4]
+ paddw xm2, xm0, [t4+r10*1+400*2+2]
+ mova m1, [t3+r10*2+400*4+0]
+ paddd m1, [t3+r10*2+400*4+8]
+ paddd m3, m1, [t3+r10*2+400*4+4]
+ psllw xm2, 2 ; a3[-1] 444
+ pslld m3, 2 ; b3[-1] 444
+ psubw xm2, xm0 ; a3[-1] 343
+ psubd m3, m1 ; b3[-1] 343
+ mova [t4+r10*1+400* 8], xm2
+ mova [t3+r10*2+400*16], m3
+ mova xm0, [t4+r10*1+400*4+0]
+ paddw xm0, [t4+r10*1+400*4+4]
+ paddw xm2, xm0, [t4+r10*1+400*4+2]
+ mova m1, [t3+r10*2+400*8+0]
+ paddd m1, [t3+r10*2+400*8+8]
+ paddd m3, m1, [t3+r10*2+400*8+4]
+ psllw xm2, 2 ; a3[ 0] 444
+ pslld m3, 2 ; b3[ 0] 444
+ mova [t4+r10*1+400*10], xm2
+ mova [t3+r10*2+400*20], m3
+ psubw xm2, xm0 ; a3[ 0] 343
+ psubd m3, m1 ; b3[ 0] 343
+ mova [t4+r10*1+400*12], xm2
+ mova [t3+r10*2+400*24], m3
+ add r10, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu xm2, [t4+r10*1+2]
+ paddw xm0, xm2, [t4+r10*1+0]
+ paddw xm0, [t4+r10*1+4]
+ paddw xm2, xm0
+ psllw xm0, 2
+ paddw xm0, xm2 ; a5
+ movu m1, [t3+r10*2+4]
+ paddd m4, m1, [t3+r10*2+0]
+ paddd m4, [t3+r10*2+8]
+ paddd m1, m4
+ pslld m4, 2
+ paddd m4, m1 ; b5
+ paddw xm2, xm0, [t4+r10*1+400* 6]
+ mova [t4+r10*1+400* 6], xm0
+ paddd m0, m4, [t3+r10*2+400*12]
+ mova [t3+r10*2+400*12], m4
+ mova xm3, [t4+r10*1+400*2+0]
+ paddw xm3, [t4+r10*1+400*2+4]
+ paddw xm5, xm3, [t4+r10*1+400*2+2]
+ psllw xm5, 2 ; a3[ 1] 444
+ psubw xm4, xm5, xm3 ; a3[ 1] 343
+ paddw xm3, xm4, [t4+r10*1+400* 8]
+ paddw xm3, [t4+r10*1+400*10]
+ mova [t4+r10*1+400* 8], xm4
+ mova [t4+r10*1+400*10], xm5
+ mova m1, [t3+r10*2+400*4+0]
+ paddd m1, [t3+r10*2+400*4+8]
+ paddd m5, m1, [t3+r10*2+400*4+4]
+ pslld m5, 2 ; b3[ 1] 444
+ psubd m4, m5, m1 ; b3[ 1] 343
+ paddd m1, m4, [t3+r10*2+400*16]
+ paddd m1, [t3+r10*2+400*20]
+ mova [t3+r10*2+400*16], m4
+ mova [t3+r10*2+400*20], m5
+ pmovzxwd m4, [dstq+r10]
+ pmovzxwd m2, xm2 ; a5
+ pmovzxwd m3, xm3 ; a3
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ pslld m4, 13
+ psubd m0, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m3 ; b3 - a3 * src + (1 << 8)
+ psrld m0, 9
+ pslld m1, 7
+ pblendw m0, m1, 0xaa
+ pmaddwd m0, m15
+ paddd m4, m11
+ paddd m0, m4
+ psrad m0, 7
+ vextracti128 xm1, m0, 1
+ packusdw xm0, xm1 ; clip
+ psrlw xm0, 6
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova xm3, [t4+r10*1+400*4+0]
+ paddw xm3, [t4+r10*1+400*4+4]
+ paddw xm5, xm3, [t4+r10*1+400*4+2]
+ psllw xm5, 2 ; a3[ 1] 444
+ psubw xm4, xm5, xm3 ; a3[ 1] 343
+ paddw xm3, xm4, [t4+r10*1+400*12]
+ paddw xm3, [t4+r10*1+400*10]
+ mova [t4+r10*1+400*10], xm5
+ mova [t4+r10*1+400*12], xm4
+ mova m1, [t3+r10*2+400*8+0]
+ paddd m1, [t3+r10*2+400*8+8]
+ paddd m5, m1, [t3+r10*2+400*8+4]
+ pslld m5, 2 ; b3[ 1] 444
+ psubd m4, m5, m1 ; b3[ 1] 343
+ paddd m1, m4, [t3+r10*2+400*24]
+ paddd m1, [t3+r10*2+400*20]
+ mova [t3+r10*2+400*20], m5
+ mova [t3+r10*2+400*24], m4
+ pmovzxwd m4, [dstq+r10]
+ pmovzxwd m2, [t4+r10*1+400* 6]
+ pmovzxwd m3, xm3
+ mova m0, [t3+r10*2+400*12]
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ pslld m4, 13
+ psubd m0, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m3 ; b3 - a3 * src + (1 << 8)
+ psrld m0, 8
+ pslld m1, 7
+ pblendw m0, m1, 0xaa
+ pmaddwd m0, m15
+ paddd m4, m11
+ paddd m0, m4
+ psrad m0, 7
+ vextracti128 xm1, m0, 1
+ packusdw xm0, xm1 ; clip
+ psrlw xm0, 6
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/looprestoration16_avx512.asm b/third_party/dav1d/src/x86/looprestoration16_avx512.asm
new file mode 100644
index 0000000000..e560c54a40
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration16_avx512.asm
@@ -0,0 +1,2524 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+
+wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
+wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11
+wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
+wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1
+wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+r_ext_mask: times 72 db -1
+ times 8 db 0
+wiener_hshift: dw 4, 4, 1, 1
+wiener_vshift: dw 1024, 1024, 4096, 4096
+wiener_round: dd 1049600, 1048832
+
+pw_164_455: dw 164, 455
+pw_1023: times 2 dw 1023
+pw_61448: times 2 dw 61448
+pd_m262128: dd -262128
+pd_m34816: dd -34816
+pd_m25: dd -25
+pd_m9: dd -9
+pd_8: dd 8
+pd_2147483648: dd 2147483648
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
+
+INIT_ZMM avx512icl
+cglobal wiener_filter7_16bpc, 4, 15, 17, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+%define base t4-wiener_hshift
+ mov fltq, r6mp
+ movifnidn wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ vbroadcasti128 m6, [wiener_shufA]
+ vpbroadcastd m12, [fltq+ 0] ; x0 x1
+ lea t4, [wiener_hshift]
+ vbroadcasti128 m7, [wiener_shufB]
+ add wd, wd
+ vpbroadcastd m13, [fltq+ 4] ; x2 x3
+ shr t3d, 11
+ vpbroadcastd m14, [fltq+16] ; y0 y1
+ add lpfq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ add dstq, wq
+ vbroadcasti128 m8, [wiener_shufC]
+ lea t1, [rsp+wq+16]
+ vbroadcasti128 m9, [wiener_shufD]
+ neg wq
+ vpbroadcastd m0, [base+wiener_hshift+t3*4]
+ mov r10d, 0xfe
+ vpbroadcastd m10, [base+wiener_round+t3*4]
+ kmovb k1, r10d
+ vpbroadcastd m11, [base+wiener_vshift+t3*4]
+ pmullw m12, m0 ; upshift filter coefs to make the
+ vpbroadcastd m16, [pd_m262128]
+ pmullw m13, m0 ; horizontal downshift constant
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm3, [leftq]
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ mova m4, [lpfq+r10+0]
+ vpbroadcastw xm3, xm4
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ jmp .h_main2
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+r10-8]
+.h_main:
+ mova m4, [lpfq+r10+0]
+.h_main2:
+ movu m5, [lpfq+r10+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -68
+ jl .h_have_right
+ push r0
+ lea r0, [r_ext_mask+66]
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r0+r10+ 0], 0xe4 ; c ? a : b
+ vpternlogd m4, m0, [r0+r10+ 8], 0xe4
+ vpternlogd m5, m0, [r0+r10+16], 0xe4
+ pop r0
+.h_have_right:
+ pshufb m2, m3, m6
+ pshufb m1, m4, m7
+ paddw m2, m1
+ pshufb m3, m8
+ mova m0, m16
+ vpdpwssd m0, m2, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ vpdpwssd m0, m3, m13
+ pshufb m2, m5, m7
+ paddw m2, m1
+ mova m1, m16
+ pshufb m4, m8
+ vpdpwssd m1, m2, m12
+ pshufb m5, m9
+ paddw m4, m5
+ vpdpwssd m1, m4, m13
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+r10], m0
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movq xm3, [leftq]
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ mova m4, [lpfq+r10+0]
+ vpbroadcastw xm3, xm4
+ vmovdqu64 m3{k1}, [lpfq+r10-8]
+ jmp .hv_main2
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+r10-8]
+.hv_main:
+ mova m4, [lpfq+r10+0]
+.hv_main2:
+ movu m5, [lpfq+r10+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -68
+ jl .hv_have_right
+ push r0
+ lea r0, [r_ext_mask+66]
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r0+r10+ 0], 0xe4
+ vpternlogd m4, m0, [r0+r10+ 8], 0xe4
+ vpternlogd m5, m0, [r0+r10+16], 0xe4
+ pop r0
+.hv_have_right:
+ pshufb m2, m3, m6
+ pshufb m1, m4, m7
+ paddw m2, m1
+ pshufb m3, m8
+ mova m0, m16
+ vpdpwssd m0, m2, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ vpdpwssd m0, m3, m13
+ pshufb m2, m5, m7
+ paddw m2, m1
+ pshufb m4, m8
+ mova m1, m16
+ vpdpwssd m1, m2, m12
+ pshufb m5, m9
+ paddw m4, m5
+ vpdpwssd m1, m4, m13
+ mova m2, [t4+r10]
+ paddw m2, [t2+r10]
+ mova m5, [t3+r10]
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova m4, [t5+r10]
+ paddw m4, [t1+r10]
+ psraw m0, 1
+ paddw m3, m0, [t6+r10]
+ mova [t0+r10], m0
+ punpcklwd m1, m2, m5
+ mova m0, m10
+ vpdpwssd m0, m1, m15
+ punpckhwd m2, m5
+ mova m1, m10
+ vpdpwssd m1, m2, m15
+ punpcklwd m2, m3, m4
+ vpdpwssd m0, m2, m14
+ punpckhwd m3, m4
+ vpdpwssd m1, m3, m14
+ psrad m0, 5
+ psrad m1, 5
+ packusdw m0, m1
+ pmulhuw m0, m11
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .hv_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m2, [t4+r10]
+ paddw m2, [t2+r10]
+ mova m3, [t3+r10]
+ punpcklwd m1, m2, m3
+ mova m0, m10
+ vpdpwssd m0, m1, m15
+ punpckhwd m2, m3
+ mova m1, m10
+ vpdpwssd m1, m2, m15
+ mova m4, [t1+r10]
+ paddw m3, m4, [t6+r10]
+ paddw m4, [t5+r10]
+ punpcklwd m2, m3, m4
+ vpdpwssd m0, m2, m14
+ punpckhwd m3, m4
+ vpdpwssd m1, m3, m14
+ psrad m0, 5
+ psrad m1, 5
+ packusdw m0, m1
+ pmulhuw m0, m11
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+
+cglobal wiener_filter5_16bpc, 4, 14, 15, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt
+%define base r13-r_ext_mask-70
+ mov fltq, r6mp
+ movifnidn wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ vbroadcasti128 m5, [wiener_shufE]
+ vpbroadcastw m11, [fltq+ 2] ; x1
+ vbroadcasti128 m6, [wiener_shufB]
+ lea r13, [r_ext_mask+70]
+ vbroadcasti128 m7, [wiener_shufD]
+ add wd, wd
+ vpbroadcastd m12, [fltq+ 4] ; x2 x3
+ shr t3d, 11
+ vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18)
+ add lpfq, wq
+ vpbroadcastw m13, [fltq+18] ; y1
+ add dstq, wq
+ vpbroadcastd m14, [fltq+20] ; y2 y3
+ lea t1, [rsp+wq+16]
+ vpbroadcastd m0, [base+wiener_hshift+t3*4]
+ neg wq
+ vpbroadcastd m9, [base+wiener_round+t3*4]
+ mov r10d, 0xfffe
+ vpbroadcastd m10, [base+wiener_vshift+t3*4]
+ kmovw k1, r10d
+ pmullw m11, m0
+ pmullw m12, m0
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+.v1:
+ call .v
+ jmp .end
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm3, [leftq+4]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm3, [lpfq+r10]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+r10-4]
+.h_main:
+ movu m4, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -66
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r13+r10+0], 0xe4 ; c ? a : b
+ vpternlogd m4, m0, [r13+r10+8], 0xe4
+.h_have_right:
+ pshufb m1, m3, m5
+ mova m0, m8
+ vpdpwssd m0, m1, m11
+ pshufb m2, m4, m5
+ mova m1, m8
+ vpdpwssd m1, m2, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ vpdpwssd m0, m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ vpdpwssd m1, m3, m12
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+r10], m0
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm3, [leftq+4]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastw xm3, [lpfq+r10]
+ vmovdqu32 m3{k1}, [lpfq+r10-4]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+r10-4]
+.hv_main:
+ movu m4, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -66
+ jl .hv_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m3, m0, [r13+r10+0], 0xe4
+ vpternlogd m4, m0, [r13+r10+8], 0xe4
+.hv_have_right:
+ pshufb m1, m3, m5
+ mova m0, m8
+ vpdpwssd m0, m1, m11
+ pshufb m2, m4, m5
+ mova m1, m8
+ vpdpwssd m1, m2, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ vpdpwssd m0, m2, m12
+ pshufb m4, m7
+ paddw m4, m3
+ vpdpwssd m1, m4, m12
+ mova m2, [t3+r10]
+ paddw m2, [t1+r10]
+ mova m3, [t2+r10]
+ punpcklwd m4, m2, m3
+ punpckhwd m2, m3
+ mova m3, m9
+ vpdpwssd m3, m2, m14
+ mova m2, m9
+ vpdpwssd m2, m4, m14
+ mova m4, [t4+r10]
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t0+r10], m0
+ punpcklwd m1, m0, m4
+ vpdpwssd m2, m1, m13
+ punpckhwd m0, m4
+ vpdpwssd m3, m0, m13
+ psrad m2, 5
+ psrad m3, 5
+ packusdw m2, m3
+ pmulhuw m2, m10
+ mova [dstq+r10], m2
+ add r10, 64
+ jl .hv_loop
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m0, [t1+r10]
+ paddw m2, m0, [t3+r10]
+ mova m1, [t2+r10]
+ mova m4, [t4+r10]
+ punpckhwd m3, m2, m1
+ pmaddwd m3, m14
+ punpcklwd m2, m1
+ pmaddwd m2, m14
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 5
+ psrad m0, 5
+ packusdw m0, m1
+ pmulhuw m0, m10
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .v_loop
+ ret
+
+cglobal sgr_filter_5x5_16bpc, 4, 14, 22, 416*24+8, dst, stride, left, lpf, \
+ w, h, edge, params
+%define base r13-r_ext_mask-72
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [r_ext_mask+72]
+ mov edged, r7m
+ movifnidn hd, hm
+ pxor m6, m6
+ vpbroadcastw m7, [paramsq+8] ; w0
+ add wd, wd
+ vpbroadcastd m8, [base+pd_8]
+ add lpfq, wq
+ vpbroadcastd m9, [base+pd_m25]
+ add dstq, wq
+ vpsubd m10, m6, [paramsq+0] {1to16} ; -s0
+ lea t3, [rsp+wq*2+416*12+8]
+ vpbroadcastd m11, [base+pw_164_455]
+ lea t4, [rsp+wq+416*20+8]
+ vpbroadcastd m12, [base+pw_61448] ; (15 << 12) + (1 << 3)
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m13, [base+pd_m34816] ; -((1 << 11) + (1 << 15))
+ neg wq
+ vpbroadcastd m14, [base+pw_1023]
+ psllw m7, 4
+ mova m18, [sgr_x_by_x+64*0]
+ mov r10d, 0xfffffff8
+ mova m19, [sgr_x_by_x+64*1]
+ kmovd k1, r10d
+ mova m20, [sgr_x_by_x+64*2]
+ mov r10, 0x3333333333333333
+ mova m21, [sgr_x_by_x+64*3]
+ kmovq k2, r10
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call .top_fixup
+ add t1, 416*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ test hd, hd
+ jz .odd_height
+ call .h
+ add lpfq, strideq
+ call .hv
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .h_top
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+416*6]
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ jmp .main
+.no_top_height1:
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10- 2]
+.h_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -68
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4 ; c ? a : b
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.h_have_right:
+ palignr m2, m17, m16, 2
+ paddw m0, m16, m2
+ palignr m3, m17, m16, 6
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ shufpd m17, m16, m17, 0x55
+ paddw m0, m17
+ punpcklwd m3, m16, m17
+ vpdpwssd m1, m3, m3
+ punpckhwd m3, m16, m17
+ vpdpwssd m2, m3, m3
+ shufps m16, m17, q2121
+ paddw m0, m16 ; sum
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+r10+416*0]
+ paddd m1, [t1+r10+416*2]
+ paddd m2, [t1+r10+416*4]
+.h_loop_end:
+ punpcklwd m17, m16, m6
+ vpdpwssd m1, m17, m17 ; sumsq
+ punpckhwd m16, m6
+ vpdpwssd m2, m16, m16
+ mova [t1+r10+416*0], m0
+ mova [t1+r10+416*2], m1
+ mova [t1+r10+416*4], m2
+ add r10, 64
+ jl .h_loop
+ ret
+.top_fixup:
+ lea r10, [wq-4]
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+r10+416*0]
+ mova m1, [t1+r10+416*2]
+ mova m2, [t1+r10+416*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m1
+ mova [t2+r10+416*4], m2
+ add r10, 64
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m16, [lpfq+r10- 2]
+.hv_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -68
+ jl .hv_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv_have_right:
+ palignr m3, m17, m16, 2
+ paddw m0, m16, m3
+ palignr m1, m17, m16, 6
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ shufpd m17, m16, m17, 0x55
+ paddw m0, m17
+ punpcklwd m1, m16, m17
+ vpdpwssd m2, m1, m1
+ punpckhwd m1, m16, m17
+ vpdpwssd m3, m1, m1
+ shufps m16, m17, q2121
+ paddw m0, m16 ; h sum
+ punpcklwd m17, m16, m6
+ vpdpwssd m2, m17, m17 ; h sumsq
+ punpckhwd m16, m6
+ vpdpwssd m3, m16, m16
+ paddw m1, m0, [t1+r10+416*0]
+ paddd m16, m2, [t1+r10+416*2]
+ paddd m17, m3, [t1+r10+416*4]
+ test hd, hd
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+r10+416*0] ; hv sum
+ paddd m16, [t2+r10+416*2] ; hv sumsq
+ paddd m17, [t2+r10+416*4]
+ mova [t0+r10+416*0], m0
+ mova [t0+r10+416*2], m2
+ mova [t0+r10+416*4], m3
+ psrlw m3, m1, 1
+ paddd m16, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m17, m8
+ psrld m16, 4 ; (a + 8) >> 4
+ psrld m17, 4
+ pmulld m16, m9 ; -a * 25
+ pmulld m17, m9
+ punpcklwd m2, m3, m6
+ vpdpwssd m16, m2, m2 ; -p
+ punpckhwd m3, m6
+ vpdpwssd m17, m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmulld m16, m10 ; p * s
+ pmulld m17, m10
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ pmaxsw m17, m6
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ packssdw m16, m17
+ psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ psubd m1, m13
+ mova [t4+r10+4], m16
+ psrld m16, m0, 12 ; b
+ psrld m17, m1, 12
+ mova [t3+r10*2+ 8], xm16
+ mova [t3+r10*2+ 24], xm17
+ vextracti128 [t3+r10*2+ 40], ym16, 1
+ vextracti128 [t3+r10*2+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+104], m16, 3
+ vextracti32x4 [t3+r10*2+120], m17, 3
+ add r10, 64
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+r10+416*0], m1
+ paddw m1, m0
+ mova [t1+r10+416*2], m16
+ paddd m16, m2
+ mova [t1+r10+416*4], m17
+ paddd m17, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+ lea r10, [wq-4]
+.v_loop:
+ mova m2, [t1+r10+416*2]
+ mova m3, [t1+r10+416*4]
+ mova m0, [t1+r10+416*0]
+ paddd m16, m2, [t2+r10+416*2]
+ paddd m17, m3, [t2+r10+416*4]
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m2
+ paddd m3, m3
+ paddd m16, m2 ; hv sumsq
+ paddd m17, m3
+ paddd m16, m8
+ paddd m17, m8
+ psrld m16, 4 ; (a + 8) >> 4
+ psrld m17, 4
+ pmulld m16, m9 ; -a * 25
+ pmulld m17, m9
+ paddw m0, m0
+ paddw m1, m0 ; hv sum
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m16, m2, m2 ; -p
+ punpckhwd m3, m6
+ vpdpwssd m17, m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmulld m16, m10 ; p * s
+ pmulld m17, m10
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ pmaxsw m17, m6
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ packssdw m16, m17
+ psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ psubd m1, m13
+ mova [t4+r10+4], m16
+ psrld m16, m0, 12 ; b
+ psrld m17, m1, 12
+ mova [t3+r10*2+ 8], xm16
+ mova [t3+r10*2+ 24], xm17
+ vextracti128 [t3+r10*2+ 40], ym16, 1
+ vextracti128 [t3+r10*2+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+104], m16, 3
+ vextracti32x4 [t3+r10*2+120], m17, 3
+ add r10, 64
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t4+r10*1+ 2]
+ movu m1, [t3+r10*2+ 4]
+ movu m2, [t3+r10*2+68]
+ paddw m3, m0, [t4+r10*1+ 0]
+ paddd m16, m1, [t3+r10*2+ 0]
+ paddd m17, m2, [t3+r10*2+64]
+ paddw m3, [t4+r10*1+ 4]
+ paddd m16, [t3+r10*2+ 8]
+ paddd m17, [t3+r10*2+72]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m16
+ pslld m16, 2
+ paddd m2, m17
+ pslld m17, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m16 ; b 565
+ paddd m2, m17
+ mova [t4+r10*1+416*2+ 0], m0
+ mova [t3+r10*2+416*4+ 0], m1
+ mova [t3+r10*2+416*4+64], m2
+ add r10, 64
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m0, [t4+r10*1+ 2]
+ movu m1, [t3+r10*2+ 4]
+ movu m2, [t3+r10*2+68]
+ paddw m3, m0, [t4+r10*1+ 0]
+ paddd m16, m1, [t3+r10*2+ 0]
+ paddd m17, m2, [t3+r10*2+64]
+ paddw m3, [t4+r10*1+ 4]
+ paddd m16, [t3+r10*2+ 8]
+ paddd m17, [t3+r10*2+72]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m16
+ pslld m16, 2
+ paddd m2, m17
+ pslld m17, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m16 ; b 565
+ paddd m2, m17
+ paddw m3, m0, [t4+r10*1+416*2+ 0]
+ paddd m16, m1, [t3+r10*2+416*4+ 0]
+ paddd m17, m2, [t3+r10*2+416*4+64]
+ mova [t4+r10*1+416*2+ 0], m0
+ mova [t3+r10*2+416*4+ 0], m1
+ mova [t3+r10*2+416*4+64], m2
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m16, m3
+ psrad m1, 9
+ psrad m16, 9
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m0, [dstq+r10]
+ mova m3, [t4+r10*1+416*2+ 0]
+ mova m16, [t3+r10*2+416*4+ 0]
+ mova m17, [t3+r10*2+416*4+64]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 7)
+ psubd m16, m3
+ psrad m1, 8
+ psrad m16, 8
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_3x3_16bpc, 4, 14, 22, 416*42+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [r_ext_mask+72]
+ mov edged, r7m
+ movifnidn hd, hm
+ pxor m6, m6
+ vpbroadcastw m7, [paramsq+10] ; w1
+ add wd, wd
+ vpbroadcastd m8, [base+pd_8]
+ add lpfq, wq
+ vpbroadcastd m9, [base+pd_m9]
+ add dstq, wq
+ vpsubd m10, m6, [paramsq+4] {1to16} ; -s1
+ lea t3, [rsp+wq*2+416*12+8]
+ vpbroadcastd m11, [base+pw_164_455]
+ lea t4, [rsp+wq+416*32+8]
+ vpbroadcastd m12, [base+pw_61448]
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m13, [base+pd_m34816]
+ neg wq
+ vpbroadcastd m14, [base+pw_1023]
+ psllw m7, 4
+ mova m18, [sgr_x_by_x+64*0]
+ mov r10d, 0xfffffffc
+ mova m19, [sgr_x_by_x+64*1]
+ kmovd k1, r10d
+ mova m20, [sgr_x_by_x+64*2]
+ mov r10, 0x3333333333333333
+ mova m21, [sgr_x_by_x+64*3]
+ kmovq k2, r10
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ add t1, 416*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea r10, [wq-4]
+ lea t2, [t1+416*6]
+.top_fixup_loop:
+ mova m0, [t1+r10+416*0]
+ mova m1, [t1+r10+416*2]
+ mova m2, [t1+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m1
+ mova [t2+r10+416*4], m2
+ add r10, 64
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm16, [leftq+4]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10+ 0]
+.h_main:
+ movu m17, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -66
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.h_have_right:
+ palignr m0, m17, m16, 2
+ paddw m1, m16, m0
+ punpcklwd m2, m16, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m16, m0
+ pmaddwd m3, m3
+ palignr m17, m16, 4
+ paddw m1, m17 ; sum
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; sumsq
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ mova [t1+r10+416*0], m1
+ mova [t1+r10+416*2], m2
+ mova [t1+r10+416*4], m3
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movd xm16, [leftq+4]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ add leftq, 8
+ jmp .hv0_main
+.hv0_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu m16, [lpfq+r10+ 0]
+.hv0_main:
+ movu m17, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -66
+ jl .hv0_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv0_have_right:
+ palignr m0, m17, m16, 2
+ paddw m1, m16, m0
+ punpcklwd m2, m16, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m16, m0
+ pmaddwd m3, m3
+ palignr m17, m16, 4
+ paddw m1, m17 ; sum
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; sumsq
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ paddw m0, m1, [t1+r10+416*0]
+ paddd m16, m2, [t1+r10+416*2]
+ paddd m17, m3, [t1+r10+416*4]
+ mova [t1+r10+416*0], m1
+ mova [t1+r10+416*2], m2
+ mova [t1+r10+416*4], m3
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m16, [t2+r10+416*2]
+ paddd m3, m17, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m16
+ mova [t2+r10+416*4], m17
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m17, m1, 1
+ pavgw m17, m6 ; (b + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m6, m1 ; b
+ punpckhwd m17, m6, m1
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m10 ; p * s
+ pmulld m3, m10
+ pmaddwd m16, m11 ; b * 455
+ pmaddwd m17, m11
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m12
+ psraw m3, 4 ; min(z, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x
+ pandn m2, m13, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m13
+ mova [t4+r10*1+416*0+4], m2
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movd xm16, [leftq+4]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ add leftq, 8
+ jmp .hv1_main
+.hv1_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-4]
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu m16, [lpfq+r10+ 0]
+.hv1_main:
+ movu m17, [lpfq+r10+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -66
+ jl .hv1_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv1_have_right:
+ palignr m1, m17, m16, 2
+ paddw m0, m16, m1
+ punpcklwd m2, m16, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m16, m1
+ pmaddwd m3, m3
+ palignr m17, m16, 4
+ paddw m0, m17 ; h sum
+ punpcklwd m1, m17, m6
+ vpdpwssd m2, m1, m1 ; h sumsq
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m16, m2, [t2+r10+416*2]
+ paddd m17, m3, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m2
+ mova [t2+r10+416*4], m3
+ paddd m16, m8
+ paddd m17, m8
+ psrld m16, 4 ; (a + 8) >> 4
+ psrld m17, 4
+ pmulld m16, m9 ; -((a + 8) >> 4) * 9
+ pmulld m17, m9
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m16, m2, m2 ; -p
+ punpckhwd m3, m6
+ vpdpwssd m17, m3, m3
+ punpcklwd m0, m6, m1 ; b
+ punpckhwd m1, m6, m1
+ pminsd m16, m6
+ pminsd m17, m6
+ pmulld m16, m10 ; p * s
+ pmulld m17, m10
+ pmaddwd m0, m11 ; b * 455
+ pmaddwd m1, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ packssdw m16, m17
+ psubd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m1, m13
+ mova [t4+r10*1+416*2+4], m16
+ psrld m16, m0, 12
+ psrld m17, m1, 12
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab (even rows)
+ lea r10, [wq-4]
+.v0_loop:
+ mova m0, [t1+r10+416*0]
+ mova m16, [t1+r10+416*2]
+ mova m17, [t1+r10+416*4]
+ paddw m0, m0
+ paddd m16, m16
+ paddd m17, m17
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m16, [t2+r10+416*2]
+ paddd m3, m17, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m16
+ mova [t2+r10+416*4], m17
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m17, m1, 1
+ pavgw m17, m6 ; (b + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m6, m1 ; b
+ punpckhwd m17, m6, m1
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m10 ; p * s
+ pmulld m3, m10
+ pmaddwd m16, m11 ; b * 455
+ pmaddwd m17, m11
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m12
+ psraw m3, 4 ; min(z, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x
+ pandn m2, m13, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m13
+ mova [t4+r10*1+416*0+4], m2
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+.v1_loop:
+ mova m0, [t1+r10+416*0]
+ mova m16, [t1+r10+416*2]
+ mova m17, [t1+r10+416*4]
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m16, [t2+r10+416*2]
+ paddd m3, m17, [t2+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m16
+ mova [t2+r10+416*4], m17
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m17, m1, 1
+ pavgw m17, m6 ; (b + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m6, m1 ; b
+ punpckhwd m17, m6, m1
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m10 ; p * s
+ pmulld m3, m10
+ pmaddwd m16, m11 ; b * 455
+ pmaddwd m17, m11
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m12
+ psraw m3, 4 ; min(z, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x
+ pandn m2, m13, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m13
+ mova [t4+r10*1+416*2+4], m2
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ mova ym16, [t4+r10*1+416*0+0]
+ paddw ym16, [t4+r10*1+416*0+4]
+ paddw ym17, ym16, [t4+r10*1+416*0+2]
+ mova m0, [t3+r10*2+416*0+0]
+ paddd m0, [t3+r10*2+416*0+8]
+ paddd m1, m0, [t3+r10*2+416*0+4]
+ psllw ym17, 2 ; a[-1] 444
+ pslld m1, 2 ; b[-1] 444
+ psubw ym17, ym16 ; a[-1] 343
+ psubd m1, m0 ; b[-1] 343
+ vmovdqa32 [t4+r10*1+416* 4], ym17
+ vmovdqa32 [t3+r10*2+416* 8], m1
+ mova ym16, [t4+r10*1+416*2+0]
+ paddw ym16, [t4+r10*1+416*2+4]
+ paddw ym17, ym16, [t4+r10*1+416*2+2]
+ mova m0, [t3+r10*2+416*4+0]
+ paddd m0, [t3+r10*2+416*4+8]
+ paddd m1, m0, [t3+r10*2+416*4+4]
+ psllw ym17, 2 ; a[ 0] 444
+ pslld m1, 2 ; b[ 0] 444
+ vmovdqa32 [t4+r10*1+416* 6], ym17
+ vmovdqa32 [t3+r10*2+416*12], m1
+ psubw ym17, ym16 ; a[ 0] 343
+ psubd m1, m0 ; b[ 0] 343
+ vmovdqa32 [t4+r10*1+416* 8], ym17
+ vmovdqa32 [t3+r10*2+416*16], m1
+ add r10, 32
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ mova m3, [t4+r10*1+416*0+0]
+ paddw m3, [t4+r10*1+416*0+4]
+ paddw m1, m3, [t4+r10*1+416*0+2]
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+r10*1+416*4]
+ paddw m3, [t4+r10*1+416*6]
+ mova [t4+r10*1+416*4], m2
+ mova [t4+r10*1+416*6], m1
+ mova m16, [t3+r10*2+416*0+0]
+ paddd m16, [t3+r10*2+416*0+8]
+ paddd m1, m16, [t3+r10*2+416*0+4]
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m16 ; b[ 1] 343
+ paddd m16, m2, [t3+r10*2+416* 8+ 0]
+ paddd m16, [t3+r10*2+416*12+ 0]
+ mova [t3+r10*2+416* 8+ 0], m2
+ mova [t3+r10*2+416*12+ 0], m1
+ mova m17, [t3+r10*2+416*0+64]
+ paddd m17, [t3+r10*2+416*0+72]
+ paddd m1, m17, [t3+r10*2+416*0+68]
+ pslld m1, 2
+ psubd m2, m1, m17
+ paddd m17, m2, [t3+r10*2+416* 8+64]
+ paddd m17, [t3+r10*2+416*12+64]
+ mova [t3+r10*2+416* 8+64], m2
+ mova [t3+r10*2+416*12+64], m1
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m16, m3
+ psrad m1, 9
+ psrad m16, 9
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m3, [t4+r10*1+416*2+0]
+ paddw m3, [t4+r10*1+416*2+4]
+ paddw m1, m3, [t4+r10*1+416*2+2]
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+r10*1+416*6]
+ paddw m3, [t4+r10*1+416*8]
+ mova [t4+r10*1+416*6], m1
+ mova [t4+r10*1+416*8], m2
+ mova m16, [t3+r10*2+416*4+0]
+ paddd m16, [t3+r10*2+416*4+8]
+ paddd m1, m16, [t3+r10*2+416*4+4]
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m16 ; b[ 1] 343
+ paddd m16, m2, [t3+r10*2+416*12+ 0]
+ paddd m16, [t3+r10*2+416*16+ 0]
+ mova [t3+r10*2+416*12+ 0], m1
+ mova [t3+r10*2+416*16+ 0], m2
+ mova m17, [t3+r10*2+416*4+64]
+ paddd m17, [t3+r10*2+416*4+72]
+ paddd m1, m17, [t3+r10*2+416*4+68]
+ pslld m1, 2
+ psubd m2, m1, m17
+ paddd m17, m2, [t3+r10*2+416*12+64]
+ paddd m17, [t3+r10*2+416*16+64]
+ mova [t3+r10*2+416*12+64], m1
+ mova [t3+r10*2+416*16+64], m2
+ mova m0, [dstq+r10]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ vshufi32x4 m1, m16, m17, q2020
+ vshufi32x4 m16, m17, q3131
+ psubd m1, m2 ; b - a * src + (1 << 8)
+ psubd m16, m3
+ psrad m1, 9
+ psrad m16, 9
+ packssdw m1, m16
+ pmulhrsw m1, m7
+ paddw m0, m1
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+r10], m0
+ add r10, 64
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_mix_16bpc, 4, 14, 23, 416*66+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ movifnidn wd, wm
+ mov paramsq, r6mp
+ lea r13, [r_ext_mask+72]
+ mov edged, r7m
+ movifnidn hd, hm
+ vpbroadcastd m7, [paramsq+8] ; w0 w1
+ pxor m6, m6
+ vpbroadcastd m8, [base+pd_8]
+ add wd, wd
+ vpbroadcastd m9, [base+pd_m9]
+ add lpfq, wq
+ vpbroadcastd m10, [base+pd_m25]
+ add dstq, wq
+ vpsubd m11, m6, [paramsq+0] {1to16} ; -s0
+ lea t3, [rsp+wq*2+416*24+8]
+ vpsubd m12, m6, [paramsq+4] {1to16} ; -s1
+ lea t4, [rsp+wq+416*52+8]
+ vpbroadcastd m13, [base+pw_164_455]
+ lea t1, [rsp+wq+12]
+ vpbroadcastd m14, [base+pw_61448]
+ neg wq
+ vpbroadcastd m15, [base+pd_m34816]
+ psllw m7, 2
+ vpbroadcastd m22, [base+pd_2147483648]
+ mov r10d, 0xfffffff8
+ mova m18, [sgr_x_by_x+64*0]
+ kmovd k1, r10d
+ mova m19, [sgr_x_by_x+64*1]
+ mov r10, 0x3333333333333333
+ mova m20, [sgr_x_by_x+64*2]
+ kmovq k2, r10
+ mova m21, [sgr_x_by_x+64*3]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx512icl).top_fixup
+ add t1, 416*12
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea r10, [wq-4]
+ lea t2, [t1+416*12]
+.top_fixup_loop:
+ mova m0, [t1+r10+416* 0]
+ mova m1, [t1+r10+416* 2]
+ mova m2, [t1+r10+416* 4]
+ paddw m0, m0
+ mova m3, [t1+r10+416* 6]
+ paddd m1, m1
+ mova m4, [t1+r10+416* 8]
+ paddd m2, m2
+ mova m5, [t1+r10+416*10]
+ mova [t2+r10+416* 0], m0
+ mova [t2+r10+416* 2], m1
+ mova [t2+r10+416* 4], m2
+ mova [t2+r10+416* 6], m3
+ mova [t2+r10+416* 8], m4
+ mova [t2+r10+416*10], m5
+ add r10, 64
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10- 2]
+.h_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -68
+ jl .h_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.h_have_right:
+ palignr m3, m17, m16, 2
+ palignr m0, m17, m16, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m17, m16, 6
+ paddw m1, m0 ; sum3
+ punpcklwd m4, m0, m6
+ vpdpwssd m2, m4, m4 ; sumsq3
+ punpckhwd m0, m6
+ vpdpwssd m3, m0, m0
+ shufpd m4, m16, m17, 0x55
+ punpcklwd m17, m4, m16
+ paddw m0, m16, m4
+ punpckhwd m4, m16
+ mova [t1+r10+416* 6], m1
+ mova [t1+r10+416* 8], m2
+ mova [t1+r10+416*10], m3
+ paddw m1, m0 ; sum5
+ vpdpwssd m2, m17, m17 ; sumsq5
+ vpdpwssd m3, m4, m4
+ mova [t1+r10+416* 0], m1
+ mova [t1+r10+416* 2], m2
+ mova [t1+r10+416* 4], m3
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .hv0_main
+.hv0_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu m16, [lpfq+r10- 2]
+.hv0_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -68
+ jl .hv0_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv0_have_right:
+ palignr m3, m17, m16, 2
+ palignr m0, m17, m16, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m17, m16, 6
+ paddw m1, m0 ; h sum3
+ punpcklwd m4, m0, m6
+ vpdpwssd m2, m4, m4 ; h sumsq3
+ punpckhwd m0, m6
+ vpdpwssd m3, m0, m0
+ shufpd m17, m16, m17, 0x55
+ paddw m4, m1, [t1+r10+416* 6]
+ paddd m5, m2, [t1+r10+416* 8]
+ mova [t1+r10+416* 6], m1
+ mova [t1+r10+416* 8], m2
+ paddw m1, m16
+ paddw m1, m17 ; h sum5
+ punpcklwd m0, m17, m16
+ vpdpwssd m2, m0, m0 ; h sumsq5
+ paddd m0, m3, [t1+r10+416*10]
+ mova [t1+r10+416*10], m3
+ punpckhwd m17, m16
+ vpdpwssd m3, m17, m17
+ mova [t3+r10*2+416*8+ 8], m1 ; we need a clean copy of the last row
+ mova [t3+r10*2+416*0+ 8], m2 ; in case height is odd
+ mova [t3+r10*2+416*0+72], m3
+ paddw m1, [t1+r10+416* 0]
+ paddd m2, [t1+r10+416* 2]
+ paddd m3, [t1+r10+416* 4]
+ mova [t1+r10+416* 0], m1
+ mova [t1+r10+416* 2], m2
+ mova [t1+r10+416* 4], m3
+ paddw m17, m4, [t2+r10+416* 6]
+ paddd m2, m5, [t2+r10+416* 8]
+ paddd m3, m0, [t2+r10+416*10]
+ mova [t2+r10+416* 6], m4
+ mova [t2+r10+416* 8], m5
+ mova [t2+r10+416*10], m0
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m5, m17, 1
+ pavgw m5, m6 ; (b3 + 2) >> 2
+ punpcklwd m4, m5, m6
+ vpdpwssd m2, m4, m4 ; -p3
+ punpckhwd m5, m6
+ vpdpwssd m3, m5, m5
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m12 ; p3 * s1
+ pmulld m3, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m14
+ psraw m3, 4 ; min(z3, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x3
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*2+4], m2
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movq xm16, [leftq+2]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ add leftq, 8
+ jmp .hv1_main
+.hv1_extend_left:
+ vpbroadcastw xm16, [lpfq+wq]
+ vmovdqu16 m16{k1}, [lpfq+wq-6]
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu m16, [lpfq+r10- 2]
+.hv1_main:
+ movu m17, [lpfq+r10+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -68
+ jl .hv1_have_right
+ vpbroadcastw m0, [lpfq-2]
+ vpternlogd m16, m0, [r13+r10+ 0], 0xe4
+ vpternlogd m17, m0, [r13+r10+16], 0xe4
+.hv1_have_right:
+ palignr m1, m17, m16, 2
+ palignr m3, m17, m16, 4
+ paddw m2, m1, m3
+ punpcklwd m0, m1, m3
+ pmaddwd m0, m0
+ punpckhwd m1, m3
+ pmaddwd m1, m1
+ palignr m3, m17, m16, 6
+ paddw m2, m3 ; h sum3
+ punpcklwd m5, m3, m6
+ vpdpwssd m0, m5, m5 ; h sumsq3
+ punpckhwd m3, m6
+ vpdpwssd m1, m3, m3
+ shufpd m3, m16, m17, 0x55
+ punpcklwd m5, m16, m3
+ paddw m4, m16, m3
+ punpckhwd m16, m3
+ paddw m17, m2, [t2+r10+416* 6]
+ mova [t2+r10+416* 6], m2
+ paddw m4, m2 ; h sum5
+ paddd m2, m0, [t2+r10+416* 8]
+ paddd m3, m1, [t2+r10+416*10]
+ mova [t2+r10+416* 8], m0
+ mova [t2+r10+416*10], m1
+ vpdpwssd m0, m5, m5 ; h sumsq5
+ vpdpwssd m1, m16, m16
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m3, m9
+ psrlw m16, m17, 1
+ pavgw m16, m6 ; (b3 + 2) >> 2
+ punpcklwd m5, m16, m6
+ vpdpwssd m2, m5, m5 ; -p3
+ punpckhwd m16, m6
+ vpdpwssd m3, m16, m16
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m2, m6
+ pminsd m3, m6
+ pmulld m2, m12 ; p3 * s1
+ pmulld m3, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ paddusw m3, m14
+ psraw m3, 4 ; min(z3, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x3
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*4+4], m2
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ paddw m5, m4, [t2+r10+416*0]
+ paddd m2, m0, [t2+r10+416*2]
+ paddd m3, m1, [t2+r10+416*4]
+ paddw m5, [t1+r10+416*0]
+ paddd m2, [t1+r10+416*2]
+ paddd m3, [t1+r10+416*4]
+ mova [t2+r10+416*0], m4
+ mova [t2+r10+416*2], m0
+ mova [t2+r10+416*4], m1
+ mova [t3+r10*2+416*8+ 8], xm16
+ mova [t3+r10*2+416*8+ 24], xm17
+ vextracti128 [t3+r10*2+416*8+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*8+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*8+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*8+120], m17, 3
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m10 ; -((a5 + 8) >> 4) * 25
+ pmulld m3, m10
+ psrlw m17, m5, 1
+ pavgw m17, m6 ; (b5 + 2) >> 2
+ punpcklwd m16, m17, m6
+ vpdpwssd m2, m16, m16 ; -p5
+ punpckhwd m17, m6
+ vpdpwssd m3, m17, m17
+ punpcklwd m16, m5, m6 ; b5
+ punpckhwd m17, m5, m6
+ pmulld m2, m11 ; p5 * s0
+ pmulld m3, m11
+ pmaddwd m16, m13 ; b5 * 164
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ pmaxsw m3, m6
+ paddusw m3, m14
+ psraw m3, 4 ; min(z5, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x5
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*0+4], m2
+ psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+ lea r10, [wq-4]
+.v0_loop:
+ mova m16, [t1+r10+416* 6]
+ mova m2, [t1+r10+416* 8]
+ mova m3, [t1+r10+416*10]
+ paddw m16, m16
+ paddd m2, m2
+ paddd m3, m3
+ paddw m17, m16, [t2+r10+416* 6]
+ paddd m4, m2, [t2+r10+416* 8]
+ paddd m5, m3, [t2+r10+416*10]
+ mova [t2+r10+416* 6], m16
+ mova [t2+r10+416* 8], m2
+ mova [t2+r10+416*10], m3
+ paddd m4, m8
+ paddd m5, m8
+ psrld m4, 4 ; (a3 + 8) >> 4
+ psrld m5, 4
+ pmulld m4, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m5, m9
+ psrlw m3, m17, 1
+ pavgw m3, m6 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m4, m2, m2 ; -p3
+ punpckhwd m3, m6
+ vpdpwssd m5, m3, m3
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m4, m6
+ pminsd m5, m6
+ pmulld m4, m12 ; p3 * s1
+ pmulld m5, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m5{k2}, m4, m4, 2
+ mova m4, m20
+ paddusw m5, m14
+ psraw m5, 4 ; min(z3, 255) - 256
+ vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m5
+ vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m5{k3}, m4 ; x3
+ pandn m4, m15, m5
+ psrld m5, 16
+ pmulld m16, m4
+ pmulld m17, m5
+ packssdw m4, m5
+ mova [t4+r10*1+416*2+4], m4
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova m3, [t1+r10+416*0]
+ mova m4, [t1+r10+416*2]
+ mova m5, [t1+r10+416*4]
+ mova [t3+r10*2+416*8+ 8], m3
+ mova [t3+r10*2+416*0+ 8], m4
+ mova [t3+r10*2+416*0+72], m5
+ paddw m3, m3 ; cc5
+ paddd m4, m4
+ paddd m5, m5
+ mova [t1+r10+416*0], m3
+ mova [t1+r10+416*2], m4
+ mova [t1+r10+416*4], m5
+ mova [t3+r10*2+416*4+ 8], xm16
+ mova [t3+r10*2+416*4+ 24], xm17
+ vextracti128 [t3+r10*2+416*4+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*4+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*4+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*4+120], m17, 3
+ add r10, 64
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-4]
+.v1_loop:
+ mova m16, [t1+r10+416* 6]
+ mova m2, [t1+r10+416* 8]
+ mova m3, [t1+r10+416*10]
+ paddw m17, m16, [t2+r10+416* 6]
+ paddd m4, m2, [t2+r10+416* 8]
+ paddd m5, m3, [t2+r10+416*10]
+ mova [t2+r10+416* 6], m16
+ mova [t2+r10+416* 8], m2
+ mova [t2+r10+416*10], m3
+ paddd m4, m8
+ paddd m5, m8
+ psrld m4, 4 ; (a3 + 8) >> 4
+ psrld m5, 4
+ pmulld m4, m9 ; -((a3 + 8) >> 4) * 9
+ pmulld m5, m9
+ psrlw m3, m17, 1
+ pavgw m3, m6 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m6
+ vpdpwssd m4, m2, m2 ; -p3
+ punpckhwd m3, m6
+ vpdpwssd m5, m3, m3
+ punpcklwd m16, m6, m17 ; b3
+ punpckhwd m17, m6, m17
+ pminsd m4, m6
+ pminsd m5, m6
+ pmulld m4, m12 ; p3 * s1
+ pmulld m5, m12
+ pmaddwd m16, m13 ; b3 * 455
+ pmaddwd m17, m13
+ vpalignr m5{k2}, m4, m4, 2
+ mova m4, m20
+ paddusw m5, m14
+ psraw m5, 4 ; min(z3, 255) - 256
+ vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m5
+ vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m5{k3}, m4 ; x3
+ pandn m4, m15, m5
+ psrld m5, 16
+ pmulld m16, m4
+ pmulld m17, m5
+ packssdw m4, m5
+ mova [t4+r10*1+416*4+4], m4
+ psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova m0, [t3+r10*2+416*8+ 8]
+ mova m4, [t3+r10*2+416*0+ 8]
+ mova m5, [t3+r10*2+416*0+72]
+ paddw m1, m0, [t2+r10+416*0]
+ paddd m2, m4, [t2+r10+416*2]
+ paddd m3, m5, [t2+r10+416*4]
+ paddw m1, [t1+r10+416*0]
+ paddd m2, [t1+r10+416*2]
+ paddd m3, [t1+r10+416*4]
+ mova [t2+r10+416*0], m0
+ mova [t2+r10+416*2], m4
+ mova [t2+r10+416*4], m5
+ mova [t3+r10*2+416*8+ 8], xm16
+ mova [t3+r10*2+416*8+ 24], xm17
+ vextracti128 [t3+r10*2+416*8+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*8+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*8+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*8+120], m17, 3
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ pmulld m2, m10 ; -((a5 + 8) >> 4) * 25
+ pmulld m3, m10
+ psrlw m5, m1, 1
+ pavgw m5, m6 ; (b5 + 2) >> 2
+ punpcklwd m4, m5, m6
+ vpdpwssd m2, m4, m4 ; -p5
+ punpckhwd m5, m6
+ vpdpwssd m3, m5, m5
+ punpcklwd m16, m1, m6 ; b5
+ punpckhwd m17, m1, m6
+ pmulld m2, m11 ; p5 * s0
+ pmulld m3, m11
+ pmaddwd m16, m13 ; b5 * 164
+ pmaddwd m17, m13
+ vpalignr m3{k2}, m2, m2, 2
+ mova m2, m20
+ pmaxsw m3, m6
+ paddusw m3, m14
+ psraw m3, 4 ; min(z5, 255) - 256
+ vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m3
+ vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m3{k3}, m2 ; x5
+ pandn m2, m15, m3
+ psrld m3, 16
+ pmulld m16, m2
+ pmulld m17, m3
+ packssdw m2, m3
+ mova [t4+r10*1+416*0+4], m2
+ psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ psubd m17, m15
+ psrld m16, 12
+ psrld m17, 12
+ mova [t3+r10*2+416*0+ 8], xm16
+ mova [t3+r10*2+416*0+ 24], xm17
+ vextracti128 [t3+r10*2+416*0+ 40], ym16, 1
+ vextracti128 [t3+r10*2+416*0+ 56], ym17, 1
+ vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2
+ vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2
+ vextracti32x4 [t3+r10*2+416*0+104], m16, 3
+ vextracti32x4 [t3+r10*2+416*0+120], m17, 3
+ add r10, 64
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu ym0, [t4+r10*1+416*0+2]
+ paddw ym2, ym0, [t4+r10*1+416*0+0]
+ paddw ym2, [t4+r10*1+416*0+4]
+ movu m1, [t3+r10*2+416*0+4]
+ paddd m3, m1, [t3+r10*2+416*0+0]
+ paddd m3, [t3+r10*2+416*0+8]
+ paddw ym0, ym2
+ paddd m1, m3
+ psllw ym2, 2
+ pslld m3, 2
+ paddw ym0, ym2 ; a5 565
+ paddd m1, m3 ; b5 565
+ mova [t4+r10*1+416* 6], ym0
+ mova [t3+r10*2+416*12], m1
+ mova ym0, [t4+r10*1+416*2+0]
+ paddw ym0, [t4+r10*1+416*2+4]
+ paddw ym2, ym0, [t4+r10*1+416*2+2]
+ mova m1, [t3+r10*2+416*4+0]
+ paddd m1, [t3+r10*2+416*4+8]
+ paddd m3, m1, [t3+r10*2+416*4+4]
+ psllw ym2, 2 ; a3[-1] 444
+ pslld m3, 2 ; b3[-1] 444
+ psubw ym2, ym0 ; a3[-1] 343
+ psubd m3, m1 ; b3[-1] 343
+ mova [t4+r10*1+416* 8], ym2
+ mova [t3+r10*2+416*16], m3
+ mova ym0, [t4+r10*1+416*4+0]
+ paddw ym0, [t4+r10*1+416*4+4]
+ paddw ym2, ym0, [t4+r10*1+416*4+2]
+ mova m1, [t3+r10*2+416*8+0]
+ paddd m1, [t3+r10*2+416*8+8]
+ paddd m3, m1, [t3+r10*2+416*8+4]
+ psllw ym2, 2 ; a3[ 0] 444
+ pslld m3, 2 ; b3[ 0] 444
+ mova [t4+r10*1+416*10], ym2
+ mova [t3+r10*2+416*20], m3
+ psubw ym2, ym0 ; a3[ 0] 343
+ psubd m3, m1 ; b3[ 0] 343
+ mova [t4+r10*1+416*12], ym2
+ mova [t3+r10*2+416*24], m3
+ add r10, 32
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu ym2, [t4+r10*1+2]
+ paddw ym0, ym2, [t4+r10*1+0]
+ paddw ym0, [t4+r10*1+4]
+ paddw ym2, ym0
+ psllw ym0, 2
+ paddw ym0, ym2 ; a5
+ movu m1, [t3+r10*2+4]
+ paddd m4, m1, [t3+r10*2+0]
+ paddd m4, [t3+r10*2+8]
+ paddd m1, m4
+ pslld m4, 2
+ paddd m4, m1 ; b5
+ paddw ym2, ym0, [t4+r10*1+416* 6]
+ mova [t4+r10*1+416* 6], ym0
+ paddd m0, m4, [t3+r10*2+416*12]
+ mova [t3+r10*2+416*12], m4
+ mova ym3, [t4+r10*1+416*2+0]
+ paddw ym3, [t4+r10*1+416*2+4]
+ paddw ym5, ym3, [t4+r10*1+416*2+2]
+ psllw ym5, 2 ; a3[ 1] 444
+ psubw ym4, ym5, ym3 ; a3[ 1] 343
+ paddw ym3, ym4, [t4+r10*1+416* 8]
+ paddw ym3, [t4+r10*1+416*10]
+ mova [t4+r10*1+416* 8], ym4
+ mova [t4+r10*1+416*10], ym5
+ mova m1, [t3+r10*2+416*4+0]
+ paddd m1, [t3+r10*2+416*4+8]
+ paddd m5, m1, [t3+r10*2+416*4+4]
+ pslld m5, 2 ; b3[ 1] 444
+ psubd m4, m5, m1 ; b3[ 1] 343
+ paddd m1, m4, [t3+r10*2+416*16]
+ paddd m1, [t3+r10*2+416*20]
+ mova [t3+r10*2+416*16], m4
+ mova [t3+r10*2+416*20], m5
+ pmovzxwd m4, [dstq+r10]
+ pmovzxwd m2, ym2 ; a5
+ pmovzxwd m3, ym3 ; a3
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ vpshldd m4, m22, 13
+ psubd m0, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m3 ; b3 - a3 * src + (1 << 8)
+ psrld m0, 9
+ pslld m1, 7
+ vpblendmb m0{k2}, m1, m0
+ vpdpwssd m4, m0, m7
+ psrad m4, 7
+ pmaxsd m4, m6
+ vpmovusdw ym16, m4 ; clip
+ psrlw ym16, 6
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova ym3, [t4+r10*1+416*4+0]
+ paddw ym3, [t4+r10*1+416*4+4]
+ paddw ym5, ym3, [t4+r10*1+416*4+2]
+ psllw ym5, 2 ; a3[ 1] 444
+ psubw ym4, ym5, ym3 ; a3[ 1] 343
+ paddw ym3, ym4, [t4+r10*1+416*12]
+ paddw ym3, [t4+r10*1+416*10]
+ mova [t4+r10*1+416*10], ym5
+ mova [t4+r10*1+416*12], ym4
+ mova m0, [t3+r10*2+416*8+0]
+ paddd m0, [t3+r10*2+416*8+8]
+ paddd m5, m0, [t3+r10*2+416*8+4]
+ pslld m5, 2 ; b3[ 1] 444
+ psubd m4, m5, m0 ; b3[ 1] 343
+ paddd m0, m4, [t3+r10*2+416*24]
+ paddd m0, [t3+r10*2+416*20]
+ mova [t3+r10*2+416*20], m5
+ mova [t3+r10*2+416*24], m4
+ pmovzxwd m4, [dstq+r10]
+ pmovzxwd m2, [t4+r10*1+416* 6]
+ pmovzxwd m3, ym3
+ mova m1, [t3+r10*2+416*12]
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ vpshldd m4, m22, 13
+ psubd m1, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m0, m3 ; b3 - a3 * src + (1 << 8)
+ pslld m0, 7
+ vpalignr m0{k2}, m1, m1, 1
+ vpdpwssd m4, m0, m7
+ psrad m4, 7
+ pmaxsd m4, m6
+ vpmovusdw ym16, m4 ; clip
+ psrlw ym16, 6
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/looprestoration16_sse.asm b/third_party/dav1d/src/x86/looprestoration16_sse.asm
new file mode 100644
index 0000000000..872e502982
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration16_sse.asm
@@ -0,0 +1,3723 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11
+wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11
+wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15
+wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1
+wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+pb_m14_m13: times 8 db -14,-13
+pb_m10_m9: times 8 db -10, -9
+pb_m6_m5: times 8 db -6, -5
+pb_m2_m1: times 8 db -2, -1
+pb_2_3: times 8 db 2, 3
+pb_6_7: times 8 db 6, 7
+pw_256: times 8 dw 256
+pw_1023: times 8 dw 1023
+pd_8: times 4 dd 8
+pd_4096: times 4 dd 4096
+pd_34816: times 4 dd 34816
+pd_m262128: times 4 dd -262128
+pd_0xffff: times 4 dd 0xffff
+pd_0xf00800a4: times 4 dd 0xf00800a4
+pd_0xf00801c7: times 4 dd 0xf00801c7
+pd_0xfffffff0: times 4 dd 0xfffffff0
+
+wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192
+wiener_round: dd 1049600, 1048832
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+%macro movif64 2 ; dst, src
+ %if ARCH_X86_64
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro movif32 2 ; dst, src
+ %if ARCH_X86_32
+ mov %1, %2
+ %endif
+%endmacro
+
+INIT_XMM ssse3
+%if ARCH_X86_32
+DECLARE_REG_TMP 5, 6
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 13*16
+ %else
+ %assign extra_stack 12*16
+ %endif
+cglobal wiener_filter7_16bpc, 4, 7, 8, -384*12-16-extra_stack, \
+ dst, stride, left, lpf, w, flt
+ %if STACK_ALIGNMENT < 16
+ %define lpfm dword [esp+calloff+16*12+ 0]
+ %define wm dword [esp+calloff+16*12+ 4]
+ %define hd dword [esp+calloff+16*12+ 8]
+ %define edgeb byte [esp+calloff+16*12+12]
+ %define edged dword [esp+calloff+16*12+12]
+ %else
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %endif
+ %define PICmem dword [esp+calloff+4*0]
+ %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers
+ %define t1m dword [esp+calloff+4*2]
+ %define t2m dword [esp+calloff+4*3]
+ %define t3m dword [esp+calloff+4*4]
+ %define t4m dword [esp+calloff+4*5]
+ %define t5m dword [esp+calloff+4*6]
+ %define t6m dword [esp+calloff+4*7]
+ %define t2 t2m
+ %define t3 t3m
+ %define t4 t4m
+ %define t5 t5m
+ %define t6 t6m
+ %define m8 [esp+calloff+16*2]
+ %define m9 [esp+calloff+16*3]
+ %define m10 [esp+calloff+16*4]
+ %define m11 [esp+calloff+16*5]
+ %define m12 [esp+calloff+16*6]
+ %define m13 [esp+calloff+16*7]
+ %define m14 [esp+calloff+16*8]
+ %define m15 [esp+calloff+16*9]
+ %define r10 r4
+ %define base t0-wiener_shifts
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov wd, [rstk+stack_offset+20]
+ mov wm, wd
+ mov r5, [rstk+stack_offset+24]
+ mov hd, r5
+ mov r5, [rstk+stack_offset+32]
+ mov edged, r5 ; edge
+ %endif
+%else
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers
+cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ %define base
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov fltq, r6mp
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ movq m13, [fltq]
+ movq m15, [fltq+16]
+%else
+ %if STACK_ALIGNMENT < 16
+ mov t0, [rstk+stack_offset+28]
+ mov t1, [rstk+stack_offset+36] ; pixel_max
+ movq m1, [t0] ; fx
+ movq m3, [t0+16] ; fy
+ LEA t0, wiener_shifts
+ %else
+ mov fltq, r6m
+ movq m1, [fltq]
+ movq m3, [fltq+16]
+ LEA t0, wiener_shifts
+ mov t1, r8m ; pixel_max
+ %endif
+ mov PICmem, t0
+%endif
+ mova m6, [base+wiener_shufA]
+ mova m7, [base+wiener_shufB]
+%if ARCH_X86_64
+ lea t4, [wiener_shifts]
+ add wd, wd
+ pshufd m12, m13, q0000 ; x0 x1
+ pshufd m13, m13, q1111 ; x2 x3
+ pshufd m14, m15, q0000 ; y0 y1
+ pshufd m15, m15, q1111 ; y2 y3
+ mova m8, [wiener_shufC]
+ mova m9, [wiener_shufD]
+ add lpfq, wq
+ lea t1, [rsp+wq+16]
+ add dstq, wq
+ neg wq
+ shr t3d, 11
+ %define base t4-wiener_shifts
+ movd m10, [base+wiener_round+t3*4]
+ movq m11, [base+wiener_shifts+t3*8]
+ pshufd m10, m10, q0000
+ pshufd m0, m11, q0000
+ pshufd m11, m11, q1111
+ pmullw m12, m0 ; upshift filter coefs to make the
+ pmullw m13, m0 ; horizontal downshift constant
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+ %define base
+ %define wiener_lshuf7_mem [wiener_lshuf7]
+ %define pd_m262128_mem [pd_m262128]
+%else
+ add wd, wd
+ mova m4, [base+wiener_shufC]
+ mova m5, [base+wiener_shufD]
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ pshufd m2, m3, q0000
+ pshufd m3, m3, q1111
+ mova m8, m4
+ mova m9, m5
+ mova m14, m2
+ mova m15, m3
+ shr t1, 11
+ add lpfq, wq
+ mova m3, [base+pd_m262128]
+ movd m4, [base+wiener_round+t1*4]
+ movq m5, [base+wiener_shifts+t1*8]
+ lea t1, [esp+extra_stack+wq+16]
+ add dstq, wq
+ neg wq
+ pshufd m4, m4, q0000
+ pshufd m2, m5, q0000
+ pshufd m5, m5, q1111
+ mov wm, wq
+ pmullw m0, m2
+ pmullw m1, m2
+ mova m2, [base+wiener_lshuf7]
+ %define pd_m262128_mem [esp+calloff+16*10]
+ mova pd_m262128_mem, m3
+ mova m10, m4
+ mova m11, m5
+ mova m12, m0
+ mova m13, m1
+ %define wiener_lshuf7_mem [esp+calloff+16*11]
+ mova wiener_lshuf7_mem, m2
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov lpfm, r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, lpfm
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+ movif32 wq, wm
+.v2:
+ call .v
+ movif32 wq, wm
+ jmp .v1
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+ movif32 t0, PICmem
+ pxor m0, m0
+ movd m1, wd
+ mova m2, [base+pb_0to15]
+ pshufb m1, m0
+ mova m0, [base+pb_6_7]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m3, m0
+ mova m0, [base+pb_m2_m1]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m4, m0
+ mova m0, [base+pb_m10_m9]
+ psubb m0, m1
+ pminub m0, m2
+ pshufb m5, m0
+ movif32 t0, t0m
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h:
+ movif64 wq, r4
+ movif32 wq, wm
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movq m3, [leftq]
+ movhps m3, [lpfq+wq]
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ mova m3, [lpfq+wq] ; avoid accessing memory located
+ pshufb m3, wiener_lshuf7_mem ; before the start of the buffer
+ jmp .h_main
+.h_top:
+ movif64 wq, r4
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+wq-8]
+.h_main:
+ mova m4, [lpfq+wq+0]
+ movu m5, [lpfq+wq+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -20
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m3, m6
+ pshufb m1, m4, m7
+ paddw m0, m1
+ pshufb m3, m8
+ pmaddwd m0, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ pmaddwd m3, m13
+ pshufb m2, m5, m7
+ paddw m1, m2
+ mova m2, pd_m262128_mem ; (1 << 4) - (1 << 18)
+ pshufb m4, m8
+ pmaddwd m1, m12
+ pshufb m5, m9
+ paddw m4, m5
+ pmaddwd m4, m13
+ paddd m0, m2
+ paddd m1, m2
+ paddd m0, m3
+ paddd m1, m4
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+wq], m0
+ add wq, 16
+ jl .h_loop
+ movif32 wq, wm
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movq m3, [leftq]
+ movhps m3, [lpfq+wq]
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ mova m3, [lpfq+wq]
+ pshufb m3, wiener_lshuf7_mem
+ jmp .hv_main
+.hv_bottom:
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+wq-8]
+.hv_main:
+ mova m4, [lpfq+wq+0]
+ movu m5, [lpfq+wq+8]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp wd, -20
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ movif32 t1, t4m
+ movif32 t0, t2m
+ pshufb m0, m3, m6
+ pshufb m1, m4, m7
+ paddw m0, m1
+ pshufb m3, m8
+ pmaddwd m0, m12
+ pshufb m1, m4, m9
+ paddw m3, m1
+ pshufb m1, m4, m6
+ pmaddwd m3, m13
+ pshufb m2, m5, m7
+ paddw m1, m2
+ mova m2, pd_m262128_mem
+ pshufb m4, m8
+ pmaddwd m1, m12
+ pshufb m5, m9
+ paddw m4, m5
+ pmaddwd m4, m13
+ paddd m0, m2
+ paddd m1, m2
+%if ARCH_X86_64
+ mova m2, [t4+wq]
+ paddw m2, [t2+wq]
+ mova m5, [t3+wq]
+%else
+ mova m2, [t1+wq]
+ paddw m2, [t0+wq]
+ mov t1, t3m
+ mov t0, t5m
+ mova m5, [t1+wq]
+ mov t1, t1m
+%endif
+ paddd m0, m3
+ paddd m1, m4
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+%if ARCH_X86_64
+ mova m4, [t5+wq]
+ paddw m4, [t1+wq]
+ psraw m0, 1
+ paddw m3, m0, [t6+wq]
+%else
+ mova m4, [t0+wq]
+ paddw m4, [t1+wq]
+ mov t0, t0m
+ mov t1, t6m
+ psraw m0, 1
+ paddw m3, m0, [t1+wq]
+%endif
+ mova [t0+wq], m0
+ punpcklwd m0, m2, m5
+ pmaddwd m0, m15
+ punpckhwd m2, m5
+ pmaddwd m2, m15
+ punpcklwd m1, m3, m4
+ pmaddwd m1, m14
+ punpckhwd m3, m4
+ pmaddwd m3, m14
+ paddd m0, m10
+ paddd m2, m10
+ paddd m0, m1
+ paddd m2, m3
+ psrad m0, 6
+ psrad m2, 6
+ packssdw m0, m2
+ pmulhw m0, m11
+ pxor m1, m1
+ pmaxsw m0, m1
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .hv_loop
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+%else
+ mov r4, t5m
+ mov t1, t4m
+ mov t6m, r4
+ mov t5m, t1
+ mov r4, t3m
+ mov t1, t2m
+ mov t4m, r4
+ mov t3m, t1
+ mov r4, t1m
+ mov t1, t0
+ mov t2m, r4
+ mov t0, t6m
+ mov wq, wm
+%endif
+ add dstq, strideq
+ ret
+.v:
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+.v_loop:
+%if ARCH_X86_64
+ mova m1, [t4+wq]
+ paddw m1, [t2+wq]
+ mova m2, [t3+wq]
+ mova m4, [t1+wq]
+ paddw m3, m4, [t6+wq]
+ paddw m4, [t5+wq]
+%else
+ mov t0, t4m
+ mov t1, t2m
+ mova m1, [t0+wq]
+ paddw m1, [t1+wq]
+ mov t0, t3m
+ mov t1, t1m
+ mova m2, [t0+wq]
+ mova m4, [t1+wq]
+ mov t0, t6m
+ mov t1, t5m
+ paddw m3, m4, [t0+wq]
+ paddw m4, [t1+wq]
+%endif
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m15
+ punpckhwd m1, m2
+ pmaddwd m1, m15
+ punpcklwd m2, m3, m4
+ pmaddwd m2, m14
+ punpckhwd m3, m4
+ pmaddwd m3, m14
+ paddd m0, m10
+ paddd m1, m10
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 6
+ psrad m1, 6
+ packssdw m0, m1
+ pmulhw m0, m11
+ pxor m1, m1
+ pmaxsw m0, m1
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .v_loop
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+%else
+ mov t0, t5m
+ mov t1, t4m
+ mov r4, t3m
+ mov t6m, t0
+ mov t5m, t1
+ mov t4m, r4
+ mov r4, t2m
+ mov t1, t1m
+ mov t0, t0m
+ mov t3m, r4
+ mov t2m, t1
+%endif
+ add dstq, strideq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign stack_size 12*16+384*8
+ %else
+ %assign stack_size 11*16+384*8
+ %endif
+cglobal wiener_filter5_16bpc, 4, 7, 8, -stack_size, dst, stride, left, \
+ lpf, w, flt
+ %if STACK_ALIGNMENT < 16
+ %define lpfm dword [esp+calloff+4*6]
+ %define wm dword [esp+calloff+4*7]
+ %define hd dword [esp+calloff+16*10+0]
+ %define edgeb byte [esp+calloff+16*10+4]
+ %define edged dword [esp+calloff+16*10+4]
+ %else
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %endif
+ %define PICmem dword [esp+calloff+4*0]
+ %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers
+ %define t1m dword [esp+calloff+4*2]
+ %define t2m dword [esp+calloff+4*3]
+ %define t3m dword [esp+calloff+4*4]
+ %define t4m dword [esp+calloff+4*5]
+ %define t2 t2m
+ %define t3 t3m
+ %define t4 t4m
+ %define m8 [esp+calloff+16*2]
+ %define m9 [esp+calloff+16*3]
+ %define m10 [esp+calloff+16*4]
+ %define m11 [esp+calloff+16*5]
+ %define m12 [esp+calloff+16*6]
+ %define m13 [esp+calloff+16*7]
+ %define m14 [esp+calloff+16*8]
+ %define m15 [esp+calloff+16*9]
+ %define base t0-wiener_shifts
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov wd, [rstk+stack_offset+20]
+ mov wm, wd
+ mov r5, [rstk+stack_offset+24]
+ mov hd, r5
+ mov r5, [rstk+stack_offset+32]
+ mov edged, r5 ; edge
+ %endif
+%else
+cglobal wiener_filter5_16bpc, 4, 14, 16, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ %define base
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov fltq, r6mp
+ movifnidn hd, hm
+ mov edged, r7m
+ mov t3d, r8m ; pixel_max
+ movq m12, [fltq]
+ movq m14, [fltq+16]
+%else
+ %if STACK_ALIGNMENT < 16
+ mov t0, [rstk+stack_offset+28]
+ mov t1, [rstk+stack_offset+36] ; pixel_max
+ movq m1, [t0] ; fx
+ movq m3, [t0+16] ; fy
+ LEA t0, wiener_shifts
+ %else
+ mov fltq, r6m
+ movq m1, [fltq]
+ movq m3, [fltq+16]
+ LEA t0, wiener_shifts
+ mov t1, r8m ; pixel_max
+ %endif
+ mov PICmem, t0
+%endif
+ mova m5, [base+wiener_shufE]
+ mova m6, [base+wiener_shufB]
+ mova m7, [base+wiener_shufD]
+%if ARCH_X86_64
+ lea t4, [wiener_shifts]
+ add wd, wd
+ punpcklwd m11, m12, m12
+ pshufd m11, m11, q1111 ; x1
+ pshufd m12, m12, q1111 ; x2 x3
+ punpcklwd m13, m14, m14
+ pshufd m13, m13, q1111 ; y1
+ pshufd m14, m14, q1111 ; y2 y3
+ shr t3d, 11
+ mova m8, [pd_m262128] ; (1 << 4) - (1 << 18)
+ add lpfq, wq
+ lea t1, [rsp+wq+16]
+ add dstq, wq
+ neg wq
+ %define base t4-wiener_shifts
+ movd m9, [base+wiener_round+t3*4]
+ movq m10, [base+wiener_shifts+t3*8]
+ pshufd m9, m9, q0000
+ pshufd m0, m10, q0000
+ pshufd m10, m10, q1111
+ mova m15, [wiener_lshuf5]
+ pmullw m11, m0
+ pmullw m12, m0
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+ %define base
+%else
+ add wd, wd
+ punpcklwd m0, m1, m1
+ pshufd m0, m0, q1111 ; x1
+ pshufd m1, m1, q1111 ; x2 x3
+ punpcklwd m2, m3, m3
+ pshufd m2, m2, q1111 ; y1
+ pshufd m3, m3, q1111 ; y2 y3
+ mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18)
+ mova m13, m2
+ mova m14, m3
+ mova m8, m4
+ shr t1, 11
+ add lpfq, wq
+ movd m2, [base+wiener_round+t1*4]
+ movq m3, [base+wiener_shifts+t1*8]
+ %if STACK_ALIGNMENT < 16
+ lea t1, [esp+16*11+wq+16]
+ %else
+ lea t1, [esp+16*10+wq+16]
+ %endif
+ add dstq, wq
+ neg wq
+ pshufd m2, m2, q0000
+ pshufd m4, m3, q0000
+ pshufd m3, m3, q1111
+ mov wm, wq
+ pmullw m0, m4
+ pmullw m1, m4
+ mova m4, [base+wiener_lshuf5]
+ mova m9, m2
+ mova m10, m3
+ mova m11, m0
+ mova m12, m1
+ mova m15, m4
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ add r10, strideq
+ mov lpfm, r10 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, lpfm
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+%if ARCH_X86_64
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+%else
+ mov t0, t3m
+ mov r4, t2m
+ mov t1, t1m
+ mov t4m, t0
+ mov t3m, r4
+ mov t2m, t1
+ mov wq, wm
+%endif
+ add dstq, strideq
+.v1:
+ call .v
+ jmp .end
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+ movif32 t0, PICmem
+ pxor m1, m1
+ movd m2, wd
+ mova m0, [base+pb_2_3]
+ pshufb m2, m1
+ mova m1, [base+pb_m6_m5]
+ psubb m0, m2
+ psubb m1, m2
+ mova m2, [base+pb_0to15]
+ pminub m0, m2
+ pminub m1, m2
+ pshufb m3, m0
+ pshufb m4, m1
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h:
+ movif64 wq, r4
+ movif32 wq, wm
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ mova m4, [lpfq+wq]
+ movd m3, [leftq+4]
+ pslldq m4, 4
+ por m3, m4
+ add leftq, 8
+ jmp .h_main
+.h_extend_left:
+ mova m3, [lpfq+wq] ; avoid accessing memory located
+ pshufb m3, m15 ; before the start of the buffer
+ jmp .h_main
+.h_top:
+ movif64 wq, r4
+ movif32 wq, wm
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m3, [lpfq+wq-4]
+.h_main:
+ movu m4, [lpfq+wq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m3, m5
+ pmaddwd m0, m11
+ pshufb m1, m4, m5
+ pmaddwd m1, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ pmaddwd m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ pmaddwd m3, m12
+ paddd m0, m8
+ paddd m1, m8
+ paddd m0, m2
+ paddd m1, m3
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ psraw m0, 1
+ mova [t1+wq], m0
+ add wq, 16
+ jl .h_loop
+ movif32 wq, wm
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ mova m4, [lpfq+wq]
+ movd m3, [leftq+4]
+ pslldq m4, 4
+ por m3, m4
+ add leftq, 8
+ jmp .hv_main
+.hv_extend_left:
+ mova m3, [lpfq+wq]
+ pshufb m3, m15
+ jmp .hv_main
+.hv_bottom:
+ movif64 wq, r4
+ movif32 t0m, t0
+ movif32 t1m, t1
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m3, [lpfq+wq-4]
+.hv_main:
+ movu m4, [lpfq+wq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp wd, -18
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ movif32 t1, t1m
+ movif32 t0, t3m
+ pshufb m0, m3, m5
+ pmaddwd m0, m11
+ pshufb m1, m4, m5
+ pmaddwd m1, m11
+ pshufb m2, m3, m6
+ pshufb m3, m7
+ paddw m2, m3
+ pshufb m3, m4, m6
+ pmaddwd m2, m12
+ pshufb m4, m7
+ paddw m3, m4
+ pmaddwd m3, m12
+ paddd m0, m8
+ paddd m1, m8
+ paddd m0, m2
+%if ARCH_X86_64
+ mova m2, [t3+wq]
+ paddw m2, [t1+wq]
+ paddd m1, m3
+ mova m4, [t2+wq]
+%else
+ mova m2, [t0+wq]
+ mov t0, t2m
+ paddw m2, [t1+wq]
+ mov t1, t4m
+ paddd m1, m3
+ mova m4, [t0+wq]
+ mov t0, t0m
+%endif
+ punpckhwd m3, m2, m4
+ pmaddwd m3, m14
+ punpcklwd m2, m4
+%if ARCH_X86_64
+ mova m4, [t4+wq]
+%else
+ mova m4, [t1+wq]
+%endif
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ pmaddwd m2, m14
+ psraw m0, 1
+ mova [t0+wq], m0
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 6
+ psrad m0, 6
+ packssdw m0, m1
+ pmulhw m0, m10
+ pxor m1, m1
+ pmaxsw m0, m1
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .hv_loop
+%if ARCH_X86_64
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+%else
+ mov r4, t3m
+ mov t1, t2m
+ mov t4m, r4
+ mov t3m, t1
+ mov r4, t1m
+ mov t1, t0
+ mov t2m, r4
+ mov t0, t4m
+ mov wq, wm
+%endif
+ add dstq, strideq
+ ret
+.v:
+ movif64 wq, r4
+ movif32 t1m, t1
+.v_loop:
+%if ARCH_X86_64
+ mova m0, [t1+wq]
+ paddw m2, m0, [t3+wq]
+ mova m1, [t2+wq]
+ mova m4, [t4+wq]
+%else
+ mov t0, t3m
+ mova m0, [t1+wq]
+ mov t1, t2m
+ paddw m2, m0, [t0+wq]
+ mov t0, t4m
+ mova m1, [t1+wq]
+ mova m4, [t0+wq]
+%endif
+ punpckhwd m3, m2, m1
+ pmaddwd m3, m14
+ punpcklwd m2, m1
+ pmaddwd m2, m14
+ punpckhwd m1, m0, m4
+ pmaddwd m1, m13
+ punpcklwd m0, m4
+ pmaddwd m0, m13
+ paddd m3, m9
+ paddd m2, m9
+ paddd m1, m3
+ paddd m0, m2
+ psrad m1, 6
+ psrad m0, 6
+ packssdw m0, m1
+ pmulhw m0, m10
+ pxor m1, m1
+ pmaxsw m0, m1
+ mova [dstq+wq], m0
+ add wq, 16
+%if ARCH_X86_64
+ jl .v_loop
+%else
+ jge .v_end
+ mov t1, t1m
+ jmp .v_loop
+.v_end:
+%endif
+ ret
+
+%macro GATHERDD 3 ; dst, src, tmp
+ movd %3d, %2
+ %if ARCH_X86_64
+ movd %1, [r13+%3]
+ pextrw %3d, %2, 2
+ pinsrw %1, [r13+%3+2], 3
+ pextrw %3d, %2, 4
+ pinsrw %1, [r13+%3+2], 5
+ pextrw %3d, %2, 6
+ pinsrw %1, [r13+%3+2], 7
+ %else
+ movd %1, [base+sgr_x_by_x-0xf03+%3]
+ pextrw %3, %2, 2
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3
+ pextrw %3, %2, 4
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5
+ pextrw %3, %2, 6
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7
+ %endif
+%endmacro
+
+%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
+ %if ARCH_X86_64
+ %define tmp r14
+ %else
+ %define tmp %4
+ %endif
+ GATHERDD %1, %2, tmp
+ GATHERDD %2, %3, tmp
+ movif32 %4, %5
+ psrld %1, 24
+ psrld %2, 24
+ packssdw %1, %2
+%endmacro
+
+%macro MAXSD 3-4 0 ; dst, src, restore_tmp
+ pcmpgtd %3, %1, %2
+ pand %1, %3
+ pandn %3, %2
+ por %1, %3
+ %if %4 == 1
+ pxor %3, %3
+ %endif
+%endmacro
+
+%macro MULLD 3 ; dst, src, tmp
+ pmulhuw %3, %1, %2
+ pmullw %1, %2
+ pslld %3, 16
+ paddd %1, %3
+%endmacro
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 0, 1, 2, 3, 5
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 5*16
+ %else
+ %assign extra_stack 3*16
+ %endif
+cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*0+4*6]
+ %define stridemp dword [esp+calloff+16*0+4*7]
+ %define leftm dword [esp+calloff+16*3+4*0]
+ %define lpfm dword [esp+calloff+16*3+4*1]
+ %define w0m dword [esp+calloff+16*3+4*2]
+ %define hd dword [esp+calloff+16*3+4*3]
+ %define edgeb byte [esp+calloff+16*3+4*4]
+ %define edged dword [esp+calloff+16*3+4*4]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t0m dword [esp+calloff+4*2]
+ %define t2m dword [esp+calloff+4*3]
+ %define t3m dword [esp+calloff+4*4]
+ %define t4m dword [esp+calloff+4*5]
+ %define m8 [base+pd_8]
+ %define m9 [base+pd_0xfffffff0]
+ %define m10 [esp+calloff+16*2]
+ %define m11 [base+pd_0xf00800a4]
+ %define m12 [base+sgr_lshuf5]
+ %define m13 [base+pd_34816]
+ %define m14 [base+pw_1023]
+ %define r10 r4
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ add wd, wd
+ mov edged, r7m
+ movu m10, [paramsq]
+ mova m12, [sgr_lshuf5]
+ add lpfq, wq
+ mova m8, [pd_8]
+ lea t1, [rsp+wq+20]
+ mova m9, [pd_0xfffffff0]
+ add dstq, wq
+ lea t3, [rsp+wq*2+400*12+16]
+ mova m11, [pd_0xf00800a4]
+ lea t4, [rsp+wq+400*20+16]
+ pshufhw m7, m10, q0000
+ pshufb m10, [pw_256] ; s0
+ punpckhqdq m7, m7 ; w0
+ neg wq
+ mova m13, [pd_34816] ; (1 << 11) + (1 << 15)
+ pxor m6, m6
+ mova m14, [pw_1023]
+ psllw m7, 4
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ add wd, wd
+ movu m1, [r1]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq+20]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*2+400*12+16]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq+400*20+16]
+ mov t3m, t3
+ pshufhw m7, m1, q0000
+ mov t4m, t4
+ pshufb m1, [base+pw_256] ; s0
+ punpckhqdq m7, m7 ; w0
+ psllw m7, 4
+ neg wq
+ mova m10, m1
+ pxor m6, m6
+ mov w1m, wd
+ sub wd, 4
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ movif32 t2m, t1
+ mov t2, t1
+ call .top_fixup
+ add t1, 400*6
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t0m, t2
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, stridemp
+ movif32 t4, t4m
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+%if ARCH_X86_64
+ test hb, hb
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ call .h
+ add lpfq, stridemp
+ call .hv
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+ sub hd, 2
+ movif32 t0, t0m
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .h_top
+ add lpfq, stridemp
+ call .hv_bottom
+.end:
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ movif32 t4, t4m
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ movif32 dstq, dstm
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+ lea t2, [t1+400*6]
+ movif32 t2m, t2
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ movif32 t0m, t0
+ jmp .main
+.no_top_height1:
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.extend_right:
+ movd m0, wd
+ movd m1, [lpfq-2]
+ mova m2, [base+pw_256]
+ mova m3, [base+pb_m14_m13]
+ pshufb m0, m6
+ pshufb m1, m2
+ psubb m2, m0
+ psubb m3, m0
+ mova m0, [base+pb_0to15]
+ pcmpgtb m2, m0
+ pcmpgtb m3, m0
+ pand m4, m2
+ pand m5, m3
+ pandn m2, m1
+ pandn m3, m1
+ por m4, m2
+ por m5, m3
+ ret
+%assign stack_offset stack_offset+4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m4, [lpfq+wq- 2]
+.h_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -20
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ palignr m2, m5, m4, 2
+ paddw m0, m4, m2
+ palignr m3, m5, m4, 6
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ palignr m5, m4, 8
+ paddw m0, m5
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ paddd m1, m3
+ punpckhwd m3, m4, m5
+ pmaddwd m3, m3
+ shufps m4, m5, q2121
+ paddw m0, m4 ; sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m2, m3
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+wq+400*0]
+ paddd m1, [t1+wq+400*2]
+ paddd m2, [t1+wq+400*4]
+.h_loop_end:
+ paddd m1, m5 ; sumsq
+ paddd m2, m4
+ mova [t1+wq+400*0], m0
+ mova [t1+wq+400*2], m1
+ mova [t1+wq+400*4], m2
+ add wq, 16
+ jl .h_loop
+ ret
+.top_fixup:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+wq+400*0]
+ mova m1, [t1+wq+400*2]
+ mova m2, [t1+wq+400*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m1
+ mova [t2+wq+400*4], m2
+ add wq, 16
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .hv_main
+.hv_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .hv_main
+.hv_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv_loop_start
+%endif
+.hv_loop:
+ movif32 lpfq, hvsrcm
+.hv_loop_start:
+ movu m4, [lpfq+wq- 2]
+.hv_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp wd, -20
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ movif32 t3, hd
+ palignr m3, m5, m4, 2
+ paddw m0, m4, m3
+ palignr m1, m5, m4, 6
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 8
+ paddw m0, m5
+ punpcklwd m1, m4, m5
+ pmaddwd m1, m1
+ paddd m2, m1
+ punpckhwd m1, m4, m5
+ pmaddwd m1, m1
+ shufps m4, m5, q2121
+ paddw m0, m4 ; h sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m3, m1
+ paddd m2, m5 ; h sumsq
+ paddd m3, m4
+ paddw m1, m0, [t1+wq+400*0]
+ paddd m4, m2, [t1+wq+400*2]
+ paddd m5, m3, [t1+wq+400*4]
+%if ARCH_X86_64
+ test hd, hd
+%else
+ test t3, t3
+%endif
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+wq+400*0] ; hv sum
+ paddd m4, [t2+wq+400*2] ; hv sumsq
+ paddd m5, [t2+wq+400*4]
+ mova [t0+wq+400*0], m0
+ mova [t0+wq+400*2], m2
+ mova [t0+wq+400*4], m3
+ psrlw m3, m1, 1
+ paddd m4, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m5, m8
+ pand m4, m9 ; ((a + 8) >> 4) << 4
+ pand m5, m9
+ psrld m2, m4, 4
+ psrld m0, m5, 4
+ paddd m2, m4
+ psrld m4, 1
+ paddd m0, m5
+ psrld m5, 1
+ paddd m4, m2 ; a * 25
+ paddd m5, m0
+ punpcklwd m2, m3, m6
+ punpckhwd m3, m6
+ pmaddwd m2, m2 ; b * b
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m6
+ MAXSD m5, m3, m6, 1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m10, m2 ; p * s
+ MULLD m5, m10, m2
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, t2, t2m
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m2
+ MULLD m1, m5, m2
+ paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ mova [t4+wq+4], m3
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+wq*2+ 8], m0
+ mova [t3+wq*2+24], m1
+ add wq, 16
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ movif32 t2m, t2
+ movif32 t0m, t0
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+wq+400*0], m1
+ paddw m1, m0
+ mova [t1+wq+400*2], m4
+ paddd m4, m2
+ mova [t1+wq+400*4], m5
+ paddd m5, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v_loop:
+ mova m0, [t1+wq+400*0]
+ mova m2, [t1+wq+400*2]
+ mova m3, [t1+wq+400*4]
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m4, m2, [t2+wq+400*2]
+ paddd m5, m3, [t2+wq+400*4]
+ paddw m0, m0
+ paddd m2, m2
+ paddd m3, m3
+ paddw m1, m0 ; hv sum
+ paddd m4, m2 ; hv sumsq
+ paddd m5, m3
+ psrlw m3, m1, 1
+ paddd m4, m8
+ pavgw m3, m6 ; (b + 2) >> 2
+ paddd m5, m8
+ pand m4, m9 ; ((a + 8) >> 4) << 4
+ pand m5, m9
+ psrld m2, m4, 4
+ psrld m0, m5, 4
+ paddd m2, m4
+ psrld m4, 1
+ paddd m0, m5
+ psrld m5, 1
+ paddd m4, m2 ; a * 25
+ paddd m5, m0
+ punpcklwd m2, m3, m6
+ punpckhwd m3, m6
+ pmaddwd m2, m2 ; b * b
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m6
+ MAXSD m5, m3, m6, 1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m10, m2 ; p * s
+ MULLD m5, m10, m2
+ pmaddwd m0, m11 ; b * 164
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, t2, t2m
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m2
+ MULLD m1, m5, m2
+ paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ mova [t4+wq+4], m3
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+wq*2+ 8], m0
+ mova [t3+wq*2+24], m1
+ add wq, 16
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*1+ 2]
+ movu m3, [t4+wq*1+ 4]
+ movu m1, [t3+wq*2+ 4]
+ movu m4, [t3+wq*2+ 8]
+ movu m2, [t3+wq*2+20]
+ movu m5, [t3+wq*2+24]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ paddw m3, [t4+wq*1+ 0]
+ paddd m4, [t3+wq*2+ 0]
+ paddd m5, [t3+wq*2+16]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ mova [t4+wq*1+400*2+ 0], m0
+ mova [t3+wq*2+400*4+ 0], m1
+ mova [t3+wq*2+400*4+16], m2
+ add wq, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m0, [t4+wq*1+ 2]
+ movu m3, [t4+wq*1+ 4]
+ movu m1, [t3+wq*2+ 4]
+ movu m4, [t3+wq*2+ 8]
+ movu m2, [t3+wq*2+20]
+ movu m5, [t3+wq*2+24]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ paddw m3, [t4+wq*1+ 0]
+ paddd m4, [t3+wq*2+ 0]
+ paddd m5, [t3+wq*2+16]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ paddw m3, m0, [t4+wq*1+400*2+ 0]
+ paddd m4, m1, [t3+wq*2+400*4+ 0]
+ paddd m5, m2, [t3+wq*2+400*4+16]
+ mova [t4+wq*1+400*2+ 0], m0
+ mova [t3+wq*2+400*4+ 0], m1
+ mova [t3+wq*2+400*4+16], m2
+ mova m0, [dstq+wq]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ mova m0, [dstq+wq]
+ mova m3, [t4+wq*1+400*2+ 0]
+ mova m4, [t3+wq*2+400*4+ 0]
+ mova m5, [t3+wq*2+400*4+16]
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 7)
+ psubd m5, m3
+ psrad m4, 8
+ psrad m5, 8
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m14
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 4*16
+ %else
+ %assign extra_stack 2*16
+ %endif
+cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*2+4*0]
+ %define stridemp dword [esp+calloff+16*2+4*1]
+ %define leftm dword [esp+calloff+16*2+4*2]
+ %define lpfm dword [esp+calloff+16*2+4*3]
+ %define w0m dword [esp+calloff+16*2+4*4]
+ %define hd dword [esp+calloff+16*2+4*5]
+ %define edgeb byte [esp+calloff+16*2+4*6]
+ %define edged dword [esp+calloff+16*2+4*6]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t3m dword [esp+calloff+4*2]
+ %define t4m dword [esp+calloff+4*3]
+ %define m8 [base+pd_8]
+ %define m9 [esp+calloff+16*1]
+ %define m10 [base+pd_0xf00801c7]
+ %define m11 [base+pd_34816]
+ %define m12 [base+sgr_lshuf3]
+ %define m13 [base+pw_1023]
+ %define m14 m6
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ add wd, wd
+ mov edged, r7m
+ movq m9, [paramsq+4]
+ add lpfq, wq
+ lea t1, [rsp+wq+12]
+ mova m8, [pd_8]
+ add dstq, wq
+ lea t3, [rsp+wq*2+400*12+8]
+ mova m10, [pd_0xf00801c7]
+ lea t4, [rsp+wq+400*32+8]
+ mova m11, [pd_34816]
+ pshuflw m7, m9, q3333
+ pshufb m9, [pw_256] ; s1
+ punpcklqdq m7, m7 ; w1
+ neg wq
+ pxor m6, m6
+ mova m13, [pw_1023]
+ psllw m7, 4
+ mova m12, [sgr_lshuf3]
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ add wd, wd
+ movq m1, [r1+4]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq+20]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*2+400*12+16]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq+400*32+16]
+ mov t3m, t3
+ pshuflw m7, m1, q3333
+ mov t4m, t4
+ pshufb m1, [base+pw_256] ; s1
+ punpcklqdq m7, m7 ; w1
+ psllw m7, 4
+ neg wq
+ mova m9, m1
+ pxor m6, m6
+ mov w1m, wd
+ sub wd, 4
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ mov t2, t1
+ add t1, 400*6
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t4, t4m
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv0
+%if ARCH_X86_64
+ test hb, hb
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .hv0_bottom
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wq, w0m
+ mov hvsrcm, lpfq
+%endif
+ lea t2, [t1+400*6]
+.top_fixup_loop:
+ mova m0, [t1+wq+400*0]
+ mova m1, [t1+wq+400*2]
+ mova m2, [t1+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m1
+ mova [t2+wq+400*4], m2
+ add wq, 16
+ jl .top_fixup_loop
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v0
+ jmp .main
+.extend_right:
+ movd m1, wd
+ movd m5, [lpfq-2]
+ mova m2, [base+pw_256]
+ mova m3, [base+pb_0to15]
+ pshufb m1, m6
+ pshufb m5, m2
+ psubb m2, m1
+ pcmpgtb m2, m3
+ pand m4, m2
+ pandn m2, m5
+ por m4, m2
+ ret
+%assign stack_offset stack_offset+4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 12
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m4, [lpfq+wq+ 0]
+.h_main:
+ movu m5, [lpfq+wq+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ mova [t1+wq+400*0], m1
+ mova [t1+wq+400*2], m2
+ mova [t1+wq+400*4], m3
+ add wq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 12
+ jmp .hv0_main
+.hv0_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .hv0_main
+.hv0_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv0_loop_start
+%endif
+.hv0_loop:
+ movif32 lpfq, hvsrcm
+.hv0_loop_start:
+ movu m4, [lpfq+wq+ 0]
+.hv0_main:
+ movu m5, [lpfq+wq+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp wd, -18
+ jl .hv0_have_right
+ call .extend_right
+.hv0_have_right:
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ paddw m0, m1, [t1+wq+400*0]
+ paddd m4, m2, [t1+wq+400*2]
+ paddd m5, m3, [t1+wq+400*4]
+ mova [t1+wq+400*0], m1
+ mova [t1+wq+400*2], m2
+ mova [t1+wq+400*4], m3
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m2, m4, [t2+wq+400*2]
+ paddd m3, m5, [t2+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m4
+ mova [t2+wq+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m14
+ MAXSD m5, m3, m14
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m14 ; p * s
+ MULLD m5, m9, m14
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m14
+ MULLD m1, m5, m14
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+ 8], m0
+ mova [t3+wq*2+24], m1
+ add wq, 16
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 12
+ jmp .hv1_main
+.hv1_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, m12
+ jmp .hv1_main
+.hv1_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv1_loop_start
+%endif
+.hv1_loop:
+ movif32 lpfq, hvsrcm
+.hv1_loop_start:
+ movu m4, [lpfq+wq+ 0]
+.hv1_main:
+ movu m5, [lpfq+wq+16]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp wd, -18
+ jl .hv1_have_right
+ call .extend_right
+.hv1_have_right:
+ palignr m1, m5, m4, 2
+ paddw m0, m4, m1
+ punpcklwd m2, m4, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m0, m5 ; h sum
+ punpcklwd m1, m5, m6
+ pmaddwd m1, m1
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m1 ; h sumsq
+ paddd m3, m5
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m4, m2, [t2+wq+400*2]
+ paddd m5, m3, [t2+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m2
+ mova [t2+wq+400*4], m3
+ paddd m4, m8
+ paddd m5, m8
+ psrld m4, 4 ; (a + 8) >> 4
+ psrld m5, 4
+ pslld m2, m4, 3
+ pslld m3, m5, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m14
+ MAXSD m5, m3, m14
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m14 ; p * s
+ MULLD m5, m9, m14
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m14
+ MULLD m1, m5, m14
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*1+400*2 +4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*4+ 8], m0
+ mova [t3+wq*2+400*4+24], m1
+ add wq, 16
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v0_loop:
+ mova m0, [t1+wq+400*0]
+ mova m4, [t1+wq+400*2]
+ mova m5, [t1+wq+400*4]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m2, m4, [t2+wq+400*2]
+ paddd m3, m5, [t2+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m4
+ mova [t2+wq+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m14
+ MAXSD m5, m3, m14
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m14 ; p * s
+ MULLD m5, m9, m14
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m14
+ MULLD m1, m5, m14
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*1+400*0+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*0+ 8], m0
+ mova [t3+wq*2+400*0+24], m1
+ add wq, 16
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v1_loop:
+ mova m0, [t1+wq+400*0]
+ mova m4, [t1+wq+400*2]
+ mova m5, [t1+wq+400*4]
+ paddw m1, m0, [t2+wq+400*0]
+ paddd m2, m4, [t2+wq+400*2]
+ paddd m3, m5, [t2+wq+400*4]
+ mova [t2+wq+400*0], m0
+ mova [t2+wq+400*2], m4
+ mova [t2+wq+400*4], m5
+ paddd m2, m8
+ paddd m3, m8
+ psrld m2, 4 ; (a + 8) >> 4
+ psrld m3, 4
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m6 ; (b + 2) >> 2
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m2
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ MAXSD m4, m2, m14
+ MAXSD m5, m3, m14
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m14 ; p * s
+ MULLD m5, m9, m14
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m14
+ MULLD m1, m5, m14
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*1+400*2+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*4+ 8], m0
+ mova [t3+wq*2+400*4+24], m1
+ add wq, 16
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*1+400*0+ 4]
+ movu m1, [t3+wq*2+400*0+ 8]
+ movu m2, [t3+wq*2+400*0+24]
+ movu m3, [t4+wq*1+400*0+ 2]
+ movu m4, [t3+wq*2+400*0+ 4]
+ movu m5, [t3+wq*2+400*0+20]
+ paddw m0, [t4+wq*1+400*0+ 0]
+ paddd m1, [t3+wq*2+400*0+ 0]
+ paddd m2, [t3+wq*2+400*0+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a[-1] 444
+ pslld m4, 2 ; b[-1] 444
+ pslld m5, 2
+ psubw m3, m0 ; a[-1] 343
+ psubd m4, m1 ; b[-1] 343
+ psubd m5, m2
+ mova [t4+wq*1+400*4], m3
+ mova [t3+wq*2+400*8+ 0], m4
+ mova [t3+wq*2+400*8+16], m5
+ movu m0, [t4+wq*1+400*2+ 4]
+ movu m1, [t3+wq*2+400*4+ 8]
+ movu m2, [t3+wq*2+400*4+24]
+ movu m3, [t4+wq*1+400*2+ 2]
+ movu m4, [t3+wq*2+400*4+ 4]
+ movu m5, [t3+wq*2+400*4+20]
+ paddw m0, [t4+wq*1+400*2+ 0]
+ paddd m1, [t3+wq*2+400*4+ 0]
+ paddd m2, [t3+wq*2+400*4+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a[ 0] 444
+ pslld m4, 2 ; b[ 0] 444
+ pslld m5, 2
+ mova [t4+wq*1+400* 6], m3
+ mova [t3+wq*2+400*12+ 0], m4
+ mova [t3+wq*2+400*12+16], m5
+ psubw m3, m0 ; a[ 0] 343
+ psubd m4, m1 ; b[ 0] 343
+ psubd m5, m2
+ mova [t4+wq*1+400* 8], m3
+ mova [t3+wq*2+400*16+ 0], m4
+ mova [t3+wq*2+400*16+16], m5
+ add wq, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m3, [t4+wq*1+400*0+4]
+ movu m1, [t4+wq*1+400*0+2]
+ paddw m3, [t4+wq*1+400*0+0]
+ paddw m1, m3
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+wq*1+400*4]
+ paddw m3, [t4+wq*1+400*6]
+ mova [t4+wq*1+400*4], m2
+ mova [t4+wq*1+400*6], m1
+ movu m4, [t3+wq*2+400*0+8]
+ movu m1, [t3+wq*2+400*0+4]
+ paddd m4, [t3+wq*2+400*0+0]
+ paddd m1, m4
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+wq*2+400* 8+ 0]
+ paddd m4, [t3+wq*2+400*12+ 0]
+ mova [t3+wq*2+400* 8+ 0], m2
+ mova [t3+wq*2+400*12+ 0], m1
+ movu m5, [t3+wq*2+400*0+24]
+ movu m1, [t3+wq*2+400*0+20]
+ paddd m5, [t3+wq*2+400*0+16]
+ paddd m1, m5
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+wq*2+400* 8+16]
+ paddd m5, [t3+wq*2+400*12+16]
+ mova [t3+wq*2+400* 8+16], m2
+ mova [t3+wq*2+400*12+16], m1
+ mova m0, [dstq+wq]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m13
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movu m3, [t4+wq*1+400*2+4]
+ movu m1, [t4+wq*1+400*2+2]
+ paddw m3, [t4+wq*1+400*2+0]
+ paddw m1, m3
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+wq*1+400*6]
+ paddw m3, [t4+wq*1+400*8]
+ mova [t4+wq*1+400*6], m1
+ mova [t4+wq*1+400*8], m2
+ movu m4, [t3+wq*2+400*4+8]
+ movu m1, [t3+wq*2+400*4+4]
+ paddd m4, [t3+wq*2+400*4+0]
+ paddd m1, m4
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+wq*2+400*12+ 0]
+ paddd m4, [t3+wq*2+400*16+ 0]
+ mova [t3+wq*2+400*12+ 0], m1
+ mova [t3+wq*2+400*16+ 0], m2
+ movu m5, [t3+wq*2+400*4+24]
+ movu m1, [t3+wq*2+400*4+20]
+ paddd m5, [t3+wq*2+400*4+16]
+ paddd m1, m5
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+wq*2+400*12+16]
+ paddd m5, [t3+wq*2+400*16+16]
+ mova [t3+wq*2+400*12+16], m1
+ mova [t3+wq*2+400*16+16], m2
+ mova m0, [dstq+wq]
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ pmaxsw m0, m6
+ pminsw m0, m13
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 10*16
+ %else
+ %assign extra_stack 8*16
+ %endif
+cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*8+4*0]
+ %define stridemp dword [esp+calloff+16*8+4*1]
+ %define leftm dword [esp+calloff+16*8+4*2]
+ %define lpfm dword [esp+calloff+16*8+4*3]
+ %define w0m dword [esp+calloff+16*8+4*4]
+ %define hd dword [esp+calloff+16*8+4*5]
+ %define edgeb byte [esp+calloff+16*8+4*6]
+ %define edged dword [esp+calloff+16*8+4*6]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t3m dword [esp+calloff+4*2]
+ %define t4m dword [esp+calloff+4*3]
+ %xdefine m8 m6
+ %define m9 [base+pd_8]
+ %define m10 [base+pd_34816]
+ %define m11 [base+pd_0xf00801c7]
+ %define m12 [base+pd_0xf00800a4]
+ %define m13 [esp+calloff+16*4]
+ %define m14 [esp+calloff+16*5]
+ %define m15 [esp+calloff+16*6]
+ %define m6 [esp+calloff+16*7]
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ movifnidn wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ add wd, wd
+ mov edged, r7m
+ mova m14, [paramsq]
+ add lpfq, wq
+ mova m9, [pd_8]
+ lea t1, [rsp+wq+44]
+ mova m10, [pd_34816]
+ add dstq, wq
+ mova m11, [pd_0xf00801c7]
+ lea t3, [rsp+wq*2+400*24+40]
+ mova m12, [pd_0xf00800a4]
+ lea t4, [rsp+wq+400*52+40]
+ neg wq
+ pshufd m15, m14, q2222 ; w0 w1
+ punpcklwd m14, m14
+ pshufd m13, m14, q0000 ; s0
+ pshufd m14, m14, q2222 ; s1
+ pxor m6, m6
+ psllw m15, 2
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ add wd, wd
+ mova m2, [r1]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq+52]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*2+400*24+48]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq+400*52+48]
+ mov t3m, t3
+ mov t4m, t4
+ neg wq
+ pshuflw m0, m2, q0000
+ pshuflw m1, m2, q2222
+ pshufhw m2, m2, q1010
+ punpcklqdq m0, m0 ; s0
+ punpcklqdq m1, m1 ; s1
+ punpckhqdq m2, m2 ; w0 w1
+ mov w1m, wd
+ pxor m3, m3
+ psllw m2, 2
+ mova m13, m0
+ mova m14, m1
+ sub wd, 4
+ mova m15, m2
+ mova m6, m3
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ mov t2, t1
+%if ARCH_X86_64
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup
+%else
+ mov wq, w0m
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop
+%endif
+ add t1, 400*12
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t4, t4m
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv0
+%if ARCH_X86_64
+ test hd, hd
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .hv0_bottom
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wq, w0m
+ mov hvsrcm, lpfq
+%endif
+ lea t2, [t1+400*12]
+.top_fixup_loop:
+ mova m0, [t1+wq+400* 0]
+ mova m1, [t1+wq+400* 2]
+ mova m2, [t1+wq+400* 4]
+ paddw m0, m0
+ mova m3, [t1+wq+400* 6]
+ paddd m1, m1
+ mova m4, [t1+wq+400* 8]
+ paddd m2, m2
+ mova m5, [t1+wq+400*10]
+ mova [t2+wq+400* 0], m0
+ mova [t2+wq+400* 2], m1
+ mova [t2+wq+400* 4], m2
+ mova [t2+wq+400* 6], m3
+ mova [t2+wq+400* 8], m4
+ mova [t2+wq+400*10], m5
+ add wq, 16
+ jl .top_fixup_loop
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v0
+ jmp .main
+.h: ; horizontal boxsum
+%assign stack_offset stack_offset+4
+%assign calloff 4
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, [base+sgr_lshuf5]
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m4, [lpfq+wq- 2]
+.h_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -20
+ jl .h_have_right
+%if ARCH_X86_32
+ pxor m8, m8
+%endif
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
+.h_have_right:
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; sum3
+ punpcklwd m7, m0, m6
+ pmaddwd m7, m7
+ punpckhwd m0, m6
+ pmaddwd m0, m0
+ paddd m2, m7 ; sumsq3
+ palignr m5, m4, 8
+ punpcklwd m7, m5, m4
+ paddw m8, m4, m5
+ pmaddwd m7, m7
+ punpckhwd m5, m4
+ pmaddwd m5, m5
+ paddd m3, m0
+ mova [t1+wq+400* 6], m1
+ mova [t1+wq+400* 8], m2
+ mova [t1+wq+400*10], m3
+ paddw m8, m1 ; sum5
+ paddd m7, m2 ; sumsq5
+ paddd m5, m3
+ mova [t1+wq+400* 0], m8
+ mova [t1+wq+400* 2], m7
+ mova [t1+wq+400* 4], m5
+ add wq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .hv0_main
+.hv0_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, [base+sgr_lshuf5]
+ jmp .hv0_main
+.hv0_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv0_loop_start
+%endif
+.hv0_loop:
+ movif32 lpfq, hvsrcm
+.hv0_loop_start:
+ movu m4, [lpfq+wq- 2]
+.hv0_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp wd, -20
+ jl .hv0_have_right
+%if ARCH_X86_32
+ pxor m8, m8
+%endif
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
+.hv0_have_right:
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ movif32 t3, t3m
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; h sum3
+ punpcklwd m7, m0, m6
+ pmaddwd m7, m7
+ punpckhwd m0, m6
+ pmaddwd m0, m0
+ paddd m2, m7 ; h sumsq3
+ palignr m5, m4, 8
+ punpcklwd m7, m5, m4
+ paddw m8, m4, m5
+ pmaddwd m7, m7
+ punpckhwd m5, m4
+ pmaddwd m5, m5
+ paddd m3, m0
+ paddw m8, m1 ; h sum5
+ paddd m7, m2 ; h sumsq5
+ paddd m5, m3
+ mova [t3+wq*2+400*8+ 8], m8
+ mova [t3+wq*2+400*0+ 8], m7
+ mova [t3+wq*2+400*0+24], m5
+ paddw m8, [t1+wq+400* 0]
+ paddd m7, [t1+wq+400* 2]
+ paddd m5, [t1+wq+400* 4]
+ mova [t1+wq+400* 0], m8
+ mova [t1+wq+400* 2], m7
+ mova [t1+wq+400* 4], m5
+ paddw m0, m1, [t1+wq+400* 6]
+ paddd m4, m2, [t1+wq+400* 8]
+ paddd m5, m3, [t1+wq+400*10]
+ mova [t1+wq+400* 6], m1
+ mova [t1+wq+400* 8], m2
+ mova [t1+wq+400*10], m3
+ paddw m1, m0, [t2+wq+400* 6]
+ paddd m2, m4, [t2+wq+400* 8]
+ paddd m3, m5, [t2+wq+400*10]
+ mova [t2+wq+400* 6], m0
+ mova [t2+wq+400* 8], m4
+ mova [t2+wq+400*10], m5
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m4, m2, m7
+ MAXSD m5, m3, m7
+ psubd m4, m2 ; p3
+ psubd m5, m3
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*1+400*2+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*4+ 8], m0
+ mova [t3+wq*2+400*4+24], m1
+ add wq, 16
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 leftq, leftm
+ movddup m5, [leftq]
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ add leftmp, 8
+ palignr m4, m5, 10
+ jmp .hv1_main
+.hv1_extend_left:
+ movif32 wq, w0m
+ mova m4, [lpfq+wq+4]
+ pshufb m4, [base+sgr_lshuf5]
+ jmp .hv1_main
+.hv1_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv1_loop_start
+%endif
+.hv1_loop:
+ movif32 lpfq, hvsrcm
+.hv1_loop_start:
+ movu m4, [lpfq+wq- 2]
+.hv1_main:
+ movu m5, [lpfq+wq+14]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp wd, -20
+ jl .hv1_have_right
+%if ARCH_X86_32
+ pxor m8, m8
+%endif
+ call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right
+.hv1_have_right:
+ palignr m7, m5, m4, 2
+ palignr m3, m5, m4, 4
+ paddw m2, m7, m3
+ punpcklwd m0, m7, m3
+ pmaddwd m0, m0
+ punpckhwd m7, m3
+ pmaddwd m7, m7
+ palignr m3, m5, m4, 6
+ paddw m2, m3 ; h sum3
+ punpcklwd m1, m3, m6
+ pmaddwd m1, m1
+ punpckhwd m3, m6
+ pmaddwd m3, m3
+ paddd m0, m1 ; h sumsq3
+ palignr m5, m4, 8
+ punpckhwd m1, m4, m5
+ paddw m8, m4, m5
+ pmaddwd m1, m1
+ punpcklwd m4, m5
+ pmaddwd m4, m4
+ paddd m7, m3
+ paddw m5, m2, [t2+wq+400* 6]
+ mova [t2+wq+400* 6], m2
+ paddw m8, m2 ; h sum5
+ paddd m2, m0, [t2+wq+400* 8]
+ paddd m3, m7, [t2+wq+400*10]
+ mova [t2+wq+400* 8], m0
+ mova [t2+wq+400*10], m7
+ paddd m4, m0 ; h sumsq5
+ paddd m1, m7
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+ pslld m0, m2, 3
+ pslld m7, m3, 3
+ paddd m2, m0 ; ((a3 + 8) >> 4) * 9
+ paddd m3, m7
+ psrlw m7, m5, 1
+ pavgw m7, m6 ; (b3 + 2) >> 2
+ punpcklwd m0, m7, m6
+ pmaddwd m0, m0
+ punpckhwd m7, m6
+ pmaddwd m7, m7
+%if ARCH_X86_32
+ mova [esp+20], m8
+%else
+ SWAP m8, m6
+%endif
+ MAXSD m2, m0, m8
+ MAXSD m3, m7, m8
+ pxor m8, m8
+ psubd m2, m0 ; p3
+ psubd m3, m7
+ punpcklwd m0, m5, m8 ; b3
+ punpckhwd m5, m8
+ MULLD m2, m14, m8 ; p3 * s1
+ MULLD m3, m14, m8
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m5, m11
+ paddusw m2, m11
+ paddusw m3, m11
+ psrld m2, 20 ; min(z3, 255)
+ movif32 t3, t3m
+ psrld m3, 20
+ GATHER_X_BY_X m8, m2, m3, r0, dstm
+ punpcklwd m2, m8, m8
+ punpckhwd m3, m8, m8
+ MULLD m0, m2, m7
+ MULLD m5, m3, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m5, m10
+ psrld m0, 12
+ psrld m5, 12
+ mova [t4+wq*1+400*4+4], m8
+ mova [t3+wq*2+400*8+ 8], m0
+ mova [t3+wq*2+400*8+24], m5
+%if ARCH_X86_32
+ mova m8, [esp+20]
+%else
+ SWAP m6, m8
+ pxor m6, m6
+%endif
+ paddw m5, m8, [t2+wq+400*0]
+ paddd m2, m4, [t2+wq+400*2]
+ paddd m3, m1, [t2+wq+400*4]
+ paddw m5, [t1+wq+400*0]
+ paddd m2, [t1+wq+400*2]
+ paddd m3, [t1+wq+400*4]
+ mova [t2+wq+400*0], m8
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ mova [t2+wq+400*2], m4
+ pslld m8, m2, 4
+ mova [t2+wq+400*4], m1
+ pslld m4, m3, 4
+ paddd m8, m2
+ pslld m2, 3
+ paddd m4, m3
+ pslld m3, 3
+ paddd m2, m8 ; ((a5 + 8) >> 4) * 25
+ paddd m3, m4
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ psrlw m1, m5, 1
+ pavgw m1, m7 ; (b5 + 2) >> 2
+ punpcklwd m4, m1, m7
+ pmaddwd m4, m4
+ punpckhwd m1, m7
+ pmaddwd m1, m1
+ punpcklwd m0, m5, m7 ; b5
+ punpckhwd m5, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m2, m4, m7
+ psubd m2, m4 ; p5
+ MAXSD m3, m1, m7
+ psubd m3, m1
+ MULLD m2, m13, m7 ; p5 * s0
+ MULLD m3, m13, m7
+ pmaddwd m0, m12 ; b5 * 164
+ pmaddwd m5, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrld m2, 20 ; min(z5, 255)
+ psrld m3, 20
+ GATHER_X_BY_X m1, m2, m3, r0, dstm
+ punpcklwd m2, m1, m1
+ punpckhwd m3, m1, m1
+ MULLD m0, m2, m7
+ MULLD m5, m3, m7
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m5, m10
+ mova [t4+wq*1+400*0+ 4], m1
+ psrld m0, 12
+ psrld m5, 12
+ mova [t3+wq*2+400*0+ 8], m0
+ mova [t3+wq*2+400*0+24], m5
+ add wq, 16
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v0_loop:
+ mova m0, [t1+wq+400* 6]
+ mova m4, [t1+wq+400* 8]
+ mova m5, [t1+wq+400*10]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+wq+400* 6]
+ paddd m2, m4, [t2+wq+400* 8]
+ paddd m3, m5, [t2+wq+400*10]
+ mova [t2+wq+400* 6], m0
+ mova [t2+wq+400* 8], m4
+ mova [t2+wq+400*10], m5
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m4, m2, m7
+ MAXSD m5, m3, m7
+ psubd m4, m2 ; p3
+ psubd m5, m3
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*1+400*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova m3, [t1+wq+400*0]
+ mova m4, [t1+wq+400*2]
+ mova m5, [t1+wq+400*4]
+ mova [t3+wq*2+400*8+ 8], m3
+ mova [t3+wq*2+400*0+ 8], m4
+ mova [t3+wq*2+400*0+24], m5
+ paddw m3, m3 ; cc5
+ paddd m4, m4
+ paddd m5, m5
+ mova [t1+wq+400*0], m3
+ mova [t1+wq+400*2], m4
+ mova [t1+wq+400*4], m5
+ mova [t3+wq*2+400*4+ 8], m0
+ mova [t3+wq*2+400*4+24], m1
+ add wq, 16
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-4]
+%else
+ mov wd, w0m
+%endif
+.v1_loop:
+ mova m4, [t1+wq+400* 6]
+ mova m5, [t1+wq+400* 8]
+ mova m7, [t1+wq+400*10]
+ paddw m1, m4, [t2+wq+400* 6]
+ paddd m2, m5, [t2+wq+400* 8]
+ paddd m3, m7, [t2+wq+400*10]
+ mova [t2+wq+400* 6], m4
+ mova [t2+wq+400* 8], m5
+ mova [t2+wq+400*10], m7
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a3 + 8) >> 4
+ psrld m3, 4
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ psrlw m3, m1, 1
+ pavgw m3, m7 ; (b3 + 2) >> 2
+ punpcklwd m2, m3, m7
+ pmaddwd m2, m2
+ punpckhwd m3, m7
+ pmaddwd m3, m3
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m4, m2, m7
+ MAXSD m5, m3, m7
+ psubd m4, m2 ; p3
+ psubd m5, m3
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*1+400*4+4], m3
+ psrld m0, 12
+ psrld m8, m1, 12
+ mova m4, [t3+wq*2+400*8+ 8]
+ mova m5, [t3+wq*2+400*0+ 8]
+ mova m7, [t3+wq*2+400*0+24]
+ paddw m1, m4, [t2+wq+400*0]
+ paddd m2, m5, [t2+wq+400*2]
+ paddd m3, m7, [t2+wq+400*4]
+ paddw m1, [t1+wq+400*0]
+ paddd m2, [t1+wq+400*2]
+ paddd m3, [t1+wq+400*4]
+ mova [t2+wq+400*0], m4
+ mova [t2+wq+400*2], m5
+ mova [t2+wq+400*4], m7
+ paddd m2, m9
+ paddd m3, m9
+ psrld m2, 4 ; (a5 + 8) >> 4
+ psrld m3, 4
+ mova [t3+wq*2+400*8+ 8], m0
+ pslld m4, m2, 4
+ mova [t3+wq*2+400*8+24], m8
+ pslld m5, m3, 4
+ paddd m4, m2
+ pslld m2, 3
+ paddd m5, m3
+ pslld m3, 3
+ paddd m2, m4
+ paddd m3, m5
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ psrlw m5, m1, 1
+ pavgw m5, m7 ; (b5 + 2) >> 2
+ punpcklwd m4, m5, m7
+ pmaddwd m4, m4
+ punpckhwd m5, m7
+ pmaddwd m5, m5
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MAXSD m2, m4, m7
+ psubd m2, m4 ; p5
+ MAXSD m3, m5, m7
+ psubd m3, m5
+ MULLD m2, m13, m7 ; p5 * s0
+ MULLD m3, m13, m7
+ pmaddwd m0, m12 ; b5 * 164
+ pmaddwd m1, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrld m2, 20 ; min(z5, 255)
+ psrld m3, 20
+ GATHER_X_BY_X m4, m2, m3, r0, dstm
+ punpcklwd m2, m4, m4
+ punpckhwd m3, m4, m4
+ MULLD m0, m2, m7
+ MULLD m1, m3, m7
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*1+400*0+ 4], m4
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*2+400*0+ 8], m0
+ mova [t3+wq*2+400*0+24], m1
+ add wq, 16
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*1+400*0+ 2]
+ movu m1, [t3+wq*2+400*0+ 4]
+ movu m2, [t3+wq*2+400*0+20]
+ movu m7, [t4+wq*1+400*0+ 4]
+ movu m8, [t3+wq*2+400*0+ 8]
+ paddw m3, m0, [t4+wq*1+400*0+ 0]
+ paddd m4, m1, [t3+wq*2+400*0+ 0]
+ paddd m5, m2, [t3+wq*2+400*0+16]
+ paddw m3, m7
+ paddd m4, m8
+ movu m7, [t3+wq*2+400*0+24]
+ paddw m0, m3
+ paddd m1, m4
+ psllw m3, 2
+ pslld m4, 2
+ paddd m5, m7
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a5 565
+ paddd m1, m4 ; b5 565
+ paddd m2, m5
+ mova [t4+wq*1+400* 6+ 0], m0
+ mova [t3+wq*2+400*12+ 0], m1
+ mova [t3+wq*2+400*12+16], m2
+ movu m0, [t4+wq*1+400*2+ 4]
+ movu m1, [t3+wq*2+400*4+ 8]
+ movu m2, [t3+wq*2+400*4+24]
+ movu m3, [t4+wq*1+400*2+ 2]
+ movu m4, [t3+wq*2+400*4+ 4]
+ movu m5, [t3+wq*2+400*4+20]
+ paddw m0, [t4+wq*1+400*2+ 0]
+ paddd m1, [t3+wq*2+400*4+ 0]
+ paddd m2, [t3+wq*2+400*4+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a3[-1] 444
+ pslld m4, 2 ; b3[-1] 444
+ pslld m5, 2
+ psubw m3, m0 ; a3[-1] 343
+ psubd m4, m1 ; b3[-1] 343
+ psubd m5, m2
+ mova [t4+wq*1+400* 8+ 0], m3
+ mova [t3+wq*2+400*16+ 0], m4
+ mova [t3+wq*2+400*16+16], m5
+ movu m0, [t4+wq*1+400*4+ 4]
+ movu m1, [t3+wq*2+400*8+ 8]
+ movu m2, [t3+wq*2+400*8+24]
+ movu m3, [t4+wq*1+400*4+ 2]
+ movu m4, [t3+wq*2+400*8+ 4]
+ movu m5, [t3+wq*2+400*8+20]
+ paddw m0, [t4+wq*1+400*4+ 0]
+ paddd m1, [t3+wq*2+400*8+ 0]
+ paddd m2, [t3+wq*2+400*8+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a3[ 0] 444
+ pslld m4, 2 ; b3[ 0] 444
+ pslld m5, 2
+ mova [t4+wq*1+400*10+ 0], m3
+ mova [t3+wq*2+400*20+ 0], m4
+ mova [t3+wq*2+400*20+16], m5
+ psubw m3, m0 ; a3[ 0] 343
+ psubd m4, m1 ; b3[ 0] 343
+ psubd m5, m2
+ mova [t4+wq*1+400*12+ 0], m3
+ mova [t3+wq*2+400*24+ 0], m4
+ mova [t3+wq*2+400*24+16], m5
+ add wq, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m0, [t4+wq*1+ 4]
+ movu m2, [t4+wq*1+ 2]
+ paddw m0, [t4+wq*1+ 0]
+ paddw m0, m2
+ paddw m2, m0
+ psllw m0, 2
+ paddw m0, m2 ; a5
+ movu m4, [t3+wq*2+ 8]
+ movu m5, [t3+wq*2+24]
+ movu m1, [t3+wq*2+ 4]
+ movu m3, [t3+wq*2+20]
+ paddd m4, [t3+wq*2+ 0]
+ paddd m5, [t3+wq*2+16]
+ paddd m4, m1
+ paddd m5, m3
+ paddd m1, m4
+ paddd m3, m5
+ pslld m4, 2
+ pslld m5, 2
+ paddd m4, m1 ; b5
+ paddd m5, m3
+ movu m2, [t4+wq*1+400* 6]
+ paddw m2, m0
+ mova [t4+wq*1+400* 6], m0
+ paddd m0, m4, [t3+wq*2+400*12+ 0]
+ paddd m1, m5, [t3+wq*2+400*12+16]
+ mova [t3+wq*2+400*12+ 0], m4
+ mova [t3+wq*2+400*12+16], m5
+ mova [rsp+16+ARCH_X86_32*4], m1
+ movu m3, [t4+wq*1+400*2+4]
+ movu m5, [t4+wq*1+400*2+2]
+ paddw m3, [t4+wq*1+400*2+0]
+ paddw m5, m3
+ psllw m5, 2 ; a3[ 1] 444
+ psubw m4, m5, m3 ; a3[ 1] 343
+ movu m3, [t4+wq*1+400* 8]
+ paddw m3, [t4+wq*1+400*10]
+ paddw m3, m4
+ mova [t4+wq*1+400* 8], m4
+ mova [t4+wq*1+400*10], m5
+ movu m1, [t3+wq*2+400*4+ 8]
+ movu m5, [t3+wq*2+400*4+ 4]
+ movu m7, [t3+wq*2+400*4+24]
+ movu m8, [t3+wq*2+400*4+20]
+ paddd m1, [t3+wq*2+400*4+ 0]
+ paddd m7, [t3+wq*2+400*4+16]
+ paddd m5, m1
+ paddd m8, m7
+ pslld m5, 2 ; b3[ 1] 444
+ pslld m8, 2
+ psubd m4, m5, m1 ; b3[ 1] 343
+%if ARCH_X86_32
+ mova [esp+52], m8
+ psubd m8, m7
+%else
+ psubd m6, m8, m7
+ SWAP m8, m6
+%endif
+ paddd m1, m4, [t3+wq*2+400*16+ 0]
+ paddd m7, m8, [t3+wq*2+400*16+16]
+ paddd m1, [t3+wq*2+400*20+ 0]
+ paddd m7, [t3+wq*2+400*20+16]
+ mova [t3+wq*2+400*16+ 0], m4
+ mova [t3+wq*2+400*16+16], m8
+ mova [t3+wq*2+400*20+ 0], m5
+%if ARCH_X86_32
+ mova m8, [esp+52]
+%else
+ SWAP m8, m6
+ pxor m6, m6
+%endif
+ mova [t3+wq*2+400*20+16], m8
+ mova [rsp+32+ARCH_X86_32*4], m7
+ movu m5, [dstq+wq]
+ punpcklwd m4, m5, m6
+ punpcklwd m7, m2, m6
+ pmaddwd m7, m4 ; a5 * src
+ punpcklwd m8, m3, m6
+ pmaddwd m8, m4 ; a3 * src
+ punpckhwd m5, m6
+ punpckhwd m2, m6
+ pmaddwd m2, m5
+ punpckhwd m3, m6
+ pmaddwd m3, m5
+ pslld m4, 13
+ pslld m5, 13
+ psubd m0, m7 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m8 ; b3 - a3 * src + (1 << 8)
+ mova m7, [base+pd_0xffff]
+ psrld m0, 9
+ pslld m1, 7
+ pand m0, m7
+ pandn m8, m7, m1
+ por m0, m8
+ mova m1, [rsp+16+ARCH_X86_32*4]
+ mova m8, [rsp+32+ARCH_X86_32*4]
+ psubd m1, m2
+ psubd m8, m3
+ mova m2, [base+pd_4096]
+ psrld m1, 9
+ pslld m8, 7
+ pand m1, m7
+ pandn m7, m8
+ por m1, m7
+ pmaddwd m0, m15
+ pmaddwd m1, m15
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ paddd m4, m2
+ paddd m5, m2
+ paddd m0, m4
+ paddd m1, m5
+ psrad m0, 8
+ psrad m1, 8
+ packssdw m0, m1 ; clip
+ pmaxsw m0, m7
+ psrlw m0, 5
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+%if ARCH_X86_64
+ SWAP m6, m7
+%endif
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movu m3, [t4+wq*1+400*4+4]
+ movu m5, [t4+wq*1+400*4+2]
+ paddw m3, [t4+wq*1+400*4+0]
+ paddw m5, m3
+ psllw m5, 2 ; a3[ 1] 444
+ psubw m4, m5, m3 ; a3[ 1] 343
+ paddw m3, m4, [t4+wq*1+400*12]
+ paddw m3, [t4+wq*1+400*10]
+ mova [t4+wq*1+400*10], m5
+ mova [t4+wq*1+400*12], m4
+ movu m1, [t3+wq*2+400*8+ 8]
+ movu m5, [t3+wq*2+400*8+ 4]
+ movu m7, [t3+wq*2+400*8+24]
+ movu m8, [t3+wq*2+400*8+20]
+ paddd m1, [t3+wq*2+400*8+ 0]
+ paddd m7, [t3+wq*2+400*8+16]
+ paddd m5, m1
+ paddd m8, m7
+ pslld m5, 2 ; b3[ 1] 444
+ pslld m8, 2
+ psubd m4, m5, m1 ; b3[ 1] 343
+ psubd m0, m8, m7
+ paddd m1, m4, [t3+wq*2+400*24+ 0]
+ paddd m7, m0, [t3+wq*2+400*24+16]
+ paddd m1, [t3+wq*2+400*20+ 0]
+ paddd m7, [t3+wq*2+400*20+16]
+ mova [t3+wq*2+400*20+ 0], m5
+ mova [t3+wq*2+400*20+16], m8
+ mova [t3+wq*2+400*24+ 0], m4
+ mova [t3+wq*2+400*24+16], m0
+ mova m5, [dstq+wq]
+ mova m2, [t4+wq*1+400* 6]
+ punpcklwd m4, m5, m6
+ punpcklwd m8, m2, m6
+ pmaddwd m8, m4 ; a5 * src
+ punpcklwd m0, m3, m6
+ pmaddwd m0, m4 ; a3 * src
+ punpckhwd m5, m6
+ punpckhwd m2, m6
+ pmaddwd m2, m5
+ punpckhwd m3, m6
+ pmaddwd m3, m5
+ psubd m1, m0 ; b3 - a3 * src + (1 << 8)
+ pslld m4, 13
+ pslld m5, 13
+ mova m0, [t3+wq*2+400*12+ 0]
+ psubd m0, m8 ; b5 - a5 * src + (1 << 8)
+ mova m8, [t3+wq*2+400*12+16]
+ psubd m8, m2
+ psubd m7, m3
+ mova m2, [base+pd_0xffff]
+ pslld m1, 7
+ psrld m0, 8
+ psrld m8, 8
+ pslld m7, 7
+ pand m0, m2
+ pandn m3, m2, m1
+ por m0, m3
+ pand m8, m2
+ pandn m2, m7
+ por m2, m8
+ mova m1, [base+pd_4096]
+ pmaddwd m0, m15
+ pmaddwd m2, m15
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ pxor m7, m7
+ paddd m4, m1
+ paddd m5, m1
+ paddd m0, m4
+ paddd m2, m5
+ psrad m0, 8
+ psrad m2, 8
+ packssdw m0, m2 ; clip
+ pmaxsw m0, m7
+ psrlw m0, 5
+ mova [dstq+wq], m0
+ add wq, 16
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
diff --git a/third_party/dav1d/src/x86/looprestoration_avx2.asm b/third_party/dav1d/src/x86/looprestoration_avx2.asm
new file mode 100644
index 0000000000..a73cb21882
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration_avx2.asm
@@ -0,0 +1,2237 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14
+wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12
+sgr_r_ext: times 16 db 1
+ times 16 db 9
+
+; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of
+; cache but eliminates some shifts in the inner sgr loop which is overall a win
+const sgr_x_by_x_avx2
+ dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16
+ dd 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8
+ dd 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5
+ dd 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
+ dd 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3
+ dd 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+ dd 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
+
+ times 4 db -1 ; needed for 16-bit sgr
+pb_m5: times 4 db -5
+pb_3: times 4 db 3
+pw_5_6: dw 5, 6
+
+sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1
+ db 9, -1, 10, -1, 11, -1, 12, -1
+
+pw_256: times 2 dw 256
+pw_2056: times 2 dw 2056
+pw_m16380: times 2 dw -16380
+pd_25: dd 25
+pd_34816: dd 34816
+pd_m4096: dd -4096
+pd_0xf00801c7: dd 0xf00801c7
+pd_0xf00800a4: dd 0xf00800a4
+
+SECTION .text
+
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
+
+INIT_YMM avx2
+cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ mov fltq, r6mp
+ movifnidn hd, hm
+ mov edged, r7m
+ mov wd, wm
+ vbroadcasti128 m6, [wiener_shufA]
+ vpbroadcastb m11, [fltq+ 0] ; x0 x0
+ vbroadcasti128 m7, [wiener_shufB]
+ vpbroadcastd m12, [fltq+ 2]
+ vbroadcasti128 m8, [wiener_shufC]
+ packsswb m12, m12 ; x1 x2
+ vpbroadcastw m13, [fltq+ 6] ; x3
+ vbroadcasti128 m9, [sgr_shuf+6]
+ add lpfq, wq
+ vpbroadcastd m10, [pw_m16380]
+ vpbroadcastd m14, [fltq+16] ; y0 y1
+ add dstq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ lea t1, [rsp+wq*2+16]
+ psllw m14, 5
+ neg wq
+ psllw m15, 5
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.extend_right:
+ movd xm2, r10d
+ vpbroadcastd m0, [pb_3]
+ vpbroadcastd m1, [pb_m5]
+ vpbroadcastb m2, xm2
+ movu m3, [pb_0to31]
+ psubb m0, m2
+ psubb m1, m2
+ pminub m0, m3
+ pminub m1, m3
+ pshufb m4, m0
+ pshufb m5, m1
+ ret
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located
+ mova m4, [lpfq+r10] ; before the start of the buffer
+ palignr m4, m5, 12
+ pshufb m4, [wiener_l_shuf]
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10-4]
+.h_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m11
+ pshufb m1, m5, m6
+ pmaddubsw m1, m11
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ paddw m0, m2
+ pshufb m2, m4, m8
+ pmaddubsw m2, m12
+ paddw m1, m3
+ pshufb m3, m5, m8
+ pmaddubsw m3, m12
+ pshufb m4, m9
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m9
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m10
+ paddw m5, m10
+ paddw m0, m2
+ vpbroadcastd m2, [pw_2056]
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m2
+ paddw m1, m2
+ mova [t1+r10*2+ 0], m0
+ mova [t1+r10*2+32], m1
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ movu m4, [lpfq+r10-4]
+ pshufb m4, [wiener_l_shuf]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+r10-4]
+.hv_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -34
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m11
+ pshufb m1, m5, m6
+ pmaddubsw m1, m11
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ paddw m0, m2
+ pshufb m2, m4, m8
+ pmaddubsw m2, m12
+ paddw m1, m3
+ pshufb m3, m5, m8
+ pmaddubsw m3, m12
+ pshufb m4, m9
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m9
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m10
+ paddw m5, m10
+ paddw m0, m2
+ paddw m1, m3
+ mova m2, [t4+r10*2]
+ paddw m2, [t2+r10*2]
+ mova m3, [t3+r10*2]
+ paddsw m0, m4
+ vpbroadcastd m4, [pw_2056]
+ paddsw m1, m5
+ mova m5, [t5+r10*2]
+ paddw m5, [t1+r10*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m4
+ paddw m1, m4
+ paddw m4, m0, [t6+r10*2]
+ mova [t0+r10*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m14
+ punpckhwd m4, m5
+ pmaddwd m4, m14
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t4+r10*2+32]
+ paddw m2, [t2+r10*2+32]
+ mova m3, [t3+r10*2+32]
+ mova m5, [t5+r10*2+32]
+ paddw m5, [t1+r10*2+32]
+ packuswb m0, m4
+ paddw m4, m1, [t6+r10*2+32]
+ mova [t0+r10*2+32], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m14
+ punpckhwd m4, m5
+ pmaddwd m4, m14
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m2, [t4+r10*2+ 0]
+ paddw m2, [t2+r10*2+ 0]
+ mova m4, [t3+r10*2+ 0]
+ mova m6, [t1+r10*2+ 0]
+ paddw m8, m6, [t6+r10*2+ 0]
+ paddw m6, [t5+r10*2+ 0]
+ mova m3, [t4+r10*2+32]
+ paddw m3, [t2+r10*2+32]
+ mova m5, [t3+r10*2+32]
+ mova m7, [t1+r10*2+32]
+ paddw m9, m7, [t6+r10*2+32]
+ paddw m7, [t5+r10*2+32]
+ punpcklwd m0, m2, m4
+ pmaddwd m0, m15
+ punpckhwd m2, m4
+ pmaddwd m2, m15
+ punpcklwd m4, m8, m6
+ pmaddwd m4, m14
+ punpckhwd m6, m8, m6
+ pmaddwd m6, m14
+ punpcklwd m1, m3, m5
+ pmaddwd m1, m15
+ punpckhwd m3, m5
+ pmaddwd m3, m15
+ punpcklwd m5, m9, m7
+ pmaddwd m5, m14
+ punpckhwd m7, m9, m7
+ pmaddwd m7, m14
+ paddd m0, m4
+ paddd m2, m6
+ paddd m1, m5
+ paddd m3, m7
+ packuswb m0, m2
+ packuswb m1, m3
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+
+cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ mov fltq, r6mp
+ movifnidn hd, hm
+ mov edged, r7m
+ mov wd, wm
+ vbroadcasti128 m6, [wiener_shufB]
+ vpbroadcastd m12, [fltq+ 2]
+ vbroadcasti128 m7, [wiener_shufC]
+ packsswb m12, m12 ; x1 x2
+ vpbroadcastw m13, [fltq+ 6] ; x3
+ vbroadcasti128 m8, [sgr_shuf+6]
+ add lpfq, wq
+ vpbroadcastd m9, [pw_m16380]
+ vpbroadcastd m10, [pw_2056]
+ mova m11, [wiener_l_shuf]
+ vpbroadcastd m14, [fltq+16] ; __ y1
+ add dstq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ lea t1, [rsp+wq*2+16]
+ psllw m14, 5
+ neg wq
+ psllw m15, 5
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+.v1:
+ call .v
+ jmp .end
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located
+ mova m4, [lpfq+r10] ; before the start of the buffer
+ palignr m4, m5, 12
+ pshufb m4, m11
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+r10-4]
+.h_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -33
+ jl .h_have_right
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
+.h_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m12
+ pshufb m1, m5, m6
+ pmaddubsw m1, m12
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ pshufb m4, m8
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m8
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m9
+ paddw m5, m9
+ paddw m0, m2
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m10
+ paddw m1, m10
+ mova [t1+r10*2+ 0], m0
+ mova [t1+r10*2+32], m1
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ movu m4, [lpfq+r10-4]
+ pshufb m4, m11
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+r10-4]
+.hv_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -33
+ jl .hv_have_right
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
+.hv_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m12
+ pshufb m1, m5, m6
+ pmaddubsw m1, m12
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ pshufb m4, m8
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m8
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m9
+ paddw m5, m9
+ paddw m0, m2
+ paddw m1, m3
+ mova m2, [t3+r10*2]
+ paddw m2, [t1+r10*2]
+ mova m3, [t2+r10*2]
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m10
+ paddw m1, m10
+ paddw m4, m0, [t4+r10*2]
+ mova [t0+r10*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m14
+ punpckhwd m4, m4
+ pmaddwd m4, m14
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t3+r10*2+32]
+ paddw m2, [t1+r10*2+32]
+ mova m3, [t2+r10*2+32]
+ packuswb m0, m4
+ paddw m4, m1, [t4+r10*2+32]
+ mova [t0+r10*2+32], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m14
+ punpckhwd m4, m4
+ pmaddwd m4, m14
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+ psrld m13, m14, 16 ; y1 __
+.v_loop:
+ mova m6, [t1+r10*2+ 0]
+ paddw m2, m6, [t3+r10*2+ 0]
+ mova m4, [t2+r10*2+ 0]
+ mova m7, [t1+r10*2+32]
+ paddw m3, m7, [t3+r10*2+32]
+ mova m5, [t2+r10*2+32]
+ paddw m6, [t4+r10*2+ 0]
+ paddw m7, [t4+r10*2+32]
+ punpcklwd m0, m2, m4
+ pmaddwd m0, m15
+ punpckhwd m2, m4
+ pmaddwd m2, m15
+ punpcklwd m1, m3, m5
+ pmaddwd m1, m15
+ punpckhwd m3, m5
+ pmaddwd m3, m15
+ punpcklwd m5, m7, m6
+ pmaddwd m4, m5, m14
+ punpckhwd m7, m6
+ pmaddwd m6, m7, m14
+ pmaddwd m5, m13
+ pmaddwd m7, m13
+ paddd m0, m4
+ paddd m2, m6
+ paddd m1, m5
+ paddd m3, m7
+ packuswb m0, m2
+ packuswb m1, m3
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ ret
+
+cglobal sgr_filter_5x5_8bpc, 4, 13, 16, 400*24+16, dst, stride, left, lpf, \
+ w, h, edge, params
+%define base r12-sgr_x_by_x_avx2-256*4
+ lea r12, [sgr_x_by_x_avx2+256*4]
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti128 m8, [base+sgr_shuf+0]
+ vbroadcasti128 m9, [base+sgr_shuf+8]
+ add lpfq, wq
+ vbroadcasti128 m10, [base+sgr_shuf+2]
+ add dstq, wq
+ vbroadcasti128 m11, [base+sgr_shuf+6]
+ lea t3, [rsp+wq*4+16+400*12]
+ vpbroadcastd m12, [paramsq+0] ; s0
+ pxor m6, m6
+ vpbroadcastw m7, [paramsq+8] ; w0
+ lea t1, [rsp+wq*2+20]
+ vpbroadcastd m13, [base+pd_0xf00800a4]
+ neg wq
+ vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15)
+ psllw m7, 4
+ vpbroadcastd m15, [base+pd_m4096]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call .top_fixup
+ add t1, 400*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ test hd, hd
+ jz .odd_height
+ call .h
+ add lpfq, strideq
+ call .hv
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .h_top
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+400*6]
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ jmp .main
+.no_top_height1:
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.extend_right:
+ movd xm2, r10d
+ mova m0, [sgr_r_ext]
+ vpbroadcastb m2, xm2
+ psubb m0, m2
+ pminub m0, [pb_0to31]
+ pshufb m5, m0
+ ret
+.h: ; horizontal boxsum
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu xm5, [lpfq+r10-2]
+.h_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m3, m5, m8
+ pmullw m4, m3, m3
+ pshufb m2, m5, m9
+ paddw m0, m3, m2
+ shufps m3, m2, q2121
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ punpcklwd m3, m4, m6
+ paddd m1, m3
+ punpckhwd m4, m6
+ paddd m2, m4
+ pshufb m4, m5, m10
+ paddw m0, m4
+ pshufb m5, m11
+ paddw m0, m5 ; sum
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ punpckhwd m4, m5
+ pmaddwd m4, m4
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+r10*2+400*0]
+ paddd m1, [t1+r10*2+400*2]
+ paddd m2, [t1+r10*2+400*4]
+.h_loop_end:
+ paddd m1, m3 ; sumsq
+ paddd m2, m4
+ mova [t1+r10*2+400*0], m0
+ mova [t1+r10*2+400*2], m1
+ mova [t1+r10*2+400*4], m2
+ add r10, 16
+ jl .h_loop
+ ret
+.top_fixup:
+ lea r10, [wq-2]
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+r10*2+400*0]
+ mova m1, [t1+r10*2+400*2]
+ mova m2, [t1+r10*2+400*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+r10*2+400*0], m0
+ mova [t2+r10*2+400*2], m1
+ mova [t2+r10*2+400*4], m2
+ add r10, 16
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu xm5, [lpfq+r10-2]
+.hv_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -18
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ pshufb m1, m5, m8
+ pmullw m4, m1, m1
+ pshufb m3, m5, m9
+ paddw m0, m1, m3
+ shufps m1, m3, q2121
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ punpcklwd m1, m4, m6
+ paddd m2, m1
+ punpckhwd m4, m6
+ paddd m3, m4
+ pshufb m1, m5, m10
+ paddw m0, m1
+ pshufb m5, m11
+ paddw m0, m5 ; h sum
+ punpcklwd m4, m5, m1
+ pmaddwd m4, m4
+ punpckhwd m5, m1
+ pmaddwd m5, m5
+ paddw m1, m0, [t1+r10*2+400*0]
+ paddd m2, m4 ; h sumsq
+ paddd m3, m5
+ paddd m4, m2, [t1+r10*2+400*2]
+ paddd m5, m3, [t1+r10*2+400*4]
+ test hd, hd
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+r10*2+400*0] ; hv sum
+ paddd m4, [t2+r10*2+400*2] ; hv sumsq
+ paddd m5, [t2+r10*2+400*4]
+ mova [t0+r10*2+400*0], m0
+ mova [t0+r10*2+400*2], m2
+ mova [t0+r10*2+400*4], m3
+ vpbroadcastd m2, [pd_25]
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmulld m4, m2 ; a * 25
+ pmulld m5, m2
+ pmaddwd m2, m0, m0 ; b * b
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m12 ; p * s
+ pmulld m5, m12
+ pmaddwd m0, m13 ; b * 164
+ pmaddwd m1, m13
+ paddusw m4, m13
+ paddusw m5, m13
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ pand m0, m15
+ pand m1, m15
+ por m0, m2 ; a | (b << 12)
+ por m1, m3
+ mova [t3+r10*4+ 8], xm0 ; The neighbor calculations requires
+ vextracti128 [t3+r10*4+40], m0, 1 ; 13 bits for a and 21 bits for b.
+ mova [t3+r10*4+24], xm1 ; Packing them allows for 12+20, but
+ vextracti128 [t3+r10*4+56], m1, 1 ; that gets us most of the way.
+ add r10, 16
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+r10*2+400*0], m1
+ paddw m1, m0
+ mova [t1+r10*2+400*2], m4
+ paddd m4, m2
+ mova [t1+r10*2+400*4], m5
+ paddd m5, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+ lea r10, [wq-2]
+.v_loop:
+ mova m0, [t1+r10*2+400*0]
+ mova m2, [t1+r10*2+400*2]
+ mova m3, [t1+r10*2+400*4]
+ paddw m1, m0, [t2+r10*2+400*0]
+ paddd m4, m2, [t2+r10*2+400*2]
+ paddd m5, m3, [t2+r10*2+400*4]
+ paddw m0, m0
+ paddd m2, m2
+ paddd m3, m3
+ paddw m1, m0 ; hv sum
+ paddd m4, m2 ; hv sumsq
+ paddd m5, m3
+ vpbroadcastd m2, [pd_25]
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmulld m4, m2 ; a * 25
+ pmulld m5, m2
+ pmaddwd m2, m0, m0 ; b * b
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m12 ; p * s
+ pmulld m5, m12
+ pmaddwd m0, m13 ; b * 164
+ pmaddwd m1, m13
+ paddusw m4, m13
+ paddusw m5, m13
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4 ; x
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ pand m0, m15
+ pand m1, m15
+ por m0, m2 ; a | (b << 12)
+ por m1, m3
+ mova [t3+r10*4+ 8], xm0
+ vextracti128 [t3+r10*4+40], m0, 1
+ mova [t3+r10*4+24], xm1
+ vextracti128 [t3+r10*4+56], m1, 1
+ add r10, 16
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t3+r10*4+ 4]
+ movu m1, [t3+r10*4+36]
+ paddd m2, m0, [t3+r10*4+ 0]
+ paddd m3, m1, [t3+r10*4+32]
+ paddd m2, [t3+r10*4+ 8]
+ paddd m3, [t3+r10*4+40]
+ paddd m0, m2
+ pslld m2, 2
+ paddd m1, m3
+ pslld m3, 2
+ paddd m2, m0 ; ab 565
+ paddd m3, m1
+ pandn m0, m15, m2 ; a
+ psrld m2, 12 ; b
+ pandn m1, m15, m3
+ psrld m3, 12
+ mova [t3+r10*4+400*4+ 0], m0
+ mova [t3+r10*4+400*8+ 0], m2
+ mova [t3+r10*4+400*4+32], m1
+ mova [t3+r10*4+400*8+32], m3
+ add r10, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m0, [t3+r10*4+ 4]
+ movu m1, [t3+r10*4+36]
+ paddd m2, m0, [t3+r10*4+ 0]
+ paddd m3, m1, [t3+r10*4+32]
+ paddd m2, [t3+r10*4+ 8]
+ paddd m3, [t3+r10*4+40]
+ paddd m0, m2
+ pslld m2, 2
+ paddd m1, m3
+ pslld m3, 2
+ paddd m2, m0
+ paddd m3, m1
+ pandn m0, m15, m2
+ psrld m2, 12
+ pandn m1, m15, m3
+ psrld m3, 12
+ paddd m4, m0, [t3+r10*4+400*4+ 0] ; a
+ paddd m5, m1, [t3+r10*4+400*4+32]
+ mova [t3+r10*4+400*4+ 0], m0
+ mova [t3+r10*4+400*4+32], m1
+ paddd m0, m2, [t3+r10*4+400*8+ 0] ; b
+ paddd m1, m3, [t3+r10*4+400*8+32]
+ mova [t3+r10*4+400*8+ 0], m2
+ mova [t3+r10*4+400*8+32], m3
+ pmovzxbd m2, [dstq+r10+0]
+ pmovzxbd m3, [dstq+r10+8]
+ pmaddwd m4, m2 ; a * src
+ pmaddwd m5, m3
+ packssdw m2, m3
+ psubd m0, m4 ; b - a * src + (1 << 8)
+ psubd m1, m5
+ psrad m0, 9
+ psrad m1, 9
+ packssdw m0, m1
+ pmulhrsw m0, m7
+ paddw m0, m2
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ pshufd xm0, xm0, q3120
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ pmovzxbd m2, [dstq+r10+0]
+ pmovzxbd m3, [dstq+r10+8]
+ pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; a * src
+ pmaddwd m5, m3, [t3+r10*4+400*4+32]
+ mova m0, [t3+r10*4+400*8+ 0] ; b
+ mova m1, [t3+r10*4+400*8+32]
+ packssdw m2, m3
+ psubd m0, m4 ; b - a * src + (1 << 7)
+ psubd m1, m5
+ psrad m0, 8
+ psrad m1, 8
+ packssdw m0, m1
+ pmulhrsw m0, m7
+ paddw m0, m2
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ pshufd xm0, xm0, q3120
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_3x3_8bpc, 4, 15, 15, -400*28-16, dst, stride, left, lpf, \
+ w, h, edge, params
+%define base r14-sgr_x_by_x_avx2-256*4
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ lea r14, [sgr_x_by_x_avx2+256*4]
+ vbroadcasti128 m8, [base+sgr_shuf+2]
+ add lpfq, wq
+ vbroadcasti128 m9, [base+sgr_shuf+4]
+ add dstq, wq
+ vbroadcasti128 m10, [base+sgr_shuf+6]
+ lea t3, [rsp+wq*4+16+400*12]
+ vpbroadcastd m11, [paramsq+ 4] ; s1
+ pxor m6, m6
+ vpbroadcastw m7, [paramsq+10] ; w1
+ lea t1, [rsp+wq*2+20]
+ vpbroadcastd m12, [base+pd_0xf00801c7]
+ neg wq
+ vpbroadcastd m13, [base+pd_34816] ; (1 << 11) + (1 << 15)
+ psllw m7, 4
+ vpbroadcastd m14, [base+pd_m4096]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ add t1, 400*6
+ call .h_top
+ lea t4, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add t4, strideq
+ mov [rsp], t4 ; below
+ mov t0, t2
+ call .hv
+.main:
+ mov t5, t3
+ add t3, 400*4
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ dec hd
+ jz .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv
+ call .n
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv_bottom
+ call .n
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n
+ RET
+.height1:
+ call .v
+ call .prep_n
+ mov t2, t1
+ call .v
+ jmp .end
+.extend_bottom:
+ call .v
+ call .n
+ mov t2, t1
+ call .v
+ jmp .end
+.no_top:
+ lea t4, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea t4, [t4+strideq*2]
+ mov [rsp], t4
+ call .h
+ lea t0, [t1+400*6]
+ mov t2, t1
+ call .v
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu xm5, [lpfq+r10-2]
+.h_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -17
+ jl .h_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.h_have_right:
+ pshufb m0, m5, m8
+ pmullw m2, m0, m0
+ pshufb m4, m5, m9
+ paddw m0, m4
+ pshufb m5, m10
+ paddw m0, m5 ; sum
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ punpckhwd m4, m5
+ pmaddwd m4, m4
+ punpcklwd m1, m2, m6
+ punpckhwd m2, m6
+ mova [t1+r10*2+400*0], m0
+ paddd m1, m3 ; sumsq
+ paddd m2, m4
+ mova [t1+r10*2+400*2], m1
+ mova [t1+r10*2+400*4], m2
+ add r10, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu xm5, [lpfq+r10-2]
+.hv_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -17
+ jl .hv_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.hv_have_right:
+ pshufb m0, m5, m8
+ pmullw m3, m0, m0
+ pshufb m1, m5, m9
+ paddw m0, m1
+ pshufb m5, m10
+ paddw m0, m5 ; h sum
+ punpcklwd m4, m5, m1
+ pmaddwd m4, m4
+ punpckhwd m5, m1
+ pmaddwd m5, m5
+ paddw m1, m0, [t2+r10*2+400*0]
+ paddw m1, [t1+r10*2+400*0] ; hv sum
+ punpcklwd m2, m3, m6
+ punpckhwd m3, m6
+ paddd m4, m2 ; h sumsq
+ paddd m5, m3
+ paddd m2, m4, [t2+r10*2+400*2]
+ paddd m3, m5, [t2+r10*2+400*4]
+ paddd m2, [t1+r10*2+400*2] ; hv sumsq
+ paddd m3, [t1+r10*2+400*4]
+ mova [t0+r10*2+400*0], m0
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ mova [t0+r10*2+400*2], m4
+ pslld m4, m2, 3
+ mova [t0+r10*2+400*4], m5
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ pmaddwd m2, m0, m0 ; b * b
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m11 ; p * s
+ pmulld m5, m11
+ pmaddwd m0, m12 ; b * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r14+m3*4], m4
+ psrad m4, m5, 20
+ vpgatherdd m3, [r14+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ pand m0, m14
+ pand m1, m14
+ por m0, m2 ; a | (b << 12)
+ por m1, m3
+ mova [t3+r10*4+ 8], xm0
+ vextracti128 [t3+r10*4+40], m0, 1
+ mova [t3+r10*4+24], xm1
+ vextracti128 [t3+r10*4+56], m1, 1
+ add r10, 16
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.v: ; vertical boxsum + ab
+ lea r10, [wq-2]
+.v_loop:
+ mova m1, [t1+r10*2+400*0]
+ paddw m1, m1
+ paddw m1, [t2+r10*2+400*0] ; hv sum
+ mova m2, [t1+r10*2+400*2]
+ mova m3, [t1+r10*2+400*4]
+ paddd m2, m2
+ paddd m3, m3
+ paddd m2, [t2+r10*2+400*2] ; hv sumsq
+ paddd m3, [t2+r10*2+400*4]
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ pmaddwd m2, m0, m0 ; b * b
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ pmulld m4, m11 ; p * s
+ pmulld m5, m11
+ pmaddwd m0, m12 ; b * 455
+ pmaddwd m1, m12
+ paddusw m4, m12
+ paddusw m5, m12
+ psrad m3, m4, 20 ; min(z, 255) - 256
+ vpgatherdd m2, [r14+m3*4], m4
+ psrad m4, m5, 20
+ vpgatherdd m3, [r14+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m13
+ pand m0, m14
+ pand m1, m14
+ por m0, m2 ; a | (b << 12)
+ por m1, m3
+ mova [t3+r10*4+ 8], xm0
+ vextracti128 [t3+r10*4+40], m0, 1
+ mova [t3+r10*4+24], xm1
+ vextracti128 [t3+r10*4+56], m1, 1
+ add r10, 16
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+ mov t4, t3
+ add t3, 400*4
+.prep_n_loop:
+ mova m2, [t5+r10*4+0]
+ mova m3, [t4+r10*4+0]
+ paddd m2, [t5+r10*4+8]
+ paddd m3, [t4+r10*4+8]
+ paddd m0, m2, [t5+r10*4+4]
+ paddd m1, m3, [t4+r10*4+4]
+ pslld m0, 2
+ paddd m1, m1 ; ab[ 0] 222
+ psubd m0, m2 ; ab[-1] 343
+ mova [t3+r10*4+400*4], m1
+ paddd m1, m1
+ mova [t5+r10*4], m0
+ psubd m1, m3 ; ab[ 0] 343
+ mova [t4+r10*4], m1
+ add r10, 8
+ jl .prep_n_loop
+ ret
+; a+b are packed together in a single dword, but we can't do the
+; full neighbor calculations before splitting them since we don't
+; have sufficient precision. The solution is to do the calculations
+; in two equal halves and split a and b before doing the final sum.
+ALIGN function_align
+.n: ; neighbor + output
+ mov r10, wq
+.n_loop:
+ mova m4, [t3+r10*4+ 0]
+ paddd m4, [t3+r10*4+ 8]
+ paddd m5, m4, [t3+r10*4+ 4]
+ paddd m5, m5 ; ab[+1] 222
+ mova m2, [t3+r10*4+400*4+ 0]
+ paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
+ mova m3, [t3+r10*4+400*4+32]
+ paddd m1, m3, [t5+r10*4+32]
+ mova [t3+r10*4+400*4+ 0], m5
+ paddd m5, m5
+ psubd m5, m4 ; ab[+1] 343
+ mova [t5+r10*4+ 0], m5
+ paddd m2, m5 ; ab[ 0] 222 + ab[+1] 343
+ mova m4, [t3+r10*4+32]
+ paddd m4, [t3+r10*4+40]
+ paddd m5, m4, [t3+r10*4+36]
+ paddd m5, m5
+ mova [t3+r10*4+400*4+32], m5
+ paddd m5, m5
+ psubd m5, m4
+ mova [t5+r10*4+32], m5
+ pandn m4, m14, m0
+ psrld m0, 12
+ paddd m3, m5
+ pandn m5, m14, m2
+ psrld m2, 12
+ paddd m4, m5 ; a
+ pandn m5, m14, m1
+ psrld m1, 12
+ paddd m0, m2 ; b + (1 << 8)
+ pandn m2, m14, m3
+ psrld m3, 12
+ paddd m5, m2
+ pmovzxbd m2, [dstq+r10+0]
+ paddd m1, m3
+ pmovzxbd m3, [dstq+r10+8]
+ pmaddwd m4, m2 ; a * src
+ pmaddwd m5, m3
+ packssdw m2, m3
+ psubd m0, m4 ; b - a * src + (1 << 8)
+ psubd m1, m5
+ psrad m0, 9
+ psrad m1, 9
+ packssdw m0, m1
+ pmulhrsw m0, m7
+ paddw m0, m2
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ pshufd xm0, xm0, q3120
+ mova [dstq+r10], xm0
+ add r10, 16
+ jl .n_loop
+ mov r10, t5
+ mov t5, t4
+ mov t4, r10
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_mix_8bpc, 4, 13, 16, 400*56+8, dst, stride, left, lpf, \
+ w, h, edge, params
+%define base r12-sgr_x_by_x_avx2-256*4
+ lea r12, [sgr_x_by_x_avx2+256*4]
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti128 m9, [base+sgr_shuf+0]
+ vbroadcasti128 m10, [base+sgr_shuf+8]
+ add lpfq, wq
+ vbroadcasti128 m11, [base+sgr_shuf+2]
+ vbroadcasti128 m12, [base+sgr_shuf+6]
+ add dstq, wq
+ vpbroadcastd m15, [paramsq+8] ; w0 w1
+ lea t3, [rsp+wq*4+400*24+8]
+ vpbroadcastd m13, [paramsq+0] ; s0
+ pxor m7, m7
+ vpbroadcastd m14, [paramsq+4] ; s1
+ lea t1, [rsp+wq*2+12]
+ neg wq
+ psllw m15, 2 ; to reuse existing pd_m4096 register for rounding
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup
+ add t1, 400*12
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+400*12]
+ lea r10, [wq-2]
+.top_fixup_loop:
+ mova m0, [t1+r10*2+400* 0]
+ mova m1, [t1+r10*2+400* 2]
+ mova m2, [t1+r10*2+400* 4]
+ paddw m0, m0
+ mova m3, [t1+r10*2+400* 6]
+ paddd m1, m1
+ mova m4, [t1+r10*2+400* 8]
+ paddd m2, m2
+ mova m5, [t1+r10*2+400*10]
+ mova [t2+r10*2+400* 0], m0
+ mova [t2+r10*2+400* 2], m1
+ mova [t2+r10*2+400* 4], m2
+ mova [t2+r10*2+400* 6], m3
+ mova [t2+r10*2+400* 8], m4
+ mova [t2+r10*2+400*10], m5
+ add r10, 16
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsums
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu xm5, [lpfq+r10-2]
+.h_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -18
+ jl .h_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.h_have_right:
+ pshufb m6, m5, m9
+ pshufb m4, m5, m10
+ paddw m8, m6, m4
+ shufps m0, m6, m4, q2121
+ pmullw m3, m0, m0
+ pshufb m2, m5, m11
+ paddw m0, m2
+ pshufb m5, m12
+ paddw m0, m5 ; sum3
+ punpcklwd m1, m2, m5
+ pmaddwd m1, m1
+ punpckhwd m2, m5
+ pmaddwd m2, m2
+ punpcklwd m5, m6, m4
+ pmaddwd m5, m5
+ punpckhwd m6, m4
+ pmaddwd m6, m6
+ punpcklwd m4, m3, m7
+ paddd m1, m4 ; sumsq3
+ punpckhwd m3, m7
+ paddd m2, m3
+ mova [t1+r10*2+400* 6], m0
+ mova [t1+r10*2+400* 8], m1
+ mova [t1+r10*2+400*10], m2
+ paddw m8, m0 ; sum5
+ paddd m5, m1 ; sumsq5
+ paddd m6, m2
+ mova [t1+r10*2+400* 0], m8
+ mova [t1+r10*2+400* 2], m5
+ mova [t1+r10*2+400* 4], m6
+ add r10, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .hv0_main
+.hv0_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu xm5, [lpfq+r10-2]
+.hv0_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -18
+ jl .hv0_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.hv0_have_right:
+ pshufb m6, m5, m9
+ pshufb m4, m5, m10
+ paddw m8, m6, m4
+ shufps m1, m6, m4, q2121
+ pmullw m0, m1, m1
+ pshufb m3, m5, m11
+ paddw m1, m3
+ pshufb m5, m12
+ paddw m1, m5 ; sum3
+ punpcklwd m2, m3, m5
+ pmaddwd m2, m2
+ punpckhwd m3, m5
+ pmaddwd m3, m3
+ punpcklwd m5, m6, m4
+ pmaddwd m5, m5
+ punpckhwd m6, m4
+ pmaddwd m6, m6
+ punpcklwd m4, m0, m7
+ paddd m2, m4 ; sumsq3
+ punpckhwd m0, m7
+ paddd m3, m0
+ paddw m8, m1 ; sum5
+ paddd m5, m2 ; sumsq5
+ paddd m6, m3
+ mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row
+ mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd
+ mova [t3+r10*4+400*0+40], m6
+ paddw m8, [t1+r10*2+400* 0]
+ paddd m5, [t1+r10*2+400* 2]
+ paddd m6, [t1+r10*2+400* 4]
+ mova [t1+r10*2+400* 0], m8
+ mova [t1+r10*2+400* 2], m5
+ mova [t1+r10*2+400* 4], m6
+ paddw m0, m1, [t1+r10*2+400* 6]
+ paddd m4, m2, [t1+r10*2+400* 8]
+ paddd m5, m3, [t1+r10*2+400*10]
+ mova [t1+r10*2+400* 6], m1
+ mova [t1+r10*2+400* 8], m2
+ mova [t1+r10*2+400*10], m3
+ paddw m1, m0, [t2+r10*2+400* 6]
+ paddd m2, m4, [t2+r10*2+400* 8]
+ paddd m3, m5, [t2+r10*2+400*10]
+ mova [t2+r10*2+400* 6], m0
+ mova [t2+r10*2+400* 8], m4
+ mova [t2+r10*2+400*10], m5
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ pmaddwd m2, m0, m0 ; b3 * b
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ vpbroadcastd m2, [base+pd_0xf00801c7]
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m2 ; b3 * 455
+ pmaddwd m1, m2
+ paddusw m4, m2
+ paddusw m5, m2
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ vpbroadcastd m4, [base+pd_34816]
+ pmulld m0, m2
+ vpbroadcastd m5, [base+pd_m4096]
+ pmulld m1, m3
+ paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m4
+ pand m0, m5
+ pand m1, m5
+ por m0, m2 ; a3 | (b3 << 12)
+ por m1, m3
+ mova [t3+r10*4+400*4+ 8], xm0
+ vextracti128 [t3+r10*4+400*4+40], m0, 1
+ mova [t3+r10*4+400*4+24], xm1
+ vextracti128 [t3+r10*4+400*4+56], m1, 1
+ add r10, 16
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ vpbroadcastd xm0, [leftq]
+ mova xm5, [lpfq+wq]
+ palignr xm5, xm0, 12
+ add leftq, 4
+ jmp .hv1_main
+.hv1_extend_left:
+ mova xm5, [lpfq+wq]
+ pshufb xm5, [base+sgr_l_shuf]
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu xm5, [lpfq+r10-2]
+.hv1_main:
+ vinserti128 m5, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -18
+ jl .hv1_have_right
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
+.hv1_have_right:
+ pshufb m6, m5, m9
+ pshufb m3, m5, m10
+ paddw m8, m6, m3
+ shufps m2, m6, m3, q2121
+ pmullw m1, m2, m2
+ pshufb m0, m5, m11
+ paddw m2, m0
+ pshufb m5, m12
+ paddw m2, m5 ; sum3
+ punpcklwd m4, m5, m0
+ pmaddwd m4, m4
+ punpckhwd m5, m0
+ pmaddwd m5, m5
+ punpcklwd m0, m6, m3
+ pmaddwd m0, m0
+ punpckhwd m6, m3
+ pmaddwd m6, m6
+ punpcklwd m3, m1, m7
+ paddd m4, m3 ; sumsq3
+ punpckhwd m1, m7
+ paddd m5, m1
+ paddw m1, m2, [t2+r10*2+400* 6]
+ mova [t2+r10*2+400* 6], m2
+ paddw m8, m2 ; sum5
+ paddd m2, m4, [t2+r10*2+400* 8]
+ paddd m3, m5, [t2+r10*2+400*10]
+ mova [t2+r10*2+400* 8], m4
+ mova [t2+r10*2+400*10], m5
+ paddd m4, m0 ; sumsq5
+ paddd m5, m6
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pslld m6, m2, 3
+ pslld m7, m3, 3
+ paddd m6, m2 ; a3 * 9
+ pmaddwd m2, m0, m0 ; b3 * b3
+ paddd m7, m3
+ pmaddwd m3, m1, m1
+ psubd m6, m2 ; p3
+ vpbroadcastd m2, [base+pd_0xf00801c7]
+ psubd m7, m3
+ pmulld m6, m14 ; p3 * s1
+ pmulld m7, m14
+ pmaddwd m0, m2 ; b3 * 455
+ pmaddwd m1, m2
+ paddusw m6, m2
+ paddusw m7, m2
+ psrad m3, m6, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m6
+ psrad m6, m7, 20
+ vpgatherdd m3, [r12+m6*4], m7
+ vpbroadcastd m6, [base+pd_34816] ; x3
+ pmulld m0, m2
+ vpbroadcastd m7, [base+pd_m4096]
+ pmulld m1, m3
+ paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m6
+ pand m0, m7
+ pand m7, m1
+ por m0, m2 ; a3 | (b3 << 12)
+ por m7, m3
+ paddw m1, m8, [t2+r10*2+400*0]
+ paddd m2, m4, [t2+r10*2+400*2]
+ paddd m3, m5, [t2+r10*2+400*4]
+ paddw m1, [t1+r10*2+400*0]
+ paddd m2, [t1+r10*2+400*2]
+ paddd m3, [t1+r10*2+400*4]
+ mova [t2+r10*2+400*0], m8
+ mova [t2+r10*2+400*2], m4
+ mova [t2+r10*2+400*4], m5
+ mova [t3+r10*4+400*8+ 8], xm0
+ vextracti128 [t3+r10*4+400*8+40], m0, 1
+ mova [t3+r10*4+400*8+24], xm7
+ vextracti128 [t3+r10*4+400*8+56], m7, 1
+ vpbroadcastd m4, [base+pd_25]
+ pxor m7, m7
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+ pmulld m2, m4 ; a5 * 25
+ pmulld m3, m4
+ pmaddwd m4, m0, m0 ; b5 * b5
+ pmaddwd m5, m1, m1
+ psubd m2, m4 ; p5
+ vpbroadcastd m4, [base+pd_0xf00800a4]
+ psubd m3, m5
+ pmulld m2, m13 ; p5 * s0
+ pmulld m3, m13
+ pmaddwd m0, m4 ; b5 * 164
+ pmaddwd m1, m4
+ paddusw m2, m4
+ paddusw m3, m4
+ psrad m5, m2, 20 ; min(z5, 255) - 256
+ vpgatherdd m4, [r12+m5*4], m2 ; x5
+ psrad m2, m3, 20
+ vpgatherdd m5, [r12+m2*4], m3
+ pmulld m0, m4
+ pmulld m1, m5
+ paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m6
+ vpbroadcastd m6, [base+pd_m4096]
+ pand m0, m6
+ pand m1, m6
+ por m0, m4 ; a5 | (b5 << 12)
+ por m1, m5
+ mova [t3+r10*4+400*0+ 8], xm0
+ vextracti128 [t3+r10*4+400*0+40], m0, 1
+ mova [t3+r10*4+400*0+24], xm1
+ vextracti128 [t3+r10*4+400*0+56], m1, 1
+ add r10, 16
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+ lea r10, [wq-2]
+ vpbroadcastd m6, [base+pd_34816]
+ vpbroadcastd m8, [base+pd_m4096]
+.v0_loop:
+ mova m0, [t1+r10*2+400* 6]
+ mova m4, [t1+r10*2+400* 8]
+ mova m5, [t1+r10*2+400*10]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+r10*2+400* 6]
+ paddd m2, m4, [t2+r10*2+400* 8]
+ paddd m3, m5, [t2+r10*2+400*10]
+ mova [t2+r10*2+400* 6], m0
+ mova [t2+r10*2+400* 8], m4
+ mova [t2+r10*2+400*10], m5
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ pmaddwd m2, m0, m0 ; b3 * b3
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ vpbroadcastd m2, [base+pd_0xf00801c7]
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m2 ; b3 * 455
+ pmaddwd m1, m2
+ paddusw m4, m2
+ paddusw m5, m2
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ pmulld m0, m2
+ pmulld m1, m3
+ paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m6
+ pand m0, m8
+ pand m1, m8
+ por m0, m2 ; a3 | (b3 << 12)
+ por m1, m3
+ mova m2, [t1+r10*2+400*0]
+ mova m3, [t1+r10*2+400*2]
+ mova m4, [t1+r10*2+400*4]
+ mova [t3+r10*4+400*8+ 8], m2
+ mova [t3+r10*4+400*0+ 8], m3
+ mova [t3+r10*4+400*0+40], m4
+ paddw m2, m2 ; cc5
+ paddd m3, m3
+ paddd m4, m4
+ mova [t1+r10*2+400*0], m2
+ mova [t1+r10*2+400*2], m3
+ mova [t1+r10*2+400*4], m4
+ mova [t3+r10*4+400*4+ 8], xm0
+ vextracti128 [t3+r10*4+400*4+40], m0, 1
+ mova [t3+r10*4+400*4+24], xm1
+ vextracti128 [t3+r10*4+400*4+56], m1, 1
+ add r10, 16
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-2]
+.v1_loop:
+ mova m4, [t1+r10*2+400* 6]
+ mova m5, [t1+r10*2+400* 8]
+ mova m6, [t1+r10*2+400*10]
+ paddw m1, m4, [t2+r10*2+400* 6]
+ paddd m2, m5, [t2+r10*2+400* 8]
+ paddd m3, m6, [t2+r10*2+400*10]
+ mova [t2+r10*2+400* 6], m4
+ mova [t2+r10*2+400* 8], m5
+ mova [t2+r10*2+400*10], m6
+ punpcklwd m0, m1, m7 ; b3
+ punpckhwd m1, m7
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ pmaddwd m2, m0, m0 ; b3 * b3
+ paddd m5, m3
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ vpbroadcastd m2, [base+pd_0xf00801c7]
+ psubd m5, m3
+ pmulld m4, m14 ; p3 * s1
+ pmulld m5, m14
+ pmaddwd m0, m2 ; b3 * 455
+ pmaddwd m1, m2
+ paddusw m4, m2
+ paddusw m5, m2
+ psrad m3, m4, 20 ; min(z3, 255) - 256
+ vpgatherdd m2, [r12+m3*4], m4 ; x3
+ psrad m4, m5, 20
+ vpgatherdd m3, [r12+m4*4], m5
+ vpbroadcastd m4, [base+pd_34816]
+ pmulld m0, m2
+ vpbroadcastd m8, [base+pd_m4096]
+ pmulld m1, m3
+ paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m4
+ pand m0, m8
+ pand m8, m1
+ por m0, m2 ; a3 | (b3 << 12)
+ por m8, m3
+ mova m4, [t3+r10*4+400*8+ 8]
+ mova m5, [t3+r10*4+400*0+ 8]
+ mova m6, [t3+r10*4+400*0+40]
+ paddw m1, m4, [t2+r10*2+400*0]
+ paddd m2, m5, [t2+r10*2+400*2]
+ paddd m3, m6, [t2+r10*2+400*4]
+ paddw m1, [t1+r10*2+400*0]
+ paddd m2, [t1+r10*2+400*2]
+ paddd m3, [t1+r10*2+400*4]
+ mova [t2+r10*2+400*0], m4
+ mova [t2+r10*2+400*2], m5
+ mova [t2+r10*2+400*4], m6
+ vpbroadcastd m4, [base+pd_25]
+ mova [t3+r10*4+400*8+ 8], xm0
+ vextracti128 [t3+r10*4+400*8+40], m0, 1
+ mova [t3+r10*4+400*8+24], xm8
+ vextracti128 [t3+r10*4+400*8+56], m8, 1
+ punpcklwd m0, m1, m7 ; b5
+ punpckhwd m1, m7
+ pmulld m2, m4 ; a5 * 25
+ pmulld m3, m4
+ pmaddwd m4, m0, m0 ; b5 * b5
+ pmaddwd m5, m1, m1
+ psubd m2, m4 ; p5
+ vpbroadcastd m4, [base+pd_0xf00800a4]
+ psubd m3, m5
+ pmulld m2, m13 ; p5 * s0
+ pmulld m3, m13
+ pmaddwd m0, m4 ; b5 * 164
+ pmaddwd m1, m4
+ paddusw m2, m4
+ paddusw m3, m4
+ psrad m5, m2, 20 ; min(z5, 255) - 256
+ vpgatherdd m4, [r12+m5*4], m2 ; x5
+ psrad m2, m3, 20
+ vpgatherdd m5, [r12+m2*4], m3
+ pmulld m0, m4
+ vpbroadcastd m6, [base+pd_34816]
+ pmulld m1, m5
+ paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m6
+ vpbroadcastd m6, [base+pd_m4096]
+ pand m0, m6
+ pand m1, m6
+ por m0, m4 ; a5 | (b5 << 12)
+ por m1, m5
+ mova [t3+r10*4+400*0+ 8], xm0
+ vextracti128 [t3+r10*4+400*0+40], m0, 1
+ mova [t3+r10*4+400*0+24], xm1
+ vextracti128 [t3+r10*4+400*0+56], m1, 1
+ add r10, 16
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t3+r10*4+400*0+4]
+ paddd m1, m0, [t3+r10*4+400*0+0]
+ mova m4, [t3+r10*4+400*4+0]
+ paddd m1, [t3+r10*4+400*0+8]
+ mova m5, [t3+r10*4+400*8+0]
+ paddd m4, [t3+r10*4+400*4+8]
+ paddd m5, [t3+r10*4+400*8+8]
+ paddd m2, m4, [t3+r10*4+400*4+4]
+ paddd m3, m5, [t3+r10*4+400*8+4]
+ paddd m0, m1
+ pslld m1, 2
+ pslld m2, 2
+ paddd m1, m0 ; ab5 565
+ paddd m3, m3 ; ab3[ 0] 222
+ psubd m2, m4 ; ab3[-1] 343
+ mova [t3+r10*4+400*20], m3
+ pandn m0, m6, m1 ; a5 565
+ mova [t3+r10*4+400*24], m2
+ psrld m1, 12 ; b5 565
+ mova [t3+r10*4+400*12], m0
+ paddd m3, m3
+ mova [t3+r10*4+400*16], m1
+ psubd m3, m5 ; ab3[ 0] 343
+ mova [t3+r10*4+400*28], m3
+ add r10, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m0, [t3+r10*4+4]
+ paddd m4, m0, [t3+r10*4+0]
+ paddd m4, [t3+r10*4+8]
+ paddd m0, m4
+ pslld m4, 2
+ paddd m4, m0
+ pandn m0, m6, m4
+ psrld m4, 12
+ paddd m2, m0, [t3+r10*4+400*12] ; a5
+ mova [t3+r10*4+400*12], m0
+ paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8)
+ mova [t3+r10*4+400*16], m4
+ mova m3, [t3+r10*4+400*4+0]
+ paddd m3, [t3+r10*4+400*4+8]
+ paddd m5, m3, [t3+r10*4+400*4+4]
+ paddd m5, m5 ; ab3[ 1] 222
+ mova m4, [t3+r10*4+400*20]
+ paddd m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343
+ mova [t3+r10*4+400*20], m5
+ paddd m5, m5
+ psubd m5, m3 ; ab3[ 1] 343
+ mova [t3+r10*4+400*24], m5
+ paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343
+ pandn m3, m6, m1
+ psrld m1, 12
+ pandn m5, m6, m4
+ psrld m4, 12
+ paddd m3, m5 ; a3
+ paddd m1, m4 ; b3 + (1 << 8)
+ pmovzxbd m4, [dstq+r10]
+ pmaddwd m2, m4 ; a5 * src
+ pmaddwd m3, m4 ; a3 * src
+ psubd m0, m2 ; b5 - a5 * src + (1 << 8)
+ psubd m1, m3 ; b3 - a3 * src + (1 << 8)
+ psrld m0, 9
+ pslld m1, 7
+ pblendw m0, m1, 0xaa
+ pmaddwd m0, m15
+ psubd m0, m6
+ psrad m0, 13
+ paddd m0, m4
+ vextracti128 xm1, m0, 1
+ packssdw xm0, xm1
+ packuswb xm0, xm0
+ movq [dstq+r10], xm0
+ add r10, 8
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m3, [t3+r10*4+400*8+0]
+ paddd m3, [t3+r10*4+400*8+8]
+ paddd m5, m3, [t3+r10*4+400*8+4]
+ paddd m5, m5 ; ab3[ 1] 222
+ mova m4, [t3+r10*4+400*20]
+ paddd m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343
+ mova [t3+r10*4+400*20], m5
+ paddd m5, m5
+ psubd m5, m3 ; ab3[ 1] 343
+ mova [t3+r10*4+400*28], m5
+ paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343
+ pandn m3, m6, m1
+ psrld m1, 12
+ pandn m5, m6, m4
+ psrld m4, 12
+ paddd m3, m5 ; -a3
+ paddd m1, m4 ; b3 + (1 << 8)
+ pmovzxbd m4, [dstq+r10]
+ pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src
+ mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7)
+ pmaddwd m3, m4 ; -a3 * src
+ psubd m0, m2 ; a5 * src + b5 + (1 << 7)
+ psubd m1, m3 ; a3 * src + b3 + (1 << 8)
+ psrld m0, 8
+ pslld m1, 7
+ pblendw m0, m1, 0xaa
+ pmaddwd m0, m15
+ psubd m0, m6
+ psrad m0, 13
+ paddd m0, m4
+ vextracti128 xm1, m0, 1
+ packssdw xm0, xm1
+ packuswb xm0, xm0
+ movq [dstq+r10], xm0
+ add r10, 8
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/looprestoration_avx512.asm b/third_party/dav1d/src/x86/looprestoration_avx512.asm
new file mode 100644
index 0000000000..1e571774ca
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration_avx512.asm
@@ -0,0 +1,2122 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+wiener_shufA: db 1, 2, 7, 6, 3, 4, 9, 8, 5, 6, 11, 10, 7, 8, 13, 12
+wiener_shufB: db 2, 3, 8, 7, 4, 5, 10, 9, 6, 7, 12, 11, 8, 9, 14, 13
+wiener_shufC: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
+wiener_shufD: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+wiener_perm32: db 1, 9, 3, 11, 5, 13, 7, 15, 33, 41, 35, 43, 37, 45, 39, 47
+ db 17, 25, 19, 27, 21, 29, 23, 31, 49, 57, 51, 59, 53, 61, 55, 63
+sgr_shuf: db 128, 1, -1, 2,132, 3, -1, 4,136, 5, -1, 6,140, 7, -1, 8
+ db 129, 9, -1, 10,133, 11, -1, 12,137, -1, -1, -1,141, -1, 0,128
+sgr_mix_perm: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
+r_ext_mask: times 68 db -1
+ times 4 db 0
+wiener_x_shuf: db 0, 2, -1, 0
+wiener_x_add: db 0, 1,127, 0
+
+pw_61448: times 2 dw 61448
+pw_164_455: dw 164, 455
+pd_m16380: dd -16380
+pd_m4096: dd -4096
+pd_m25 dd -25
+pd_m9: dd -9
+pd_34816: dd 34816
+pd_8421376: dd 8421376
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
+
+INIT_ZMM avx512icl
+cglobal wiener_filter7_8bpc, 4, 15, 20, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt
+ mov fltq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti32x4 m6, [wiener_shufA]
+ vbroadcasti32x4 m7, [wiener_shufB]
+ mov r10d, 0xfffe
+ vbroadcasti32x4 m8, [wiener_shufC]
+ vbroadcasti32x4 m9, [wiener_shufD]
+ kmovw k1, r10d
+ vpbroadcastd m0, [wiener_x_shuf]
+ vpbroadcastd m1, [wiener_x_add]
+ mov r10, 0xaaaaaaaaaaaaaaaa
+ vpbroadcastd m11, [fltq+ 0]
+ vpbroadcastd m12, [fltq+ 4]
+ kmovq k2, r10
+ vpbroadcastd m10, [pd_m16380]
+ packsswb m11, m11 ; x0 x1 x0 x1
+ vpbroadcastd m14, [fltq+16]
+ pshufb m12, m0
+ vpbroadcastd m15, [fltq+20]
+ paddb m12, m1 ; x2 x3+1 x2 127
+ vpbroadcastd m13, [pd_8421376]
+ psllw m14, 5 ; y0 y1
+ psllw m15, 5 ; y2 y3
+ cmp wd, 32 ; the minimum lr unit size for chroma in 4:2:0 is 32
+ jle .w32 ; pixels, so we need a special case for small widths
+ lea t1, [rsp+wq*2+16]
+ add lpfq, wq
+ add dstq, wq
+ neg wq
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm16, [leftq]
+ vmovdqu32 m16{k1}, [lpfq+r10-4]
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastb xm16, [lpfq+r10] ; the masked load ensures that no exception
+ vmovdqu32 m16{k1}, [lpfq+r10-4] ; gets raised from accessing invalid memory
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m16, [lpfq+r10-4]
+.h_main:
+ movu m17, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -66
+ jl .h_have_right
+ push r0
+ lea r0, [r_ext_mask+65]
+ vpbroadcastb m0, [lpfq-1]
+ vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b
+ vpternlogd m17, m0, [r0+r10+8], 0xe4
+ pop r0
+.h_have_right:
+ pshufb m4, m16, m6
+ mova m0, m10
+ vpdpbusd m0, m4, m11
+ pshufb m4, m16, m7
+ mova m2, m10
+ vpdpbusd m2, m4, m11
+ pshufb m4, m17, m6
+ mova m1, m10
+ vpdpbusd m1, m4, m11
+ pshufb m4, m17, m7
+ mova m3, m10
+ vpdpbusd m3, m4, m11
+ pshufb m4, m16, m8
+ vpdpbusd m0, m4, m12
+ pshufb m16, m9
+ vpdpbusd m2, m16, m12
+ pshufb m4, m17, m8
+ vpdpbusd m1, m4, m12
+ pshufb m17, m9
+ vpdpbusd m3, m17, m12
+ packssdw m0, m2
+ packssdw m1, m3
+ psraw m0, 3
+ psraw m1, 3
+ mova [t1+r10*2+ 0], m0
+ mova [t1+r10*2+64], m1
+ add r10, 64
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm16, [leftq]
+ vmovdqu32 m16{k1}, [lpfq+r10-4]
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastb xm16, [lpfq+r10]
+ vmovdqu32 m16{k1}, [lpfq+r10-4]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m16, [lpfq+r10-4]
+.hv_main:
+ movu m17, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -66
+ jl .hv_have_right
+ push r0
+ lea r0, [r_ext_mask+65]
+ vpbroadcastb m0, [lpfq-1]
+ vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b
+ vpternlogd m17, m0, [r0+r10+8], 0xe4
+ pop r0
+.hv_have_right:
+ pshufb m4, m16, m6
+ mova m0, m10
+ vpdpbusd m0, m4, m11
+ pshufb m4, m16, m7
+ mova m2, m10
+ vpdpbusd m2, m4, m11
+ pshufb m4, m17, m6
+ mova m1, m10
+ vpdpbusd m1, m4, m11
+ pshufb m4, m17, m7
+ mova m3, m10
+ vpdpbusd m3, m4, m11
+ pshufb m4, m16, m8
+ vpdpbusd m0, m4, m12
+ pshufb m16, m9
+ vpdpbusd m2, m16, m12
+ pshufb m4, m17, m8
+ vpdpbusd m1, m4, m12
+ pshufb m17, m9
+ vpdpbusd m3, m17, m12
+ packssdw m0, m2
+ packssdw m1, m3
+ psraw m0, 3
+ psraw m1, 3
+ mova m16, [t4+r10*2]
+ paddw m16, [t2+r10*2]
+ mova m3, [t3+r10*2]
+ mova m17, [t4+r10*2+64]
+ paddw m17, [t2+r10*2+64]
+ mova m5, [t3+r10*2+64]
+ punpcklwd m4, m16, m3
+ mova m2, m13
+ vpdpwssd m2, m4, m15
+ punpcklwd m18, m17, m5
+ mova m4, m13
+ vpdpwssd m4, m18, m15
+ punpckhwd m16, m3
+ mova m3, m13
+ vpdpwssd m3, m16, m15
+ punpckhwd m17, m5
+ mova m5, m13
+ vpdpwssd m5, m17, m15
+ mova m17, [t5+r10*2]
+ paddw m17, [t1+r10*2]
+ paddw m16, m0, [t6+r10*2]
+ mova m19, [t5+r10*2+64]
+ paddw m19, [t1+r10*2+64]
+ paddw m18, m1, [t6+r10*2+64]
+ mova [t0+r10*2+ 0], m0
+ mova [t0+r10*2+64], m1
+ punpcklwd m0, m16, m17
+ vpdpwssd m2, m0, m14
+ punpcklwd m1, m18, m19
+ vpdpwssd m4, m1, m14
+ punpckhwd m16, m17
+ vpdpwssd m3, m16, m14
+ punpckhwd m18, m19
+ vpdpwssd m5, m18, m14
+ packuswb m2, m4
+ psrlw m2, 8
+ vpackuswb m2{k2}, m3, m5
+ movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap
+ add r10, 64 ; function is used for chroma as well, and in some
+ jl .hv_loop ; esoteric edge cases chroma dst pointers may only
+ mov t6, t5 ; have a 32-byte alignment despite having a width
+ mov t5, t4 ; larger than 32, so use an unaligned store here.
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m4, [t4+r10*2+ 0]
+ paddw m4, [t2+r10*2+ 0]
+ mova m1, [t3+r10*2+ 0]
+ mova m5, [t4+r10*2+64]
+ paddw m5, [t2+r10*2+64]
+ mova m3, [t3+r10*2+64]
+ punpcklwd m6, m4, m1
+ mova m0, m13
+ vpdpwssd m0, m6, m15
+ punpcklwd m6, m5, m3
+ mova m2, m13
+ vpdpwssd m2, m6, m15
+ punpckhwd m4, m1
+ mova m1, m13
+ vpdpwssd m1, m4, m15
+ punpckhwd m5, m3
+ mova m3, m13
+ vpdpwssd m3, m5, m15
+ mova m5, [t1+r10*2+ 0]
+ paddw m4, m5, [t6+r10*2+ 0]
+ paddw m5, [t5+r10*2+ 0]
+ mova m7, [t1+r10*2+64]
+ paddw m6, m7, [t6+r10*2+64]
+ paddw m7, [t5+r10*2+64]
+ punpcklwd m8, m4, m5
+ vpdpwssd m0, m8, m14
+ punpcklwd m8, m6, m7
+ vpdpwssd m2, m8, m14
+ punpckhwd m4, m5
+ vpdpwssd m1, m4, m14
+ punpckhwd m6, m7
+ vpdpwssd m3, m6, m14
+ packuswb m0, m2
+ psrlw m0, 8
+ vpackuswb m0{k2}, m1, m3
+ movu [dstq+r10], m0
+ add r10, 64
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+.w32:
+ lea r10, [r_ext_mask+73]
+ mova ym18, [wiener_perm32]
+ lea t1, [rsp+16]
+ sub r10, wq
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .w32_no_top
+ call .w32_h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 32*2
+ call .w32_h_top
+ lea r9, [lpfq+strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 32*2
+ add r9, strideq
+ mov [rsp], r9 ; below
+ call .w32_h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .w32_v1
+ add lpfq, strideq
+ add t1, 32*2
+ call .w32_h
+ mov t2, t1
+ dec hd
+ jz .w32_v2
+ add lpfq, strideq
+ add t1, 32*2
+ call .w32_h
+ dec hd
+ jz .w32_v3
+.w32_main:
+ lea t0, [t1+32*2]
+.w32_main_loop:
+ call .w32_hv
+ dec hd
+ jnz .w32_main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .w32_v3
+ mov lpfq, [rsp]
+ call .w32_hv_bottom
+ add lpfq, strideq
+ call .w32_hv_bottom
+.w32_v1:
+ call .w32_v
+ RET
+.w32_no_top:
+ lea r9, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r9, [r9+strideq*2]
+ mov [rsp], r9
+ call .w32_h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .w32_v1
+ add lpfq, strideq
+ add t1, 32*2
+ call .w32_h
+ mov t2, t1
+ dec hd
+ jz .w32_v2
+ add lpfq, strideq
+ add t1, 32*2
+ call .w32_h
+ dec hd
+ jz .w32_v3
+ lea t0, [t1+32*2]
+ call .w32_hv
+ dec hd
+ jz .w32_v3
+ add t0, 32*8
+ call .w32_hv
+ dec hd
+ jnz .w32_main
+.w32_v3:
+ call .w32_v
+.w32_v2:
+ call .w32_v
+ jmp .w32_v1
+.w32_h:
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .w32_h_extend_left
+ movd xm16, [leftq]
+ vmovdqu32 ym16{k1}, [lpfq-4]
+ add leftq, 4
+ jmp .w32_h_main
+.w32_h_extend_left:
+ vpbroadcastb xm16, [lpfq] ; the masked load ensures that no exception
+ vmovdqu32 ym16{k1}, [lpfq-4] ; gets raised from accessing invalid memory
+ jmp .w32_h_main
+.w32_h_top:
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .w32_h_extend_left
+ movu ym16, [lpfq-4]
+.w32_h_main:
+ vinserti32x8 m16, [lpfq+4], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .w32_h_have_right
+ vpbroadcastb m0, [lpfq+wq-1]
+ movu ym17, [r10-8]
+ vinserti32x8 m17, [r10+0], 1
+ vpternlogd m16, m0, m17, 0xe4 ; c ? a : b
+.w32_h_have_right:
+ pshufb m2, m16, m6
+ mova m0, m10
+ vpdpbusd m0, m2, m11
+ pshufb m2, m16, m7
+ mova m1, m10
+ vpdpbusd m1, m2, m11
+ pshufb m2, m16, m8
+ vpdpbusd m0, m2, m12
+ pshufb m16, m9
+ vpdpbusd m1, m16, m12
+ packssdw m0, m1
+ psraw m0, 3
+ mova [t1], m0
+ ret
+.w32_hv:
+ add lpfq, strideq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .w32_hv_extend_left
+ movd xm16, [leftq]
+ vmovdqu32 ym16{k1}, [lpfq-4]
+ add leftq, 4
+ jmp .w32_hv_main
+.w32_hv_extend_left:
+ vpbroadcastb xm16, [lpfq]
+ vmovdqu32 ym16{k1}, [lpfq-4]
+ jmp .w32_hv_main
+.w32_hv_bottom:
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .w32_hv_extend_left
+ movu ym16, [lpfq-4]
+.w32_hv_main:
+ vinserti32x8 m16, [lpfq+4], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .w32_hv_have_right
+ vpbroadcastb m0, [lpfq+wq-1]
+ movu ym17, [r10-8]
+ vinserti32x8 m17, [r10+0], 1
+ vpternlogd m16, m0, m17, 0xe4
+.w32_hv_have_right:
+ mova m3, [t4]
+ paddw m3, [t2]
+ mova m2, [t3]
+ pshufb m4, m16, m6
+ mova m0, m10
+ vpdpbusd m0, m4, m11
+ pshufb m4, m16, m7
+ mova m5, m10
+ vpdpbusd m5, m4, m11
+ punpcklwd m4, m3, m2
+ mova m1, m13
+ vpdpwssd m1, m4, m15
+ punpckhwd m3, m2
+ mova m2, m13
+ vpdpwssd m2, m3, m15
+ pshufb m4, m16, m8
+ vpdpbusd m0, m4, m12
+ pshufb m16, m9
+ vpdpbusd m5, m16, m12
+ packssdw m0, m5
+ psraw m0, 3
+ mova m4, [t5]
+ paddw m4, [t1]
+ paddw m3, m0, [t6]
+ mova [t0], m0
+ punpcklwd m0, m3, m4
+ vpdpwssd m1, m0, m14
+ punpckhwd m3, m4
+ vpdpwssd m2, m3, m14
+ packuswb m1, m2
+ vpermb m16, m18, m1
+ mova [dstq], ym16
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, strideq
+ ret
+.w32_v:
+ mova m2, [t4]
+ paddw m2, [t2]
+ mova m1, [t3]
+ mova m4, [t1]
+ paddw m3, m4, [t6]
+ paddw m4, [t5]
+ punpcklwd m5, m2, m1
+ mova m0, m13
+ vpdpwssd m0, m5, m15
+ punpckhwd m2, m1
+ mova m1, m13
+ vpdpwssd m1, m2, m15
+ punpcklwd m2, m3, m4
+ vpdpwssd m0, m2, m14
+ punpckhwd m3, m4
+ vpdpwssd m1, m3, m14
+ packuswb m0, m1
+ vpermb m16, m18, m0
+ mova [dstq], ym16
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_5x5_8bpc, 4, 13, 23, 416*24+16, dst, stride, left, lpf, \
+ w, h, edge, params
+ mov paramsq, r6mp
+ mov wd, wm
+ mov hd, hm
+ mov edged, r7m
+ vbroadcasti32x4 m5, [sgr_shuf+1]
+ add lpfq, wq
+ vbroadcasti32x4 m6, [sgr_shuf+9]
+ add dstq, wq
+ vbroadcasti32x4 m7, [sgr_shuf+3]
+ lea t3, [rsp+wq*4+16+416*12]
+ vbroadcasti32x4 m8, [sgr_shuf+7]
+ pxor m4, m4
+ vpbroadcastd m9, [pd_m25]
+ vpsubd m11, m4, [paramsq+0] {1to16} ; -s0
+ vpbroadcastw m15, [paramsq+8] ; w0
+ lea t1, [rsp+wq*2+20]
+ vpbroadcastd m10, [pw_164_455]
+ neg wq
+ vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3)
+ mov r10d, 0xfe
+ vpbroadcastd m13, [pd_m4096]
+ kmovb k1, r10d
+ vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15)
+ mov r10, 0x3333333333333333
+ mova m18, [sgr_x_by_x+64*0]
+ kmovq k2, r10
+ mova m19, [sgr_x_by_x+64*1]
+ lea r12, [r_ext_mask+75]
+ mova m20, [sgr_x_by_x+64*2]
+ psllw m15, 4
+ mova m21, [sgr_x_by_x+64*3]
+ lea r10, [lpfq+strideq*4]
+ mova ym22, [sgr_shuf]
+ add r10, strideq
+ mov [rsp], r10 ; below
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call .top_fixup
+ add t1, 416*6
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ test hd, hd
+ jz .odd_height
+ call .h
+ add lpfq, strideq
+ call .hv
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .h_top
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+416*6]
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ jmp .main
+.no_top_height1:
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.h: ; horizontal boxsum
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu ym17, [lpfq+r10-2]
+.h_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.h_have_right:
+ pshufb m3, m17, m5
+ pmullw m2, m3, m3
+ pshufb m1, m17, m6
+ paddw m0, m3, m1
+ shufps m3, m1, q2121
+ paddw m0, m3
+ punpcklwd m16, m3, m1
+ punpckhwd m3, m1
+ punpcklwd m1, m2, m4
+ vpdpwssd m1, m16, m16
+ punpckhwd m2, m4
+ vpdpwssd m2, m3, m3
+ pshufb m16, m17, m7
+ paddw m0, m16
+ pshufb m17, m8
+ paddw m0, m17 ; sum
+ punpcklwd m3, m16, m17
+ vpdpwssd m1, m3, m3 ; sumsq
+ punpckhwd m16, m17
+ vpdpwssd m2, m16, m16
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+r10*2+416*0]
+ paddd m1, [t1+r10*2+416*2]
+ paddd m2, [t1+r10*2+416*4]
+.h_loop_end:
+ mova [t1+r10*2+416*0], m0
+ mova [t1+r10*2+416*2], m1
+ mova [t1+r10*2+416*4], m2
+ add r10, 32
+ jl .h_loop
+ ret
+.top_fixup:
+ lea r10, [wq-2]
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+r10*2+416*0]
+ mova m1, [t1+r10*2+416*2]
+ mova m2, [t1+r10*2+416*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+r10*2+416*0], m0
+ mova [t2+r10*2+416*2], m1
+ mova [t2+r10*2+416*4], m2
+ add r10, 32
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu ym17, [lpfq+r10-2]
+.hv_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -34
+ jl .hv_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.hv_have_right:
+ pshufb m1, m17, m5
+ pmullw m3, m1, m1
+ pshufb m2, m17, m6
+ paddw m0, m1, m2
+ shufps m1, m2, q2121
+ paddw m0, m1
+ punpcklwd m16, m1, m2
+ punpckhwd m1, m2
+ punpcklwd m2, m3, m4
+ vpdpwssd m2, m16, m16
+ punpckhwd m3, m4
+ vpdpwssd m3, m1, m1
+ pshufb m16, m17, m7
+ paddw m0, m16
+ pshufb m17, m8
+ paddw m0, m17 ; h sum
+ punpcklwd m1, m16, m17
+ vpdpwssd m2, m1, m1 ; h sumsq
+ punpckhwd m16, m17
+ vpdpwssd m3, m16, m16
+ paddw m1, m0, [t1+r10*2+416*0]
+ paddd m16, m2, [t1+r10*2+416*2]
+ paddd m17, m3, [t1+r10*2+416*4]
+ test hd, hd
+ jz .hv_last_row
+.hv_main2:
+ paddd m16, [t2+r10*2+416*2] ; hv sumsq
+ paddd m17, [t2+r10*2+416*4]
+ paddw m1, [t2+r10*2+416*0] ; hv sum
+ mova [t0+r10*2+416*2], m2
+ mova [t0+r10*2+416*4], m3
+ mova [t0+r10*2+416*0], m0
+ pmulld m16, m9 ; -a * 25
+ pmulld m17, m9
+ punpcklwd m0, m1, m4 ; b
+ vpdpwssd m16, m0, m0 ; -p
+ punpckhwd m1, m4
+ vpdpwssd m17, m1, m1
+ pmaddwd m0, m10 ; b * 164
+ pmaddwd m1, m10
+ pmulld m16, m11 ; p * s
+ pmulld m17, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
+ vpternlogd m17, m1, m13, 0xd8
+ mova [t3+r10*4+ 8], m16 ; The neighbor calculations requires
+ mova [t3+r10*4+ 24], xm17 ; 13 bits for a and 21 bits for b.
+ vextracti32x4 [t3+r10*4+ 56], m17, 2 ; Packing them allows for 12+20, but
+ mova [t3+r10*4+ 72], m17 ; that gets us most of the way.
+ vextracti128 [t3+r10*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+104], m16, 3
+ add r10, 32
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+r10*2+416*0], m1
+ paddw m1, m0
+ mova [t1+r10*2+416*2], m16
+ paddd m16, m2
+ mova [t1+r10*2+416*4], m17
+ paddd m17, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+ lea r10, [wq-2]
+.v_loop:
+ mova m2, [t1+r10*2+416*2]
+ paddd m16, m2, [t2+r10*2+416*2]
+ mova m3, [t1+r10*2+416*4]
+ paddd m17, m3, [t2+r10*2+416*4]
+ paddd m2, m2
+ paddd m3, m3
+ paddd m16, m2 ; hv sumsq
+ paddd m17, m3
+ pmulld m16, m9 ; -a * 25
+ pmulld m17, m9
+ mova m0, [t1+r10*2+416*0]
+ paddw m1, m0, [t2+r10*2+416*0]
+ paddw m0, m0
+ paddw m1, m0 ; hv sum
+ punpcklwd m0, m1, m4 ; b
+ vpdpwssd m16, m0, m0 ; -p
+ punpckhwd m1, m4
+ vpdpwssd m17, m1, m1
+ pmaddwd m0, m10 ; b * 164
+ pmaddwd m1, m10
+ pmulld m16, m11 ; p * s
+ pmulld m17, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
+ vpternlogd m17, m1, m13, 0xd8
+ mova [t3+r10*4+ 8], m16
+ mova [t3+r10*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+ 56], m17, 2
+ mova [t3+r10*4+ 72], m17
+ vextracti128 [t3+r10*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+104], m16, 3
+ add r10, 32
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t3+r10*4+ 4]
+ movu m1, [t3+r10*4+68]
+ paddd m2, m0, [t3+r10*4+ 0]
+ paddd m3, m1, [t3+r10*4+64]
+ paddd m2, [t3+r10*4+ 8]
+ paddd m3, [t3+r10*4+72]
+ paddd m0, m2
+ pslld m2, 2
+ paddd m1, m3
+ pslld m3, 2
+ paddd m2, m0 ; ab 565
+ paddd m3, m1
+ pandn m0, m13, m2 ; a
+ psrld m2, 12 ; b
+ pandn m1, m13, m3
+ psrld m3, 12
+ mova [t3+r10*4+416*4+ 0], m0
+ mova [t3+r10*4+416*8+ 0], m2
+ mova [t3+r10*4+416*4+64], m1
+ mova [t3+r10*4+416*8+64], m3
+ add r10, 32
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m16, [t3+r10*4+ 4]
+ movu m17, [t3+r10*4+68]
+ paddd m0, m16, [t3+r10*4+ 0]
+ paddd m1, m17, [t3+r10*4+64]
+ paddd m0, [t3+r10*4+ 8]
+ paddd m1, [t3+r10*4+72]
+ paddd m16, m0
+ pslld m0, 2
+ paddd m17, m1
+ pslld m1, 2
+ paddd m0, m16
+ paddd m1, m17
+ pandn m16, m13, m0
+ psrld m0, 12
+ pandn m17, m13, m1
+ psrld m1, 12
+ paddd m2, m16, [t3+r10*4+416*4+ 0] ; a
+ paddd m3, m17, [t3+r10*4+416*4+64]
+ mova [t3+r10*4+416*4+ 0], m16
+ mova [t3+r10*4+416*4+64], m17
+ paddd m16, m0, [t3+r10*4+416*8+ 0] ; b + (1 << 8)
+ paddd m17, m1, [t3+r10*4+416*8+64]
+ mova [t3+r10*4+416*8+ 0], m0
+ mova [t3+r10*4+416*8+64], m1
+ pmovzxbd m0, [dstq+r10+ 0]
+ pmovzxbd m1, [dstq+r10+16]
+ pmaddwd m2, m0 ; a * src
+ pmaddwd m3, m1
+ packssdw m0, m1
+ psubd m16, m2 ; b - a * src + (1 << 8)
+ psubd m17, m3
+ psrad m16, 9
+ psrad m17, 9
+ packssdw m16, m17
+ pmulhrsw m16, m15
+ paddw m16, m0
+ packuswb m16, m16
+ vpermd m16, m22, m16
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ pmovzxbd m0, [dstq+r10+ 0]
+ pmovzxbd m1, [dstq+r10+16]
+ pmaddwd m2, m0, [t3+r10*4+416*4+ 0] ; a * src
+ pmaddwd m3, m1, [t3+r10*4+416*4+64]
+ mova m16, [t3+r10*4+416*8+ 0] ; b + (1 << 7)
+ mova m17, [t3+r10*4+416*8+64]
+ packssdw m0, m1
+ psubd m16, m2 ; b - a * src + (1 << 7)
+ psubd m17, m3
+ psrad m16, 8
+ psrad m17, 8
+ packssdw m16, m17
+ pmulhrsw m16, m15
+ paddw m16, m0
+ packuswb m16, m16
+ vpermd m16, m22, m16
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_3x3_8bpc, 4, 15, 22, -416*28-16, dst, stride, left, lpf, \
+ w, h, edge, params
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti32x4 m5, [sgr_shuf+3]
+ add lpfq, wq
+ vbroadcasti32x4 m6, [sgr_shuf+5]
+ add dstq, wq
+ vbroadcasti32x4 m7, [sgr_shuf+7]
+ pxor m4, m4
+ vpbroadcastd m8, [pd_m9]
+ vpsubd m11, m4, [paramsq+4] {1to16} ; -s1
+ vpbroadcastw m15, [paramsq+10] ; w1
+ lea t1, [rsp+wq*2+20]
+ vpbroadcastd m10, [pw_164_455]
+ lea t3, [rsp+wq*4+16+416*12]
+ vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3)
+ neg wq
+ vpbroadcastd m13, [pd_m4096]
+ mov r10d, 0xfe
+ vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15)
+ kmovb k1, r10d
+ mova m18, [sgr_x_by_x+64*0]
+ mov r10, 0x3333333333333333
+ mova m19, [sgr_x_by_x+64*1]
+ kmovq k2, r10
+ mova m20, [sgr_x_by_x+64*2]
+ psllw m15, 4
+ mova m21, [sgr_x_by_x+64*3]
+ lea r14, [r_ext_mask+75]
+ mova ym9, [sgr_shuf]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ add t1, 416*6
+ call .h_top
+ lea t4, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add t4, strideq
+ mov [rsp], t4 ; below
+ mov t0, t2
+ call .hv
+.main:
+ mov t5, t3
+ add t3, 416*4
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv
+ call .prep_n
+ dec hd
+ jz .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv
+ call .n
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv_bottom
+ call .n
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ call .n
+ RET
+.height1:
+ call .v
+ call .prep_n
+ mov t2, t1
+ call .v
+ jmp .end
+.extend_bottom:
+ call .v
+ call .n
+ mov t2, t1
+ call .v
+ jmp .end
+.no_top:
+ lea t4, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea t4, [t4+strideq*2]
+ mov [rsp], t4
+ call .h
+ lea t0, [t1+416*6]
+ mov t2, t1
+ call .v
+ jmp .main
+.h: ; horizontal boxsum
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu ym17, [lpfq+r10-2]
+.h_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -33
+ jl .h_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r14+r10-8]
+ vinserti32x8 m16, [r14+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.h_have_right:
+ pshufb m0, m17, m5
+ pmullw m2, m0, m0
+ pshufb m16, m17, m6
+ paddw m0, m16
+ pshufb m17, m7
+ paddw m0, m17 ; sum
+ punpcklwd m3, m16, m17
+ punpcklwd m1, m2, m4
+ vpdpwssd m1, m3, m3 ; sumsq
+ punpckhwd m16, m17
+ punpckhwd m2, m4
+ vpdpwssd m2, m16, m16
+ mova [t1+r10*2+416*0], m0
+ mova [t1+r10*2+416*2], m1
+ mova [t1+r10*2+416*4], m2
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .hv_main
+.hv_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu ym17, [lpfq+r10-2]
+.hv_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -33
+ jl .hv_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r14+r10-8]
+ vinserti32x8 m16, [r14+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.hv_have_right:
+ pshufb m0, m17, m5
+ pmullw m3, m0, m0
+ pshufb m1, m17, m6
+ paddw m0, m1
+ pshufb m17, m7
+ paddw m0, m17 ; h sum
+ punpcklwd m16, m17, m1
+ punpcklwd m2, m3, m4
+ vpdpwssd m2, m16, m16 ; h sumsq
+ punpckhwd m17, m1
+ punpckhwd m3, m4
+ vpdpwssd m3, m17, m17
+ paddw m1, m0, [t2+r10*2+416*0]
+ paddw m1, [t1+r10*2+416*0] ; hv sum
+ paddd m16, m2, [t2+r10*2+416*2]
+ paddd m17, m3, [t2+r10*2+416*4]
+ paddd m16, [t1+r10*2+416*2] ; hv sumsq
+ paddd m17, [t1+r10*2+416*4]
+ mova [t0+r10*2+416*0], m0
+ mova [t0+r10*2+416*2], m2
+ mova [t0+r10*2+416*4], m3
+ pmulld m16, m8 ; -a * 9
+ pmulld m17, m8
+ punpcklwd m0, m4, m1 ; b
+ vpdpwssd m16, m0, m0 ; -p
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ pmulld m16, m11 ; p * s
+ pmulld m17, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
+ vpternlogd m17, m1, m13, 0xd8
+ mova [t3+r10*4+ 8], m16
+ mova [t3+r10*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+ 56], m17, 2
+ mova [t3+r10*4+ 72], m17
+ vextracti128 [t3+r10*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+104], m16, 3
+ add r10, 32
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ ret
+.v: ; vertical boxsum + ab
+ lea r10, [wq-2]
+.v_loop:
+ mova m16, [t1+r10*2+416*2]
+ mova m17, [t1+r10*2+416*4]
+ paddd m16, m16
+ paddd m17, m17
+ paddd m16, [t2+r10*2+416*2] ; hv sumsq
+ paddd m17, [t2+r10*2+416*4]
+ pmulld m16, m8 ; -a * 9
+ pmulld m17, m8
+ mova m1, [t1+r10*2+416*0]
+ paddw m1, m1
+ paddw m1, [t2+r10*2+416*0] ; hv sum
+ punpcklwd m0, m4, m1 ; b
+ vpdpwssd m16, m0, m0 ; -p
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ pmulld m16, m11 ; p * s
+ pmulld m17, m11
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m20
+ paddusw m17, m12
+ psraw m17, 4 ; min(z, 255) - 256
+ vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x
+ pandn m16, m13, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m14
+ vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12)
+ vpternlogd m17, m1, m13, 0xd8
+ mova [t3+r10*4+ 8], m16
+ mova [t3+r10*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+ 56], m17, 2
+ mova [t3+r10*4+ 72], m17
+ vextracti128 [t3+r10*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+104], m16, 3
+ add r10, 32
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+ mov t4, t3
+ add t3, 416*4
+.prep_n_loop:
+ mova m2, [t5+r10*4+0]
+ mova m3, [t4+r10*4+0]
+ paddd m2, [t5+r10*4+8]
+ paddd m3, [t4+r10*4+8]
+ paddd m0, m2, [t5+r10*4+4]
+ paddd m1, m3, [t4+r10*4+4]
+ pslld m0, 2
+ paddd m1, m1 ; ab[ 0] 222
+ psubd m0, m2 ; ab[-1] 343
+ mova [t3+r10*4+416*4], m1
+ paddd m1, m1
+ mova [t5+r10*4], m0
+ psubd m1, m3 ; ab[ 0] 343
+ mova [t4+r10*4], m1
+ add r10, 16
+ jl .prep_n_loop
+ ret
+; a+b are packed together in a single dword, but we can't do the
+; full neighbor calculations before splitting them since we don't
+; have sufficient precision. The solution is to do the calculations
+; in two equal halves and split a and b before doing the final sum.
+ALIGN function_align
+.n: ; neighbor + output
+ mov r10, wq
+.n_loop:
+ mova m16, [t3+r10*4+ 0]
+ paddd m16, [t3+r10*4+ 8]
+ paddd m17, m16, [t3+r10*4+ 4]
+ paddd m17, m17 ; ab[+1] 222
+ mova m2, [t3+r10*4+416*4+ 0]
+ paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
+ mova m3, [t3+r10*4+416*4+64]
+ paddd m1, m3, [t5+r10*4+64]
+ mova [t3+r10*4+416*4+ 0], m17
+ paddd m17, m17
+ psubd m17, m16 ; ab[+1] 343
+ mova [t5+r10*4+ 0], m17
+ paddd m2, m17 ; ab[ 0] 222 + ab[+1] 343
+ mova m16, [t3+r10*4+64]
+ paddd m16, [t3+r10*4+72]
+ paddd m17, m16, [t3+r10*4+68]
+ paddd m17, m17
+ mova [t3+r10*4+416*4+64], m17
+ paddd m17, m17
+ psubd m17, m16
+ mova [t5+r10*4+64], m17
+ pandn m16, m13, m0
+ psrld m0, 12
+ paddd m3, m17
+ pandn m17, m13, m2
+ psrld m2, 12
+ paddd m16, m17 ; a
+ pandn m17, m13, m1
+ psrld m1, 12
+ paddd m0, m2 ; b + (1 << 8)
+ pandn m2, m13, m3
+ psrld m3, 12
+ paddd m17, m2
+ pmovzxbd m2, [dstq+r10+ 0]
+ paddd m1, m3
+ pmovzxbd m3, [dstq+r10+16]
+ pmaddwd m16, m2 ; a * src
+ pmaddwd m17, m3
+ packssdw m2, m3
+ psubd m0, m16 ; b - a * src + (1 << 8)
+ psubd m1, m17
+ psrad m0, 9
+ psrad m1, 9
+ packssdw m0, m1
+ pmulhrsw m0, m15
+ paddw m0, m2
+ packuswb m0, m0
+ vpermd m16, m9, m0
+ mova [dstq+r10], ym16
+ add r10, 32
+ jl .n_loop
+ mov r10, t5
+ mov t5, t4
+ mov t4, r10
+ add dstq, strideq
+ ret
+
+cglobal sgr_filter_mix_8bpc, 4, 13, 28, 416*56+8, dst, stride, left, lpf, \
+ w, h, edge, params
+ mov paramsq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ vbroadcasti128 m5, [sgr_shuf+1]
+ add lpfq, wq
+ vbroadcasti128 m6, [sgr_shuf+9]
+ add dstq, wq
+ vbroadcasti128 m7, [sgr_shuf+3]
+ lea t3, [rsp+wq*4+416*24+8]
+ vbroadcasti128 m8, [sgr_shuf+7]
+ pxor m4, m4
+ vpbroadcastd m9, [pd_m9]
+ vpsubd m11, m4, [paramsq+0] {1to16} ; -s0
+ vpbroadcastd m14, [pw_61448]
+ vpsubd m12, m4, [paramsq+4] {1to16} ; -s1
+ vpbroadcastd m26, [paramsq+8] ; w0 w1
+ lea t1, [rsp+wq*2+12]
+ vpbroadcastd m10, [pd_m25]
+ neg wq
+ vpbroadcastd m13, [pw_164_455]
+ mov r10d, 0xfe
+ vpbroadcastd m15, [pd_34816]
+ kmovb k1, r10d
+ mova m20, [sgr_x_by_x+64*0]
+ mov r10, 0x3333333333333333
+ mova m21, [sgr_x_by_x+64*1]
+ kmovq k2, r10
+ mova m22, [sgr_x_by_x+64*2]
+ lea r12, [r_ext_mask+75]
+ mova m23, [sgr_x_by_x+64*3]
+ vpbroadcastd m24, [pd_m4096]
+ vpbroadcastd m25, [sgr_shuf+28] ; 0x8000____
+ psllw m26, 5
+ mova xm27, [sgr_mix_perm]
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t2, t1
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx512icl).top_fixup
+ add t1, 416*12
+ call .h_top
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov [rsp], r10 ; below
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ add lpfq, strideq
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ add lpfq, strideq
+ call .hv0
+ test hd, hd
+ jz .odd_height
+ add lpfq, strideq
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, [rsp]
+ call .hv0_bottom
+ add lpfq, strideq
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov [rsp], r10
+ call .h
+ lea t2, [t1+416*12]
+ lea r10, [wq-2]
+.top_fixup_loop:
+ mova m0, [t1+r10*2+416* 0]
+ mova m1, [t1+r10*2+416* 2]
+ mova m2, [t1+r10*2+416* 4]
+ paddw m0, m0
+ mova m3, [t1+r10*2+416* 6]
+ paddd m1, m1
+ mova m16, [t1+r10*2+416* 8]
+ paddd m2, m2
+ mova m17, [t1+r10*2+416*10]
+ mova [t2+r10*2+416* 0], m0
+ mova [t2+r10*2+416* 2], m1
+ mova [t2+r10*2+416* 4], m2
+ mova [t2+r10*2+416* 6], m3
+ mova [t2+r10*2+416* 8], m16
+ mova [t2+r10*2+416*10], m17
+ add r10, 32
+ jl .top_fixup_loop
+ call .v0
+ jmp .main
+.h: ; horizontal boxsums
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .h_main
+.h_top:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu ym17, [lpfq+r10-2]
+.h_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.h_have_right:
+ pshufb m3, m17, m5
+ pshufb m18, m17, m6
+ shufps m0, m3, m18, q2121
+ pmullw m2, m0, m0
+ pshufb m19, m17, m7
+ paddw m0, m19
+ pshufb m17, m8
+ paddw m0, m17 ; sum3
+ punpcklwd m16, m19, m17
+ punpcklwd m1, m2, m4
+ vpdpwssd m1, m16, m16 ; sumsq3
+ punpckhwd m19, m17
+ punpckhwd m2, m4
+ vpdpwssd m2, m19, m19
+ mova [t1+r10*2+416* 6], m0
+ mova [t1+r10*2+416* 8], m1
+ mova [t1+r10*2+416*10], m2
+ punpcklwd m19, m3, m18
+ paddw m0, m3
+ vpdpwssd m1, m19, m19 ; sumsq5
+ punpckhwd m3, m18
+ paddw m0, m18 ; sum5
+ vpdpwssd m2, m3, m3
+ mova [t1+r10*2+416* 0], m0
+ mova [t1+r10*2+416* 2], m1
+ mova [t1+r10*2+416* 4], m2
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .hv0_main
+.hv0_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .hv0_main
+.hv0_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+.hv0_loop:
+ movu ym17, [lpfq+r10-2]
+.hv0_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp r10d, -34
+ jl .hv0_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.hv0_have_right:
+ pshufb m18, m17, m5
+ pshufb m19, m17, m6
+ shufps m1, m18, m19, q2121
+ pmullw m3, m1, m1
+ pshufb m0, m17, m7
+ paddw m1, m0
+ pshufb m17, m8
+ paddw m1, m17 ; sum3
+ punpcklwd m16, m0, m17
+ punpcklwd m2, m3, m4
+ vpdpwssd m2, m16, m16 ; sumsq3
+ punpckhwd m0, m17
+ punpckhwd m3, m4
+ vpdpwssd m3, m0, m0
+ paddw m0, m1, [t1+r10*2+416* 6]
+ paddd m16, m2, [t1+r10*2+416* 8]
+ paddd m17, m3, [t1+r10*2+416*10]
+ mova [t1+r10*2+416* 6], m1
+ mova [t1+r10*2+416* 8], m2
+ mova [t1+r10*2+416*10], m3
+ paddw m1, m18
+ paddw m1, m19 ; sum5
+ mova [t3+r10*4+416*8+ 8], m1
+ paddw m1, [t1+r10*2+416* 0]
+ mova [t1+r10*2+416* 0], m1
+ punpcklwd m1, m18, m19
+ vpdpwssd m2, m1, m1 ; sumsq5
+ punpckhwd m18, m19
+ vpdpwssd m3, m18, m18
+ mova [t3+r10*4+416*0+ 8], m2 ; we need a clean copy of the last row
+ mova [t3+r10*4+416*0+72], m3 ; in case height is odd
+ paddd m2, [t1+r10*2+416* 2]
+ paddd m3, [t1+r10*2+416* 4]
+ mova [t1+r10*2+416* 2], m2
+ mova [t1+r10*2+416* 4], m3
+ paddw m1, m0, [t2+r10*2+416* 6]
+ paddd m2, m16, [t2+r10*2+416* 8]
+ paddd m3, m17, [t2+r10*2+416*10]
+ mova [t2+r10*2+416* 6], m0
+ mova [t2+r10*2+416* 8], m16
+ mova [t2+r10*2+416*10], m17
+ pmulld m16, m2, m9 ; -a3 * 9
+ pmulld m17, m3, m9
+ punpcklwd m0, m4, m1 ; b3
+ vpdpwssd m16, m0, m0 ; -p3
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ pmulld m16, m12 ; p3 * s1
+ pmulld m17, m12
+ pmaddwd m0, m13 ; b3 * 455
+ pmaddwd m1, m13
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m22
+ paddusw m17, m14
+ psraw m17, 4 ; min(z3, 255) - 256
+ vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x3
+ pandn m16, m24, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m15
+ vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
+ vpternlogd m17, m1, m24, 0xd8
+ mova [t3+r10*4+416*4+ 8], m16
+ mova [t3+r10*4+416*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2
+ mova [t3+r10*4+416*4+ 72], m17
+ vextracti128 [t3+r10*4+416*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+416*4+104], m16, 3
+ add r10, 32
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movd xm17, [leftq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ add leftq, 4
+ jmp .hv1_main
+.hv1_extend_left:
+ vpbroadcastb xm17, [lpfq+wq]
+ vmovdqu32 ym17{k1}, [lpfq+wq-4]
+ jmp .hv1_main
+.hv1_bottom:
+ lea r10, [wq-2]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+.hv1_loop:
+ movu ym17, [lpfq+r10-2]
+.hv1_main:
+ vinserti32x8 m17, [lpfq+r10+6], 1
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp r10d, -34
+ jl .hv1_have_right
+ vpbroadcastb m0, [lpfq-1]
+ movu ym16, [r12+r10-8]
+ vinserti32x8 m16, [r12+r10+0], 1
+ vpternlogd m17, m0, m16, 0xe4
+.hv1_have_right:
+ pshufb m3, m17, m5
+ pshufb m19, m17, m6
+ shufps m2, m3, m19, q2121
+ pmullw m1, m2, m2
+ pshufb m18, m17, m7
+ paddw m2, m18
+ pshufb m17, m8
+ paddw m2, m17 ; sum3
+ punpcklwd m16, m17, m18
+ punpcklwd m0, m1, m4
+ vpdpwssd m0, m16, m16 ; sumsq3
+ punpckhwd m17, m18
+ punpckhwd m1, m4
+ vpdpwssd m1, m17, m17
+ paddd m16, m0, [t2+r10*2+416* 8]
+ paddd m17, m1, [t2+r10*2+416*10]
+ mova [t2+r10*2+416* 8], m0
+ mova [t2+r10*2+416*10], m1
+ punpcklwd m18, m3, m19
+ vpdpwssd m0, m18, m18 ; sumsq5
+ punpckhwd m18, m3, m19
+ vpdpwssd m1, m18, m18
+ paddw m3, m19
+ pmulld m16, m9 ; -a3 * 9
+ pmulld m17, m9
+ paddd m18, m0, [t2+r10*2+416*2]
+ paddd m19, m1, [t2+r10*2+416*4]
+ paddd m18, [t1+r10*2+416*2]
+ paddd m19, [t1+r10*2+416*4]
+ mova [t2+r10*2+416*2], m0
+ mova [t2+r10*2+416*4], m1
+ pmulld m18, m10 ; -a5 * 25
+ pmulld m19, m10
+ paddw m1, m2, [t2+r10*2+416* 6]
+ mova [t2+r10*2+416* 6], m2
+ paddw m2, m3 ; sum5
+ paddw m3, m2, [t2+r10*2+416*0]
+ paddw m3, [t1+r10*2+416*0]
+ mova [t2+r10*2+416*0], m2
+ punpcklwd m0, m4, m1 ; b3
+ vpdpwssd m16, m0, m0 ; -p3
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ punpcklwd m2, m3, m4 ; b5
+ vpdpwssd m18, m2, m2 ; -p5
+ punpckhwd m3, m4
+ vpdpwssd m19, m3, m3
+ pmulld m16, m12 ; p3 * s1
+ pmulld m17, m12
+ pmulld m18, m11 ; p5 * s0
+ pmulld m19, m11
+ pmaddwd m0, m13 ; b3 * 455
+ pmaddwd m1, m13
+ pmaddwd m2, m13 ; b5 * 164
+ pmaddwd m3, m13
+ vpalignr m17{k2}, m16, m16, 2
+ vpalignr m19{k2}, m18, m18, 2
+ paddusw m17, m14
+ mova m16, m22
+ psraw m17, 4 ; min(z3, 255) - 256
+ vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
+ paddusw m19, m14
+ mova m18, m22
+ psraw m19, 4 ; min(z5, 255) - 256
+ vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k4, m19
+ vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x3
+ vmovdqu8 m19{k4}, m18 ; x5
+ pandn m16, m24, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ pandn m18, m24, m19
+ psrld m19, 16
+ pmulld m2, m18
+ pmulld m3, m19
+ paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m15
+ vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
+ vpternlogd m17, m1, m24, 0xd8
+ mova [t3+r10*4+416*8+ 8], m16
+ mova [t3+r10*4+416*8+ 24], xm17
+ vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2
+ paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m3, m15
+ mova [t3+r10*4+416*8+ 72], m17
+ vextracti128 [t3+r10*4+416*8+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+416*8+104], m16, 3
+ vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12)
+ vpternlogd m19, m3, m24, 0xd8
+ mova [t3+r10*4+416*0+ 8], m18
+ mova [t3+r10*4+416*0+ 24], xm19
+ vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2
+ mova [t3+r10*4+416*0+ 72], m19
+ vextracti128 [t3+r10*4+416*0+ 72], ym18, 1
+ vextracti32x4 [t3+r10*4+416*0+104], m18, 3
+ add r10, 32
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+ lea r10, [wq-2]
+.v0_loop:
+ mova m2, [t1+r10*2+416* 8]
+ mova m3, [t1+r10*2+416*10]
+ paddd m2, m2
+ paddd m3, m3
+ paddd m16, m2, [t2+r10*2+416* 8]
+ paddd m17, m3, [t2+r10*2+416*10]
+ mova m0, [t1+r10*2+416* 6]
+ paddw m0, m0
+ paddw m1, m0, [t2+r10*2+416* 6]
+ pmulld m16, m9 ; -a3 * 9
+ pmulld m17, m9
+ mova [t2+r10*2+416* 6], m0
+ mova [t2+r10*2+416* 8], m2
+ mova [t2+r10*2+416*10], m3
+ mova m2, [t1+r10*2+416*0]
+ mova m3, [t1+r10*2+416*2]
+ mova m18, [t1+r10*2+416*4]
+ punpcklwd m0, m4, m1 ; b3
+ vpdpwssd m16, m0, m0 ; -p3
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ pmulld m16, m12 ; p3 * s1
+ pmulld m17, m12
+ pmaddwd m0, m13 ; b3 * 455
+ pmaddwd m1, m13
+ mova [t3+r10*4+416*8+ 8], m2
+ mova [t3+r10*4+416*0+ 8], m3
+ mova [t3+r10*4+416*0+72], m18
+ vpalignr m17{k2}, m16, m16, 2
+ mova m16, m22
+ paddusw m17, m14
+ psraw m17, 4 ; min(z3, 255) - 256
+ vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x3
+ pandn m16, m24, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ paddw m2, m2 ; cc5
+ paddd m3, m3
+ paddd m18, m18
+ mova [t1+r10*2+416*0], m2
+ mova [t1+r10*2+416*2], m3
+ mova [t1+r10*2+416*4], m18
+ paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m15
+ vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
+ vpternlogd m17, m1, m24, 0xd8
+ mova [t3+r10*4+416*4+ 8], m16
+ mova [t3+r10*4+416*4+ 24], xm17
+ vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2
+ mova [t3+r10*4+416*4+ 72], m17
+ vextracti128 [t3+r10*4+416*4+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+416*4+104], m16, 3
+ add r10, 32
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+ lea r10, [wq-2]
+.v1_loop:
+ mova m0, [t1+r10*2+416* 8]
+ paddd m16, m0, [t2+r10*2+416* 8]
+ mova m1, [t1+r10*2+416*10]
+ paddd m17, m1, [t2+r10*2+416*10]
+ mova m2, [t3+r10*4+416*0+ 8]
+ paddd m18, m2, [t2+r10*2+416* 2]
+ mova m3, [t3+r10*4+416*0+72]
+ paddd m19, m3, [t2+r10*2+416* 4]
+ paddd m18, [t1+r10*2+416* 2]
+ paddd m19, [t1+r10*2+416* 4]
+ mova [t2+r10*2+416* 8], m0
+ mova [t2+r10*2+416*10], m1
+ mova [t2+r10*2+416* 2], m2
+ mova [t2+r10*2+416* 4], m3
+ pmulld m16, m9 ; -a3 * 9
+ pmulld m17, m9
+ pmulld m18, m10 ; -a5 * 25
+ pmulld m19, m10
+ mova m0, [t1+r10*2+416* 6]
+ paddw m1, m0, [t2+r10*2+416* 6]
+ mova m2, [t3+r10*4+416*8+ 8]
+ paddw m3, m2, [t2+r10*2+416*0]
+ paddw m3, [t1+r10*2+416*0]
+ mova [t2+r10*2+416* 6], m0
+ mova [t2+r10*2+416*0], m2
+ punpcklwd m0, m4, m1 ; b3
+ vpdpwssd m16, m0, m0 ; -p3
+ punpckhwd m1, m4, m1
+ vpdpwssd m17, m1, m1
+ punpcklwd m2, m3, m4 ; b5
+ vpdpwssd m18, m2, m2 ; -p5
+ punpckhwd m3, m4
+ vpdpwssd m19, m3, m3
+ pmulld m16, m12 ; p3 * s1
+ pmulld m17, m12
+ pmulld m18, m11 ; p5 * s0
+ pmulld m19, m11
+ pmaddwd m0, m13 ; b3 * 455
+ pmaddwd m1, m13
+ pmaddwd m2, m13 ; b5 * 164
+ pmaddwd m3, m13
+ vpalignr m17{k2}, m16, m16, 2
+ vpalignr m19{k2}, m18, m18, 2
+ paddusw m17, m14
+ mova m16, m22
+ psraw m17, 4 ; min(z3, 255) - 256
+ vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k3, m17
+ vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127]
+ paddusw m19, m14
+ mova m18, m22
+ psraw m19, 4 ; min(z5, 255) - 256
+ vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255]
+ vpmovb2m k4, m19
+ vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127]
+ vmovdqu8 m17{k3}, m16 ; x3
+ vmovdqu8 m19{k4}, m18 ; x5
+ pandn m16, m24, m17
+ psrld m17, 16
+ pmulld m0, m16
+ pmulld m1, m17
+ pandn m18, m24, m19
+ psrld m19, m19, 16
+ pmulld m2, m18
+ pmulld m3, m19
+ paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m15
+ vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12)
+ vpternlogd m17, m1, m24, 0xd8
+ mova [t3+r10*4+416*8+ 8], m16
+ mova [t3+r10*4+416*8+ 24], xm17
+ vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2
+ paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m3, m15
+ mova [t3+r10*4+416*8+ 72], m17
+ vextracti128 [t3+r10*4+416*8+ 72], ym16, 1
+ vextracti32x4 [t3+r10*4+416*8+104], m16, 3
+ vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12)
+ vpternlogd m19, m3, m24, 0xd8
+ mova [t3+r10*4+416*0+ 8], m18
+ mova [t3+r10*4+416*0+ 24], xm19
+ vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2
+ mova [t3+r10*4+416*0+ 72], m19
+ vextracti128 [t3+r10*4+416*0+ 72], ym18, 1
+ vextracti32x4 [t3+r10*4+416*0+104], m18, 3
+ add r10, 32
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ mov r10, wq
+.prep_n_loop:
+ movu m0, [t3+r10*4+416*0+4]
+ paddd m1, m0, [t3+r10*4+416*0+0]
+ mova m16, [t3+r10*4+416*4+0]
+ paddd m1, [t3+r10*4+416*0+8]
+ mova m17, [t3+r10*4+416*8+0]
+ paddd m16, [t3+r10*4+416*4+8]
+ paddd m17, [t3+r10*4+416*8+8]
+ paddd m2, m16, [t3+r10*4+416*4+4]
+ paddd m3, m17, [t3+r10*4+416*8+4]
+ paddd m0, m1
+ pslld m1, 2
+ pslld m2, 2
+ paddd m1, m0 ; ab5 565
+ paddd m3, m3 ; ab3[ 0] 222
+ psubd m2, m16 ; ab3[-1] 343
+ mova [t3+r10*4+416*20], m3
+ pandn m0, m24, m1 ; a5 565
+ mova [t3+r10*4+416*24], m2
+ psrld m1, 12 ; b5 565
+ mova [t3+r10*4+416*12], m0
+ paddd m3, m3
+ mova [t3+r10*4+416*16], m1
+ psubd m3, m17 ; ab3[ 0] 343
+ mova [t3+r10*4+416*28], m3
+ add r10, 16
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ mov r10, wq
+.n0_loop:
+ movu m2, [t3+r10*4+4]
+ paddd m3, m2, [t3+r10*4+0]
+ paddd m3, [t3+r10*4+8]
+ mova m1, [t3+r10*4+416*4+0]
+ paddd m2, m3
+ pslld m3, 2
+ paddd m1, [t3+r10*4+416*4+8]
+ paddd m3, m2
+ pandn m2, m24, m3
+ psrld m3, 12
+ paddd m0, m2, [t3+r10*4+416*12] ; a5
+ paddd m16, m3, [t3+r10*4+416*16] ; b5 + (1 << 8)
+ mova [t3+r10*4+416*12], m2
+ mova [t3+r10*4+416*16], m3
+ paddd m2, m1, [t3+r10*4+416*4+4]
+ paddd m2, m2 ; ab3[ 1] 222
+ mova m3, [t3+r10*4+416*20]
+ paddd m17, m3, [t3+r10*4+416*24] ; ab3[ 0] 222 + ab3[-1] 343
+ mova [t3+r10*4+416*20], m2
+ paddd m2, m2
+ psubd m2, m1 ; ab3[ 1] 343
+ mova [t3+r10*4+416*24], m2
+ paddd m2, m3 ; ab3[ 0] 222 + ab3[ 1] 343
+ pandn m1, m24, m17
+ psrld m17, 12
+ pandn m3, m24, m2
+ psrld m2, 12
+ paddd m1, m3 ; a3
+ pmovzxbd m3, [dstq+r10]
+ paddd m17, m2 ; b3 + (1 << 8)
+ pmaddwd m0, m3 ; a5 * src
+ pmaddwd m1, m3 ; a3 * src
+ vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15)
+ psubd m16, m0 ; b5 - a5 * src + (1 << 8)
+ psubd m17, m1 ; b3 - a3 * src + (1 << 8)
+ psrld m16, 9
+ pslld m17, 7
+ vmovdqu8 m17{k2}, m16
+ vpdpwssd m3, m17, m26
+ packuswb m3, m2
+ vpermb m16, m27, m3
+ mova [dstq+r10], xm16
+ add r10, 16
+ jl .n0_loop
+ add dstq, strideq
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ mov r10, wq
+.n1_loop:
+ mova m1, [t3+r10*4+416*8+0]
+ paddd m1, [t3+r10*4+416*8+8]
+ paddd m2, m1, [t3+r10*4+416*8+4]
+ paddd m2, m2 ; ab3[ 1] 222
+ mova m0, [t3+r10*4+416*20]
+ paddd m17, m0, [t3+r10*4+416*28] ; ab3[ 0] 222 + ab3[-1] 343
+ pmovzxbd m3, [dstq+r10]
+ mova [t3+r10*4+416*20], m2
+ paddd m2, m2
+ psubd m2, m1 ; ab3[ 1] 343
+ mova [t3+r10*4+416*28], m2
+ paddd m0, m2 ; ab3[ 0] 222 + ab3[ 1] 343
+ pandn m1, m24, m17
+ psrld m17, 12
+ pandn m2, m24, m0
+ psrld m0, 12
+ paddd m1, m2 ; a3
+ paddd m17, m0 ; b3 + (1 << 8)
+ mova m16, [t3+r10*4+416*16] ; b5 + (1 << 7)
+ pmaddwd m1, m3 ; a3 * src
+ pmaddwd m0, m3, [t3+r10*4+416*12] ; a5 * src
+ vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15)
+ psubd m17, m1 ; b3 - a3 * src + (1 << 8)
+ psubd m16, m0 ; b5 - a5 * src + (1 << 7)
+ pslld m17, 7
+ palignr m17{k2}, m16, m16, 1
+ vpdpwssd m3, m17, m26
+ packuswb m3, m3
+ vpermb m16, m27, m3
+ mova [dstq+r10], xm16
+ add r10, 16
+ jl .n1_loop
+ add dstq, strideq
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/looprestoration_sse.asm b/third_party/dav1d/src/x86/looprestoration_sse.asm
new file mode 100644
index 0000000000..01eb6fa348
--- /dev/null
+++ b/third_party/dav1d/src/x86/looprestoration_sse.asm
@@ -0,0 +1,3681 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4
+wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14
+wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12
+wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1
+wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+sgr_lshuf3: db 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
+sgr_lshuf5: db 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+pb_right_ext_mask: times 24 db 0xff
+ times 8 db 0
+pb_1: times 16 db 1
+pb_3: times 16 db 3
+pw_256: times 8 dw 256
+pw_2056: times 8 dw 2056
+pw_m16380: times 8 dw -16380
+pd_4096: times 4 dd 4096
+pd_34816: times 4 dd 34816
+pd_0xffff: times 4 dd 0xffff
+pd_0xf00800a4: times 4 dd 0xf00800a4
+pd_0xf00801c7: times 4 dd 0xf00801c7
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+%macro movif64 2 ; dst, src
+ %if ARCH_X86_64
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro movif32 2 ; dst, src
+ %if ARCH_X86_32
+ mov %1, %2
+ %endif
+%endmacro
+
+%if ARCH_X86_32
+ %define PIC_base_offset $$
+
+ %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
+ %assign pic_reg_stk_off 4
+ %xdefine PIC_reg %1
+ %if %2 == 1
+ mov [esp], %1
+ %endif
+ LEA PIC_reg, PIC_base_offset
+ %if %3 == 1
+ XCHG_PIC_REG
+ %endif
+ %endmacro
+
+ %macro XCHG_PIC_REG 0
+ mov [esp+pic_reg_stk_off], PIC_reg
+ %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
+ mov PIC_reg, [esp+pic_reg_stk_off]
+ %endmacro
+
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+
+%else
+ %macro XCHG_PIC_REG 0
+ %endmacro
+
+ %define PIC_sym(sym) (sym)
+%endif
+
+%macro WIENER 0
+%if ARCH_X86_64
+DECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers
+cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
+ w, h, edge, flt, x
+ %define tmpstrideq strideq
+ %define base 0
+ mov fltq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ movq m14, [fltq]
+ add lpfq, wq
+ movq m7, [fltq+16]
+ add dstq, wq
+ lea t1, [rsp+wq*2+16]
+ mova m15, [pw_2056]
+ neg wq
+%if cpuflag(ssse3)
+ pshufb m14, [wiener_init]
+ mova m8, [wiener_shufA]
+ pshufd m12, m14, q2222 ; x0 x0
+ mova m9, [wiener_shufB]
+ pshufd m13, m14, q3333 ; x1 x2
+ mova m10, [wiener_shufC]
+ punpcklqdq m14, m14 ; x3
+ mova m11, [wiener_shufD]
+%else
+ mova m10, [pw_m16380]
+ punpcklwd m14, m14
+ pshufd m11, m14, q0000 ; x0
+ pshufd m12, m14, q1111 ; x1
+ pshufd m13, m14, q2222 ; x2
+ pshufd m14, m14, q3333 ; x3
+%endif
+%else
+DECLARE_REG_TMP 4, 0, _, 5
+%if cpuflag(ssse3)
+ %define m10 [base+wiener_shufC]
+ %define m11 [base+wiener_shufD]
+ %define stk_off 96
+%else
+ %define m10 [base+pw_m16380]
+ %define m11 [stk+96]
+ %define stk_off 112
+%endif
+cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride
+ %define base r6-pb_right_ext_mask-21
+ %define stk esp
+ %define dstq leftq
+ %define edgeb byte edged
+ %define edged [stk+ 8]
+ %define dstmp [stk+12]
+ %define hd dword [stk+16]
+ %define wq [stk+20]
+ %define strideq [stk+24]
+ %define leftmp [stk+28]
+ %define t2 [stk+32]
+ %define t4 [stk+36]
+ %define t5 [stk+40]
+ %define t6 [stk+44]
+ %define m8 [base+wiener_shufA]
+ %define m9 [base+wiener_shufB]
+ %define m12 [stk+48]
+ %define m13 [stk+64]
+ %define m14 [stk+80]
+ %define m15 [base+pw_2056]
+ mov r1, r6m ; flt
+ mov r0, r0m ; dst
+ mov r4, r4m ; w
+ mov lpfq, lpfm
+ mov r2, r7m ; edge
+ mov r5, r5m ; h
+ movq m3, [r1+ 0]
+ movq m7, [r1+16]
+ add r0, r4
+ mov r1, r1m ; stride
+ add lpfq, r4
+ mov edged, r2
+ mov r2, r2m ; left
+ mov dstmp, r0
+ lea t1, [rsp+r4*2+stk_off]
+ mov hd, r5
+ neg r4
+ LEA r6, pb_right_ext_mask+21
+ mov wq, r4
+ mov strideq, r1
+ mov leftmp, r2
+ mov r4, r1
+%if cpuflag(ssse3)
+ pshufb m3, [base+wiener_init]
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q3333
+ punpcklqdq m3, m3
+%else
+ punpcklwd m3, m3
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m11, m0
+%endif
+ mova m12, m1
+ mova m13, m2
+ mova m14, m3
+%endif
+ psllw m7, 5
+ pshufd m6, m7, q0000 ; y0 y1
+ pshufd m7, m7, q1111 ; y2 y3
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea t3, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ add t3, tmpstrideq
+ mov [rsp], t3 ; below
+ mov t4, t1
+ add t1, 384*2
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.v1:
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+ RET
+.no_top:
+ lea t3, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ lea t3, [t3+tmpstrideq*2]
+ mov [rsp], t3
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+.v2:
+ call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
+ jmp .v1
+.extend_right:
+ movd m2, [lpfq-4]
+%if ARCH_X86_64
+ push r0
+ lea r0, [pb_right_ext_mask+21]
+ movu m0, [r0+xq+0]
+ movu m1, [r0+xq+8]
+ pop r0
+%else
+ movu m0, [r6+xq+0]
+ movu m1, [r6+xq+8]
+%endif
+%if cpuflag(ssse3)
+ pshufb m2, [base+pb_3]
+%else
+ punpcklbw m2, m2
+ pshuflw m2, m2, q3333
+ punpcklqdq m2, m2
+%endif
+ pand m4, m0
+ pand m5, m1
+ pandn m0, m2
+ pandn m1, m2
+ por m4, m0
+ por m5, m1
+ ret
+.h:
+ %define stk esp+4 ; offset due to call
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .h_main
+.h_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, [base+wiener_l_shuf]
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .h_main
+.h_top:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+xq-4]
+.h_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp xd, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+%macro %%h7 0
+%if cpuflag(ssse3)
+ pshufb m0, m4, m8
+ pmaddubsw m0, m12
+ pshufb m1, m5, m8
+ pmaddubsw m1, m12
+ pshufb m2, m4, m9
+ pmaddubsw m2, m13
+ pshufb m3, m5, m9
+ pmaddubsw m3, m13
+ paddw m0, m2
+ pshufb m2, m4, m10
+ pmaddubsw m2, m13
+ paddw m1, m3
+ pshufb m3, m5, m10
+ pmaddubsw m3, m13
+ pshufb m4, m11
+ paddw m0, m2
+ pmullw m2, m14, m4
+ pshufb m5, m11
+ paddw m1, m3
+ pmullw m3, m14, m5
+ psllw m4, 7
+ psllw m5, 7
+ paddw m0, m2
+ mova m2, [base+pw_m16380]
+ paddw m1, m3
+ paddw m4, m2
+ paddw m5, m2
+ paddsw m0, m4
+ paddsw m1, m5
+%else
+ psrldq m0, m4, 1
+ pslldq m1, m4, 1
+ pxor m3, m3
+ punpcklbw m0, m3
+ punpckhbw m1, m3
+ paddw m0, m1
+ pmullw m0, m11
+ psrldq m1, m4, 2
+ pslldq m2, m4, 2
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ paddw m1, m2
+ pmullw m1, m12
+ paddw m0, m1
+ pshufd m2, m4, q0321
+ punpcklbw m2, m3
+ pmullw m1, m14, m2
+ paddw m0, m1
+ psrldq m1, m4, 3
+ pslldq m4, 3
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m13
+ paddw m0, m1
+ psllw m2, 7
+ paddw m2, m10
+ paddsw m0, m2
+ psrldq m1, m5, 1
+ pslldq m2, m5, 1
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ paddw m1, m2
+ pmullw m1, m11
+ psrldq m2, m5, 2
+ pslldq m4, m5, 2
+ punpcklbw m2, m3
+ punpckhbw m4, m3
+ paddw m2, m4
+ pmullw m2, m12
+ paddw m1, m2
+ pshufd m4, m5, q0321
+ punpcklbw m4, m3
+ pmullw m2, m14, m4
+ paddw m1, m2
+ psrldq m2, m5, 3
+ pslldq m5, 3
+ punpcklbw m2, m3
+ punpckhbw m5, m3
+ paddw m2, m5
+ pmullw m2, m13
+ paddw m1, m2
+ psllw m4, 7
+ paddw m4, m10
+ paddsw m1, m4
+%endif
+%endmacro
+ %%h7
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+ mova [t1+xq*2+ 0], m0
+ mova [t1+xq*2+16], m1
+ add xq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .hv_main
+.hv_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, [base+wiener_l_shuf]
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .hv_main
+.hv_bottom:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+xq-4]
+.hv_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp xd, -18
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ %%h7
+%if ARCH_X86_64
+ mova m2, [t4+xq*2]
+ paddw m2, [t2+xq*2]
+%else
+ mov r2, t4
+ mova m2, [r2+xq*2]
+ mov r2, t2
+ paddw m2, [r2+xq*2]
+ mov r2, t5
+%endif
+ mova m3, [t3+xq*2]
+%if ARCH_X86_64
+ mova m5, [t5+xq*2]
+%else
+ mova m5, [r2+xq*2]
+ mov r2, t6
+%endif
+ paddw m5, [t1+xq*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+%if ARCH_X86_64
+ paddw m4, m0, [t6+xq*2]
+%else
+ paddw m4, m0, [r2+xq*2]
+ mov r2, t4
+%endif
+ mova [t0+xq*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m0, m3
+ mova m3, [t3+xq*2+16]
+ paddd m4, m2
+%if ARCH_X86_64
+ mova m2, [t4+xq*2+16]
+ paddw m2, [t2+xq*2+16]
+ mova m5, [t5+xq*2+16]
+%else
+ mova m2, [r2+xq*2+16]
+ mov r2, t2
+ paddw m2, [r2+xq*2+16]
+ mov r2, t5
+ mova m5, [r2+xq*2+16]
+ mov r2, t6
+%endif
+ paddw m5, [t1+xq*2+16]
+ packuswb m0, m4
+%if ARCH_X86_64
+ paddw m4, m1, [t6+xq*2+16]
+%else
+ paddw m4, m1, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ mova [t0+xq*2+16], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .hv_loop
+ add dstq, strideq
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+%else
+ mov dstmp, dstq
+ mov r1, t5
+ mov r2, t4
+ mov t6, r1
+ mov t5, r2
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, r1
+%endif
+ ret
+%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code
+.v:
+ mov xq, wq
+.v_loop:
+%if ARCH_X86_64
+ mova m1, [t4+xq*2]
+ paddw m1, [t2+xq*2]
+%else
+ mov r2, t4
+ mova m1, [r2+xq*2]
+ mov r2, t2
+ paddw m1, [r2+xq*2]
+ mov r2, t6
+%endif
+ mova m2, [t3+xq*2]
+ mova m4, [t1+xq*2]
+%if ARCH_X86_64
+ paddw m3, m4, [t6+xq*2]
+ paddw m4, [t5+xq*2]
+%else
+ paddw m3, m4, [r2+xq*2]
+ mov r2, t5
+ paddw m4, [r2+xq*2]
+ mov r2, t4
+%endif
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m7
+ punpckhwd m1, m2
+ pmaddwd m1, m7
+ punpcklwd m2, m3, m4
+ pmaddwd m2, m6
+ punpckhwd m3, m4
+ pmaddwd m3, m6
+ paddd m0, m2
+ paddd m1, m3
+%if ARCH_X86_64
+ mova m2, [t4+xq*2+16]
+ paddw m2, [t2+xq*2+16]
+%else
+ mova m2, [r2+xq*2+16]
+ mov r2, t2
+ paddw m2, [r2+xq*2+16]
+ mov r2, t6
+%endif
+ mova m3, [t3+xq*2+16]
+ mova m5, [t1+xq*2+16]
+%if ARCH_X86_64
+ paddw m4, m5, [t6+xq*2+16]
+ paddw m5, [t5+xq*2+16]
+%else
+ paddw m4, m5, [r2+xq*2+16]
+ mov r2, t5
+ paddw m5, [r2+xq*2+16]
+ movifnidn dstq, dstmp
+%endif
+ packuswb m0, m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .v_loop
+ add dstq, strideq
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+%else
+ mov dstmp, dstq
+ mov r1, t5
+ mov r2, t4
+ mov t6, r1
+ mov t5, r2
+%endif
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ ret
+%endif
+
+%if ARCH_X86_64
+cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
+ w, h, edge, flt, x
+ mov fltq, r6mp
+ mov wd, wm
+ movifnidn hd, hm
+ mov edged, r7m
+ movq m14, [fltq]
+ add lpfq, wq
+ movq m7, [fltq+16]
+ add dstq, wq
+ mova m8, [pw_m16380]
+ lea t1, [rsp+wq*2+16]
+ mova m15, [pw_2056]
+ neg wq
+%if cpuflag(ssse3)
+ pshufb m14, [wiener_init]
+ mova m9, [wiener_shufB]
+ pshufd m13, m14, q3333 ; x1 x2
+ mova m10, [wiener_shufC]
+ punpcklqdq m14, m14 ; x3
+ mova m11, [wiener_shufD]
+ mova m12, [wiener_l_shuf]
+%else
+ punpcklwd m14, m14
+ pshufd m11, m14, q1111 ; x1
+ pshufd m13, m14, q2222 ; x2
+ pshufd m14, m14, q3333 ; x3
+%endif
+%else
+%if cpuflag(ssse3)
+ %define stk_off 80
+%else
+ %define m11 [stk+80]
+ %define stk_off 96
+%endif
+cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride
+ %define stk esp
+ %define leftmp [stk+28]
+ %define m8 [base+pw_m16380]
+ %define m12 [base+wiener_l_shuf]
+ %define m14 [stk+48]
+ mov r1, r6m ; flt
+ mov r0, r0m ; dst
+ mov r4, r4m ; w
+ mov lpfq, lpfm
+ mov r2, r7m ; edge
+ mov r5, r5m ; h
+ movq m2, [r1+ 0]
+ movq m7, [r1+16]
+ add r0, r4
+ mov r1, r1m ; stride
+ add lpfq, r4
+ mov edged, r2
+ mov r2, r2m ; left
+ mov dstmp, r0
+ lea t1, [rsp+r4*2+stk_off]
+ mov hd, r5
+ neg r4
+ LEA r6, pb_right_ext_mask+21
+ mov wq, r4
+ mov strideq, r1
+ mov leftmp, r2
+ mov r4, r1
+%if cpuflag(ssse3)
+ pshufb m2, [base+wiener_init]
+ pshufd m1, m2, q3333
+ punpcklqdq m2, m2
+%else
+ punpcklwd m2, m2
+ pshufd m0, m2, q1111
+ pshufd m1, m2, q2222
+ pshufd m2, m2, q3333
+ mova m11, m0
+%endif
+ mova m13, m1
+ mova m14, m2
+%endif
+ psllw m7, 5
+ pshufd m6, m7, q0000 ; __ y1
+ pshufd m7, m7, q1111 ; y2 y3
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea xq, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ mov t3, t1
+ add t1, 384*2
+ add xq, tmpstrideq
+ mov [rsp], xq ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp]
+ call .hv_bottom
+ add lpfq, strideq
+ call .hv_bottom
+.end:
+ RET
+.no_top:
+ lea t3, [lpfq+tmpstrideq*4]
+ mov lpfq, dstmp
+ lea t3, [t3+tmpstrideq*2]
+ mov [rsp], t3
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
+ add dstq, strideq
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ movifnidn dstmp, dstq
+.v1:
+ call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v
+ jmp .end
+.h:
+ %define stk esp+4
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .h_main
+.h_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, m12
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .h_main
+.h_top:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+xq-4]
+.h_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp xd, -17
+ jl .h_have_right
+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
+.h_have_right:
+%macro %%h5 0
+%if cpuflag(ssse3)
+ pshufb m0, m4, m9
+ pmaddubsw m0, m13
+ pshufb m1, m5, m9
+ pmaddubsw m1, m13
+ pshufb m2, m4, m10
+ pmaddubsw m2, m13
+ pshufb m3, m5, m10
+ pmaddubsw m3, m13
+ pshufb m4, m11
+ paddw m0, m2
+ pmullw m2, m14, m4
+ pshufb m5, m11
+ paddw m1, m3
+ pmullw m3, m14, m5
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m8
+ paddw m5, m8
+ paddw m0, m2
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+%else
+ psrldq m0, m4, 2
+ pslldq m1, m4, 2
+ pxor m3, m3
+ punpcklbw m0, m3
+ punpckhbw m1, m3
+ paddw m0, m1
+ pmullw m0, m11
+ pshufd m2, m4, q0321
+ punpcklbw m2, m3
+ pmullw m1, m14, m2
+ paddw m0, m1
+ psrldq m1, m4, 3
+ pslldq m4, 3
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m13
+ paddw m0, m1
+ psllw m2, 7
+ paddw m2, m8
+ paddsw m0, m2
+ psrldq m1, m5, 2
+ pslldq m4, m5, 2
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m11
+ pshufd m4, m5, q0321
+ punpcklbw m4, m3
+ pmullw m2, m14, m4
+ paddw m1, m2
+ psrldq m2, m5, 3
+ pslldq m5, 3
+ punpcklbw m2, m3
+ punpckhbw m5, m3
+ paddw m2, m5
+ pmullw m2, m13
+ paddw m1, m2
+ psllw m4, 7
+ paddw m4, m8
+ paddsw m1, m4
+%endif
+%endmacro
+ %%h5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+ mova [t1+xq*2+ 0], m0
+ mova [t1+xq*2+16], m1
+ add xq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, strideq
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .hv_main
+.hv_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, m12
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .hv_main
+.hv_bottom:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+xq-4]
+.hv_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp xd, -17
+ jl .hv_have_right
+ call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right
+.hv_have_right:
+ %%h5
+ mova m2, [t3+xq*2]
+ paddw m2, [t1+xq*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+%if ARCH_X86_64
+ mova m3, [t2+xq*2]
+ paddw m4, m0, [t4+xq*2]
+%else
+ mov r2, t2
+ mova m3, [r2+xq*2]
+ mov r2, t4
+ paddw m4, m0, [r2+xq*2]
+%endif
+ mova [t0+xq*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t3+xq*2+16]
+ paddw m2, [t1+xq*2+16]
+ packuswb m0, m4
+%if ARCH_X86_64
+ mova m3, [t2+xq*2+16]
+ paddw m4, m1, [t4+xq*2+16]
+%else
+ paddw m4, m1, [r2+xq*2+16]
+ mov r2, t2
+ mova m3, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ mova [t0+xq*2+16], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .hv_loop
+ add dstq, strideq
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ movifnidn dstmp, dstq
+ ret
+%if cpuflag(ssse3)
+.v:
+ mov xq, wq
+.v_loop:
+ mova m3, [t1+xq*2]
+ paddw m1, m3, [t3+xq*2]
+%if ARCH_X86_64
+ mova m2, [t2+xq*2]
+ paddw m3, [t4+xq*2]
+%else
+ mov r2, t2
+ mova m2, [r2+xq*2]
+ mov r2, t4
+ paddw m3, [r2+xq*2]
+%endif
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m7
+ punpckhwd m1, m2
+ pmaddwd m1, m7
+ punpcklwd m2, m3
+ pmaddwd m2, m6
+ punpckhwd m3, m3
+ pmaddwd m3, m6
+ paddd m0, m2
+ paddd m1, m3
+ mova m4, [t1+xq*2+16]
+ paddw m2, m4, [t3+xq*2+16]
+%if ARCH_X86_64
+ mova m3, [t2+xq*2+16]
+ paddw m4, [t4+xq*2+16]
+%else
+ paddw m4, [r2+xq*2+16]
+ mov r2, t2
+ mova m3, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ packuswb m0, m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ packuswb m1, m2
+ psrlw m0, 8
+ psrlw m1, 8
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .v_loop
+ ret
+%endif
+%endmacro
+
+INIT_XMM sse2
+WIENER
+
+INIT_XMM ssse3
+WIENER
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; self-guided ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro GATHERDD 3 ; dst, src, tmp
+ movd %3d, %2
+ %if ARCH_X86_64
+ movd %1, [r13+%3]
+ pextrw %3d, %2, 2
+ pinsrw %1, [r13+%3+2], 3
+ pextrw %3d, %2, 4
+ pinsrw %1, [r13+%3+2], 5
+ pextrw %3d, %2, 6
+ pinsrw %1, [r13+%3+2], 7
+ %else
+ movd %1, [base+sgr_x_by_x-0xf03+%3]
+ pextrw %3, %2, 2
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3
+ pextrw %3, %2, 4
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5
+ pextrw %3, %2, 6
+ pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7
+ %endif
+%endmacro
+
+%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
+ %if ARCH_X86_64
+ %define tmp r14
+ %else
+ %define tmp %4
+ %endif
+ GATHERDD %1, %2, tmp
+ GATHERDD %2, %3, tmp
+ movif32 %4, %5
+ psrld %1, 24
+ psrld %2, 24
+ packssdw %1, %2
+%endmacro
+
+%macro MULLD 3 ; dst, src, tmp
+ pmulhuw %3, %1, %2
+ pmullw %1, %2
+ pslld %3, 16
+ paddd %1, %3
+%endmacro
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 0, 1, 2, 3, 5
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 5*16
+ %else
+ %assign extra_stack 3*16
+ %endif
+cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*0+4*6]
+ %define stridemp dword [esp+calloff+16*0+4*7]
+ %define leftm dword [esp+calloff+16*3+4*0]
+ %define lpfm dword [esp+calloff+16*3+4*1]
+ %define w0m dword [esp+calloff+16*3+4*2]
+ %define hd dword [esp+calloff+16*3+4*3]
+ %define edgeb byte [esp+calloff+16*3+4*4]
+ %define edged dword [esp+calloff+16*3+4*4]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t0m dword [esp+calloff+4*2]
+ %define t2m dword [esp+calloff+4*3]
+ %define t3m dword [esp+calloff+4*4]
+ %define t4m dword [esp+calloff+4*5]
+ %define m8 [base+pb_1]
+ %define m9 [esp+calloff+16*2]
+ %define m10 [base+pd_0xf00800a4]
+ %define m11 [base+sgr_lshuf5]
+ %define m12 [base+pd_34816]
+ %define m13 [base+pb_0to15]
+ %define r10 r4
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+DECLARE_REG_TMP 8, 7, 9, 11, 12
+cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ mov wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ mov edged, r7m
+ movu m9, [paramsq]
+ add lpfq, wq
+ mova m8, [pb_1]
+ lea t1, [rsp+wq*2+20]
+ mova m10, [pd_0xf00800a4]
+ add dstq, wq
+ lea t3, [rsp+wq*4+400*12+16]
+ mova m12, [pd_34816] ; (1 << 11) + (1 << 15)
+ lea t4, [rsp+wq*2+400*20+16]
+ pshufhw m7, m9, q0000
+ pshufb m9, [pw_256] ; s0
+ punpckhqdq m7, m7 ; w0
+ neg wq
+ mova m13, [pb_0to15]
+ pxor m6, m6
+ mova m11, [sgr_lshuf5]
+ psllw m7, 4
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ movu m1, [r1]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq*2+20]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*4+400*12+16]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq*2+400*20+16]
+ mov t3m, t3
+ pshufhw m7, m1, q0000
+ mov t4m, t4
+ pshufb m1, [base+pw_256] ; s0
+ punpckhqdq m7, m7 ; w0
+ psllw m7, 4
+ neg wq
+ mova m9, m1
+ pxor m6, m6
+ mov w1m, wd
+ sub wd, 2
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ movif32 t2m, t1
+ mov t2, t1
+ call .top_fixup
+ add t1, 400*6
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t0m, t2
+ mov t0, t2
+ dec hd
+ jz .height1
+ or edged, 16
+ call .h
+.main:
+ add lpfq, stridemp
+ movif32 t4, t4m
+ call .hv
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+%if ARCH_X86_64
+ test hb, hb
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ call .h
+ add lpfq, stridemp
+ call .hv
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+ sub hd, 2
+ movif32 t0, t0m
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .h_top
+ add lpfq, stridemp
+ call .hv_bottom
+.end:
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ movif32 t4, t4m
+ call .hv
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .hv
+ movif32 dstq, dstm
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v
+ movif32 dstq, dstm
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+ lea t2, [t1+400*6]
+ movif32 t2m, t2
+ call .top_fixup
+ dec hd
+ jz .no_top_height1
+ or edged, 16
+ mov t0, t1
+ mov t1, t2
+ movif32 t0m, t0
+ jmp .main
+.no_top_height1:
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v
+ call .prep_n
+ jmp .odd_height_end
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+ movd m1, wd
+ movd m3, [lpfq-1]
+ pshufb m1, m6
+ pshufb m3, m6
+ psubb m2, m8, m1
+ pcmpgtb m2, m13
+ pand m5, m2
+ pandn m2, m3
+ por m5, m2
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m11
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m5, [lpfq+wq-1]
+.h_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -10
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m2, m5, m4, 2
+ paddw m0, m4, m2
+ palignr m3, m5, m4, 6
+ paddw m0, m3
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m1
+ punpckhwd m2, m3
+ pmaddwd m2, m2
+ palignr m5, m4, 8
+ paddw m0, m5
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m3
+ paddd m1, m3
+ punpckhwd m3, m4, m5
+ pmaddwd m3, m3
+ shufps m4, m5, q2121
+ paddw m0, m4 ; sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m2, m3
+ test edgeb, 16 ; y > 0
+ jz .h_loop_end
+ paddw m0, [t1+wq*2+400*0]
+ paddd m1, [t1+wq*2+400*2]
+ paddd m2, [t1+wq*2+400*4]
+.h_loop_end:
+ paddd m1, m5 ; sumsq
+ paddd m2, m4
+ mova [t1+wq*2+400*0], m0
+ mova [t1+wq*2+400*2], m1
+ mova [t1+wq*2+400*4], m2
+ add wq, 8
+ jl .h_loop
+ ret
+.top_fixup:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.top_fixup_loop: ; the sums of the first row needs to be doubled
+ mova m0, [t1+wq*2+400*0]
+ mova m1, [t1+wq*2+400*2]
+ mova m2, [t1+wq*2+400*4]
+ paddw m0, m0
+ paddd m1, m1
+ paddd m2, m2
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m1
+ mova [t2+wq*2+400*4], m2
+ add wq, 8
+ jl .top_fixup_loop
+ ret
+ALIGN function_align
+.hv: ; horizontal boxsum + vertical boxsum + ab
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .hv_main
+.hv_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m11
+ jmp .hv_main
+.hv_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv_loop_start
+%endif
+.hv_loop:
+ movif32 lpfq, hvsrcm
+.hv_loop_start:
+ movu m5, [lpfq+wq-1]
+.hv_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp wd, -10
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ movif32 t3, hd
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m3, m5, m4, 2
+ paddw m0, m4, m3
+ palignr m1, m5, m4, 6
+ paddw m0, m1
+ punpcklwd m2, m3, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 8
+ paddw m0, m5
+ punpcklwd m1, m4, m5
+ pmaddwd m1, m1
+ paddd m2, m1
+ punpckhwd m1, m4, m5
+ pmaddwd m1, m1
+ shufps m4, m5, q2121
+ paddw m0, m4 ; h sum
+ punpcklwd m5, m4, m6
+ pmaddwd m5, m5
+ punpckhwd m4, m6
+ pmaddwd m4, m4
+ paddd m3, m1
+ paddd m2, m5 ; h sumsq
+ paddd m3, m4
+ paddw m1, m0, [t1+wq*2+400*0]
+ paddd m4, m2, [t1+wq*2+400*2]
+ paddd m5, m3, [t1+wq*2+400*4]
+%if ARCH_X86_64
+ test hd, hd
+%else
+ test t3, t3
+%endif
+ jz .hv_last_row
+.hv_main2:
+ paddw m1, [t2+wq*2+400*0] ; hv sum
+ paddd m4, [t2+wq*2+400*2] ; hv sumsq
+ paddd m5, [t2+wq*2+400*4]
+ mova [t0+wq*2+400*0], m0
+ pslld m0, m4, 4
+ mova [t0+wq*2+400*2], m2
+ mova [t0+wq*2+400*4], m3
+ pslld m2, m4, 3
+ paddd m4, m0
+ pslld m0, m5, 4
+ paddd m4, m2 ; a * 25
+ pslld m2, m5, 3
+ paddd m5, m0
+ paddd m5, m2
+ punpcklwd m0, m1, m6 ; b
+ punpckhwd m1, m6
+ pmaddwd m2, m0, m0 ; b * b
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m2 ; p * s
+ MULLD m5, m9, m2
+ pmaddwd m0, m10 ; b * 164
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, t2, t2m
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m2
+ MULLD m1, m5, m2
+ paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m12
+ mova [t4+wq*2+4], m3
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .hv_loop
+ mov t2, t1
+ mov t1, t0
+ mov t0, t2
+ movif32 t2m, t2
+ movif32 t0m, t0
+ ret
+.hv_last_row: ; esoteric edge case for odd heights
+ mova [t1+wq*2+400*0], m1
+ paddw m1, m0
+ mova [t1+wq*2+400*2], m4
+ paddd m4, m2
+ mova [t1+wq*2+400*4], m5
+ paddd m5, m3
+ jmp .hv_main2
+.v: ; vertical boxsum + ab
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m2, [t1+wq*2+400*2]
+ mova m3, [t1+wq*2+400*4]
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m4, m2, [t2+wq*2+400*2]
+ paddd m5, m3, [t2+wq*2+400*4]
+ paddw m0, m0
+ paddd m2, m2
+ paddd m3, m3
+ paddw m1, m0 ; hv sum
+ paddd m4, m2 ; hv sumsq
+ pslld m0, m4, 4
+ paddd m5, m3
+ pslld m2, m4, 3
+ paddd m4, m0
+ pslld m0, m5, 4
+ paddd m4, m2 ; a * 25
+ pslld m2, m5, 3
+ paddd m5, m0
+ paddd m5, m2
+ punpcklwd m0, m1, m6
+ punpckhwd m1, m6
+ pmaddwd m2, m0, m0 ; b * b
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m2 ; p * s
+ MULLD m5, m9, m2
+ pmaddwd m0, m10 ; b * 164
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, t2, t2m
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m2
+ MULLD m1, m5, m2
+ paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m12
+ mova [t4+wq*2+4], m3
+ psrld m0, 12 ; b
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .v_loop
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*2+ 2]
+ movu m3, [t4+wq*2+ 4]
+ movu m1, [t3+wq*4+ 4]
+ movu m4, [t3+wq*4+ 8]
+ movu m2, [t3+wq*4+20]
+ movu m5, [t3+wq*4+24]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ paddw m3, [t4+wq*2+ 0]
+ paddd m4, [t3+wq*4+ 0]
+ paddd m5, [t3+wq*4+16]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ mova [t4+wq*2+400*2+ 0], m0
+ mova [t3+wq*4+400*4+ 0], m1
+ mova [t3+wq*4+400*4+16], m2
+ add wq, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m0, [t4+wq*2+ 2]
+ movu m3, [t4+wq*2+ 4]
+ movu m1, [t3+wq*4+ 4]
+ movu m4, [t3+wq*4+ 8]
+ movu m2, [t3+wq*4+20]
+ movu m5, [t3+wq*4+24]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ paddw m3, [t4+wq*2+ 0]
+ paddd m4, [t3+wq*4+ 0]
+ paddd m5, [t3+wq*4+16]
+ paddw m0, m3
+ psllw m3, 2
+ paddd m1, m4
+ pslld m4, 2
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a 565
+ paddd m1, m4 ; b 565
+ paddd m2, m5
+ paddw m3, m0, [t4+wq*2+400*2+ 0]
+ paddd m4, m1, [t3+wq*4+400*4+ 0]
+ paddd m5, m2, [t3+wq*4+400*4+16]
+ mova [t4+wq*2+400*2+ 0], m0
+ mova [t3+wq*4+400*4+ 0], m1
+ mova [t3+wq*4+400*4+16], m2
+ movq m0, [dstq+wq]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movq m0, [dstq+wq]
+ mova m3, [t4+wq*2+400*2+ 0]
+ mova m4, [t3+wq*4+400*4+ 0]
+ mova m5, [t3+wq*4+400*4+16]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6 ; src
+ punpcklwd m2, m3, m6 ; a
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 7)
+ psubd m5, m3
+ psrad m4, 8
+ psrad m5, 8
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 4*16
+ %else
+ %assign extra_stack 2*16
+ %endif
+cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*2+4*0]
+ %define stridemp dword [esp+calloff+16*2+4*1]
+ %define leftm dword [esp+calloff+16*2+4*2]
+ %define lpfm dword [esp+calloff+16*2+4*3]
+ %define w0m dword [esp+calloff+16*2+4*4]
+ %define hd dword [esp+calloff+16*2+4*5]
+ %define edgeb byte [esp+calloff+16*2+4*6]
+ %define edged dword [esp+calloff+16*2+4*6]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t3m dword [esp+calloff+4*2]
+ %define t4m dword [esp+calloff+4*3]
+ %define m8 [base+pb_0to15]
+ %define m9 [esp+calloff+16*1]
+ %define m10 [base+pd_0xf00801c7]
+ %define m11 [base+pd_34816]
+ %define m12 m6
+ %define m13 [base+sgr_lshuf3]
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ mov wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ mov hd, hm
+ mov edged, r7m
+ movq m9, [paramsq+4]
+ add lpfq, wq
+ lea t1, [rsp+wq*2+12]
+ mova m8, [pb_0to15]
+ add dstq, wq
+ lea t3, [rsp+wq*4+400*12+8]
+ mova m10, [pd_0xf00801c7]
+ lea t4, [rsp+wq*2+400*32+8]
+ mova m11, [pd_34816]
+ pshuflw m7, m9, q3333
+ pshufb m9, [pw_256] ; s1
+ punpcklqdq m7, m7 ; w1
+ neg wq
+ pxor m6, m6
+ mova m13, [sgr_lshuf3]
+ psllw m7, 4
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ movq m1, [r1+4]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq*2+20]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*4+400*12+16]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq*2+400*32+16]
+ mov t3m, t3
+ pshuflw m7, m1, q3333
+ mov t4m, t4
+ pshufb m1, [base+pw_256] ; s1
+ punpcklqdq m7, m7 ; w1
+ psllw m7, 4
+ neg wq
+ mova m9, m1
+ pxor m6, m6
+ mov w1m, wd
+ sub wd, 2
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ mov t2, t1
+ add t1, 400*6
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t4, t4m
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv0
+%if ARCH_X86_64
+ test hb, hb
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .hv0_bottom
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wq, w0m
+ mov hvsrcm, lpfq
+%endif
+ lea t2, [t1+400*6]
+.top_fixup_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m1, [t1+wq*2+400*2]
+ mova m2, [t1+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m1
+ mova [t2+wq*2+400*4], m2
+ add wq, 8
+ jl .top_fixup_loop
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v0
+ jmp .main
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+ movd m0, [lpfq-1]
+ movd m1, wd
+ mova m3, m8
+ pshufb m0, m6
+ pshufb m1, m6
+ mova m2, m6
+ psubb m2, m1
+ pcmpgtb m2, m3
+ pand m5, m2
+ pandn m2, m0
+ por m5, m2
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 14
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m13
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m5, [lpfq+wq]
+.h_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp wd, -9
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ mova [t1+wq*2+400*0], m1
+ mova [t1+wq*2+400*2], m2
+ mova [t1+wq*2+400*4], m3
+ add wq, 8
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 14
+ jmp .hv0_main
+.hv0_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m13
+ jmp .hv0_main
+.hv0_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv0_loop_start
+%endif
+.hv0_loop:
+ movif32 lpfq, hvsrcm
+.hv0_loop_start:
+ movu m5, [lpfq+wq]
+.hv0_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv0_have_right
+ cmp wd, -9
+ jl .hv0_have_right
+ call .extend_right
+.hv0_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m0, m5, m4, 2
+ paddw m1, m4, m0
+ punpcklwd m2, m4, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m0
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m1, m5 ; sum
+ punpcklwd m4, m5, m6
+ pmaddwd m4, m4
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m4 ; sumsq
+ paddd m3, m5
+ paddw m0, m1, [t1+wq*2+400*0]
+ paddd m4, m2, [t1+wq*2+400*2]
+ paddd m5, m3, [t1+wq*2+400*4]
+ mova [t1+wq*2+400*0], m1
+ mova [t1+wq*2+400*2], m2
+ mova [t1+wq*2+400*4], m3
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m5, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m4
+ mova [t2+wq*2+400*4], m5
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 14
+ jmp .hv1_main
+.hv1_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, m13
+ jmp .hv1_main
+.hv1_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv1_loop_start
+%endif
+.hv1_loop:
+ movif32 lpfq, hvsrcm
+.hv1_loop_start:
+ movu m5, [lpfq+wq]
+.hv1_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv1_have_right
+ cmp wd, -9
+ jl .hv1_have_right
+ call .extend_right
+.hv1_have_right:
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ palignr m1, m5, m4, 2
+ paddw m0, m4, m1
+ punpcklwd m2, m4, m1
+ pmaddwd m2, m2
+ punpckhwd m3, m4, m1
+ pmaddwd m3, m3
+ palignr m5, m4, 4
+ paddw m0, m5 ; h sum
+ punpcklwd m1, m5, m6
+ pmaddwd m1, m1
+ punpckhwd m5, m6
+ pmaddwd m5, m5
+ paddd m2, m1 ; h sumsq
+ paddd m3, m5
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m4, m2, [t2+wq*2+400*2]
+ paddd m5, m3, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m2
+ mova [t2+wq*2+400*4], m3
+ pslld m2, m4, 3
+ pslld m3, m5, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ movif32 t3, t3m
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+400*2 +4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v0_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m4, [t1+wq*2+400*2]
+ mova m5, [t1+wq*2+400*4]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m5, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m4
+ mova [t2+wq*2+400*4], m5
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v1_loop:
+ mova m0, [t1+wq*2+400*0]
+ mova m4, [t1+wq*2+400*2]
+ mova m5, [t1+wq*2+400*4]
+ paddw m1, m0, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m5, [t2+wq*2+400*4]
+ mova [t2+wq*2+400*0], m0
+ mova [t2+wq*2+400*2], m4
+ mova [t2+wq*2+400*4], m5
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m6 ; b
+ pmaddwd m2, m0, m0 ; b * b
+ punpckhwd m1, m6
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p
+ psubd m5, m3
+ MULLD m4, m9, m12 ; p * s
+ MULLD m5, m9, m12
+ pmaddwd m0, m10 ; b * 455
+ pmaddwd m1, m10
+ paddusw m4, m10
+ paddusw m5, m10
+ psrld m4, 20 ; min(z, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m12
+ MULLD m1, m5, m12
+%if ARCH_X86_32
+ pxor m6, m6
+%endif
+ paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m11
+ mova [t4+wq*2+400*2+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*2+400*0+ 4]
+ movu m1, [t3+wq*4+400*0+ 8]
+ movu m2, [t3+wq*4+400*0+24]
+ movu m3, [t4+wq*2+400*0+ 2]
+ movu m4, [t3+wq*4+400*0+ 4]
+ movu m5, [t3+wq*4+400*0+20]
+ paddw m0, [t4+wq*2+400*0+ 0]
+ paddd m1, [t3+wq*4+400*0+ 0]
+ paddd m2, [t3+wq*4+400*0+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a[-1] 444
+ pslld m4, 2 ; b[-1] 444
+ pslld m5, 2
+ psubw m3, m0 ; a[-1] 343
+ psubd m4, m1 ; b[-1] 343
+ psubd m5, m2
+ mova [t4+wq*2+400*4], m3
+ mova [t3+wq*4+400*8+ 0], m4
+ mova [t3+wq*4+400*8+16], m5
+ movu m0, [t4+wq*2+400*2+ 4]
+ movu m1, [t3+wq*4+400*4+ 8]
+ movu m2, [t3+wq*4+400*4+24]
+ movu m3, [t4+wq*2+400*2+ 2]
+ movu m4, [t3+wq*4+400*4+ 4]
+ movu m5, [t3+wq*4+400*4+20]
+ paddw m0, [t4+wq*2+400*2+ 0]
+ paddd m1, [t3+wq*4+400*4+ 0]
+ paddd m2, [t3+wq*4+400*4+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a[ 0] 444
+ pslld m4, 2 ; b[ 0] 444
+ pslld m5, 2
+ mova [t4+wq*2+400* 6], m3
+ mova [t3+wq*4+400*12+ 0], m4
+ mova [t3+wq*4+400*12+16], m5
+ psubw m3, m0 ; a[ 0] 343
+ psubd m4, m1 ; b[ 0] 343
+ psubd m5, m2
+ mova [t4+wq*2+400* 8], m3
+ mova [t3+wq*4+400*16+ 0], m4
+ mova [t3+wq*4+400*16+16], m5
+ add wq, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m3, [t4+wq*2+400*0+4]
+ movu m1, [t4+wq*2+400*0+2]
+ paddw m3, [t4+wq*2+400*0+0]
+ paddw m1, m3
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+wq*2+400*4]
+ paddw m3, [t4+wq*2+400*6]
+ mova [t4+wq*2+400*4], m2
+ mova [t4+wq*2+400*6], m1
+ movu m4, [t3+wq*4+400*0+8]
+ movu m1, [t3+wq*4+400*0+4]
+ paddd m4, [t3+wq*4+400*0+0]
+ paddd m1, m4
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+wq*4+400* 8+ 0]
+ paddd m4, [t3+wq*4+400*12+ 0]
+ mova [t3+wq*4+400* 8+ 0], m2
+ mova [t3+wq*4+400*12+ 0], m1
+ movu m5, [t3+wq*4+400*0+24]
+ movu m1, [t3+wq*4+400*0+20]
+ paddd m5, [t3+wq*4+400*0+16]
+ paddd m1, m5
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+wq*4+400* 8+16]
+ paddd m5, [t3+wq*4+400*12+16]
+ mova [t3+wq*4+400* 8+16], m2
+ mova [t3+wq*4+400*12+16], m1
+ movq m0, [dstq+wq]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movu m3, [t4+wq*2+400*2+4]
+ movu m1, [t4+wq*2+400*2+2]
+ paddw m3, [t4+wq*2+400*2+0]
+ paddw m1, m3
+ psllw m1, 2 ; a[ 1] 444
+ psubw m2, m1, m3 ; a[ 1] 343
+ paddw m3, m2, [t4+wq*2+400*6]
+ paddw m3, [t4+wq*2+400*8]
+ mova [t4+wq*2+400*6], m1
+ mova [t4+wq*2+400*8], m2
+ movu m4, [t3+wq*4+400*4+8]
+ movu m1, [t3+wq*4+400*4+4]
+ paddd m4, [t3+wq*4+400*4+0]
+ paddd m1, m4
+ pslld m1, 2 ; b[ 1] 444
+ psubd m2, m1, m4 ; b[ 1] 343
+ paddd m4, m2, [t3+wq*4+400*12+ 0]
+ paddd m4, [t3+wq*4+400*16+ 0]
+ mova [t3+wq*4+400*12+ 0], m1
+ mova [t3+wq*4+400*16+ 0], m2
+ movu m5, [t3+wq*4+400*4+24]
+ movu m1, [t3+wq*4+400*4+20]
+ paddd m5, [t3+wq*4+400*4+16]
+ paddd m1, m5
+ pslld m1, 2
+ psubd m2, m1, m5
+ paddd m5, m2, [t3+wq*4+400*12+16]
+ paddd m5, [t3+wq*4+400*16+16]
+ mova [t3+wq*4+400*12+16], m1
+ mova [t3+wq*4+400*16+16], m2
+ movq m0, [dstq+wq]
+ punpcklbw m0, m6
+ punpcklwd m1, m0, m6
+ punpcklwd m2, m3, m6
+ pmaddwd m2, m1 ; a * src
+ punpckhwd m1, m0, m6
+ punpckhwd m3, m6
+ pmaddwd m3, m1
+ psubd m4, m2 ; b - a * src + (1 << 8)
+ psubd m5, m3
+ psrad m4, 9
+ psrad m5, 9
+ packssdw m4, m5
+ pmulhrsw m4, m7
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < 16
+ %assign extra_stack 10*16
+ %else
+ %assign extra_stack 8*16
+ %endif
+cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \
+ dst, stride, left, lpf, w
+ %if STACK_ALIGNMENT < 16
+ %define dstm dword [esp+calloff+16*8+4*0]
+ %define stridemp dword [esp+calloff+16*8+4*1]
+ %define leftm dword [esp+calloff+16*8+4*2]
+ %define lpfm dword [esp+calloff+16*8+4*3]
+ %define w0m dword [esp+calloff+16*8+4*4]
+ %define hd dword [esp+calloff+16*8+4*5]
+ %define edgeb byte [esp+calloff+16*8+4*6]
+ %define edged dword [esp+calloff+16*8+4*6]
+ %define leftmp leftm
+ %else
+ %define w0m wm
+ %define hd dword r5m
+ %define edgeb byte r7m
+ %define edged dword r7m
+ %endif
+ %define hvsrcm dword [esp+calloff+4*0]
+ %define w1m dword [esp+calloff+4*1]
+ %define t3m dword [esp+calloff+4*2]
+ %define t4m dword [esp+calloff+4*3]
+ %xdefine m8 m6
+ %define m9 [base+pd_0xffff]
+ %define m10 [base+pd_34816]
+ %define m11 [base+pd_0xf00801c7]
+ %define m12 [base+pd_0xf00800a4]
+ %define m13 [esp+calloff+16*4]
+ %define m14 [esp+calloff+16*5]
+ %define m15 [esp+calloff+16*6]
+ %define m6 [esp+calloff+16*7]
+ %define base r6-$$
+ %assign calloff 0
+ %if STACK_ALIGNMENT < 16
+ mov strideq, [rstk+stack_offset+ 8]
+ mov leftq, [rstk+stack_offset+12]
+ mov lpfq, [rstk+stack_offset+16]
+ mov wd, [rstk+stack_offset+20]
+ mov dstm, dstq
+ mov stridemp, strideq
+ mov leftm, leftq
+ mov r1, [rstk+stack_offset+24]
+ mov r2, [rstk+stack_offset+32]
+ mov lpfm, lpfq
+ mov hd, r1
+ mov edged, r2
+ %endif
+%else
+cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
+ w, h, edge, params
+%endif
+%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
+ mov wd, wm
+%endif
+%if ARCH_X86_64
+ mov paramsq, r6mp
+ lea r13, [sgr_x_by_x-0xf03]
+ movifnidn hd, hm
+ mov edged, r7m
+ mova m15, [paramsq]
+ add lpfq, wq
+ mova m9, [pd_0xffff]
+ lea t1, [rsp+wq*2+44]
+ mova m10, [pd_34816]
+ add dstq, wq
+ lea t3, [rsp+wq*4+400*24+40]
+ mova m11, [pd_0xf00801c7]
+ lea t4, [rsp+wq*2+400*52+40]
+ mova m12, [base+pd_0xf00800a4]
+ neg wq
+ pshuflw m13, m15, q0000
+ pshuflw m14, m15, q2222
+ pshufhw m15, m15, q1010
+ punpcklqdq m13, m13 ; s0
+ punpcklqdq m14, m14 ; s1
+ punpckhqdq m15, m15 ; w0 w1
+ pxor m6, m6
+ psllw m15, 2
+ DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
+ %define lpfm [rsp]
+%else
+ mov r1, [rstk+stack_offset+28] ; params
+ LEA r6, $$
+ mova m2, [r1]
+ add lpfm, wq
+ lea t1, [rsp+extra_stack+wq*2+52]
+ add dstq, wq
+ lea t3, [rsp+extra_stack+wq*4+400*24+48]
+ mov dstm, dstq
+ lea t4, [rsp+extra_stack+wq*2+400*52+48]
+ mov t3m, t3
+ mov t4m, t4
+ neg wq
+ pshuflw m0, m2, q0000
+ pshuflw m1, m2, q2222
+ pshufhw m2, m2, q1010
+ punpcklqdq m0, m0 ; s0
+ punpcklqdq m1, m1 ; s1
+ punpckhqdq m2, m2 ; w0 w1
+ mov w1m, wd
+ pxor m3, m3
+ psllw m2, 2
+ mova m13, m0
+ mova m14, m1
+ sub wd, 2
+ mova m15, m2
+ mova m6, m3
+ mov lpfq, lpfm
+ mov w0m, wd
+ %define strideq r5
+%endif
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, stridemp
+ mov t2, t1
+%if ARCH_X86_64
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup
+%else
+ mov wq, w0m
+ call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup_loop
+%endif
+ add t1, 400*12
+ call .h_top
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ add r10, strideq
+ mov lpfm, r10 ; below
+ movif32 t4, t4m
+ call .hv0
+.main:
+ dec hd
+ jz .height1
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .prep_n
+ sub hd, 2
+ jl .extend_bottom
+.main_loop:
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv0
+%if ARCH_X86_64
+ test hd, hd
+%else
+ mov r4, hd
+ test r4, r4
+%endif
+ jz .odd_height
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1
+ call .n0
+ call .n1
+ sub hd, 2
+ jge .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .extend_bottom
+ mov lpfq, lpfm
+ call .hv0_bottom
+ movif32 lpfq, hvsrcm
+ add lpfq, stridemp
+ call .hv1_bottom
+.end:
+ call .n0
+ call .n1
+.end2:
+ RET
+.height1:
+ call .v1
+ call .prep_n
+ jmp .odd_height_end
+.odd_height:
+ call .v1
+ call .n0
+ call .n1
+.odd_height_end:
+ call .v0
+ call .v1
+ call .n0
+ jmp .end2
+.extend_bottom:
+ call .v0
+ call .v1
+ jmp .end
+.no_top:
+ movif32 strideq, stridemp
+ lea r10, [lpfq+strideq*4]
+ mov lpfq, dstq
+ lea r10, [r10+strideq*2]
+ mov lpfm, r10
+ call .h
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wq, w0m
+ mov hvsrcm, lpfq
+%endif
+ lea t2, [t1+400*12]
+.top_fixup_loop:
+ mova m0, [t1+wq*2+400* 0]
+ mova m1, [t1+wq*2+400* 2]
+ mova m2, [t1+wq*2+400* 4]
+ paddw m0, m0
+ mova m3, [t1+wq*2+400* 6]
+ paddd m1, m1
+ mova m4, [t1+wq*2+400* 8]
+ paddd m2, m2
+ mova m5, [t1+wq*2+400*10]
+ mova [t2+wq*2+400* 0], m0
+ mova [t2+wq*2+400* 2], m1
+ mova [t2+wq*2+400* 4], m2
+ mova [t2+wq*2+400* 6], m3
+ mova [t2+wq*2+400* 8], m4
+ mova [t2+wq*2+400*10], m5
+ add wq, 8
+ jl .top_fixup_loop
+ movif32 t3, t3m
+ movif32 t4, t4m
+ call .v0
+ jmp .main
+.extend_right:
+%assign stack_offset stack_offset+8
+%assign calloff 8
+%if ARCH_X86_64
+ SWAP m8, m6
+%endif
+ movd m1, wd
+ movd m3, [lpfq-1]
+ pshufb m1, m8
+ pshufb m3, m8
+ psubb m2, [base+pb_1], m1
+ pcmpgtb m2, [base+pb_0to15]
+ pand m5, m2
+ pandn m2, m3
+ por m5, m2
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ ret
+%assign stack_offset stack_offset-4
+%assign calloff 4
+.h: ; horizontal boxsum
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ %define leftq r4
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .h_main
+.h_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, [base+sgr_lshuf5]
+ jmp .h_main
+.h_top:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movif32 wq, w0m
+.h_loop:
+ movu m5, [lpfq+wq-1]
+.h_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+%if ARCH_X86_32
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ jnz .h_have_right
+ cmp wd, -10
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; sum3
+ punpcklwd m7, m0, m8
+ pmaddwd m7, m7
+ punpckhwd m0, m8
+ pmaddwd m0, m0
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ paddd m2, m7 ; sumsq3
+ palignr m5, m4, 8
+ punpcklwd m7, m5, m4
+ paddw m8, m4, m5
+ pmaddwd m7, m7
+ punpckhwd m5, m4
+ pmaddwd m5, m5
+ paddd m3, m0
+ mova [t1+wq*2+400* 6], m1
+ mova [t1+wq*2+400* 8], m2
+ mova [t1+wq*2+400*10], m3
+ paddw m8, m1 ; sum5
+ paddd m7, m2 ; sumsq5
+ paddd m5, m3
+ mova [t1+wq*2+400* 0], m8
+ mova [t1+wq*2+400* 2], m7
+ mova [t1+wq*2+400* 4], m5
+ add wq, 8
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .hv0_main
+.hv0_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, [base+sgr_lshuf5]
+ jmp .hv0_main
+.hv0_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv0_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv0_loop_start
+%endif
+.hv0_loop:
+ movif32 lpfq, hvsrcm
+.hv0_loop_start:
+ movu m5, [lpfq+wq-1]
+.hv0_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+%if ARCH_X86_32
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ jnz .hv0_have_right
+ cmp wd, -10
+ jl .hv0_have_right
+ call .extend_right
+.hv0_have_right:
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ palignr m3, m5, m4, 2
+ palignr m0, m5, m4, 4
+ movif32 t3, t3m
+ paddw m1, m3, m0
+ punpcklwd m2, m3, m0
+ pmaddwd m2, m2
+ punpckhwd m3, m0
+ pmaddwd m3, m3
+ palignr m0, m5, m4, 6
+ paddw m1, m0 ; h sum3
+ punpcklwd m7, m0, m8
+ pmaddwd m7, m7
+ punpckhwd m0, m8
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ pmaddwd m0, m0
+ paddd m2, m7 ; h sumsq3
+ palignr m5, m4, 8
+ punpcklwd m7, m5, m4
+ paddw m8, m4, m5
+ pmaddwd m7, m7
+ punpckhwd m5, m4
+ pmaddwd m5, m5
+ paddd m3, m0
+ paddw m8, m1 ; h sum5
+ paddd m7, m2 ; h sumsq5
+ paddd m5, m3
+ mova [t3+wq*4+400*8+ 8], m8
+ mova [t3+wq*4+400*0+ 8], m7
+ mova [t3+wq*4+400*0+24], m5
+ paddw m8, [t1+wq*2+400* 0]
+ paddd m7, [t1+wq*2+400* 2]
+ paddd m5, [t1+wq*2+400* 4]
+ mova [t1+wq*2+400* 0], m8
+ mova [t1+wq*2+400* 2], m7
+ mova [t1+wq*2+400* 4], m5
+ paddw m0, m1, [t1+wq*2+400* 6]
+ paddd m4, m2, [t1+wq*2+400* 8]
+ paddd m5, m3, [t1+wq*2+400*10]
+ mova [t1+wq*2+400* 6], m1
+ mova [t1+wq*2+400* 8], m2
+ mova [t1+wq*2+400*10], m3
+ paddw m1, m0, [t2+wq*2+400* 6]
+ paddd m2, m4, [t2+wq*2+400* 8]
+ paddd m3, m5, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 6], m0
+ mova [t2+wq*2+400* 8], m4
+ mova [t2+wq*2+400*10], m5
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m7 ; b3
+ pmaddwd m2, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m3, m1, m1
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ psubd m4, m2 ; p3
+ psubd m5, m3
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+400*2+ 4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .hv0_loop
+ ret
+ALIGN function_align
+.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 leftq, leftm
+ movddup m4, [leftq-4]
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ add leftmp, 4
+ palignr m5, m4, 13
+ jmp .hv1_main
+.hv1_extend_left:
+ movif32 wq, w0m
+ mova m5, [lpfq+wq+2]
+ pshufb m5, [base+sgr_lshuf5]
+ jmp .hv1_main
+.hv1_bottom:
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov hvsrcm, lpfq
+%endif
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv1_extend_left
+ movif32 wq, w0m
+%if ARCH_X86_32
+ jmp .hv1_loop_start
+%endif
+.hv1_loop:
+ movif32 lpfq, hvsrcm
+.hv1_loop_start:
+ movu m5, [lpfq+wq-1]
+.hv1_main:
+ test edgeb, 2 ; LR_HAVE_RIGHT
+%if ARCH_X86_32
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ jnz .hv1_have_right
+ cmp wd, -10
+ jl .hv1_have_right
+ call .extend_right
+.hv1_have_right:
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ palignr m7, m5, m4, 2
+ palignr m3, m5, m4, 4
+ paddw m2, m7, m3
+ punpcklwd m0, m7, m3
+ pmaddwd m0, m0
+ punpckhwd m7, m3
+ pmaddwd m7, m7
+ palignr m3, m5, m4, 6
+ paddw m2, m3 ; h sum3
+ punpcklwd m1, m3, m8
+ pmaddwd m1, m1
+ punpckhwd m3, m8
+%if ARCH_X86_64
+ SWAP m6, m8
+%endif
+ pmaddwd m3, m3
+ paddd m0, m1 ; h sumsq3
+ palignr m5, m4, 8
+ punpckhwd m1, m4, m5
+ paddw m8, m4, m5
+ pmaddwd m1, m1
+ punpcklwd m4, m5
+ pmaddwd m4, m4
+ paddd m7, m3
+ paddw m5, m2, [t2+wq*2+400* 6]
+ mova [t2+wq*2+400* 6], m2
+ paddw m8, m2 ; h sum5
+ paddd m2, m0, [t2+wq*2+400* 8]
+ paddd m3, m7, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 8], m0
+ mova [t2+wq*2+400*10], m7
+ paddd m4, m0 ; h sumsq5
+ paddd m1, m7
+ pslld m0, m2, 3
+ pslld m7, m3, 3
+ paddd m2, m0 ; a3 * 9
+ paddd m3, m7
+%if ARCH_X86_32
+ mova [esp+20], m8
+ pxor m8, m8
+%else
+ SWAP m8, m6
+%endif
+ punpcklwd m0, m5, m8 ; b3
+ pmaddwd m7, m0, m0
+ punpckhwd m5, m8
+ pmaddwd m8, m5, m5
+ psubd m2, m7 ; p3
+ psubd m3, m8
+ MULLD m2, m14, m8 ; p3 * s1
+ MULLD m3, m14, m8
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m5, m11
+ paddusw m2, m11
+ paddusw m3, m11
+ psrld m2, 20 ; min(z3, 255)
+ movif32 t3, t3m
+ psrld m3, 20
+ GATHER_X_BY_X m8, m2, m3, r0, dstm
+ punpcklwd m2, m8, m8
+ punpckhwd m3, m8, m8
+ MULLD m0, m2, m7
+ MULLD m5, m3, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m5, m10
+ psrld m0, 12
+ psrld m5, 12
+ mova [t4+wq*2+400*4+ 4], m8
+ mova [t3+wq*4+400*8+ 8], m0
+ mova [t3+wq*4+400*8+24], m5
+%if ARCH_X86_32
+ mova m8, [esp+20]
+%else
+ SWAP m6, m8
+ pxor m6, m6
+%endif
+ paddw m5, m8, [t2+wq*2+400*0]
+ paddd m2, m4, [t2+wq*2+400*2]
+ paddd m3, m1, [t2+wq*2+400*4]
+ paddw m5, [t1+wq*2+400*0]
+ paddd m2, [t1+wq*2+400*2]
+ paddd m3, [t1+wq*2+400*4]
+ mova [t2+wq*2+400*0], m8
+ pslld m0, m2, 4
+ mova [t2+wq*2+400*2], m4
+ pslld m8, m3, 4
+ mova [t2+wq*2+400*4], m1
+ pslld m4, m2, 3
+ paddd m2, m0
+ pslld m7, m3, 3
+ paddd m3, m8
+ paddd m2, m4 ; a5 * 25
+ paddd m3, m7
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ punpcklwd m0, m5, m7 ; b5
+ pmaddwd m4, m0, m0
+ punpckhwd m5, m7
+ pmaddwd m1, m5, m5
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ psubd m2, m4 ; p5
+ psubd m3, m1
+ MULLD m2, m13, m7 ; p5 * s0
+ MULLD m3, m13, m7
+ pmaddwd m0, m12 ; b5 * 164
+ pmaddwd m5, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrld m2, 20 ; min(z5, 255)
+ psrld m3, 20
+ GATHER_X_BY_X m1, m2, m3, r0, dstm
+ punpcklwd m2, m1, m1
+ punpckhwd m3, m1, m1
+ MULLD m0, m2, m7
+ MULLD m5, m3, m7
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m5, m10
+ mova [t4+wq*2+4], m1
+ psrld m0, 12
+ psrld m5, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m5
+ add wq, 8
+ jl .hv1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.v0: ; vertical boxsums + ab3 (even rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v0_loop:
+ mova m0, [t1+wq*2+400* 6]
+ mova m4, [t1+wq*2+400* 8]
+ mova m5, [t1+wq*2+400*10]
+ paddw m0, m0
+ paddd m4, m4
+ paddd m5, m5
+ paddw m1, m0, [t2+wq*2+400* 6]
+ paddd m2, m4, [t2+wq*2+400* 8]
+ paddd m3, m5, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 6], m0
+ mova [t2+wq*2+400* 8], m4
+ mova [t2+wq*2+400*10], m5
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; a3 * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m7 ; b3
+ pmaddwd m2, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ psubd m5, m3
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+400*2+4], m3
+ psrld m0, 12
+ psrld m1, 12
+ mova m3, [t1+wq*2+400*0]
+ mova m4, [t1+wq*2+400*2]
+ mova m5, [t1+wq*2+400*4]
+ mova [t3+wq*4+400*8+ 8], m3
+ mova [t3+wq*4+400*0+ 8], m4
+ mova [t3+wq*4+400*0+24], m5
+ paddw m3, m3 ; cc5
+ paddd m4, m4
+ paddd m5, m5
+ mova [t1+wq*2+400*0], m3
+ mova [t1+wq*2+400*2], m4
+ mova [t1+wq*2+400*4], m5
+ mova [t3+wq*4+400*4+ 8], m0
+ mova [t3+wq*4+400*4+24], m1
+ add wq, 8
+ jl .v0_loop
+ ret
+.v1: ; vertical boxsums + ab (odd rows)
+%if ARCH_X86_64
+ lea wq, [r4-2]
+%else
+ mov wd, w0m
+%endif
+.v1_loop:
+ mova m4, [t1+wq*2+400* 6]
+ mova m5, [t1+wq*2+400* 8]
+ mova m7, [t1+wq*2+400*10]
+ paddw m1, m4, [t2+wq*2+400* 6]
+ paddd m2, m5, [t2+wq*2+400* 8]
+ paddd m3, m7, [t2+wq*2+400*10]
+ mova [t2+wq*2+400* 6], m4
+ mova [t2+wq*2+400* 8], m5
+ mova [t2+wq*2+400*10], m7
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m4, m2 ; ((a3 + 8) >> 4) * 9
+ paddd m5, m3
+ punpcklwd m0, m1, m7 ; b3
+ pmaddwd m2, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m3, m1, m1
+ psubd m4, m2 ; p3
+ psubd m5, m3
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MULLD m4, m14, m7 ; p3 * s1
+ MULLD m5, m14, m7
+ pmaddwd m0, m11 ; b3 * 455
+ pmaddwd m1, m11
+ paddusw m4, m11
+ paddusw m5, m11
+ psrld m4, 20 ; min(z3, 255)
+ psrld m5, 20
+ GATHER_X_BY_X m3, m4, m5, r0, dstm
+ punpcklwd m4, m3, m3
+ punpckhwd m5, m3, m3
+ MULLD m0, m4, m7
+ MULLD m1, m5, m7
+ paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+400*4+4], m3
+ psrld m0, 12
+ psrld m8, m1, 12
+ mova m4, [t3+wq*4+400*8+ 8]
+ mova m5, [t3+wq*4+400*0+ 8]
+ mova m7, [t3+wq*4+400*0+24]
+ paddw m1, m4, [t2+wq*2+400*0]
+ paddd m2, m5, [t2+wq*2+400*2]
+ paddd m3, m7, [t2+wq*2+400*4]
+ paddw m1, [t1+wq*2+400*0]
+ paddd m2, [t1+wq*2+400*2]
+ paddd m3, [t1+wq*2+400*4]
+ mova [t2+wq*2+400*0], m4
+ mova [t2+wq*2+400*2], m5
+ mova [t2+wq*2+400*4], m7
+ pslld m4, m2, 4
+ mova [t3+wq*4+400*8+ 8], m0
+ pslld m5, m3, 4
+ mova [t3+wq*4+400*8+24], m8
+ pslld m7, m2, 3
+ paddd m2, m4
+ pslld m8, m3, 3
+ paddd m3, m5
+ paddd m2, m7 ; a5 * 25
+ paddd m3, m8
+%if ARCH_X86_32
+ pxor m7, m7
+%else
+ SWAP m7, m6
+%endif
+ punpcklwd m0, m1, m7 ; b5
+ pmaddwd m4, m0, m0
+ punpckhwd m1, m7
+ pmaddwd m5, m1, m1
+ psubd m2, m4 ; p5
+ psubd m3, m5
+%if ARCH_X86_64
+ SWAP m7, m6
+%endif
+ MULLD m2, m13, m7 ; p5 * s0
+ MULLD m3, m13, m7
+ pmaddwd m0, m12 ; b5 * 164
+ pmaddwd m1, m12
+ paddusw m2, m12
+ paddusw m3, m12
+ psrld m2, 20 ; min(z5, 255)
+ psrld m3, 20
+ GATHER_X_BY_X m4, m2, m3, r0, dstm
+ punpcklwd m2, m4, m4
+ punpckhwd m3, m4, m4
+ MULLD m0, m2, m7
+ MULLD m1, m3, m7
+ paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
+ paddd m1, m10
+ mova [t4+wq*2+4], m4
+ psrld m0, 12
+ psrld m1, 12
+ mova [t3+wq*4+ 8], m0
+ mova [t3+wq*4+24], m1
+ add wq, 8
+ jl .v1_loop
+ mov r10, t2
+ mov t2, t1
+ mov t1, r10
+ ret
+.prep_n: ; initial neighbor setup
+ movif64 wq, r4
+ movif32 wd, w1m
+.prep_n_loop:
+ movu m0, [t4+wq*2+400*0+ 2]
+ movu m1, [t3+wq*4+400*0+ 4]
+ movu m2, [t3+wq*4+400*0+20]
+ movu m7, [t4+wq*2+400*0+ 4]
+ movu m8, [t3+wq*4+400*0+ 8]
+ paddw m3, m0, [t4+wq*2+400*0+ 0]
+ paddd m4, m1, [t3+wq*4+400*0+ 0]
+ paddd m5, m2, [t3+wq*4+400*0+16]
+ paddw m3, m7
+ paddd m4, m8
+ movu m7, [t3+wq*4+400*0+24]
+ paddw m0, m3
+ paddd m1, m4
+ psllw m3, 2
+ pslld m4, 2
+ paddd m5, m7
+ paddd m2, m5
+ pslld m5, 2
+ paddw m0, m3 ; a5 565
+ paddd m1, m4 ; b5 565
+ paddd m2, m5
+ mova [t4+wq*2+400* 6+ 0], m0
+ mova [t3+wq*4+400*12+ 0], m1
+ mova [t3+wq*4+400*12+16], m2
+ movu m0, [t4+wq*2+400*2+ 4]
+ movu m1, [t3+wq*4+400*4+ 8]
+ movu m2, [t3+wq*4+400*4+24]
+ movu m3, [t4+wq*2+400*2+ 2]
+ movu m4, [t3+wq*4+400*4+ 4]
+ movu m5, [t3+wq*4+400*4+20]
+ paddw m0, [t4+wq*2+400*2+ 0]
+ paddd m1, [t3+wq*4+400*4+ 0]
+ paddd m2, [t3+wq*4+400*4+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a3[-1] 444
+ pslld m4, 2 ; b3[-1] 444
+ pslld m5, 2
+ psubw m3, m0 ; a3[-1] 343
+ psubd m4, m1 ; b3[-1] 343
+ psubd m5, m2
+ mova [t4+wq*2+400* 8+ 0], m3
+ mova [t3+wq*4+400*16+ 0], m4
+ mova [t3+wq*4+400*16+16], m5
+ movu m0, [t4+wq*2+400*4+ 4]
+ movu m1, [t3+wq*4+400*8+ 8]
+ movu m2, [t3+wq*4+400*8+24]
+ movu m3, [t4+wq*2+400*4+ 2]
+ movu m4, [t3+wq*4+400*8+ 4]
+ movu m5, [t3+wq*4+400*8+20]
+ paddw m0, [t4+wq*2+400*4+ 0]
+ paddd m1, [t3+wq*4+400*8+ 0]
+ paddd m2, [t3+wq*4+400*8+16]
+ paddw m3, m0
+ paddd m4, m1
+ paddd m5, m2
+ psllw m3, 2 ; a3[ 0] 444
+ pslld m4, 2 ; b3[ 0] 444
+ pslld m5, 2
+ mova [t4+wq*2+400*10+ 0], m3
+ mova [t3+wq*4+400*20+ 0], m4
+ mova [t3+wq*4+400*20+16], m5
+ psubw m3, m0 ; a3[ 0] 343
+ psubd m4, m1 ; b3[ 0] 343
+ psubd m5, m2
+ mova [t4+wq*2+400*12+ 0], m3
+ mova [t3+wq*4+400*24+ 0], m4
+ mova [t3+wq*4+400*24+16], m5
+ add wq, 8
+ jl .prep_n_loop
+ ret
+ALIGN function_align
+.n0: ; neighbor + output (even rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n0_loop:
+ movu m0, [t4+wq*2+ 4]
+ movu m2, [t4+wq*2+ 2]
+ paddw m0, [t4+wq*2+ 0]
+ paddw m0, m2
+ paddw m2, m0
+ psllw m0, 2
+ paddw m0, m2 ; a5
+ movu m4, [t3+wq*4+ 8]
+ movu m5, [t3+wq*4+24]
+ movu m1, [t3+wq*4+ 4]
+ movu m3, [t3+wq*4+20]
+ paddd m4, [t3+wq*4+ 0]
+ paddd m5, [t3+wq*4+16]
+ paddd m4, m1
+ paddd m5, m3
+ paddd m1, m4
+ paddd m3, m5
+ pslld m4, 2
+ pslld m5, 2
+ paddd m4, m1 ; b5
+ paddd m5, m3
+ movu m2, [t4+wq*2+400* 6]
+ paddw m2, m0
+ mova [t4+wq*2+400* 6], m0
+ paddd m0, m4, [t3+wq*4+400*12+ 0]
+ paddd m1, m5, [t3+wq*4+400*12+16]
+ mova [t3+wq*4+400*12+ 0], m4
+ mova [t3+wq*4+400*12+16], m5
+ mova [rsp+16+ARCH_X86_32*4], m1
+ movu m3, [t4+wq*2+400*2+4]
+ movu m5, [t4+wq*2+400*2+2]
+ paddw m3, [t4+wq*2+400*2+0]
+ paddw m5, m3
+ psllw m5, 2 ; a3[ 1] 444
+ psubw m4, m5, m3 ; a3[ 1] 343
+ movu m3, [t4+wq*2+400* 8]
+ paddw m3, [t4+wq*2+400*10]
+ paddw m3, m4
+ mova [t4+wq*2+400* 8], m4
+ mova [t4+wq*2+400*10], m5
+ movu m1, [t3+wq*4+400*4+ 8]
+ movu m5, [t3+wq*4+400*4+ 4]
+ movu m7, [t3+wq*4+400*4+24]
+ movu m8, [t3+wq*4+400*4+20]
+ paddd m1, [t3+wq*4+400*4+ 0]
+ paddd m7, [t3+wq*4+400*4+16]
+ paddd m5, m1
+ paddd m8, m7
+ pslld m5, 2 ; b3[ 1] 444
+ pslld m8, 2
+ psubd m4, m5, m1 ; b3[ 1] 343
+%if ARCH_X86_32
+ mova [esp+52], m8
+ psubd m8, m7
+%else
+ psubd m6, m8, m7
+ SWAP m8, m6
+%endif
+ paddd m1, m4, [t3+wq*4+400*16+ 0]
+ paddd m7, m8, [t3+wq*4+400*16+16]
+ paddd m1, [t3+wq*4+400*20+ 0]
+ paddd m7, [t3+wq*4+400*20+16]
+ mova [t3+wq*4+400*16+ 0], m4
+ mova [t3+wq*4+400*16+16], m8
+ mova [t3+wq*4+400*20+ 0], m5
+%if ARCH_X86_32
+ mova m8, [esp+52]
+%else
+ SWAP m8, m6
+ pxor m6, m6
+%endif
+ mova [t3+wq*4+400*20+16], m8
+ mova [rsp+32+ARCH_X86_32*4], m7
+ movq m4, [dstq+wq]
+ punpcklbw m4, m6
+ punpcklwd m5, m4, m6
+ punpcklwd m7, m2, m6
+ pmaddwd m7, m5 ; a5 * src
+ punpcklwd m8, m3, m6
+ pmaddwd m8, m5 ; a3 * src
+ punpckhwd m5, m4, m6
+ punpckhwd m2, m6
+ pmaddwd m2, m5
+ punpckhwd m3, m6
+ pmaddwd m3, m5
+ psubd m0, m7 ; b5 - a5 * src + (1 << 8) - (src << 13)
+ psubd m1, m8 ; b3 - a3 * src + (1 << 8) - (src << 13)
+ psrld m0, 9
+ pslld m1, 7
+ pand m0, m9
+ pandn m8, m9, m1
+ por m0, m8
+ mova m1, [rsp+16+ARCH_X86_32*4]
+ psubd m1, m2
+ mova m2, [rsp+32+ARCH_X86_32*4]
+ psubd m2, m3
+ mova m3, [base+pd_4096]
+ psrld m1, 9
+ pslld m2, 7
+ pand m1, m9
+ pandn m5, m9, m2
+ por m1, m5
+ pmaddwd m0, m15
+ pmaddwd m1, m15
+ paddd m0, m3
+ paddd m1, m3
+ psrad m0, 13
+ psrad m1, 13
+ packssdw m0, m1
+ paddw m0, m4
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n0_loop
+ add dstq, stridemp
+ ret
+ALIGN function_align
+.n1: ; neighbor + output (odd rows)
+ movif64 wq, r4
+ movif32 wd, w1m
+.n1_loop:
+ movu m3, [t4+wq*2+400*4+4]
+ movu m5, [t4+wq*2+400*4+2]
+ paddw m3, [t4+wq*2+400*4+0]
+ paddw m5, m3
+ psllw m5, 2 ; a3[ 1] 444
+ psubw m4, m5, m3 ; a3[ 1] 343
+ paddw m3, m4, [t4+wq*2+400*12]
+ paddw m3, [t4+wq*2+400*10]
+ mova [t4+wq*2+400*10], m5
+ mova [t4+wq*2+400*12], m4
+ movu m1, [t3+wq*4+400*8+ 8]
+ movu m5, [t3+wq*4+400*8+ 4]
+ movu m7, [t3+wq*4+400*8+24]
+ movu m8, [t3+wq*4+400*8+20]
+ paddd m1, [t3+wq*4+400*8+ 0]
+ paddd m7, [t3+wq*4+400*8+16]
+ paddd m5, m1
+ paddd m8, m7
+ pslld m5, 2 ; b3[ 1] 444
+ pslld m8, 2
+ psubd m4, m5, m1 ; b3[ 1] 343
+ psubd m0, m8, m7
+ paddd m1, m4, [t3+wq*4+400*24+ 0]
+ paddd m7, m0, [t3+wq*4+400*24+16]
+ paddd m1, [t3+wq*4+400*20+ 0]
+ paddd m7, [t3+wq*4+400*20+16]
+ mova [t3+wq*4+400*20+ 0], m5
+ mova [t3+wq*4+400*20+16], m8
+ mova [t3+wq*4+400*24+ 0], m4
+ mova [t3+wq*4+400*24+16], m0
+ movq m5, [dstq+wq]
+ mova m2, [t4+wq*2+400* 6]
+ punpcklbw m5, m6
+ punpcklwd m4, m5, m6
+ punpcklwd m8, m2, m6
+ pmaddwd m8, m4 ; a5 * src
+ punpcklwd m0, m3, m6
+ pmaddwd m0, m4 ; a3 * src
+ punpckhwd m4, m5, m6
+ punpckhwd m2, m6
+ pmaddwd m2, m4
+ punpckhwd m3, m6
+ pmaddwd m3, m4
+ psubd m1, m0 ; b3 - a3 * src + (1 << 8) - (src << 13)
+ mova m0, [t3+wq*4+400*12+ 0]
+ psubd m0, m8 ; b5 - a5 * src + (1 << 8) - (src << 13)
+ mova m4, [t3+wq*4+400*12+16]
+ psubd m4, m2
+ psubd m7, m3
+ pslld m1, 7
+ psrld m0, 8
+ psrld m4, 8
+ pslld m7, 7
+ pandn m3, m9, m1
+ pand m0, m9
+ por m0, m3
+ pand m4, m9
+ pandn m2, m9, m7
+ por m2, m4
+ mova m1, [base+pd_4096]
+ pmaddwd m0, m15
+ pmaddwd m2, m15
+ paddd m0, m1
+ paddd m2, m1
+ psrad m0, 13
+ psrad m2, 13
+ packssdw m0, m2
+ paddw m0, m5
+ packuswb m0, m0
+ movq [dstq+wq], m0
+ add wq, 8
+ jl .n1_loop
+ add dstq, stridemp
+ movif32 dstm, dstq
+ ret
diff --git a/third_party/dav1d/src/x86/mc.h b/third_party/dav1d/src/x86/mc.h
new file mode 100644
index 0000000000..65c607e180
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018-2021, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/mc.h"
+
+#define decl_fn(type, name) \
+ decl_##type##_fn(BF(name, sse2)); \
+ decl_##type##_fn(BF(name, ssse3)); \
+ decl_##type##_fn(BF(name, avx2)); \
+ decl_##type##_fn(BF(name, avx512icl));
+#define init_mc_fn(type, name, suffix) \
+ c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+ c->mct[type] = BF(dav1d_prep_##name, suffix)
+#define init_mc_scaled_fn(type, name, suffix) \
+ c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_scaled_fn(type, name, suffix) \
+ c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
+
+decl_fn(mc, dav1d_put_8tap_regular);
+decl_fn(mc, dav1d_put_8tap_regular_smooth);
+decl_fn(mc, dav1d_put_8tap_regular_sharp);
+decl_fn(mc, dav1d_put_8tap_smooth);
+decl_fn(mc, dav1d_put_8tap_smooth_regular);
+decl_fn(mc, dav1d_put_8tap_smooth_sharp);
+decl_fn(mc, dav1d_put_8tap_sharp);
+decl_fn(mc, dav1d_put_8tap_sharp_regular);
+decl_fn(mc, dav1d_put_8tap_sharp_smooth);
+decl_fn(mc, dav1d_put_bilin);
+
+decl_fn(mct, dav1d_prep_8tap_regular);
+decl_fn(mct, dav1d_prep_8tap_regular_smooth);
+decl_fn(mct, dav1d_prep_8tap_regular_sharp);
+decl_fn(mct, dav1d_prep_8tap_smooth);
+decl_fn(mct, dav1d_prep_8tap_smooth_regular);
+decl_fn(mct, dav1d_prep_8tap_smooth_sharp);
+decl_fn(mct, dav1d_prep_8tap_sharp);
+decl_fn(mct, dav1d_prep_8tap_sharp_regular);
+decl_fn(mct, dav1d_prep_8tap_sharp_smooth);
+decl_fn(mct, dav1d_prep_bilin);
+
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_smooth);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_smooth);
+decl_fn(mc_scaled, dav1d_put_bilin_scaled);
+
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_smooth);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_smooth);
+decl_fn(mct_scaled, dav1d_prep_bilin_scaled);
+
+decl_fn(avg, dav1d_avg);
+decl_fn(w_avg, dav1d_w_avg);
+decl_fn(mask, dav1d_mask);
+decl_fn(w_mask, dav1d_w_mask_420);
+decl_fn(w_mask, dav1d_w_mask_422);
+decl_fn(w_mask, dav1d_w_mask_444);
+decl_fn(blend, dav1d_blend);
+decl_fn(blend_dir, dav1d_blend_v);
+decl_fn(blend_dir, dav1d_blend_h);
+
+decl_fn(warp8x8, dav1d_warp_affine_8x8);
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4));
+decl_fn(warp8x8t, dav1d_warp_affine_8x8t);
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4));
+
+decl_fn(emu_edge, dav1d_emu_edge);
+
+decl_fn(resize, dav1d_resize);
+
+static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
+ return;
+
+#if BITDEPTH == 8
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2);
+
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2);
+#endif
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
+ return;
+
+ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
+ init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
+
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
+
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
+
+ c->avg = BF(dav1d_avg, ssse3);
+ c->w_avg = BF(dav1d_w_avg, ssse3);
+ c->mask = BF(dav1d_mask, ssse3);
+ c->w_mask[0] = BF(dav1d_w_mask_444, ssse3);
+ c->w_mask[1] = BF(dav1d_w_mask_422, ssse3);
+ c->w_mask[2] = BF(dav1d_w_mask_420, ssse3);
+ c->blend = BF(dav1d_blend, ssse3);
+ c->blend_v = BF(dav1d_blend_v, ssse3);
+ c->blend_h = BF(dav1d_blend_h, ssse3);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, ssse3);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3);
+ c->emu_edge = BF(dav1d_emu_edge, ssse3);
+ c->resize = BF(dav1d_resize, ssse3);
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
+ return;
+
+#if BITDEPTH == 8
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, sse4);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4);
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
+ return;
+
+ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2);
+
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
+
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
+
+ c->avg = BF(dav1d_avg, avx2);
+ c->w_avg = BF(dav1d_w_avg, avx2);
+ c->mask = BF(dav1d_mask, avx2);
+ c->w_mask[0] = BF(dav1d_w_mask_444, avx2);
+ c->w_mask[1] = BF(dav1d_w_mask_422, avx2);
+ c->w_mask[2] = BF(dav1d_w_mask_420, avx2);
+ c->blend = BF(dav1d_blend, avx2);
+ c->blend_v = BF(dav1d_blend_v, avx2);
+ c->blend_h = BF(dav1d_blend_h, avx2);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2);
+ c->emu_edge = BF(dav1d_emu_edge, avx2);
+ c->resize = BF(dav1d_resize, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
+ return;
+
+ init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_BILINEAR, bilin, avx512icl);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl);
+
+ c->avg = BF(dav1d_avg, avx512icl);
+ c->w_avg = BF(dav1d_w_avg, avx512icl);
+ c->mask = BF(dav1d_mask, avx512icl);
+ c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl);
+ c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl);
+ c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl);
+ c->blend = BF(dav1d_blend, avx512icl);
+ c->blend_v = BF(dav1d_blend_v, avx512icl);
+ c->blend_h = BF(dav1d_blend_h, avx512icl);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
+ c->resize = BF(dav1d_resize, avx512icl);
+#endif
+}
diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm
new file mode 100644
index 0000000000..61eeaa1007
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc16_avx2.asm
@@ -0,0 +1,5879 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+; dav1d_obmc_masks[] * -512
+const obmc_masks_avx2
+ dw 0, 0, -9728, 0, -12800, -7168, -2560, 0
+ dw -14336, -11264, -8192, -5632, -3584, -1536, 0, 0
+ dw -15360, -13824, -12288, -10752, -9216, -7680, -6144, -5120
+ dw -4096, -3072, -2048, -1536, 0, 0, 0, 0
+ dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240
+ dw -9728, -8704, -8192, -7168, -6656, -6144, -5632, -4608
+ dw -4096, -3584, -3072, -2560, -2048, -2048, -1536, -1024
+ dw 0, 0, 0, 0, 0, 0, 0, 0
+
+deint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
+subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+subpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+subpel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
+rescale_mul2: dd 0, 1, 4, 5, 2, 3, 6, 7
+resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+bdct_lb_q: times 8 db 0
+ times 8 db 4
+ times 8 db 8
+ times 8 db 12
+
+prep_mul: dw 16, 16, 4, 4
+put_bilin_h_rnd: dw 8, 8, 10, 10
+put_8tap_h_rnd: dd 34, 40
+s_8tap_h_rnd: dd 2, 8
+s_8tap_h_sh: dd 2, 4
+put_s_8tap_v_rnd: dd 512, 128
+put_s_8tap_v_sh: dd 10, 8
+prep_8tap_1d_rnd: dd 8 - (8192 << 4)
+prep_8tap_2d_rnd: dd 32 - (8192 << 5)
+warp8x8t_rnd: dd 16384 - (8192 << 15)
+warp8x8_shift: dd 5, 3
+warp8x8_rnd: dw 4096, 4096, 16384, 16384
+bidir_rnd: dw -16400, -16400, -16388, -16388
+bidir_mul: dw 2048, 2048, 8192, 8192
+
+%define pw_16 prep_mul
+%define pd_512 put_s_8tap_v_rnd
+
+pw_2: times 2 dw 2
+pw_64: times 2 dw 64
+pw_2048: times 2 dw 2048
+pw_8192: times 2 dw 8192
+pw_27615: times 2 dw 27615
+pw_32766: times 2 dw 32766
+pw_m512: times 2 dw -512
+pd_32: dd 32
+pd_63: dd 63
+pd_64: dd 64
+pd_32768: dd 32768
+pd_65538: dd 65538
+pd_m524256: dd -524256 ; -8192 << 6 + 32
+pd_0x3ff: dd 0x3ff
+pq_0x40000000: dq 0x40000000
+ dd 0
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 64, 128
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put)
+%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep)
+
+BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+
+%macro SCALED_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
+%%table:
+ %rep %0 - 2
+ dw %%base %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+ %%dy_1024:
+ %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy1_w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+ %%dy_2048:
+ %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy2_w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+cextern mc_warp_filter
+cextern resize_filter
+
+SECTION .text
+
+INIT_XMM avx2
+cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+ mov mxyd, r6m ; mx
+ lea r7, [put_avx2]
+%if UNIX64
+ DECLARE_REG_TMP 8
+ %define org_w r8d
+ mov r8d, wd
+%else
+ DECLARE_REG_TMP 7
+ %define org_w wm
+%endif
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [r7+wq*2+table_offset(put,)]
+ add wq, r7
+ jmp wq
+.put_w2:
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+INIT_YMM avx2
+.put_w16:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+ssq*0+32*0]
+ movu m1, [srcq+ssq*0+32*1]
+ movu m2, [srcq+ssq*1+32*0]
+ movu m3, [srcq+ssq*1+32*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+32*0], m0
+ mova [dstq+dsq*0+32*1], m1
+ mova [dstq+dsq*1+32*0], m2
+ mova [dstq+dsq*1+32*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+ movu m2, [srcq+32*2]
+ movu m3, [srcq+32*3]
+ add srcq, ssq
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+ movu m2, [srcq+32*2]
+ movu m3, [srcq+32*3]
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ movu m0, [srcq+32*4]
+ movu m1, [srcq+32*5]
+ movu m2, [srcq+32*6]
+ movu m3, [srcq+32*7]
+ add srcq, ssq
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ mova [dstq+32*6], m2
+ mova [dstq+32*7], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ movd xm5, mxyd
+ mov mxyd, r7m ; my
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastw m5, xm5
+ psubw m4, m5
+ test mxyd, mxyd
+ jnz .hv
+ ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
+ mov r6d, r8m ; bitdepth_max
+ add wq, r7
+ shr r6d, 11
+ vpbroadcastd m3, [r7-put_avx2+put_bilin_h_rnd+r6*4]
+ jmp wq
+.h_w2:
+ movq xm1, [srcq+ssq*0]
+ movhps xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmullw xm0, xm4, xm1
+ psrlq xm1, 16
+ pmullw xm1, xm5
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 4
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ movq xm1, [srcq+ssq*0+2]
+ movhps xm1, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw xm0, xm4
+ pmullw xm1, xm5
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 4
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ vinserti128 m0, [srcq+ssq*1], 1
+ movu xm1, [srcq+ssq*0+2]
+ vinserti128 m1, [srcq+ssq*1+2], 1
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 4
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ pmullw m0, m4, [srcq+ssq*0]
+ pmullw m1, m5, [srcq+ssq*0+2]
+ paddw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+ssq*1]
+ pmullw m2, m5, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ pmullw m0, m4, [srcq+32*0]
+ pmullw m1, m5, [srcq+32*0+2]
+ paddw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+32*1]
+ pmullw m2, m5, [srcq+32*1+2]
+ add srcq, ssq
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w32
+ RET
+.h_w64:
+.h_w128:
+ movifnidn t0d, org_w
+.h_w64_loop0:
+ mov r6d, t0d
+.h_w64_loop:
+ pmullw m0, m4, [srcq+r6*2-32*1]
+ pmullw m1, m5, [srcq+r6*2-32*1+2]
+ paddw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+r6*2-32*2]
+ pmullw m2, m5, [srcq+r6*2-32*2+2]
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+r6*2-32*1], m0
+ mova [dstq+r6*2-32*2], m1
+ sub r6d, 32
+ jg .h_w64_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w64_loop0
+ RET
+.v:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+ shl mxyd, 11
+ movd xm5, mxyd
+ add wq, r7
+ vpbroadcastw m5, xm5
+ jmp wq
+.v_w2:
+ movd xm0, [srcq+ssq*0]
+.v_w2_loop:
+ movd xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq xm2, xm0, xm1
+ movd xm0, [srcq+ssq*0]
+ punpckldq xm1, xm0
+ psubw xm1, xm2
+ pmulhrsw xm1, xm5
+ paddw xm1, xm2
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xm0, [srcq+ssq*0]
+.v_w4_loop:
+ movq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq xm2, xm0, xm1
+ movq xm0, [srcq+ssq*0]
+ punpcklqdq xm1, xm0
+ psubw xm1, xm2
+ pmulhrsw xm1, xm5
+ paddw xm1, xm2
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu xm0, [srcq+ssq*0]
+.v_w8_loop:
+ vbroadcasti128 m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m2, m0, m1, 0xf0
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vpblendd m1, m0, 0xf0
+ psubw m1, m2
+ pmulhrsw m1, m5
+ paddw m1, m2
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w32:
+ movu m0, [srcq+ssq*0+32*0]
+ movu m1, [srcq+ssq*0+32*1]
+.v_w32_loop:
+ movu m2, [srcq+ssq*1+32*0]
+ movu m3, [srcq+ssq*1+32*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m4, m2, m0
+ pmulhrsw m4, m5
+ paddw m4, m0
+ movu m0, [srcq+ssq*0+32*0]
+ mova [dstq+dsq*0+32*0], m4
+ psubw m4, m3, m1
+ pmulhrsw m4, m5
+ paddw m4, m1
+ movu m1, [srcq+ssq*0+32*1]
+ mova [dstq+dsq*0+32*1], m4
+ psubw m4, m0, m2
+ pmulhrsw m4, m5
+ paddw m4, m2
+ mova [dstq+dsq*1+32*0], m4
+ psubw m4, m1, m3
+ pmulhrsw m4, m5
+ paddw m4, m3
+ mova [dstq+dsq*1+32*1], m4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ RET
+.v_w16:
+.v_w64:
+.v_w128:
+ movifnidn t0d, org_w
+ add t0d, t0d
+ mov r4, srcq
+ lea r6d, [hq+t0*8-256]
+ mov r7, dstq
+.v_w16_loop0:
+ movu m0, [srcq+ssq*0]
+.v_w16_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m1, m3, m0
+ pmulhrsw m1, m5
+ paddw m1, m0
+ movu m0, [srcq+ssq*0]
+ psubw m2, m0, m3
+ pmulhrsw m2, m5
+ paddw m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11
+ vpbroadcastd m3, [pw_2]
+ movd xm6, mxyd
+ vpbroadcastd m7, [pw_8192]
+ add wq, r7
+ vpbroadcastw m6, xm6
+ test dword r8m, 0x800
+ jnz .hv_12bpc
+ psllw m4, 2
+ psllw m5, 2
+ vpbroadcastd m7, [pw_2048]
+.hv_12bpc:
+ jmp wq
+.hv_w2:
+ vpbroadcastq xm1, [srcq+ssq*0]
+ pmullw xm0, xm4, xm1
+ psrlq xm1, 16
+ pmullw xm1, xm5
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 2
+.hv_w2_loop:
+ movq xm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm2, [srcq+ssq*0]
+ pmullw xm1, xm4, xm2
+ psrlq xm2, 16
+ pmullw xm2, xm5
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2 ; 1 _ 2 _
+ shufpd xm2, xm0, xm1, 0x01 ; 0 _ 1 _
+ mova xm0, xm1
+ psubw xm1, xm2
+ paddw xm1, xm1
+ pmulhw xm1, xm6
+ paddw xm1, xm2
+ pmulhrsw xm1, xm7
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ pmullw xm0, xm4, [srcq+ssq*0-8]
+ pmullw xm1, xm5, [srcq+ssq*0-6]
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 2
+.hv_w4_loop:
+ movq xm1, [srcq+ssq*1]
+ movq xm2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ movhps xm1, [srcq+ssq*0]
+ movhps xm2, [srcq+ssq*0+2]
+ pmullw xm1, xm4
+ pmullw xm2, xm5
+ paddw xm1, xm3
+ paddw xm1, xm2
+ psrlw xm1, 2 ; 1 2
+ shufpd xm2, xm0, xm1, 0x01 ; 0 1
+ mova xm0, xm1
+ psubw xm1, xm2
+ paddw xm1, xm1
+ pmulhw xm1, xm6
+ paddw xm1, xm2
+ pmulhrsw xm1, xm7
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ pmullw xm0, xm4, [srcq+ssq*0]
+ pmullw xm1, xm5, [srcq+ssq*0+2]
+ paddw xm0, xm3
+ paddw xm0, xm1
+ psrlw xm0, 2
+ vinserti128 m0, xm0, 1
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1]
+ movu xm2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m1, [srcq+ssq*0], 1
+ vinserti128 m2, [srcq+ssq*0+2], 1
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2 ; 1 2
+ vperm2i128 m2, m0, m1, 0x21 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+.hv_w32:
+.hv_w64:
+.hv_w128:
+%if UNIX64
+ lea r6d, [r8*2-32]
+%else
+ mov r6d, wm
+ lea r6d, [r6*2-32]
+%endif
+ mov r4, srcq
+ lea r6d, [hq+r6*8]
+ mov r7, dstq
+.hv_w16_loop0:
+ pmullw m0, m4, [srcq+ssq*0]
+ pmullw m1, m5, [srcq+ssq*0+2]
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w16_loop:
+ pmullw m1, m4, [srcq+ssq*1]
+ pmullw m2, m5, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2
+ psubw m2, m1, m0
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m0
+ pmulhrsw m2, m7
+ mova [dstq+dsq*0], m2
+ pmullw m0, m4, [srcq+ssq*0]
+ pmullw m2, m5, [srcq+ssq*0+2]
+ paddw m0, m3
+ paddw m0, m2
+ psrlw m0, 2
+ psubw m2, m0, m1
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m1
+ pmulhrsw m2, m7
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+ RET
+
+cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea r6, [prep_avx2]
+%if UNIX64
+ DECLARE_REG_TMP 7
+ %define org_w r7d
+%else
+ DECLARE_REG_TMP 6
+ %define org_w r5m
+%endif
+ mov org_w, wd
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ mov r5d, r7m ; bitdepth_max
+ vpbroadcastd m5, [r6-prep_avx2+pw_8192]
+ add wq, r6
+ shr r5d, 11
+ vpbroadcastd m4, [r6-prep_avx2+prep_mul+r5*4]
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movq xm0, [srcq+strideq*0]
+ movhps xm0, [srcq+strideq*1]
+ vpbroadcastq m1, [srcq+strideq*2]
+ vpbroadcastq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m0, m1, 0x30
+ vpblendd m0, m2, 0xc0
+ pmullw m0, m4
+ psubw m0, m5
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*2]
+ vinserti128 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ pmullw m1, m4
+ psubw m0, m5
+ psubw m1, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m4, [srcq+strideq*2]
+ pmullw m3, m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmullw m0, m4, [srcq+strideq*0+32*0]
+ pmullw m1, m4, [srcq+strideq*0+32*1]
+ pmullw m2, m4, [srcq+strideq*1+32*0]
+ pmullw m3, m4, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 2
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmullw m0, m4, [srcq+32*0]
+ pmullw m1, m4, [srcq+32*1]
+ pmullw m2, m4, [srcq+32*2]
+ pmullw m3, m4, [srcq+32*3]
+ add srcq, strideq
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ dec hd
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmullw m0, m4, [srcq+32*0]
+ pmullw m1, m4, [srcq+32*1]
+ pmullw m2, m4, [srcq+32*2]
+ pmullw m3, m4, [srcq+32*3]
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ pmullw m0, m4, [srcq+32*4]
+ pmullw m1, m4, [srcq+32*5]
+ pmullw m2, m4, [srcq+32*6]
+ pmullw m3, m4, [srcq+32*7]
+ add tmpq, 32*8
+ add srcq, strideq
+ psubw m0, m5
+ psubw m1, m5
+ psubw m2, m5
+ psubw m3, m5
+ mova [tmpq-32*4], m0
+ mova [tmpq-32*3], m1
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ movd xm5, mxyd
+ mov mxyd, r6m ; my
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastw m5, xm5
+ vpbroadcastd m3, [pw_32766]
+ psubw m4, m5
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m4, 2
+ psllw m5, 2
+.h_12bpc:
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ movu xm1, [srcq+strideq*0]
+ vinserti128 m1, [srcq+strideq*2], 1
+ movu xm2, [srcq+strideq*1]
+ vinserti128 m2, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq m0, m1, m2
+ psrldq m1, 2
+ pslldq m2, 6
+ pmullw m0, m4
+ vpblendd m1, m2, 0xcc
+ pmullw m1, m5
+ psubw m0, m3
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4
+ RET
+.h_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*0+2]
+ vinserti128 m1, [srcq+strideq*1+2], 1
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ psubw m0, m3
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m1, m5, [srcq+strideq*0+2]
+ psubw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m5, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ psubw m1, m3
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+.h_w64:
+.h_w128:
+ movifnidn t0d, org_w
+.h_w32_loop0:
+ mov r3d, t0d
+.h_w32_loop:
+ pmullw m0, m4, [srcq+r3*2-32*1]
+ pmullw m1, m5, [srcq+r3*2-32*1+2]
+ psubw m0, m3
+ paddw m0, m1
+ pmullw m1, m4, [srcq+r3*2-32*2]
+ pmullw m2, m5, [srcq+r3*2-32*2+2]
+ psubw m1, m3
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+r3*2-32*1], m0
+ mova [tmpq+r3*2-32*2], m1
+ sub r3d, 32
+ jg .h_w32_loop
+ add srcq, strideq
+ lea tmpq, [tmpq+t0*2]
+ dec hd
+ jg .h_w32_loop0
+ RET
+.v:
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+ movd xm5, mxyd
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastw m5, xm5
+ vpbroadcastd m3, [pw_32766]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ psubw m4, m5
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m4, 2
+ psllw m5, 2
+.v_12bpc:
+ jmp wq
+.v_w4:
+ movq xm0, [srcq+strideq*0]
+.v_w4_loop:
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq xm1, [srcq+strideq*1]
+ vpblendd m2, m0, 0x03 ; 0 2 2 2
+ vpbroadcastq m0, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m0, 0xf0 ; 1 1 3 3
+ vpbroadcastq m0, [srcq+strideq*0]
+ vpblendd m1, m2, 0x33 ; 0 1 2 3
+ vpblendd m0, m2, 0x0c ; 4 2 4 4
+ punpckhqdq m2, m1, m0 ; 1 2 3 4
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu xm0, [srcq+strideq*0]
+.v_w8_loop:
+ vbroadcasti128 m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpblendd m1, m0, m2, 0xf0 ; 0 1
+ vbroadcasti128 m0, [srcq+strideq*0]
+ vpblendd m2, m0, 0xf0 ; 1 2
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu m0, [srcq+strideq*0]
+.v_w16_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5, m2
+ psubw m0, m3
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m4
+ mova [tmpq+32*0], m1
+ pmullw m1, m5, m0
+ psubw m2, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+.v_w64:
+.v_w128:
+%if WIN64
+ PUSH r7
+%endif
+ movifnidn r7d, org_w
+ add r7d, r7d
+ mov r3, srcq
+ lea r6d, [hq+r7*8-256]
+ mov r5, tmpq
+.v_w32_loop0:
+ movu m0, [srcq+strideq*0]
+.v_w32_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5, m2
+ psubw m0, m3
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m4
+ mova [tmpq+r7*0], m1
+ pmullw m1, m5, m0
+ psubw m2, m3
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+r7*1], m1
+ lea tmpq, [tmpq+r7*2]
+ sub hd, 2
+ jg .v_w32_loop
+ add r3, 32
+ add r5, 32
+ movzx hd, r6b
+ mov srcq, r3
+ mov tmpq, r5
+ sub r6d, 1<<8
+ jg .v_w32_loop0
+%if WIN64
+ POP r7
+%endif
+ RET
+.hv:
+ WIN64_SPILL_XMM 7
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ movd xm6, mxyd
+ add wq, r6
+ lea stride3q, [strideq*3]
+ vpbroadcastw m6, xm6
+ jmp wq
+.hv_w4:
+ movu xm1, [srcq+strideq*0]
+%if WIN64
+ movaps [rsp+24], xmm7
+%endif
+ pmullw xm0, xm4, xm1
+ psrldq xm1, 2
+ pmullw xm1, xm5
+ psubw xm0, xm3
+ paddw xm0, xm1
+ psraw xm0, 2
+ vpbroadcastq m0, xm0
+.hv_w4_loop:
+ movu xm1, [srcq+strideq*1]
+ vinserti128 m1, [srcq+stride3q ], 1
+ movu xm2, [srcq+strideq*2]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m2, [srcq+strideq*0], 1
+ punpcklqdq m7, m1, m2
+ psrldq m1, 2
+ pslldq m2, 6
+ pmullw m7, m4
+ vpblendd m1, m2, 0xcc
+ pmullw m1, m5
+ psubw m7, m3
+ paddw m1, m7
+ psraw m1, 2 ; 1 2 3 4
+ vpblendd m0, m1, 0x3f
+ vpermq m2, m0, q2103 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+%if WIN64
+ movaps xmm7, [rsp+24]
+%endif
+ RET
+.hv_w8:
+ pmullw xm0, xm4, [srcq+strideq*0]
+ pmullw xm1, xm5, [srcq+strideq*0+2]
+ psubw xm0, xm3
+ paddw xm0, xm1
+ psraw xm0, 2
+ vinserti128 m0, xm0, 1
+.hv_w8_loop:
+ movu xm1, [srcq+strideq*1]
+ movu xm2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ vinserti128 m1, [srcq+strideq*0], 1
+ vinserti128 m2, [srcq+strideq*0+2], 1
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2 ; 1 2
+ vperm2i128 m2, m0, m1, 0x21 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+.hv_w32:
+.hv_w64:
+.hv_w128:
+%if WIN64
+ PUSH r7
+%endif
+ movifnidn r7d, org_w
+ add r7d, r7d
+ mov r3, srcq
+ lea r6d, [hq+r7*8-256]
+ mov r5, tmpq
+.hv_w16_loop0:
+ pmullw m0, m4, [srcq]
+ pmullw m1, m5, [srcq+2]
+ psubw m0, m3
+ paddw m0, m1
+ psraw m0, 2
+.hv_w16_loop:
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m5, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ psubw m1, m3
+ paddw m1, m2
+ psraw m1, 2
+ psubw m2, m1, m0
+ pmulhrsw m2, m6
+ paddw m2, m0
+ mova [tmpq+r7*0], m2
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m2, m5, [srcq+strideq*0+2]
+ psubw m0, m3
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+r7*1], m2
+ lea tmpq, [tmpq+r7*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add r3, 32
+ add r5, 32
+ movzx hd, r6b
+ mov srcq, r3
+ mov tmpq, r5
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+%if WIN64
+ POP r7
+%endif
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; prefix, type, type_h, type_v
+cglobal %1_%2_16bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
+%define base r8-put_avx2
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx2]
+ movifnidn wd, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ sub srcq, 2
+ mova xm2, [subpel_h_shuf2]
+ vpbroadcastd xm3, [base+subpel_filters+mxq*8+2]
+ pmovsxbw xm3, xm3
+.h_w2_loop:
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm2
+ pshufb xm1, xm2
+ pmaddwd xm0, xm3
+ pmaddwd xm1, xm3
+ phaddd xm0, xm1
+ paddd xm0, xm4
+ psrad xm0, 6
+ packusdw xm0, xm0
+ pminsw xm0, xm5
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xm3, [base+subpel_filters+mxq*8]
+ WIN64_SPILL_XMM 8
+ vbroadcasti128 m6, [subpel_h_shufA]
+ vbroadcasti128 m7, [subpel_h_shufB]
+ pshufd xm3, xm3, q2211
+ vpbroadcastq m2, xm3
+ vpermq m3, m3, q1111
+.h_w4_loop:
+ movu xm1, [srcq+ssq*0]
+ vinserti128 m1, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m7 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m2
+ pmaddwd m1, m3
+ paddd m0, m4
+ paddd m0, m1
+ psrad m0, 6
+ vextracti128 xm1, m0, 1
+ packusdw xm0, xm1
+ pminsw xm0, xm5
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ mov r7d, r8m
+ vpbroadcastw m5, r8m
+ shr r7d, 11
+ vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4]
+ cmp wd, 4
+ je .h_w4
+ jl .h_w2
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 13
+ shr mxd, 16
+ sub srcq, 6
+ vpbroadcastq m0, [base+subpel_filters+mxq*8]
+ vbroadcasti128 m6, [subpel_h_shufA]
+ vbroadcasti128 m7, [subpel_h_shufB]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 8
+ jg .h_w16
+.h_w8:
+%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+ pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m6 ; 0 1 1 2 2 3 3 4
+ pmaddwd m%5, m9, m%4 ; abcd1
+ pmaddwd m%1, m8 ; abcd0
+ pshufb m%2, m7 ; 6 7 7 8 8 9 9 a
+ shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m%5, m4
+ paddd m%1, m%5
+ pmaddwd m%5, m11, m%2 ; abcd3
+ paddd m%1, m%5
+ pmaddwd m%5, m10, m%4 ; abcd2
+ pshufb m%3, m7 ; a b b c c d d e
+ pmaddwd m%4, m8 ; efgh0
+ paddd m%1, m%5
+ pmaddwd m%5, m9, m%2 ; efgh1
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m11 ; efgh3
+ pmaddwd m%2, m10 ; efgh2
+ paddd m%4, m4
+ paddd m%4, m%5
+ paddd m%3, m%4
+ paddd m%2, m%3
+ psrad m%1, 6
+ psrad m%2, 6
+ packusdw m%1, m%2
+ pminsw m%1, m5
+%endmacro
+ movu xm0, [srcq+ssq*0+ 0]
+ vinserti128 m0, [srcq+ssq*1+ 0], 1
+ movu xm2, [srcq+ssq*0+16]
+ vinserti128 m2, [srcq+ssq*1+16], 1
+ lea srcq, [srcq+ssq*2]
+ shufpd m1, m0, m2, 0x05
+ PUT_8TAP_H 0, 1, 2, 3, 12
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mov r6d, wd
+.h_w16_loop:
+ movu m0, [srcq+r6*2-32]
+ movu m1, [srcq+r6*2-24]
+ movu m2, [srcq+r6*2-16]
+ PUT_8TAP_H 0, 1, 2, 3, 12
+ mova [dstq+r6*2-32], m0
+ sub r6d, 16
+ jg .h_w16_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w16
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 15
+ vpbroadcastd m6, [pd_32]
+ vpbroadcastw m7, r8m
+ lea r6, [ssq*3]
+ sub srcq, r6
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 4
+ jg .v_w8
+ je .v_w4
+.v_w2:
+ movd xm2, [srcq+ssq*0]
+ pinsrd xm2, [srcq+ssq*1], 1
+ pinsrd xm2, [srcq+ssq*2], 2
+ pinsrd xm2, [srcq+r6 ], 3 ; 0 1 2 3
+ lea srcq, [srcq+ssq*4]
+ movd xm3, [srcq+ssq*0]
+ vpbroadcastd xm1, [srcq+ssq*1]
+ vpbroadcastd xm0, [srcq+ssq*2]
+ add srcq, r6
+ vpblendd xm3, xm1, 0x02 ; 4 5
+ vpblendd xm1, xm0, 0x02 ; 5 6
+ palignr xm4, xm3, xm2, 4 ; 1 2 3 4
+ punpcklwd xm3, xm1 ; 45 56
+ punpcklwd xm1, xm2, xm4 ; 01 12
+ punpckhwd xm2, xm4 ; 23 34
+.v_w2_loop:
+ vpbroadcastd xm4, [srcq+ssq*0]
+ pmaddwd xm5, xm8, xm1 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm9 ; a1 b1
+ paddd xm5, xm6
+ paddd xm5, xm2
+ mova xm2, xm3
+ pmaddwd xm3, xm10 ; a2 b2
+ paddd xm5, xm3
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm4, xm0, 0x02 ; 7 8
+ punpcklwd xm3, xm4 ; 67 78
+ pmaddwd xm4, xm11, xm3 ; a3 b3
+ paddd xm5, xm4
+ psrad xm5, 6
+ packusdw xm5, xm5
+ pminsw xm5, xm7
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xm1, [srcq+ssq*0]
+ vpbroadcastq m0, [srcq+ssq*1]
+ vpbroadcastq m2, [srcq+ssq*2]
+ vpbroadcastq m4, [srcq+r6 ]
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastq m3, [srcq+ssq*0]
+ vpbroadcastq m5, [srcq+ssq*1]
+ vpblendd m1, m0, 0x30
+ vpblendd m0, m2, 0x30
+ punpcklwd m1, m0 ; 01 12
+ vpbroadcastq m0, [srcq+ssq*2]
+ add srcq, r6
+ vpblendd m2, m4, 0x30
+ vpblendd m4, m3, 0x30
+ punpcklwd m2, m4 ; 23 34
+ vpblendd m3, m5, 0x30
+ vpblendd m5, m0, 0x30
+ punpcklwd m3, m5 ; 45 56
+.v_w4_loop:
+ vpbroadcastq m4, [srcq+ssq*0]
+ pmaddwd m5, m8, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m9 ; a1 b1
+ paddd m5, m6
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m10 ; a2 b2
+ paddd m5, m3
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m4, m0, 0x30
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m11, m3 ; a3 b3
+ paddd m5, m4
+ psrad m5, 6
+ vextracti128 xm4, m5, 1
+ packusdw xm5, xm4
+ pminsw xm5, xm7
+ movq [dstq+dsq*0], xm5
+ movhps [dstq+dsq*1], xm5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ shl wd, 5
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [hq+wq-256]
+.v_w8_loop0:
+ vbroadcasti128 m4, [srcq+ssq*0]
+ vbroadcasti128 m5, [srcq+ssq*1]
+ vbroadcasti128 m0, [srcq+r6 ]
+ vbroadcasti128 m6, [srcq+ssq*2]
+ lea srcq, [srcq+ssq*4]
+ vbroadcasti128 m1, [srcq+ssq*0]
+ vbroadcasti128 m2, [srcq+ssq*1]
+ vbroadcasti128 m3, [srcq+ssq*2]
+ add srcq, r6
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklwd m3, m6, m0 ; 23
+ punpckhwd m6, m0 ; 56
+.v_w8_loop:
+ vbroadcasti128 m14, [srcq+ssq*0]
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ vbroadcasti128 m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ paddd m13, m6
+ shufpd m6, m0, m14, 0x0d
+ shufpd m0, m14, m5, 0x0c
+ punpcklwd m5, m6, m0 ; 67
+ punpckhwd m6, m0 ; 78
+ pmaddwd m14, m11, m5 ; a3
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m14
+ psrad m12, 5
+ psrad m13, 5
+ packusdw m12, m13
+ pxor m13, m13
+ pavgw m12, m13
+ pminsw m12, m7
+ vpermq m12, m12, q3120
+ mova [dstq+dsq*0], xm12
+ vextracti128 [dstq+dsq*1], m12, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ add r7, 16
+ add r8, 16
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+ jg .v_w8_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ vpbroadcastw m15, r8m
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ vpbroadcastd m0, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m1, [base+subpel_filters+myq*8]
+ vpbroadcastd m6, [pd_512]
+ lea r6, [ssq*3]
+ sub srcq, 2
+ sub srcq, r6
+ pxor m7, m7
+ punpcklbw m7, m0
+ punpcklbw m1, m1
+ psraw m1, 8 ; sign-extend
+ test dword r8m, 0x800
+ jz .hv_10bit
+ psraw m7, 2
+ psllw m1, 2
+.hv_10bit:
+ pshufd m11, m1, q0000
+ pshufd m12, m1, q1111
+ pshufd m13, m1, q2222
+ pshufd m14, m1, q3333
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 m9, [subpel_h_shuf2]
+ vbroadcasti128 m1, [srcq+r6 ] ; 3 3
+ movu xm3, [srcq+ssq*2]
+ movu xm0, [srcq+ssq*0]
+ movu xm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m3, [srcq+ssq*0], 1 ; 2 4
+ vinserti128 m0, [srcq+ssq*1], 1 ; 0 5
+ vinserti128 m2, [srcq+ssq*2], 1 ; 1 6
+ add srcq, r6
+ pshufb m1, m9
+ pshufb m3, m9
+ pshufb m0, m9
+ pshufb m2, m9
+ pmaddwd m1, m7
+ pmaddwd m3, m7
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ phaddd m1, m3
+ phaddd m0, m2
+ paddd m1, m6
+ paddd m0, m6
+ psrad m1, 10
+ psrad m0, 10
+ packssdw m1, m0 ; 3 2 0 1
+ vextracti128 xm0, m1, 1 ; 3 4 5 6
+ pshufd xm2, xm1, q1301 ; 2 3 1 2
+ pshufd xm3, xm0, q2121 ; 4 5 4 5
+ punpckhwd xm1, xm2 ; 01 12
+ punpcklwd xm2, xm0 ; 23 34
+ punpckhwd xm3, xm0 ; 45 56
+.hv_w2_loop:
+ movu xm4, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm4, xm9
+ pshufb xm5, xm9
+ pmaddwd xm4, xm7
+ pmaddwd xm5, xm7
+ phaddd xm4, xm5
+ pmaddwd xm5, xm11, xm1 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm12 ; a1 b1
+ paddd xm5, xm2
+ mova xm2, xm3
+ pmaddwd xm3, xm13 ; a2 b2
+ paddd xm5, xm3
+ paddd xm4, xm6
+ psrad xm4, 10
+ packssdw xm4, xm4
+ palignr xm3, xm4, xm0, 12
+ mova xm0, xm4
+ punpcklwd xm3, xm0 ; 67 78
+ pmaddwd xm4, xm14, xm3 ; a3 b3
+ paddd xm5, xm6
+ paddd xm5, xm4
+ psrad xm5, 10
+ packusdw xm5, xm5
+ pminsw xm5, xm15
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ vbroadcasti128 m9, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ pshufd m8, m7, q1111
+ pshufd m7, m7, q0000
+ movu xm1, [srcq+ssq*0]
+ vinserti128 m1, [srcq+ssq*1], 1 ; 0 1
+ vbroadcasti128 m0, [srcq+r6 ]
+ vinserti128 m2, m0, [srcq+ssq*2], 0 ; 2 3
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m0, [srcq+ssq*0], 1 ; 3 4
+ movu xm3, [srcq+ssq*1]
+ vinserti128 m3, [srcq+ssq*2], 1 ; 5 6
+ add srcq, r6
+ pshufb m4, m1, m9
+ pshufb m1, m10
+ pmaddwd m4, m7
+ pmaddwd m1, m8
+ pshufb m5, m2, m9
+ pshufb m2, m10
+ pmaddwd m5, m7
+ pmaddwd m2, m8
+ paddd m4, m6
+ paddd m1, m4
+ pshufb m4, m0, m9
+ pshufb m0, m10
+ pmaddwd m4, m7
+ pmaddwd m0, m8
+ paddd m5, m6
+ paddd m2, m5
+ pshufb m5, m3, m9
+ pshufb m3, m10
+ pmaddwd m5, m7
+ pmaddwd m3, m8
+ paddd m4, m6
+ paddd m4, m0
+ paddd m5, m6
+ paddd m5, m3
+ vperm2i128 m0, m1, m2, 0x21
+ psrld m1, 10
+ psrld m2, 10
+ vperm2i128 m3, m4, m5, 0x21
+ pslld m4, 6
+ pslld m5, 6
+ pblendw m2, m4, 0xaa ; 23 34
+ pslld m0, 6
+ pblendw m1, m0, 0xaa ; 01 12
+ psrld m3, 10
+ pblendw m3, m5, 0xaa ; 45 56
+ psrad m0, m5, 16
+.hv_w4_loop:
+ movu xm4, [srcq+ssq*0]
+ vinserti128 m4, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m11, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m12 ; a1 b1
+ paddd m5, m6
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m13 ; a2 b2
+ paddd m5, m3
+ pshufb m3, m4, m9
+ pshufb m4, m10
+ pmaddwd m3, m7
+ pmaddwd m4, m8
+ paddd m3, m6
+ paddd m4, m3
+ psrad m4, 10
+ packssdw m0, m4 ; _ 7 6 8
+ vpermq m3, m0, q1122 ; _ 6 _ 7
+ punpckhwd m3, m0 ; 67 78
+ mova m0, m4
+ pmaddwd m4, m14, m3 ; a3 b3
+ paddd m4, m5
+ psrad m4, 10
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, xm15
+ movq [dstq+dsq*0], xm4
+ movhps [dstq+dsq*1], xm4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ vpbroadcastq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ pmovsxbw xm1, [base+subpel_filters+myq*8]
+ shl wd, 5
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+ pxor m0, m0
+ punpcklbw m0, m2
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [hq+wq-256]
+ test dword r8m, 0x800
+ jz .hv_w8_10bit
+ psraw m0, 2
+ psllw xm1, 2
+.hv_w8_10bit:
+ pshufd m11, m0, q0000
+ pshufd m12, m0, q1111
+ pshufd m13, m0, q2222
+ pshufd m14, m0, q3333
+%if WIN64
+ %define v_mul (rsp+stack_offset+40) ; r4m
+%else
+ %define v_mul (rsp-24) ; red zone
+%endif
+ mova [v_mul], xm1
+.hv_w8_loop0:
+%macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
+ pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m8 ; 0 1 1 2 2 3 3 4
+ pmaddwd m3, m12, m2
+ pmaddwd m%1, m11
+ pshufb m%2, m9 ; 6 7 7 8 8 9 9 a
+ shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m3, m10
+ paddd m%1, m3
+ pmaddwd m3, m14, m%2
+ paddd m%1, m3
+ pmaddwd m3, m13, m2
+ pshufb m%3, m9 ; a b b c c d d e
+ pmaddwd m2, m11
+ paddd m%1, m3
+ pmaddwd m3, m12, m%2
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m14
+ pmaddwd m%2, m13
+ paddd m2, m10
+ paddd m2, m3
+ paddd m%3, m2
+ paddd m%2, m%3
+ psrad m%1, 10
+ psrad m%2, 10
+ packssdw m%1, m%2
+%endmacro
+ movu xm4, [srcq+r6 *1+ 0]
+ vbroadcasti128 m8, [subpel_h_shufA]
+ movu xm6, [srcq+r6 *1+ 8]
+ vbroadcasti128 m9, [subpel_h_shufB]
+ movu xm0, [srcq+r6 *1+16]
+ vpbroadcastd m10, [pd_512]
+ movu xm5, [srcq+ssq*0+ 0]
+ vinserti128 m5, [srcq+ssq*4+ 0], 1
+ movu xm1, [srcq+ssq*0+16]
+ vinserti128 m1, [srcq+ssq*4+16], 1
+ shufpd m7, m5, m1, 0x05
+ INIT_XMM avx2
+ PUT_8TAP_HV_H 4, 6, 0 ; 3
+ INIT_YMM avx2
+ PUT_8TAP_HV_H 5, 7, 1 ; 0 4
+ movu xm0, [srcq+ssq*2+ 0]
+ vinserti128 m0, [srcq+r6 *2+ 0], 1
+ movu xm1, [srcq+ssq*2+16]
+ vinserti128 m1, [srcq+r6 *2+16], 1
+ shufpd m7, m0, m1, 0x05
+ PUT_8TAP_HV_H 0, 7, 1 ; 2 6
+ movu xm6, [srcq+ssq*1+ 0]
+ movu xm1, [srcq+ssq*1+16]
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m6, [srcq+ssq*1+ 0], 1
+ vinserti128 m1, [srcq+ssq*1+16], 1
+ add srcq, r6
+ shufpd m7, m6, m1, 0x05
+ PUT_8TAP_HV_H 6, 7, 1 ; 1 5
+ vpermq m4, m4, q1100
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ vpermq m7, m0, q3120
+ punpcklwd m3, m7, m4 ; 23
+ punpckhwd m4, m5 ; 34
+ punpcklwd m1, m5, m6 ; 01
+ punpckhwd m5, m6 ; 45
+ punpcklwd m2, m6, m7 ; 12
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vpbroadcastd m9, [v_mul+4*0]
+ vpbroadcastd m7, [v_mul+4*1]
+ vpbroadcastd m10, [v_mul+4*2]
+ pmaddwd m8, m9, m1 ; a0
+ pmaddwd m9, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m7 ; a1
+ pmaddwd m4, m7 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ movu xm5, [srcq+ssq*0]
+ vinserti128 m5, [srcq+ssq*1], 1
+ vbroadcasti128 m7, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ movu xm6, [srcq+ssq*0+16]
+ vinserti128 m6, [srcq+ssq*1+16], 1
+ vextracti128 [dstq], m0, 1
+ pshufb m0, m5, m7 ; 01
+ pshufb m5, m10 ; 23
+ pmaddwd m0, m11
+ pmaddwd m5, m12
+ paddd m0, m5
+ pshufb m5, m6, m7 ; 89
+ pshufb m6, m10 ; ab
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ paddd m6, m5
+ movu xm5, [srcq+ssq*0+8]
+ vinserti128 m5, [srcq+ssq*1+8], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m7, m5, m7
+ pshufb m5, m10
+ pmaddwd m10, m13, m7
+ pmaddwd m7, m11
+ paddd m0, m10
+ vpbroadcastd m10, [pd_512]
+ paddd m6, m7
+ pmaddwd m7, m14, m5
+ pmaddwd m5, m12
+ paddd m0, m7
+ paddd m5, m6
+ vbroadcasti128 m6, [dstq]
+ paddd m8, m10
+ paddd m9, m10
+ paddd m0, m10
+ paddd m5, m10
+ vpbroadcastd m10, [v_mul+4*3]
+ psrad m0, 10
+ psrad m5, 10
+ packssdw m0, m5
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m10, m5 ; a3
+ pmaddwd m10, m6 ; b3
+ paddd m7, m8
+ paddd m9, m10
+ psrad m7, 10
+ psrad m9, 10
+ packusdw m7, m9
+ pminsw m7, m15
+ vpermq m7, m7, q3120
+ mova [dstq+dsq*0], xm7
+ vextracti128 [dstq+dsq*1], m7, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ add r7, 16
+ add r8, 16
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my
+%define base r7-prep_avx2
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep_avx2]
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ mov r6d, r7m ; bitdepth_max
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ vpbroadcastd m5, [r7-prep_avx2+pw_8192]
+ shr r6d, 11
+ add wq, r7
+ vpbroadcastd m4, [base+prep_mul+r6*4]
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xm0, [base+subpel_filters+mxq*8]
+ vbroadcasti128 m3, [subpel_h_shufA]
+ vbroadcasti128 m4, [subpel_h_shufB]
+ WIN64_SPILL_XMM 8
+ pshufd xm0, xm0, q2211
+ test dword r7m, 0x800
+ jnz .h_w4_12bpc
+ psllw xm0, 2
+.h_w4_12bpc:
+ vpbroadcastq m6, xm0
+ vpermq m7, m0, q1111
+.h_w4_loop:
+ movu xm1, [srcq+strideq*0]
+ vinserti128 m1, [srcq+strideq*2], 1
+ movu xm2, [srcq+strideq*1]
+ vinserti128 m2, [srcq+r6 ], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m4 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m6
+ pmaddwd m1, m7
+ paddd m0, m5
+ paddd m0, m1
+ pshufb m1, m2, m3
+ pshufb m2, m4
+ pmaddwd m1, m6
+ pmaddwd m2, m7
+ paddd m1, m5
+ paddd m1, m2
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
+ lea r6, [strideq*3]
+ cmp wd, 4
+ je .h_w4
+ shr mxd, 16
+ sub srcq, 6
+ vpbroadcastq m0, [base+subpel_filters+mxq*8]
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+ vbroadcasti128 m6, [subpel_h_shufA]
+ vbroadcasti128 m7, [subpel_h_shufB]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m0, 2
+.h_12bpc:
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 8
+ jg .h_w16
+.h_w8:
+%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
+ pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m6 ; 0 1 1 2 2 3 3 4
+ pmaddwd m%5, m9, m%4 ; abcd1
+ pmaddwd m%1, m8 ; abcd0
+ pshufb m%2, m7 ; 6 7 7 8 8 9 9 a
+ shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m%5, m5
+ paddd m%1, m%5
+ pmaddwd m%5, m11, m%2 ; abcd3
+ paddd m%1, m%5
+ pmaddwd m%5, m10, m%4 ; abcd2
+ pshufb m%3, m7 ; a b b c c d d e
+ pmaddwd m%4, m8 ; efgh0
+ paddd m%1, m%5
+ pmaddwd m%5, m9, m%2 ; efgh1
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m11 ; efgh3
+ pmaddwd m%2, m10 ; efgh2
+ paddd m%4, m5
+ paddd m%4, m%5
+ paddd m%3, m%4
+ paddd m%2, m%3
+ psrad m%1, 4
+ psrad m%2, 4
+ packssdw m%1, m%2
+%endmacro
+ movu xm0, [srcq+strideq*0+ 0]
+ vinserti128 m0, [srcq+strideq*1+ 0], 1
+ movu xm2, [srcq+strideq*0+16]
+ vinserti128 m2, [srcq+strideq*1+16], 1
+ lea srcq, [srcq+strideq*2]
+ shufpd m1, m0, m2, 0x05
+ PREP_8TAP_H 0, 1, 2, 3, 4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ add wd, wd
+.h_w16_loop0:
+ mov r6d, wd
+.h_w16_loop:
+ movu m0, [srcq+r6-32]
+ movu m1, [srcq+r6-24]
+ movu m2, [srcq+r6-16]
+ PREP_8TAP_H 0, 1, 2, 3, 4
+ mova [tmpq+r6-32], m0
+ sub r6d, 32
+ jg .h_w16_loop
+ add srcq, strideq
+ add tmpq, wq
+ dec hd
+ jg .h_w16_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 15
+ vpbroadcastd m7, [prep_8tap_1d_rnd]
+ lea r6, [strideq*3]
+ sub srcq, r6
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m0, 2
+.v_12bpc:
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ cmp wd, 4
+ jg .v_w8
+.v_w4:
+ movq xm1, [srcq+strideq*0]
+ vpbroadcastq m0, [srcq+strideq*1]
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq m4, [srcq+r6 ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ vpblendd m1, m0, 0x30
+ vpblendd m0, m2, 0x30
+ punpcklwd m1, m0 ; 01 12
+ vpbroadcastq m0, [srcq+strideq*2]
+ add srcq, r6
+ vpblendd m2, m4, 0x30
+ vpblendd m4, m3, 0x30
+ punpcklwd m2, m4 ; 23 34
+ vpblendd m3, m5, 0x30
+ vpblendd m5, m0, 0x30
+ punpcklwd m3, m5 ; 45 56
+.v_w4_loop:
+ vpbroadcastq m4, [srcq+strideq*0]
+ pmaddwd m5, m8, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m9 ; a1 b1
+ paddd m5, m7
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m10 ; a2 b2
+ paddd m5, m3
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpblendd m4, m0, 0x30
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m11, m3 ; a3 b3
+ paddd m5, m4
+ psrad m5, 4
+ vextracti128 xm4, m5, 1
+ packssdw xm5, xm4
+ mova [tmpq], xm5
+ add tmpq, 16
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+%if WIN64
+ push r8
+%endif
+ mov r8d, wd
+ shl wd, 5
+ mov r5, srcq
+ mov r7, tmpq
+ lea wd, [hq+wq-256]
+.v_w8_loop0:
+ vbroadcasti128 m4, [srcq+strideq*0]
+ vbroadcasti128 m5, [srcq+strideq*1]
+ vbroadcasti128 m0, [srcq+r6 ]
+ vbroadcasti128 m6, [srcq+strideq*2]
+ lea srcq, [srcq+strideq*4]
+ vbroadcasti128 m1, [srcq+strideq*0]
+ vbroadcasti128 m2, [srcq+strideq*1]
+ vbroadcasti128 m3, [srcq+strideq*2]
+ add srcq, r6
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklwd m3, m6, m0 ; 23
+ punpckhwd m6, m0 ; 56
+.v_w8_loop:
+ vbroadcasti128 m14, [srcq+strideq*0]
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m7
+ paddd m13, m7
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ vbroadcasti128 m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ paddd m13, m6
+ shufpd m6, m0, m14, 0x0d
+ shufpd m0, m14, m5, 0x0c
+ punpcklwd m5, m6, m0 ; 67
+ punpckhwd m6, m0 ; 78
+ pmaddwd m14, m11, m5 ; a3
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m14
+ psrad m12, 4
+ psrad m13, 4
+ packssdw m12, m13
+ vpermq m12, m12, q3120
+ mova [tmpq+r8*0], xm12
+ vextracti128 [tmpq+r8*2], m12, 1
+ lea tmpq, [tmpq+r8*4]
+ sub hd, 2
+ jg .v_w8_loop
+ add r5, 16
+ add r7, 16
+ movzx hd, wb
+ mov srcq, r5
+ mov tmpq, r7
+ sub wd, 1<<8
+ jg .v_w8_loop0
+%if WIN64
+ pop r8
+%endif
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ vpbroadcastd m15, [prep_8tap_2d_rnd]
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ vpbroadcastd m0, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ vpbroadcastq m1, [base+subpel_filters+myq*8]
+ lea r6, [strideq*3]
+ sub srcq, 2
+ sub srcq, r6
+ pxor m7, m7
+ punpcklbw m7, m0
+ punpcklbw m1, m1
+ psraw m7, 4
+ psraw m1, 8
+ test dword r7m, 0x800
+ jz .hv_w4_10bit
+ psraw m7, 2
+.hv_w4_10bit:
+ pshufd m11, m1, q0000
+ pshufd m12, m1, q1111
+ pshufd m13, m1, q2222
+ pshufd m14, m1, q3333
+.hv_w4:
+ vbroadcasti128 m9, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ pshufd m8, m7, q1111
+ pshufd m7, m7, q0000
+ movu xm1, [srcq+strideq*0]
+ vinserti128 m1, [srcq+strideq*1], 1 ; 0 1
+ vbroadcasti128 m0, [srcq+r6 ]
+ vinserti128 m2, m0, [srcq+strideq*2], 0 ; 2 3
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m0, [srcq+strideq*0], 1 ; 3 4
+ movu xm3, [srcq+strideq*1]
+ vinserti128 m3, [srcq+strideq*2], 1 ; 5 6
+ add srcq, r6
+ pshufb m4, m1, m9
+ pshufb m1, m10
+ pmaddwd m4, m7
+ pmaddwd m1, m8
+ pshufb m5, m2, m9
+ pshufb m2, m10
+ pmaddwd m5, m7
+ pmaddwd m2, m8
+ paddd m4, m15
+ paddd m1, m4
+ pshufb m4, m0, m9
+ pshufb m0, m10
+ pmaddwd m4, m7
+ pmaddwd m0, m8
+ paddd m5, m15
+ paddd m2, m5
+ pshufb m5, m3, m9
+ pshufb m3, m10
+ pmaddwd m5, m7
+ pmaddwd m3, m8
+ paddd m4, m15
+ paddd m4, m0
+ paddd m5, m15
+ paddd m5, m3
+ vperm2i128 m0, m1, m2, 0x21
+ psrld m1, 6
+ psrld m2, 6
+ vperm2i128 m3, m4, m5, 0x21
+ pslld m4, 10
+ pslld m5, 10
+ pblendw m2, m4, 0xaa ; 23 34
+ pslld m0, 10
+ pblendw m1, m0, 0xaa ; 01 12
+ psrld m3, 6
+ pblendw m3, m5, 0xaa ; 45 56
+ psrad m0, m5, 16
+.hv_w4_loop:
+ movu xm4, [srcq+strideq*0]
+ vinserti128 m4, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ pmaddwd m5, m11, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m12 ; a1 b1
+ paddd m5, m15
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m13 ; a2 b2
+ paddd m5, m3
+ pshufb m3, m4, m9
+ pshufb m4, m10
+ pmaddwd m3, m7
+ pmaddwd m4, m8
+ paddd m3, m15
+ paddd m4, m3
+ psrad m4, 6
+ packssdw m0, m4 ; _ 7 6 8
+ vpermq m3, m0, q1122 ; _ 6 _ 7
+ punpckhwd m3, m0 ; 67 78
+ mova m0, m4
+ pmaddwd m4, m14, m3 ; a3 b3
+ paddd m4, m5
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, 16
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ vpbroadcastq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ pmovsxbw xm1, [base+subpel_filters+myq*8]
+%if WIN64
+ PUSH r8
+%endif
+ mov r8d, wd
+ shl wd, 5
+ lea r6, [strideq*3]
+ sub srcq, 6
+ sub srcq, r6
+ mov r5, srcq
+ mov r7, tmpq
+ lea wd, [hq+wq-256]
+ pxor m0, m0
+ punpcklbw m0, m2
+ mova [v_mul], xm1
+ psraw m0, 4
+ test dword r7m, 0x800
+ jz .hv_w8_10bit
+ psraw m0, 2
+.hv_w8_10bit:
+ pshufd m11, m0, q0000
+ pshufd m12, m0, q1111
+ pshufd m13, m0, q2222
+ pshufd m14, m0, q3333
+.hv_w8_loop0:
+%macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16
+ pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6
+ pshufb m%1, m8 ; 0 1 1 2 2 3 3 4
+ pmaddwd m3, m12, m2
+ pmaddwd m%1, m11
+ pshufb m%2, m9 ; 6 7 7 8 8 9 9 a
+ shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8
+ paddd m3, m15
+ paddd m%1, m3
+ pmaddwd m3, m14, m%2
+ paddd m%1, m3
+ pmaddwd m3, m13, m2
+ pshufb m%3, m9 ; a b b c c d d e
+ pmaddwd m2, m11
+ paddd m%1, m3
+ pmaddwd m3, m12, m%2
+ shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c
+ pmaddwd m%3, m14
+ pmaddwd m%2, m13
+ paddd m2, m15
+ paddd m2, m3
+ paddd m2, m%3
+ paddd m2, m%2
+ psrad m%1, 6
+ psrad m2, 6
+ packssdw m%1, m2
+%endmacro
+ movu xm4, [srcq+r6 + 0]
+ vbroadcasti128 m8, [subpel_h_shufA]
+ movu xm6, [srcq+r6 + 8]
+ vbroadcasti128 m9, [subpel_h_shufB]
+ movu xm0, [srcq+r6 +16]
+ movu xm5, [srcq+strideq*0+ 0]
+ vinserti128 m5, [srcq+strideq*4+ 0], 1
+ movu xm1, [srcq+strideq*0+16]
+ vinserti128 m1, [srcq+strideq*4+16], 1
+ shufpd m7, m5, m1, 0x05
+ INIT_XMM avx2
+ PREP_8TAP_HV_H 4, 6, 0 ; 3
+ INIT_YMM avx2
+ PREP_8TAP_HV_H 5, 7, 1 ; 0 4
+ movu xm0, [srcq+strideq*2+ 0]
+ vinserti128 m0, [srcq+r6 *2+ 0], 1
+ movu xm1, [srcq+strideq*2+16]
+ vinserti128 m1, [srcq+r6 *2+16], 1
+ shufpd m7, m0, m1, 0x05
+ PREP_8TAP_HV_H 0, 7, 1 ; 2 6
+ movu xm6, [srcq+strideq*1+ 0]
+ movu xm1, [srcq+strideq*1+16]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m6, [srcq+strideq*1+ 0], 1
+ vinserti128 m1, [srcq+strideq*1+16], 1
+ add srcq, r6
+ shufpd m7, m6, m1, 0x05
+ PREP_8TAP_HV_H 6, 7, 1 ; 1 5
+ vpermq m4, m4, q1100
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ vpermq m7, m0, q3120
+ punpcklwd m3, m7, m4 ; 23
+ punpckhwd m4, m5 ; 34
+ punpcklwd m1, m5, m6 ; 01
+ punpckhwd m5, m6 ; 45
+ punpcklwd m2, m6, m7 ; 12
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vpbroadcastd m9, [v_mul+4*0]
+ vpbroadcastd m7, [v_mul+4*1]
+ vpbroadcastd m10, [v_mul+4*2]
+ pmaddwd m8, m9, m1 ; a0
+ pmaddwd m9, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m7 ; a1
+ pmaddwd m4, m7 ; b1
+ paddd m8, m15
+ paddd m9, m15
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ movu xm5, [srcq+strideq*0]
+ vinserti128 m5, [srcq+strideq*1], 1
+ vbroadcasti128 m7, [subpel_h_shufA]
+ vbroadcasti128 m10, [subpel_h_shufB]
+ movu xm6, [srcq+strideq*0+16]
+ vinserti128 m6, [srcq+strideq*1+16], 1
+ vextracti128 [tmpq], m0, 1
+ pshufb m0, m5, m7 ; 01
+ pshufb m5, m10 ; 23
+ pmaddwd m0, m11
+ pmaddwd m5, m12
+ paddd m0, m15
+ paddd m0, m5
+ pshufb m5, m6, m7 ; 89
+ pshufb m6, m10 ; ab
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ paddd m5, m15
+ paddd m6, m5
+ movu xm5, [srcq+strideq*0+8]
+ vinserti128 m5, [srcq+strideq*1+8], 1
+ lea srcq, [srcq+strideq*2]
+ pshufb m7, m5, m7
+ pshufb m5, m10
+ pmaddwd m10, m13, m7
+ pmaddwd m7, m11
+ paddd m0, m10
+ paddd m6, m7
+ pmaddwd m7, m14, m5
+ pmaddwd m5, m12
+ paddd m0, m7
+ paddd m5, m6
+ vbroadcasti128 m6, [tmpq]
+ vpbroadcastd m10, [v_mul+4*3]
+ psrad m0, 6
+ psrad m5, 6
+ packssdw m0, m5
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m10, m5 ; a3
+ pmaddwd m10, m6 ; b3
+ paddd m7, m8
+ paddd m9, m10
+ psrad m7, 6
+ psrad m9, 6
+ packssdw m7, m9
+ vpermq m7, m7, q3120
+ mova [tmpq+r8*0], xm7
+ vextracti128 [tmpq+r8*2], m7, 1
+ lea tmpq, [tmpq+r8*4]
+ sub hd, 2
+ jg .hv_w8_loop
+ add r5, 16
+ add r7, 16
+ movzx hd, wb
+ mov srcq, r5
+ mov tmpq, r7
+ sub wd, 1<<8
+ jg .hv_w8_loop0
+%if WIN64
+ POP r8
+%endif
+ RET
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro REMAP_REG 2
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %xdefine r14_save r14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ %xdefine r14 r14_save
+ %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd
+ movu xm%1, [srcq+ r4*2]
+ movu xm%2, [srcq+ r6*2]
+ movu xm%3, [srcq+ r7*2]
+ movu xm%4, [srcq+ r9*2]
+ vinserti128 m%1, [srcq+r10*2], 1
+ vinserti128 m%2, [srcq+r11*2], 1
+ vinserti128 m%3, [srcq+r13*2], 1
+ vinserti128 m%4, [srcq+ rX*2], 1
+ add srcq, ssq
+ movu xm%5, [srcq+ r4*2]
+ movu xm%6, [srcq+ r6*2]
+ movu xm%7, [srcq+ r7*2]
+ movu xm%8, [srcq+ r9*2]
+ vinserti128 m%5, [srcq+r10*2], 1
+ vinserti128 m%6, [srcq+r11*2], 1
+ vinserti128 m%7, [srcq+r13*2], 1
+ vinserti128 m%8, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m%1, m12
+ pmaddwd m%2, m13
+ pmaddwd m%3, m14
+ pmaddwd m%4, m15
+ pmaddwd m%5, m12
+ pmaddwd m%6, m13
+ pmaddwd m%7, m14
+ pmaddwd m%8, m15
+ phaddd m%1, m%2
+ %if %9
+ mova m10, [rsp+0x00]
+ %endif
+ phaddd m%3, m%4
+ phaddd m%5, m%6
+ phaddd m%7, m%8
+ phaddd m%1, m%3
+ phaddd m%5, m%7
+ paddd m%1, m10
+ paddd m%5, m10
+ psrad m%1, xm11
+ psrad m%5, xm11
+ packssdw m%1, m%5
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isput 1
+ %assign isprep 0
+cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %xdefine base_reg r12
+ mov r7d, pxmaxm
+%else
+ %assign isput 0
+ %assign isprep 1
+cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %define tmp_stridem qword [rsp+0xd0]
+ %xdefine base_reg r11
+%endif
+ lea base_reg, [%1_8tap_scaled_16bpc_avx2]
+%define base base_reg-%1_8tap_scaled_16bpc_avx2
+ tzcnt wd, wm
+ vpbroadcastd m8, dxm
+%if isprep && UNIX64
+ movd xm10, mxd
+ vpbroadcastd m10, xm10
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+ mov r6d, pxmaxm
+%else
+ vpbroadcastd m10, mxm
+ %if isput
+ vpbroadcastw m11, pxmaxm
+ %else
+ mov r6d, pxmaxm
+ %endif
+%endif
+ mov dyd, dym
+%if isput
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %else
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %endif
+ %define dsm [rsp+0x98]
+ %define rX r1
+ %define rXd r1d
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %else
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm [rsp+0x98]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define rX r14
+ %define rXd r14d
+%endif
+ shr r7d, 11
+ vpbroadcastd m6, [base+pd_0x3ff]
+ vpbroadcastd m12, [base+s_8tap_h_rnd+r7*4]
+ movd xm7, [base+s_8tap_h_sh+r7*4]
+%if isput
+ vpbroadcastd m13, [base+put_s_8tap_v_rnd+r7*4]
+ pinsrd xm7, [base+put_s_8tap_v_sh+r7*4], 2
+%else
+ vpbroadcastd m13, [base+pd_m524256]
+%endif
+ pxor m9, m9
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.w2:
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m10, m8 ; mx+dx*[0,1]
+ vpbroadcastd xm14, [base+pq_0x40000000+2]
+ vpbroadcastd xm15, xm15
+ pand xm8, xm10, xm6
+ psrld xm8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_q]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd xm15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm4, [base+subpel_filters+r6*8+2]
+ pcmpeqd xm8, xm9
+ psrld m10, 10
+ paddd m10, m10
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ movu xm2, [srcq+ssq*2]
+ movu xm3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m10, m5
+ paddb m10, m6
+ vpblendd xm15, xm4, 0xa
+ pblendvb xm15, xm14, xm8
+ pmovsxbw m15, xm15
+ vinserti128 m0, [srcq+ssq*0], 1 ; 0 4
+ vinserti128 m1, [srcq+ssq*1], 1 ; 1 5
+ vinserti128 m2, [srcq+ssq*2], 1 ; 2 6
+ vinserti128 m3, [srcq+ss3q ], 1 ; 3 7
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m10}, m0, m1, m2, m3
+ REPX {pmaddwd x, m15}, m0, m1, m2, m3
+ phaddd m0, m1
+ phaddd m2, m3
+ paddd m0, m12
+ paddd m2, m12
+ psrad m0, xm7
+ psrad m2, xm7
+ packssdw m0, m2 ; 0 1 2 3 4 5 6 7
+ vextracti128 xm1, m0, 1
+ palignr xm2, xm1, xm0, 4 ; 1 2 3 4
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ pshufd xm4, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm4 ; 45 56
+ punpckhwd xm4, xm1, xm4 ; 67 __
+.w2_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm14, r6q
+ pmovsxbw xm14, xm14
+ pshufd xm8, xm14, q0000
+ pshufd xm9, xm14, q1111
+ pmaddwd xm5, xm3, xm8
+ pmaddwd xm6, xm0, xm9
+ pshufd xm8, xm14, q2222
+ pshufd xm14, xm14, q3333
+ paddd xm5, xm6
+ pmaddwd xm6, xm2, xm8
+ pmaddwd xm8, xm4, xm14
+ psrldq xm9, xm7, 8
+ paddd xm5, xm6
+ paddd xm5, xm13
+ paddd xm5, xm8
+ psrad xm5, xm9
+ packusdw xm5, xm5
+ pminsw xm5, xm11
+ movd [dstq], xm5
+ add dstq, dsq
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w2_loop
+ movu xm5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps xm3, xm0, q1032 ; 01 12
+ shufps xm0, xm2, q1032 ; 23 34
+ shufps xm2, xm4, q1032 ; 45 56
+ pshufb xm5, xm10
+ pmaddwd xm5, xm15
+ phaddd xm5, xm5
+ paddd xm5, xm12
+ psrad xm5, xm7
+ packssdw xm5, xm5
+ palignr xm1, xm5, xm1, 12
+ punpcklqdq xm1, xm1 ; 6 7 6 7
+ punpcklwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+.w2_skip_line:
+ movu xm6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova xm3, xm0 ; 01 12
+ mova xm0, xm2 ; 23 34
+ pshufb xm5, xm10
+ pshufb xm6, xm10
+ pmaddwd xm5, xm15
+ pmaddwd xm6, xm15
+ phaddd xm5, xm6
+ paddd xm5, xm12
+ psrad xm5, xm7
+ packssdw xm5, xm5 ; 6 7 6 7
+ palignr xm1, xm5, xm1, 8 ; 4 5 6 7
+ pshufd xm5, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm5 ; 45 56
+ punpckhwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+%endif
+.w4:
+ mov myd, mym
+ mova [rsp+0x00], m12
+%if isput
+ mova [rsp+0x20], xm13
+%else
+ SWAP m11, m13
+%endif
+ mova [rsp+0x30], xm7
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastq m2, [base+pq_0x40000000+1]
+ vpbroadcastd xm15, xm15
+ SWAP m13, m10
+ paddd m13, m8 ; mx+dx*[0-3]
+ pand m6, m13
+ psrld m6, 6
+ paddd xm15, xm6
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ vbroadcasti128 m5, [base+bdct_lb_q+ 0]
+ vbroadcasti128 m1, [base+bdct_lb_q+16]
+ vbroadcasti128 m0, [base+subpel_s_shuf2]
+ vpbroadcastd xm14, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm7, [base+subpel_filters+r6*8+2]
+ vpbroadcastd xm15, [base+subpel_filters+r11*8+2]
+ vpbroadcastd xm8, [base+subpel_filters+r13*8+2]
+ pcmpeqd m6, m9
+ punpckldq m10, m6, m6
+ punpckhdq m6, m6
+ psrld m13, 10
+ paddd m13, m13
+ vpblendd xm14, xm7, 0xa
+ vpblendd xm15, xm8, 0xa
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ pblendvb m14, m2, m10
+ pblendvb m15, m2, m6
+ pextrd r4, xm13, 2
+ pshufb m12, m13, m5
+ pshufb m13, m1
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu xm7, [srcq+ssq*0]
+ movu xm9, [srcq+ssq*1]
+ movu xm8, [srcq+ssq*2]
+ movu xm10, [srcq+ss3q ]
+ movu xm1, [srcq+r4 ]
+ movu xm3, [srcq+r6 ]
+ movu xm2, [srcq+r11 ]
+ movu xm4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m7, [srcq+ssq*0], 1
+ vinserti128 m9, [srcq+ssq*1], 1
+ vinserti128 m8, [srcq+ssq*2], 1
+ vinserti128 m10, [srcq+ss3q ], 1
+ vinserti128 m1, [srcq+r4 ], 1
+ vinserti128 m3, [srcq+r6 ], 1
+ vinserti128 m2, [srcq+r11 ], 1
+ vinserti128 m4, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastb m5, xm13
+ psubb m13, m5
+ paddb m12, m0
+ paddb m13, m0
+ REPX {pshufb x, m12}, m7, m9, m8, m10
+ REPX {pmaddwd x, m14}, m7, m9, m8, m10
+ REPX {pshufb x, m13}, m1, m2, m3, m4
+ REPX {pmaddwd x, m15}, m1, m2, m3, m4
+ mova m5, [rsp+0x00]
+ movd xm6, [rsp+0x30]
+ phaddd m7, m1
+ phaddd m9, m3
+ phaddd m8, m2
+ phaddd m10, m4
+ REPX {paddd x, m5}, m7, m9, m8, m10
+ REPX {psrad x, xm6}, m7, m9, m8, m10
+ packssdw m7, m9 ; 0 1 4 5
+ packssdw m8, m10 ; 2 3 6 7
+ vextracti128 xm9, m7, 1 ; 4 5
+ vextracti128 xm3, m8, 1 ; 6 7
+ shufps xm4, xm7, xm8, q1032 ; 1 2
+ shufps xm5, xm8, xm9, q1032 ; 3 4
+ shufps xm6, xm9, xm3, q1032 ; 5 6
+ psrldq xm10, xm3, 8 ; 7 _
+ punpcklwd xm0, xm7, xm4 ; 01
+ punpckhwd xm7, xm4 ; 12
+ punpcklwd xm1, xm8, xm5 ; 23
+ punpckhwd xm8, xm5 ; 34
+ punpcklwd xm2, xm9, xm6 ; 45
+ punpckhwd xm9, xm6 ; 56
+ punpcklwd xm3, xm10 ; 67
+ mova [rsp+0x40], xm7
+ mova [rsp+0x50], xm8
+ mova [rsp+0x60], xm9
+.w4_loop:
+ and myd, 0x3ff
+ mov r11d, 64 << 24
+ mov r13d, myd
+ shr r13d, 6
+ lea r13d, [t1+r13]
+ cmovnz r11q, [base+subpel_filters+r13*8]
+ movq xm9, r11q
+ pmovsxbw xm9, xm9
+ pshufd xm7, xm9, q0000
+ pshufd xm8, xm9, q1111
+ pmaddwd xm4, xm0, xm7
+ pmaddwd xm5, xm1, xm8
+ pshufd xm7, xm9, q2222
+ pshufd xm9, xm9, q3333
+ pmaddwd xm6, xm2, xm7
+ pmaddwd xm8, xm3, xm9
+%if isput
+ mova xm7, [rsp+0x20]
+ movd xm9, [rsp+0x38]
+%else
+ SWAP m7, m11
+%endif
+ paddd xm4, xm5
+ paddd xm6, xm8
+ paddd xm4, xm6
+ paddd xm4, xm7
+%if isput
+ psrad xm4, xm9
+ packusdw xm4, xm4
+ pminuw xm4, xm11
+ movq [dstq], xm4
+ add dstq, dsq
+%else
+ SWAP m11, m7
+ psrad xm4, 6
+ packssdw xm4, xm4
+ movq [tmpq], xm4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+ mova xm8, [rsp+0x00]
+ movd xm9, [rsp+0x30]
+ movu xm4, [srcq]
+ movu xm5, [srcq+r4]
+ test myd, 0x400
+ jz .w4_skip_line
+ mova xm0, [rsp+0x40]
+ mova [rsp+0x40], xm1
+ mova xm1, [rsp+0x50]
+ mova [rsp+0x50], xm2
+ mova xm2, [rsp+0x60]
+ mova [rsp+0x60], xm3
+ pshufb xm4, xm12
+ pshufb xm5, xm13
+ pmaddwd xm4, xm14
+ pmaddwd xm5, xm15
+ phaddd xm4, xm5
+ paddd xm4, xm8
+ psrad xm4, xm9
+ packssdw xm4, xm4
+ punpcklwd xm3, xm10, xm4
+ mova xm10, xm4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu xm6, [srcq+ssq*1]
+ movu xm7, [srcq+r6]
+ movu m0, [rsp+0x50]
+ pshufb xm4, xm12
+ pshufb xm6, xm12
+ pshufb xm5, xm13
+ pshufb xm7, xm13
+ pmaddwd xm4, xm14
+ pmaddwd xm6, xm14
+ pmaddwd xm5, xm15
+ pmaddwd xm7, xm15
+ mova [rsp+0x40], m0
+ phaddd xm4, xm5
+ phaddd xm6, xm7
+ paddd xm4, xm8
+ paddd xm6, xm8
+ psrad xm4, xm9
+ psrad xm6, xm9
+ packssdw xm4, xm6
+ punpcklwd xm9, xm10, xm4
+ mova [rsp+0x60], xm9
+ psrldq xm10, xm4, 8
+ mova xm0, xm1
+ mova xm1, xm2
+ mova xm2, xm3
+ punpcklwd xm3, xm4, xm10
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+ SWAP m10, m13
+%if isprep
+ SWAP m13, m11
+%endif
+.w8:
+ mov dword [rsp+0x80], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [rsp+0x80], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [rsp+0x80], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [rsp+0x80], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [rsp+0x80], 16
+ movifprep tmp_stridem, 256
+.w_start:
+ SWAP m10, m12, m1
+ SWAP m11, m7
+ ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
+%if isput
+ movifnidn dsm, dsq
+ mova [rsp+0xb0], xm7
+%endif
+ mova [rsp+0x00], m10
+ mova [rsp+0x20], m13
+ shr t0d, 16
+ sub srcq, 6
+ pmaddwd m8, [base+rescale_mul2]
+ movd xm15, t0d
+ mov [rsp+0x84], t0d
+ mov [rsp+0x88], srcq
+ mov [rsp+0x90], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m1, m8 ; mx+dx*[0-7]
+ jmp .hloop
+.hloop_prep:
+ dec dword [rsp+0x80]
+ jz .ret
+ add qword [rsp+0x90], 16
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m6, [base+pd_0x3ff]
+ paddd m1, m8, [rsp+0x40]
+ vpbroadcastd m15, [rsp+0x84]
+ pxor m9, m9
+ mov srcq, [rsp+0x88]
+ mov r0q, [rsp+0x90] ; dstq / tmpq
+.hloop:
+ vpbroadcastq xm2, [base+pq_0x40000000]
+ pand m5, m1, m6
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ vextracti128 xm7, m15, 1
+ movq r6, xm15
+ pextrq r9, xm15, 1
+ movq r11, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mova [rsp+0x40], m1
+ movq xm12, [base+subpel_filters+ r4*8]
+ movq xm13, [base+subpel_filters+ r6*8]
+ movhps xm12, [base+subpel_filters+ r7*8]
+ movhps xm13, [base+subpel_filters+ r9*8]
+ movq xm14, [base+subpel_filters+r10*8]
+ movq xm15, [base+subpel_filters+r11*8]
+ movhps xm14, [base+subpel_filters+r13*8]
+ movhps xm15, [base+subpel_filters+ rX*8]
+ psrld m1, 10
+ vextracti128 xm7, m1, 1
+ vextracti128 xm6, m5, 1
+ movq [rsp+0xa0], xm1
+ movq [rsp+0xa8], xm7
+ movq r6, xm1
+ pextrq r11, xm1, 1
+ movq r9, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r13d, rXd
+ shr rX, 32
+ pshufd xm4, xm5, q2200
+ pshufd xm5, xm5, q3311
+ pshufd xm7, xm6, q2200
+ pshufd xm6, xm6, q3311
+ pblendvb xm12, xm2, xm4
+ pblendvb xm13, xm2, xm5
+ pblendvb xm14, xm2, xm7
+ pblendvb xm15, xm2, xm6
+ pmovsxbw m12, xm12
+ pmovsxbw m13, xm13
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ mova [rsp+0x60], m0
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
+ mova m0, [rsp+0x60]
+ vbroadcasti128 m9, [base+subpel_s_shuf8]
+ mov myd, mym
+ mov dyd, dym
+ pshufb m0, m9 ; 01a 01b
+ pshufb m1, m9 ; 23a 23b
+ pshufb m2, m9 ; 45a 45b
+ pshufb m3, m9 ; 67a 67b
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm9, r6q
+ punpcklqdq xm9, xm9
+ pmovsxbw m9, xm9
+ pshufd m8, m9, q0000
+ pshufd m7, m9, q1111
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m7
+ pshufd m8, m9, q2222
+ pshufd m9, m9, q3333
+ pmaddwd m6, m2, m8
+ pmaddwd m7, m3, m9
+%if isput
+ psrldq xm8, xm11, 8
+%endif
+ paddd m4, [rsp+0x20]
+ paddd m6, m7
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, xm8
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0xb0]
+ mova [dstq], xm4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [rsp+0x60], myd
+ mov r4d, [rsp+0xa0]
+ mov r6d, [rsp+0xa4]
+ mov r7d, [rsp+0xa8]
+ mov r9d, [rsp+0xac]
+ jz .skip_line
+ vbroadcasti128 m9, [base+wswap]
+ movu xm4, [srcq+ r4*2]
+ movu xm5, [srcq+ r6*2]
+ movu xm6, [srcq+ r7*2]
+ movu xm7, [srcq+ r9*2]
+ vinserti128 m4, [srcq+r10*2], 1
+ vinserti128 m5, [srcq+r11*2], 1
+ vinserti128 m6, [srcq+r13*2], 1
+ vinserti128 m7, [srcq+ rX*2], 1
+ add srcq, ssq
+ mov myd, [rsp+0x60]
+ mov dyd, dym
+ pshufb m0, m9
+ pshufb m1, m9
+ pshufb m2, m9
+ pshufb m3, m9
+ pmaddwd m4, m12
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ pmaddwd m7, m15
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m4, m6
+ paddd m4, m10
+ psrad m4, xm11
+ pslld m4, 16
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .vloop
+.skip_line:
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ MC_8TAP_SCALED_H 3, 10, 4, 5, 6, 7, 8, 9, 1
+ vbroadcasti128 m9, [base+subpel_s_shuf8]
+ mov myd, [rsp+0x60]
+ mov dyd, dym
+ pshufb m3, m9
+ jmp .vloop
+ SWAP m1, m12, m10
+ SWAP m7, m11
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy1_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m10, m8 ; mx+dx*[0-1]
+ vpbroadcastd xm14, [base+pq_0x40000000+2]
+ vpbroadcastd xm15, xm15
+ pand xm8, xm10, xm6
+ psrld xm8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_q]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m4, [base+subpel_filters+r6*8+2]
+ pcmpeqd xm8, xm9
+ psrld m10, 10
+ paddd m10, m10
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ movu xm2, [srcq+ssq*2]
+ movu xm3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m10, m5
+ paddb m10, m6
+ vpblendd xm15, xm4, 0xa
+ pblendvb xm15, xm14, xm8
+ pmovsxbw m15, xm15
+ vinserti128 m0, [srcq+ssq*0], 1
+ vinserti128 m1, [srcq+ssq*1], 1
+ vinserti128 m2, [srcq+ssq*2], 1
+ add srcq, ss3q
+ movq xm6, r4q
+ pmovsxbw xm6, xm6
+ pshufd xm8, xm6, q0000
+ pshufd xm9, xm6, q1111
+ pshufd xm14, xm6, q2222
+ pshufd xm6, xm6, q3333
+ REPX {pshufb x, m10}, m0, m1, m2
+ pshufb xm3, xm10
+ REPX {pmaddwd x, m15}, m0, m1, m2
+ pmaddwd xm3, xm15
+ phaddd m0, m1
+ phaddd m2, m3
+ paddd m0, m12
+ paddd m2, m12
+ psrad m0, xm7
+ psrad m2, xm7
+ packssdw m0, m2
+ vextracti128 xm1, m0, 1
+ palignr xm2, xm1, xm0, 4
+ pshufd xm4, xm1, q2121
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ punpcklwd xm2, xm1, xm4 ; 45 56
+.dy1_w2_loop:
+ movu xm1, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm1, xm10
+ pshufb xm5, xm10
+ pmaddwd xm1, xm15
+ pmaddwd xm5, xm15
+ phaddd xm1, xm5
+ pmaddwd xm5, xm3, xm8
+ mova xm3, xm0
+ pmaddwd xm0, xm9
+ paddd xm1, xm12
+ psrad xm1, xm7
+ packssdw xm1, xm1
+ paddd xm5, xm0
+ mova xm0, xm2
+ pmaddwd xm2, xm14
+ paddd xm5, xm2
+ palignr xm2, xm1, xm4, 12
+ punpcklwd xm2, xm1 ; 67 78
+ pmaddwd xm4, xm2, xm6
+ paddd xm5, xm13
+ paddd xm5, xm4
+ mova xm4, xm1
+ psrldq xm1, xm7, 8
+ psrad xm5, xm1
+ packusdw xm5, xm5
+ pminsw xm5, xm11
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+.dy1_w4:
+ mov myd, mym
+%if isput
+ mova [rsp+0x50], xm11
+%endif
+ mova [rsp+0x00], m12
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm7
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastq m2, [base+pq_0x40000000+1]
+ vpbroadcastd xm15, xm15
+ SWAP m13, m10
+ paddd m13, m8 ; mx+dx*[0-3]
+ pand m6, m13
+ psrld m6, 6
+ paddd xm15, xm6
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ vbroadcasti128 m5, [base+bdct_lb_q+ 0]
+ vbroadcasti128 m1, [base+bdct_lb_q+16]
+ vbroadcasti128 m4, [base+subpel_s_shuf2]
+ vpbroadcastd xm14, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm7, [base+subpel_filters+r6*8+2]
+ vpbroadcastd xm15, [base+subpel_filters+r11*8+2]
+ vpbroadcastd xm8, [base+subpel_filters+r13*8+2]
+ pcmpeqd m6, m9
+ punpckldq m10, m6, m6
+ punpckhdq m6, m6
+ psrld m13, 10
+ paddd m13, m13
+ vpblendd xm14, xm7, 0xa
+ vpblendd xm15, xm8, 0xa
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ pblendvb m14, m2, m10
+ pblendvb m15, m2, m6
+ pextrd r4, xm13, 2
+ pshufb m12, m13, m5
+ pshufb m13, m1
+ lea r6, [r4+ssq*2]
+ lea r11, [r4+ssq*1]
+ lea r13, [r4+ss3q ]
+ movu xm0, [srcq+ssq*0]
+ movu xm7, [srcq+r4 ]
+ movu xm1, [srcq+ssq*2]
+ movu xm8, [srcq+r6 ]
+ vinserti128 m0, [srcq+ssq*1], 1 ; 0 1
+ vinserti128 m7, [srcq+r11 ], 1
+ vinserti128 m1, [srcq+ss3q ], 1 ; 2 3
+ vinserti128 m8, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ movu xm2, [srcq+ssq*0]
+ movu xm9, [srcq+r4 ]
+ movu xm3, [srcq+ssq*2] ; 6 _
+ movu xm10, [srcq+r6 ]
+ vinserti128 m2, [srcq+ssq*1], 1 ; 4 5
+ vinserti128 m9, [srcq+r11 ], 1
+ lea srcq, [srcq+ss3q ]
+ vpbroadcastb m5, xm13
+ psubb m13, m5
+ paddb m12, m4
+ paddb m13, m4
+ mova m5, [rsp+0x00]
+ movd xm6, [rsp+0x40]
+ pshufb m0, m12
+ pshufb m1, m12
+ pmaddwd m0, m14
+ pmaddwd m1, m14
+ pshufb m7, m13
+ pshufb m8, m13
+ pmaddwd m7, m15
+ pmaddwd m8, m15
+ pshufb m2, m12
+ pshufb xm3, xm12
+ pmaddwd m2, m14
+ pmaddwd xm3, xm14
+ pshufb m9, m13
+ pshufb xm10, xm13
+ pmaddwd m9, m15
+ pmaddwd xm10, xm15
+ phaddd m0, m7
+ phaddd m1, m8
+ phaddd m2, m9
+ phaddd xm3, xm10
+ paddd m0, m5
+ paddd m1, m5
+ paddd m2, m5
+ paddd xm3, xm5
+ psrad m0, xm6
+ psrad m1, xm6
+ psrad m2, xm6
+ psrad xm3, xm6
+ vperm2i128 m4, m0, m1, 0x21 ; 1 2
+ vperm2i128 m5, m1, m2, 0x21 ; 3 4
+ vperm2i128 m6, m2, m3, 0x21 ; 5 6
+ shr myd, 6
+ mov r13d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r13q, [base+subpel_filters+myq*8]
+ pslld m4, 16
+ pslld m5, 16
+ pslld m6, 16
+ pblendw m0, m4, 0xaa ; 01 12
+ pblendw m1, m5, 0xaa ; 23 34
+ pblendw m2, m6, 0xaa ; 45 56
+ movq xm10, r13q
+ punpcklqdq xm10, xm10
+ pmovsxbw m10, xm10
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+.dy1_w4_loop:
+ movu xm11, [srcq+ssq*0]
+ movu xm6, [srcq+r4 ]
+ vinserti128 m11, [srcq+ssq*1], 1
+ vinserti128 m6, [srcq+r11 ], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pshufb m11, m12
+ pshufb m6, m13
+ pmaddwd m11, m14
+ pmaddwd m6, m15
+ paddd m4, [rsp+0x20]
+ phaddd m11, m6
+ pmaddwd m6, m2, m9
+ paddd m11, [rsp+0x00]
+ psrad m11, [rsp+0x40]
+ mova m0, m1
+ mova m1, m2
+ paddd m5, m6
+ paddd m4, m5
+ vinserti128 m2, m3, xm11, 1
+ pslld m3, m11, 16
+ pblendw m2, m3, 0xaa ; 67 78
+ pmaddwd m5, m2, m10
+ vextracti128 xm3, m11, 1
+ paddd m4, m5
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0x50]
+ movq [dstq+dsq*0], xm4
+ movhps [dstq+dsq*1], xm4
+ lea dstq, [dstq+dsq*2]
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy1_w4_loop
+ MC_8TAP_SCALED_RET
+ SWAP m10, m13
+.dy1_w8:
+ mov dword [rsp+0xa0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [rsp+0xa0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [rsp+0xa0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [rsp+0xa0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [rsp+0xa0], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ SWAP m10, m12, m1
+ SWAP m11, m7
+ ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
+ mov myd, mym
+%if isput
+ %define dsm [rsp+0xb8]
+ movifnidn dsm, dsq
+ mova [rsp+0xc0], xm7
+%else
+ %if UNIX64
+ %define hm [rsp+0xb8]
+ %endif
+%endif
+ mova [rsp+0x00], m10
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm11
+ shr t0d, 16
+ sub srcq, 6
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pmaddwd m8, [base+rescale_mul2]
+ movd xm15, t0d
+ mov [rsp+0xa4], t0d
+ mov [rsp+0xa8], srcq
+ mov [rsp+0xb0], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m1, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ pmovsxbw xm0, xm0
+ mova [rsp+0x50], xm0
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [rsp+0xa0]
+ jz .ret
+ add qword [rsp+0xb0], 16
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m6, [base+pd_0x3ff]
+ paddd m1, m8, [rsp+0x60]
+ vpbroadcastd m15, [rsp+0xa4]
+ pxor m9, m9
+ mov srcq, [rsp+0xa8]
+ mov r0q, [rsp+0xb0] ; dstq / tmpq
+ mova m10, [rsp+0x00]
+ mova xm11, [rsp+0x40]
+.dy1_hloop:
+ vpbroadcastq xm2, [base+pq_0x40000000]
+ pand m5, m1, m6
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ vextracti128 xm7, m15, 1
+ movq r6, xm15
+ pextrq r9, xm15, 1
+ movq r11, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mova [rsp+0x60], m1
+ movq xm12, [base+subpel_filters+ r4*8]
+ movq xm13, [base+subpel_filters+ r6*8]
+ movhps xm12, [base+subpel_filters+ r7*8]
+ movhps xm13, [base+subpel_filters+ r9*8]
+ movq xm14, [base+subpel_filters+r10*8]
+ movq xm15, [base+subpel_filters+r11*8]
+ movhps xm14, [base+subpel_filters+r13*8]
+ movhps xm15, [base+subpel_filters+ rX*8]
+ psrld m1, 10
+ vextracti128 xm7, m1, 1
+ vextracti128 xm6, m5, 1
+ movq r6, xm1
+ pextrq r11, xm1, 1
+ movq r9, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r13d, rXd
+ shr rX, 32
+ pshufd xm4, xm5, q2200
+ pshufd xm5, xm5, q3311
+ pshufd xm7, xm6, q2200
+ pshufd xm6, xm6, q3311
+ pblendvb xm12, xm2, xm4
+ pblendvb xm13, xm2, xm5
+ pblendvb xm14, xm2, xm7
+ pblendvb xm15, xm2, xm6
+ pmovsxbw m12, xm12
+ pmovsxbw m13, xm13
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ mova [rsp+0x80], m0
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
+ mova m0, [rsp+0x80]
+ vbroadcasti128 m7, [base+subpel_s_shuf8]
+ vpbroadcastd m8, [rsp+0x50]
+ vpbroadcastd m9, [rsp+0x54]
+ vpbroadcastd m10, [rsp+0x58]
+ vpbroadcastd m11, [rsp+0x5c]
+ pshufb m0, m7 ; 01a 01b
+ pshufb m1, m7 ; 23a 23b
+ pshufb m2, m7 ; 45a 45b
+ pshufb m3, m7 ; 67a 67b
+.dy1_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, [rsp+0x20]
+ paddd m6, m7
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0xc0]
+ mova [dstq], xm4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+ vbroadcasti128 m7, [base+wswap]
+ pshufb m0, m7
+ pshufb m1, m7
+ pshufb m2, m7
+ pshufb m3, m7
+ movu xm4, [srcq+ r4*2]
+ movu xm5, [srcq+ r6*2]
+ movu xm6, [srcq+ r7*2]
+ movu xm7, [srcq+ r9*2]
+ vinserti128 m4, [srcq+r10*2], 1
+ vinserti128 m5, [srcq+r11*2], 1
+ vinserti128 m6, [srcq+r13*2], 1
+ vinserti128 m7, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m4, m12
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ pmaddwd m7, m15
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m4, m6
+ paddd m4, [rsp+0x00]
+ psrad m4, [rsp+0x40]
+ pslld m4, 16
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .dy1_vloop
+ SWAP m1, m12, m10
+ SWAP m7, m11
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy2_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m10, m8 ; mx+dx*[0-1]
+ vpbroadcastd xm14, [base+pq_0x40000000+2]
+ vpbroadcastd xm15, xm15
+ pand xm8, xm10, xm6
+ psrld xm8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_q]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd xm15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm4, [base+subpel_filters+r6*8+2]
+ pcmpeqd xm8, xm9
+ psrld m10, 10
+ paddd m10, m10
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*2]
+ movu xm2, [srcq+ssq*4]
+ pshufb m10, m5
+ paddb m10, m6
+ vpblendd xm15, xm4, 0xa
+ pblendvb xm15, xm14, xm8
+ pmovsxbw m15, xm15
+ vinserti128 m0, [srcq+ssq*1], 1 ; 0 1
+ vinserti128 m1, [srcq+ss3q ], 1 ; 2 3
+ lea srcq, [srcq+ssq*4]
+ vinserti128 m2, [srcq+ssq*1], 1 ; 4 5
+ lea srcq, [srcq+ssq*2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m0, m10
+ pshufb m1, m10
+ pshufb m2, m10
+ pmaddwd m0, m15
+ pmaddwd m1, m15
+ pmaddwd m2, m15
+ movq xm6, r4q
+ pmovsxbw xm6, xm6
+ phaddd m0, m1
+ phaddd m1, m2
+ paddd m0, m12
+ paddd m1, m12
+ psrad m0, xm7
+ psrad m1, xm7
+ packssdw m0, m1 ; 0 2 2 4 1 3 3 5
+ vextracti128 xm1, m0, 1
+ pshufd xm8, xm6, q0000
+ pshufd xm9, xm6, q1111
+ pshufd xm14, xm6, q2222
+ pshufd xm6, xm6, q3333
+ punpcklwd xm2, xm0, xm1 ; 01 23
+ punpckhwd xm1, xm0, xm1 ; 23 45
+.dy2_w2_loop:
+ movu xm3, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*2]
+ vinserti128 m3, [srcq+ssq*1], 1 ; 6 7
+ vinserti128 m5, [srcq+ss3q ], 1 ; 8 9
+ lea srcq, [srcq+ssq*4]
+ pmaddwd xm4, xm2, xm8
+ pmaddwd xm1, xm9
+ pshufb m3, m10
+ pshufb m5, m10
+ pmaddwd m3, m15
+ pmaddwd m5, m15
+ phaddd m3, m5
+ paddd xm4, xm1
+ paddd m3, m12
+ psrad m3, xm7
+ packssdw m3, m3
+ pshufd m3, m3, q2100
+ palignr m0, m3, m0, 12 ; 4 6 6 8 5 7 7 9
+ vextracti128 xm1, m0, 1
+ punpcklwd xm2, xm0, xm1 ; 45 67
+ punpckhwd xm1, xm0, xm1 ; 67 89
+ pmaddwd xm3, xm2, xm14
+ pmaddwd xm5, xm1, xm6
+ paddd xm4, xm13
+ paddd xm4, xm3
+ psrldq xm3, xm7, 8
+ paddd xm4, xm5
+ psrad xm4, xm3
+ packusdw xm4, xm4
+ pminsw xm4, xm11
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+.dy2_w4:
+ mov myd, mym
+%if isput
+ mova [rsp+0x50], xm11
+%endif
+ mova [rsp+0x00], m12
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm7
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastq m2, [base+pq_0x40000000+1]
+ vpbroadcastd xm15, xm15
+ SWAP m13, m10
+ paddd m13, m8 ; mx+dx*[0-3]
+ pand m6, m13
+ psrld m6, 6
+ paddd xm15, xm6
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ vbroadcasti128 m5, [base+bdct_lb_q+ 0]
+ vbroadcasti128 m1, [base+bdct_lb_q+16]
+ vbroadcasti128 m4, [base+subpel_s_shuf2]
+ vpbroadcastd xm14, [base+subpel_filters+r4*8+2]
+ vpbroadcastd xm7, [base+subpel_filters+r6*8+2]
+ vpbroadcastd xm15, [base+subpel_filters+r11*8+2]
+ vpbroadcastd xm8, [base+subpel_filters+r13*8+2]
+ shr myd, 6
+ mov r13d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r13q, [base+subpel_filters+myq*8]
+ pcmpeqd m6, m9
+ punpckldq m11, m6, m6
+ punpckhdq m6, m6
+ psrld m13, 10
+ paddd m13, m13
+ vpblendd xm14, xm7, 0xa
+ vpblendd xm15, xm8, 0xa
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ movq xm10, r13q
+ pblendvb m14, m2, m11
+ pblendvb m15, m2, m6
+ pextrd r4, xm13, 2
+ pshufb m12, m13, m5
+ pshufb m13, m1
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu xm0, [srcq+ssq*0]
+ movu xm7, [srcq+r4 ]
+ movu xm1, [srcq+ssq*1]
+ movu xm8, [srcq+r6 ]
+ vinserti128 m0, [srcq+ssq*2], 1 ; 0 2
+ vinserti128 m7, [srcq+r11 ], 1
+ vinserti128 m1, [srcq+ss3q ], 1 ; 1 3
+ vinserti128 m8, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ movu xm2, [srcq+ssq*0]
+ movu xm9, [srcq+r4 ]
+ vinserti128 m2, [srcq+ssq*1], 1 ; 4 5
+ vinserti128 m9, [srcq+r6 ], 1
+ lea srcq, [srcq+ssq*2]
+ vpbroadcastb m5, xm13
+ psubb m13, m5
+ paddb m12, m4
+ paddb m13, m4
+ mova m5, [rsp+0x00]
+ movd xm6, [rsp+0x40]
+ pshufb m0, m12
+ pshufb m1, m12
+ pshufb m2, m12
+ pmaddwd m0, m14
+ pmaddwd m1, m14
+ pmaddwd m2, m14
+ pshufb m7, m13
+ pshufb m8, m13
+ pshufb m9, m13
+ pmaddwd m7, m15
+ pmaddwd m8, m15
+ pmaddwd m9, m15
+ punpcklqdq xm10, xm10
+ pmovsxbw m10, xm10
+ phaddd m0, m7
+ phaddd m1, m8
+ phaddd m2, m9
+ paddd m0, m5
+ paddd m1, m5
+ paddd m2, m5
+ psrad m0, xm6
+ psrad m1, xm6
+ psrad m2, xm6
+ vperm2i128 m3, m0, m2, 0x21 ; 2 4
+ vperm2i128 m2, m1, 0x13 ; 3 5
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ packssdw m0, m3 ; 0 2 2 4
+ packssdw m1, m2 ; 1 3 3 5
+ punpckhwd m2, m0, m1 ; 23 45
+ punpcklwd m0, m1 ; 01 23
+.dy2_w4_loop:
+ movu xm1, [srcq+ssq*0]
+ movu xm6, [srcq+r4 ]
+ movu xm3, [srcq+ssq*1]
+ movu xm11, [srcq+r6 ]
+ vinserti128 m1, [srcq+ssq*2], 1 ; 6 8
+ vinserti128 m6, [srcq+r11 ], 1
+ vinserti128 m3, [srcq+ss3q ], 1 ; 7 9
+ vinserti128 m11, [srcq+r13 ], 1
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m2, m8
+ pshufb m1, m12
+ pshufb m3, m12
+ pmaddwd m1, m14
+ pmaddwd m3, m14
+ mova m0, [rsp+0x00]
+ pshufb m6, m13
+ pshufb m11, m13
+ pmaddwd m6, m15
+ pmaddwd m11, m15
+ paddd m4, m5
+ movd xm5, [rsp+0x40]
+ phaddd m1, m6
+ phaddd m3, m11
+ paddd m1, m0
+ paddd m3, m0
+ psrad m1, xm5
+ psrad m3, xm5
+ pslld m3, 16
+ pblendw m1, m3, 0xaa ; 67 89
+ vperm2i128 m0, m2, m1, 0x21 ; 45 67
+ paddd m4, [rsp+0x20]
+ mova m2, m1
+ pmaddwd m5, m0, m9
+ pmaddwd m6, m2, m10
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0x50]
+ movq [dstq+dsq*0], xm4
+ movhps [dstq+dsq*1], xm4
+ lea dstq, [dstq+dsq*2]
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET
+ SWAP m10, m13
+.dy2_w8:
+ mov dword [rsp+0xa0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [rsp+0xa0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [rsp+0xa0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [rsp+0xa0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [rsp+0xa0], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ SWAP m10, m12, m1
+ SWAP m11, m7
+ ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free
+ mov myd, mym
+%if isput
+ movifnidn dsm, dsq
+ mova [rsp+0xc0], xm7
+%endif
+ mova [rsp+0x00], m10
+ mova [rsp+0x20], m13
+ mova [rsp+0x40], xm11
+ shr t0d, 16
+ sub srcq, 6
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pmaddwd m8, [base+rescale_mul2]
+ movd xm15, t0d
+ mov [rsp+0xa4], t0d
+ mov [rsp+0xa8], srcq
+ mov [rsp+0xb0], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m1, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ pmovsxbw xm0, xm0
+ mova [rsp+0x50], xm0
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [rsp+0xa0]
+ jz .ret
+ add qword [rsp+0xb0], 16
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m6, [base+pd_0x3ff]
+ paddd m1, m8, [rsp+0x60]
+ vpbroadcastd m15, [rsp+0xa4]
+ pxor m9, m9
+ mov srcq, [rsp+0xa8]
+ mov r0q, [rsp+0xb0] ; dstq / tmpq
+ mova m10, [rsp+0x00]
+ mova xm11, [rsp+0x40]
+.dy2_hloop:
+ vpbroadcastq xm2, [base+pq_0x40000000]
+ pand m5, m1, m6
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ vextracti128 xm7, m15, 1
+ movq r6, xm15
+ pextrq r9, xm15, 1
+ movq r11, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mova [rsp+0x60], m1
+ movq xm12, [base+subpel_filters+ r4*8]
+ movq xm13, [base+subpel_filters+ r6*8]
+ movhps xm12, [base+subpel_filters+ r7*8]
+ movhps xm13, [base+subpel_filters+ r9*8]
+ movq xm14, [base+subpel_filters+r10*8]
+ movq xm15, [base+subpel_filters+r11*8]
+ movhps xm14, [base+subpel_filters+r13*8]
+ movhps xm15, [base+subpel_filters+ rX*8]
+ psrld m1, 10
+ vextracti128 xm7, m1, 1
+ vextracti128 xm6, m5, 1
+ movq r6, xm1
+ pextrq r11, xm1, 1
+ movq r9, xm7
+ pextrq rX, xm7, 1
+ mov r4d, r6d
+ shr r6, 32
+ mov r10d, r11d
+ shr r11, 32
+ mov r7d, r9d
+ shr r9, 32
+ mov r13d, rXd
+ shr rX, 32
+ pshufd xm4, xm5, q2200
+ pshufd xm5, xm5, q3311
+ pshufd xm7, xm6, q2200
+ pshufd xm6, xm6, q3311
+ pblendvb xm12, xm2, xm4
+ pblendvb xm13, xm2, xm5
+ pblendvb xm14, xm2, xm7
+ pblendvb xm15, xm2, xm6
+ pmovsxbw m12, xm12
+ pmovsxbw m13, xm13
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ mova [rsp+0x80], m0
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b
+ mova m0, [rsp+0x80]
+ vbroadcasti128 m7, [base+subpel_s_shuf8]
+ vpbroadcastd m8, [rsp+0x50]
+ vpbroadcastd m9, [rsp+0x54]
+ vpbroadcastd m10, [rsp+0x58]
+ vpbroadcastd m11, [rsp+0x5c]
+ pshufb m0, m7 ; 01a 01b
+ pshufb m1, m7 ; 23a 23b
+ pshufb m2, m7 ; 45a 45b
+ pshufb m3, m7 ; 67a 67b
+.dy2_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, [rsp+0x20]
+ paddd m6, m7
+ paddd m4, m5
+ paddd m4, m6
+%if isput
+ psrad m4, [rsp+0x48]
+ vextracti128 xm5, m4, 1
+ packusdw xm4, xm5
+ pminsw xm4, [rsp+0xc0]
+ mova [dstq], xm4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ movu xm3, [srcq+ r4*2]
+ movu xm4, [srcq+ r6*2]
+ movu xm5, [srcq+ r7*2]
+ movu xm6, [srcq+ r9*2]
+ vinserti128 m3, [srcq+r10*2], 1
+ vinserti128 m4, [srcq+r11*2], 1
+ vinserti128 m5, [srcq+r13*2], 1
+ vinserti128 m6, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m3, m12
+ pmaddwd m4, m13
+ pmaddwd m5, m14
+ pmaddwd m6, m15
+ phaddd m3, m4
+ phaddd m5, m6
+ phaddd m3, m5
+ movu xm4, [srcq+ r4*2]
+ movu xm5, [srcq+ r6*2]
+ movu xm6, [srcq+ r7*2]
+ movu xm7, [srcq+ r9*2]
+ vinserti128 m4, [srcq+r10*2], 1
+ vinserti128 m5, [srcq+r11*2], 1
+ vinserti128 m6, [srcq+r13*2], 1
+ vinserti128 m7, [srcq+ rX*2], 1
+ add srcq, ssq
+ pmaddwd m4, m12
+ pmaddwd m5, m13
+ pmaddwd m6, m14
+ pmaddwd m7, m15
+ phaddd m4, m5
+ phaddd m6, m7
+ mova m5, [rsp+0x00]
+ movd xm7, [rsp+0x40]
+ phaddd m4, m6
+ paddd m3, m5
+ paddd m4, m5
+ psrad m3, xm7
+ psrad m4, xm7
+ pslld m4, 16
+ pblendw m3, m4, 0xaa
+ jmp .dy2_vloop
+.ret:
+ MC_8TAP_SCALED_RET 0
+%undef isput
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled_16bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, t0d
+ jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+BILIN_SCALED_FN put
+PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
+PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+BILIN_SCALED_FN prep
+PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
+PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%macro WARP_V 5 ; dst, 01, 23, 45, 67
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm8, [filterq+myq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1 ; a e
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+deltaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; b f
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm9, [filterq+myq *8]
+ vinserti128 m9, [filterq+tmp1q*8], 1 ; c g
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+gammaq] ; my += gamma
+ punpcklwd m8, m0
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; d h
+ punpcklwd m0, m9, m0
+ punpckldq m9, m8, m0
+ punpckhdq m0, m8, m0
+ punpcklbw m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
+ punpckhbw m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
+ pmaddwd m%2, m8
+ pmaddwd m9, m%3
+ punpcklbw m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
+ punpckhbw m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
+ pmaddwd m8, m%4
+ pmaddwd m0, m%5
+ paddd m9, m%2
+ mova m%2, m%3
+ paddd m0, m8
+ mova m%3, m%4
+ mova m%4, m%5
+ paddd m%1, m0, m9
+%endmacro
+
+cglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts
+ mov r6d, r7m
+ lea r9, [$$]
+ shr r6d, 11
+ vpbroadcastd m13, [r9-$$+warp8x8_shift+r6*4]
+ vpbroadcastd m14, [warp8x8t_rnd]
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main
+ jmp .start
+.loop:
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2
+ lea tmpq, [tmpq+tsq*4]
+.start:
+ paddd m7, m14
+ paddd m0, m14
+ psrad m7, 15
+ psrad m0, 15
+ packssdw m7, m0
+ vpermq m7, m7, q3120
+ mova [tmpq+tsq*0], xm7
+ vextracti128 [tmpq+tsq*2], m7, 1
+ dec r4d
+ jg .loop
+.end:
+ RET
+
+cglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \
+ alpha, beta, filter, tmp1, delta, \
+ my, gamma
+ mov r6d, r7m
+ lea filterq, [$$]
+ shr r6d, 11
+ vpbroadcastd m13, [filterq-$$+warp8x8_shift+r6*4]
+ vpbroadcastd m14, [filterq-$$+warp8x8_rnd +r6*4]
+ vpbroadcastw m15, r7m ; pixel_max
+ call .main
+ jmp .start
+.loop:
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+ psrad m7, 16
+ psrad m0, 16
+ packusdw m7, m0
+ pmulhrsw m7, m14
+ pminsw m7, m15
+ vpermq m7, m7, q3120
+ mova [dstq+dsq*0], xm7
+ vextracti128 [dstq+dsq*1], m7, 1
+ dec r4d
+ jg .loop
+.end:
+ RET
+ALIGN function_align
+.main:
+ ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+ mov abcdq, r5m
+ mov mxd, r6m
+%endif
+ movsx alphad, word [abcdq+2*0]
+ movsx betad, word [abcdq+2*1]
+ vpbroadcastd m12, [pd_32768]
+ pxor m11, m11
+ add filterq, mc_warp_filter-$$
+ lea tmp1q, [ssq*3]
+ add mxd, 512+(64<<10)
+ lea tmp2d, [alphaq*3]
+ sub srcq, tmp1q ; src -= src_stride*3
+ sub betad, tmp2d ; beta -= alpha*3
+ mov myd, r7m
+ call .h
+ psrld m1, m0, 16
+ call .h
+ pblendw m1, m0, 0xaa ; 01
+ psrld m2, m0, 16
+ call .h
+ pblendw m2, m0, 0xaa ; 12
+ psrld m3, m0, 16
+ call .h
+ pblendw m3, m0, 0xaa ; 23
+ psrld m4, m0, 16
+ call .h
+ pblendw m4, m0, 0xaa ; 34
+ psrld m5, m0, 16
+ call .h
+ pblendw m5, m0, 0xaa ; 45
+ psrld m6, m0, 16
+ call .h
+ pblendw m6, m0, 0xaa ; 56
+ movsx deltad, word [abcdq+2*2]
+ movsx gammad, word [abcdq+2*3]
+ add myd, 512+(64<<10)
+ mov r4d, 4
+ lea tmp1d, [deltaq*3]
+ sub gammad, tmp1d ; gamma -= delta*3
+.main2:
+ call .h
+ psrld m7, m6, 16
+ pblendw m7, m0, 0xaa ; 67
+ WARP_V 7, 1, 3, 5, 7
+ call .h
+ psrld m10, m5, 16
+ pblendw m10, m0, 0xaa ; 78
+ WARP_V 0, 2, 4, 6, 10
+ ret
+ALIGN function_align
+.h:
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ movu xm10, [srcq-6]
+ vinserti128 m10, [srcq+2], 1
+ shr mxd, 10 ; 0
+ shr tmp1d, 10 ; 4
+ movq xm0, [filterq+mxq *8]
+ vinserti128 m0, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+alphaq*1]
+ movu xm8, [srcq-4]
+ vinserti128 m8, [srcq+4], 1
+ shr tmp2d, 10 ; 1
+ shr tmp1d, 10 ; 5
+ movq xm9, [filterq+tmp2q*8]
+ vinserti128 m9, [filterq+tmp1q*8], 1
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ shr mxd, 10 ; 2
+ shr tmp1d, 10 ; 6
+ punpcklbw m0, m11, m0
+ pmaddwd m0, m10
+ movu xm10, [srcq-2]
+ vinserti128 m10, [srcq+6], 1
+ punpcklbw m9, m11, m9
+ pmaddwd m9, m8
+ movq xm8, [filterq+mxq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ phaddd m0, m9 ; 0 1 4 5
+ movu xm9, [srcq+0]
+ vinserti128 m9, [srcq+8], 1
+ shr tmp2d, 10 ; 3
+ shr tmp1d, 10 ; 7
+ punpcklbw m8, m11, m8
+ pmaddwd m8, m10
+ movq xm10, [filterq+tmp2q*8]
+ vinserti128 m10, [filterq+tmp1q*8], 1
+ punpcklbw m10, m11, m10
+ pmaddwd m9, m10
+ add srcq, ssq
+ phaddd m8, m9 ; 2 3 6 7
+ phaddd m0, m8 ; 0 1 2 3 4 5 6 7
+ vpsllvd m0, m13
+ paddd m0, m12 ; rounded 14-bit result in upper 16 bits of dword
+ ret
+
+%macro BIDIR_FN 0
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq ], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ cmp hd, 8
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm3
+ movhps [dstq+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+stride3q ], xm3
+.ret:
+ RET
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ cmp hd, 4
+ jne .w8_loop_start
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+.w8_loop_start:
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm2
+ vextracti128 [dstq+strideq*1], m2, 1
+ mova [dstq+strideq*2], xm3
+ vextracti128 [dstq+stride3q ], m3, 1
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ call .main
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ mova [dstq+32*6], m2
+ mova [dstq+32*7], m3
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+cglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg_avx2_table
+ lea r6, [avg_avx2_table]
+ tzcnt wd, wm
+ mov t0d, r6m ; pixel_max
+ movsxd wq, [r6+wq*4]
+ shr t0d, 11
+ vpbroadcastd m4, [base+bidir_rnd+t0*4]
+ vpbroadcastd m5, [base+bidir_mul+t0*4]
+ movifnidn hd, hm
+ add wq, r6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m0, [tmp1q+32*0]
+ paddsw m0, [tmp2q+32*0]
+ mova m1, [tmp1q+32*1]
+ paddsw m1, [tmp2q+32*1]
+ mova m2, [tmp1q+32*2]
+ paddsw m2, [tmp2q+32*2]
+ mova m3, [tmp1q+32*3]
+ paddsw m3, [tmp2q+32*3]
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pmaxsw m2, m4
+ pmaxsw m3, m4
+ psubsw m0, m4
+ psubsw m1, m4
+ psubsw m2, m4
+ psubsw m3, m4
+ pmulhw m0, m5
+ pmulhw m1, m5
+ pmulhw m2, m5
+ pmulhw m3, m5
+ ret
+
+cglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3
+ lea r6, [w_avg_avx2_table]
+ tzcnt wd, wm
+ mov t0d, r6m ; weight
+ vpbroadcastw m8, r7m ; pixel_max
+ vpbroadcastd m7, [r6-w_avg_avx2_table+pd_65538]
+ movsxd wq, [r6+wq*4]
+ paddw m7, m8
+ add wq, r6
+ lea r6d, [t0-16]
+ shl t0d, 16
+ sub t0d, r6d ; 16-weight, weight
+ pslld m7, 7
+ rorx r6d, t0d, 30 ; << 2
+ test dword r7m, 0x800
+ cmovz r6d, t0d
+ movifnidn hd, hm
+ movd xm6, r6d
+ vpbroadcastd m6, xm6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m4, [tmp1q+32*0]
+ mova m0, [tmp2q+32*0]
+ punpckhwd m5, m0, m4
+ punpcklwd m0, m4
+ mova m4, [tmp1q+32*1]
+ mova m1, [tmp2q+32*1]
+ pmaddwd m5, m6
+ pmaddwd m0, m6
+ paddd m5, m7
+ paddd m0, m7
+ psrad m5, 8
+ psrad m0, 8
+ packusdw m0, m5
+ punpckhwd m5, m1, m4
+ punpcklwd m1, m4
+ mova m4, [tmp1q+32*2]
+ mova m2, [tmp2q+32*2]
+ pmaddwd m5, m6
+ pmaddwd m1, m6
+ paddd m5, m7
+ paddd m1, m7
+ psrad m5, 8
+ psrad m1, 8
+ packusdw m1, m5
+ punpckhwd m5, m2, m4
+ punpcklwd m2, m4
+ mova m4, [tmp1q+32*3]
+ mova m3, [tmp2q+32*3]
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ pmaddwd m5, m6
+ pmaddwd m2, m6
+ paddd m5, m7
+ paddd m2, m7
+ psrad m5, 8
+ psrad m2, 8
+ packusdw m2, m5
+ punpckhwd m5, m3, m4
+ punpcklwd m3, m4
+ pmaddwd m5, m6
+ pmaddwd m3, m6
+ paddd m5, m7
+ paddd m3, m7
+ psrad m5, 8
+ psrad m3, 8
+ packusdw m3, m5
+ pminsw m0, m8
+ pminsw m1, m8
+ pminsw m2, m8
+ pminsw m3, m8
+ ret
+
+cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask_avx2_table
+ lea r7, [mask_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m8, [base+pw_64]
+ vpbroadcastd m9, [base+bidir_rnd+r6*4]
+ vpbroadcastd m10, [base+bidir_mul+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ BIDIR_FN
+ALIGN function_align
+.main:
+%macro MASK 1
+ pmovzxbw m5, [maskq+16*%1]
+ mova m%1, [tmp1q+32*%1]
+ mova m6, [tmp2q+32*%1]
+ punpckhwd m4, m%1, m6
+ punpcklwd m%1, m6
+ psubw m7, m8, m5
+ punpckhwd m6, m5, m7 ; m, 64-m
+ punpcklwd m5, m7
+ pmaddwd m4, m6 ; tmp1 * m + tmp2 * (64-m)
+ pmaddwd m%1, m5
+ psrad m4, 5
+ psrad m%1, 5
+ packssdw m%1, m4
+ pmaxsw m%1, m9
+ psubsw m%1, m9
+ pmulhw m%1, m10
+%endmacro
+ MASK 0
+ MASK 1
+ MASK 2
+ MASK 3
+ add maskq, 16*4
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ ret
+
+cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx2_table
+ lea r7, [w_mask_420_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movd xm0, r7m ; sign
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ vpbroadcastd m11, [base+pw_64]
+ vpbroadcastd m12, [base+bidir_rnd+r6*4]
+ vpbroadcastd m13, [base+bidir_mul+r6*4]
+ movd xm14, [base+pw_2]
+ mov maskq, maskmp
+ psubw xm14, xm0
+ vpbroadcastw m14, xm14
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ phaddd m4, m5
+ paddw m4, m14
+ psrlw m4, 2
+ packuswb m4, m4
+ vextracti128 xm5, m4, 1
+ punpcklwd xm4, xm5
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ mova [maskq], xm4
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm3
+ movhps [dstq+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+stride3q ], xm3
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w8:
+ vperm2i128 m6, m4, m5, 0x21
+ vpblendd m4, m5, 0xf0
+ paddw m4, m14
+ paddw m4, m6
+ psrlw m4, 2
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ mova [maskq], xm4
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm2
+ vextracti128 [dstq+strideq*1], m2, 1
+ mova [dstq+strideq*2], xm3
+ vextracti128 [dstq+stride3q ], m3, 1
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w16:
+ punpcklqdq m6, m4, m5
+ punpckhqdq m4, m5
+ paddw m6, m14
+ paddw m4, m6
+ psrlw m4, 2
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ pshufd xm4, xm4, q3120
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ mova [maskq], xm4
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w32:
+ paddw m4, m14
+ paddw m4, m5
+ psrlw m15, m4, 2
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ call .main
+ mova m6, [deint_shuf]
+ paddw m4, m14
+ paddw m4, m5
+ psrlw m4, 2
+ packuswb m15, m4
+ vpermd m4, m6, m15
+ mova [dstq+strideq*2+32*0], m0
+ mova [dstq+strideq*2+32*1], m1
+ mova [dstq+stride3q +32*0], m2
+ mova [dstq+stride3q +32*1], m3
+ mova [maskq], m4
+ sub hd, 4
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w64:
+ paddw m4, m14
+ paddw m15, m14, m5
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*0+32*2], m2
+ mova [dstq+strideq*0+32*3], m3
+ mova [maskq], m4 ; no available registers
+ call .main
+ paddw m4, [maskq]
+ mova m6, [deint_shuf]
+ paddw m5, m15
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5 ; 0 2 4 6 1 3 5 7
+ vpermd m4, m6, m4
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m2
+ mova [dstq+strideq*1+32*3], m3
+ mova [maskq], m4
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 64
+.w128:
+ paddw m4, m14
+ paddw m5, m14
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*0+32*2], m2
+ mova [dstq+strideq*0+32*3], m3
+ mova [maskq+32*0], m4
+ mova [dstq+strideq], m5
+ call .main
+ paddw m4, m14
+ paddw m15, m14, m5
+ mova [dstq+strideq*0+32*4], m0
+ mova [dstq+strideq*0+32*5], m1
+ mova [dstq+strideq*0+32*6], m2
+ mova [dstq+strideq*0+32*7], m3
+ mova [maskq+32*1], m4
+ call .main
+ paddw m4, [maskq+32*0]
+ paddw m5, [dstq+strideq]
+ mova m6, [deint_shuf]
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m6, m4
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m2
+ mova [dstq+strideq*1+32*3], m3
+ mova [maskq+32*0], m4
+ call .main
+ paddw m4, [maskq+32*1]
+ mova m6, [deint_shuf]
+ paddw m5, m15
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m6, m4
+ mova [dstq+strideq*1+32*4], m0
+ mova [dstq+strideq*1+32*5], m1
+ mova [dstq+strideq*1+32*6], m2
+ mova [dstq+strideq*1+32*7], m3
+ mova [maskq+32*1], m4
+ sub hd, 2
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+%macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul
+ mova m%1, [tmp1q+32*%1]
+ mova m%2, [tmp2q+32*%1]
+ punpcklwd m8, m%2, m%1
+ punpckhwd m9, m%2, m%1
+ psubsw m%1, m%2
+ pabsw m%1, m%1
+ psubusw m7, m10, m%1
+ psrlw m7, 10 ; 64-m
+ psubw m%2, m%3, m7 ; m
+ punpcklwd m%1, m7, m%2
+ punpckhwd m7, m%2
+ pmaddwd m%1, m8
+ pmaddwd m7, m9
+ psrad m%1, 5
+ psrad m7, 5
+ packssdw m%1, m7
+ pmaxsw m%1, m%4
+ psubsw m%1, m%4
+ pmulhw m%1, m%5
+%endmacro
+ W_MASK 0, 4
+ W_MASK 1, 5
+ phaddw m4, m5
+ W_MASK 2, 5
+ W_MASK 3, 6
+ phaddw m5, m6
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ ret
+
+cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx2_table
+ lea r7, [w_mask_422_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ vpbroadcastb m14, r7m ; sign
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615]
+ vpbroadcastd m11, [base+pw_64]
+ vpbroadcastd m12, [base+bidir_rnd+r6*4]
+ vpbroadcastd m13, [base+bidir_mul+r6*4]
+ mova m15, [base+deint_shuf]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti128 xm2, m2, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm3
+ movhps [dstq+strideq*1], xm3
+ vextracti128 xm3, m3, 1
+ movq [dstq+strideq*2], xm3
+ movhps [dstq+stride3q ], xm3
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm2
+ vextracti128 [dstq+strideq*1], m2, 1
+ mova [dstq+strideq*2], xm3
+ vextracti128 [dstq+stride3q ], m3, 1
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m2
+ mova [dstq+strideq*1+32*1], m3
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ call .main
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ mova [dstq+32*6], m2
+ mova [dstq+32*7], m3
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 4
+ W_MASK 1, 5
+ phaddw m4, m5
+ W_MASK 2, 5
+ W_MASK 3, 6
+ phaddw m5, m6
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ packuswb m4, m5
+ pxor m5, m5
+ psubb m4, m14
+ pavgb m4, m5
+ vpermd m4, m15, m4
+ mova [maskq], m4
+ add maskq, 32
+ ret
+
+cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx2_table
+ lea r7, [w_mask_444_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615]
+ vpbroadcastd m4, [base+pw_64]
+ vpbroadcastd m5, [base+bidir_rnd+r6*4]
+ vpbroadcastd m6, [base+bidir_mul+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+ je .w4_end
+ call .main
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti128 xm0, m0, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti128 xm1, m1, 1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti128 [dstq+stride3q ], m1, 1
+ sub hd, 4
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ call .main
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ call .main
+ mova [dstq+32*2], m0
+ mova [dstq+32*3], m1
+ call .main
+ mova [dstq+32*4], m0
+ mova [dstq+32*5], m1
+ call .main
+ mova [dstq+32*6], m0
+ mova [dstq+32*7], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 2, 4, 5, 6
+ W_MASK 1, 3, 4, 5, 6
+ packuswb m2, m3
+ vpermq m2, m2, q3120
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ mova [maskq], m2
+ add maskq, 32
+ ret
+
+; (a * (64 - m) + b * m + 32) >> 6
+; = (((b - a) * m + 32) >> 6) + a
+; = (((b - a) * (m << 9) + 16384) >> 15) + a
+; except m << 9 overflows int16_t when m == 64 (which is possible),
+; but if we negate m it works out (-64 << 9 == -32768).
+; = (((a - b) * (m * -512) + 16384) >> 15) + a
+cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx2_table
+ lea r6, [blend_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ movifnidn maskq, maskmp
+ vpbroadcastd m6, [base+pw_m512]
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ pmovzxbw m3, [maskq]
+ movq xm0, [dstq+dsq*0]
+ movhps xm0, [dstq+dsq*1]
+ vpbroadcastq m1, [dstq+dsq*2]
+ vpbroadcastq m2, [dstq+r6 ]
+ vpblendd m0, m1, 0x30
+ vpblendd m0, m2, 0xc0
+ psubw m1, m0, [tmpq]
+ add maskq, 16
+ add tmpq, 32
+ pmullw m3, m6
+ pmulhrsw m1, m3
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ movq [dstq+dsq*2], xm1
+ movhps [dstq+r6 ], xm1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ pmovzxbw m4, [maskq+16*0]
+ pmovzxbw m5, [maskq+16*1]
+ mova xm0, [dstq+dsq*0]
+ vinserti128 m0, [dstq+dsq*1], 1
+ mova xm1, [dstq+dsq*2]
+ vinserti128 m1, [dstq+r6 ], 1
+ psubw m2, m0, [tmpq+32*0]
+ psubw m3, m1, [tmpq+32*1]
+ add maskq, 16*2
+ add tmpq, 32*2
+ pmullw m4, m6
+ pmullw m5, m6
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ mova [dstq+dsq*2], xm1
+ vextracti128 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ pmovzxbw m4, [maskq+16*0]
+ pmovzxbw m5, [maskq+16*1]
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 32*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 32*1]
+ add maskq, 16*2
+ add tmpq, 32*2
+ pmullw m4, m6
+ pmullw m5, m6
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16
+ RET
+.w32:
+ pmovzxbw m4, [maskq+16*0]
+ pmovzxbw m5, [maskq+16*1]
+ mova m0, [dstq+32*0]
+ psubw m2, m0, [tmpq+32*0]
+ mova m1, [dstq+32*1]
+ psubw m3, m1, [tmpq+32*1]
+ add maskq, 16*2
+ add tmpq, 32*2
+ pmullw m4, m6
+ pmullw m5, m6
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ add dstq, dsq
+ dec hd
+ jg .w32
+ RET
+
+INIT_XMM avx2
+cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h
+%define base r5-blend_v_avx2_table
+ lea r5, [blend_v_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp wq
+.w2:
+ vpbroadcastd m2, [base+obmc_masks_avx2+2*2]
+.w2_loop:
+ movd m0, [dstq+dsq*0]
+ pinsrd m0, [dstq+dsq*1], 1
+ movq m1, [tmpq]
+ add tmpq, 4*2
+ psubw m1, m0, m1
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movd [dstq+dsq*0], m0
+ pextrd [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ vpbroadcastq m2, [base+obmc_masks_avx2+4*2]
+.w4_loop:
+ movq m0, [dstq+dsq*0]
+ movhps m0, [dstq+dsq*1]
+ psubw m1, m0, [tmpq]
+ add tmpq, 8*2
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+INIT_YMM avx2
+.w8:
+ vbroadcasti128 m2, [base+obmc_masks_avx2+8*2]
+.w8_loop:
+ mova xm0, [dstq+dsq*0]
+ vinserti128 m0, [dstq+dsq*1], 1
+ psubw m1, m0, [tmpq]
+ add tmpq, 16*2
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ mova m4, [base+obmc_masks_avx2+16*2]
+.w16_loop:
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 32*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 32*1]
+ add tmpq, 32*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+%if WIN64
+ movaps [rsp+ 8], xmm6
+ movaps [rsp+24], xmm7
+%endif
+ mova m6, [base+obmc_masks_avx2+32*2]
+ vbroadcasti128 m7, [base+obmc_masks_avx2+32*3]
+.w32_loop:
+ mova m0, [dstq+dsq*0+32*0]
+ psubw m3, m0, [tmpq +32*0]
+ mova xm2, [dstq+dsq*0+32*1]
+ mova xm5, [tmpq +32*1]
+ mova m1, [dstq+dsq*1+32*0]
+ psubw m4, m1, [tmpq +32*2]
+ vinserti128 m2, [dstq+dsq*1+32*1], 1
+ vinserti128 m5, [tmpq +32*3], 1
+ add tmpq, 32*4
+ psubw m5, m2, m5
+ pmulhrsw m3, m6
+ pmulhrsw m4, m6
+ pmulhrsw m5, m7
+ paddw m0, m3
+ paddw m1, m4
+ paddw m2, m5
+ mova [dstq+dsq*0+32*0], m0
+ mova [dstq+dsq*1+32*0], m1
+ mova [dstq+dsq*0+32*1], xm2
+ vextracti128 [dstq+dsq*1+32*1], m2, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32_loop
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
+ RET
+
+%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
+ mova m0, [dstq+32*(%1+0)]
+ psubw m2, m0, [tmpq+32*(%2+0)]
+ mova m1, [dstq+32*(%1+1)]
+ psubw m3, m1, [tmpq+32*(%2+1)]
+%if %3
+ add tmpq, 32*%3
+%endif
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+32*(%1+0)], m0
+ mova [dstq+32*(%1+1)], m1
+%endmacro
+
+INIT_XMM avx2
+cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_h_avx2_table
+ lea r5, [blend_h_avx2_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea maskq, [base+obmc_masks_avx2+hq*2]
+ lea hd, [hq*3]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd m0, [dstq+dsq*0]
+ pinsrd m0, [dstq+dsq*1], 1
+ movd m2, [maskq+hq*2]
+ movq m1, [tmpq]
+ add tmpq, 4*2
+ punpcklwd m2, m2
+ psubw m1, m0, m1
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movd [dstq+dsq*0], m0
+ pextrd [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova m3, [blend_shuf]
+.w4_loop:
+ movq m0, [dstq+dsq*0]
+ movhps m0, [dstq+dsq*1]
+ movd m2, [maskq+hq*2]
+ psubw m1, m0, [tmpq]
+ add tmpq, 8*2
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+INIT_YMM avx2
+.w8:
+ vbroadcasti128 m3, [blend_shuf]
+ shufpd m3, m3, 0x0c
+.w8_loop:
+ mova xm0, [dstq+dsq*0]
+ vinserti128 m0, [dstq+dsq*1], 1
+ vpbroadcastd m2, [maskq+hq*2]
+ psubw m1, m0, [tmpq]
+ add tmpq, 16*2
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+.w16:
+ vpbroadcastw m4, [maskq+hq*2]
+ vpbroadcastw m5, [maskq+hq*2+2]
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 32*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 32*1]
+ add tmpq, 32*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16
+ RET
+.w32:
+ vpbroadcastw m4, [maskq+hq*2]
+ BLEND_H_ROW 0, 0, 2
+ add dstq, dsq
+ inc hq
+ jl .w32
+ RET
+.w64:
+ vpbroadcastw m4, [maskq+hq*2]
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2, 4
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ vpbroadcastw m4, [maskq+hq*2]
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2, 8
+ BLEND_H_ROW 4, -4
+ BLEND_H_ROW 6, -2
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
+ bottomext, rightext
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor r12d, r12d
+ lea r10, [ihq-1]
+ cmp yq, ihq
+ cmovs r10, yq
+ test yq, yq
+ cmovs r10, r12
+ imul r10, sstrideq
+ add srcq, r10
+
+ ; ref += iclip(x, 0, iw - 1)
+ lea r10, [iwq-1]
+ cmp xq, iwq
+ cmovs r10, xq
+ test xq, xq
+ cmovs r10, r12
+ lea srcq, [srcq+r10*2]
+
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ lea bottomextq, [yq+bhq]
+ sub bottomextq, ihq
+ lea r3, [bhq-1]
+ cmovs bottomextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, r12
+ cmp bottomextq, bhq
+ cmovns bottomextq, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ lea rightextq, [xq+bwq]
+ sub rightextq, iwq
+ lea r2, [bwq-1]
+ cmovs rightextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, r12
+ cmp rightextq, bwq
+ cmovns rightextq, r2
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
+ dst, dstride, src, sstride, bottomext, rightext
+
+ ; center_h = bh - top_ext - bottom_ext
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+ imul r2, dstrideq
+ add dstq, r2
+ mov r9m, dstq
+
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+.v_loop_%3:
+%if %1
+ ; left extension
+ xor r3, r3
+ vpbroadcastw m0, [srcq]
+.left_loop_%3:
+ mova [dstq+r3*2], m0
+ add r3, 16
+ cmp r3, leftextq
+ jl .left_loop_%3
+
+ ; body
+ lea r12, [dstq+leftextq*2]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ movu m0, [srcq+r3*2]
+%if %1
+ movu [r12+r3*2], m0
+%else
+ movu [dstq+r3*2], m0
+%endif
+ add r3, 16
+ cmp r3, centerwq
+ jl .body_loop_%3
+
+%if %2
+ ; right extension
+%if %1
+ lea r12, [r12+centerwq*2]
+%else
+ lea r12, [dstq+centerwq*2]
+%endif
+ xor r3, r3
+ vpbroadcastw m0, [srcq+centerwq*2-2]
+.right_loop_%3:
+ movu [r12+r3*2], m0
+ add r3, 16
+ cmp r3, rightextq
+ jl .right_loop_%3
+
+%endif
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+%endmacro
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ test rightextq, rightextq
+ jnz .need_right_ext
+ v_loop 0, 0, 0
+ jmp .body_done
+
+.need_left_ext:
+ test rightextq, rightextq
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+ ; bottom edge extension
+ test bottomextq, bottomextq
+ jz .top
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+.bottom_x_loop:
+ mova m0, [srcq+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, bottomextq
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .bottom_y_loop
+ add r1, 16
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+ mov srcq, r9m
+ mov dstq, dstm
+ xor r1, r1
+.top_x_loop:
+ mova m0, [srcq+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, topextq
+.top_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .top_y_loop
+ add r1, 16
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
+ LEA r7, $$
+%define base r7-$$
+ vpbroadcastd m3, [base+pd_64]
+ vpbroadcastw xm7, pxmaxm
+ pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
+ pslld m5, 3 ; dx*8
+ pslld m6, 14
+ paddd m8, m2 ; mx+[0..7]*dx
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+.loop_x:
+ vpbroadcastd m10, [base+pd_63]
+ pxor m2, m2
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ pand m9, m10 ; filter offset (masked)
+ ; load source pixels
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vextracti128 xm0, m0, 1
+ movu xm10, [srcq+r8*2]
+ movu xm11, [srcq+r9*2]
+ movu xm12, [srcq+r10*2]
+ movu xm13, [srcq+r11*2]
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vinserti128 m10, [srcq+r8*2], 1
+ vinserti128 m11, [srcq+r9*2], 1
+ vinserti128 m12, [srcq+r10*2], 1
+ vinserti128 m13, [srcq+r11*2], 1
+ ptest m1, m1
+ jz .filter
+ movq r9, xm1
+ pextrq r11, xm1, 1
+ movsxd r8, r9d
+ sar r9, 32
+ movsxd r10, r11d
+ sar r11, 32
+ vextracti128 xm1, m1, 1
+ movu xm14, [base+resize_shuf+8+r8*2]
+ movu xm15, [base+resize_shuf+8+r9*2]
+ movu xm0, [base+resize_shuf+8+r10*2]
+ movu xm2, [base+resize_shuf+8+r11*2]
+ movq r9, xm1
+ pextrq r11, xm1, 1
+ movsxd r8, r9d
+ sar r9, 32
+ movsxd r10, r11d
+ sar r11, 32
+ vinserti128 m14, [base+resize_shuf+8+r8*2], 1
+ vinserti128 m15, [base+resize_shuf+8+r9*2], 1
+ vinserti128 m0, [base+resize_shuf+8+r10*2], 1
+ vinserti128 m2, [base+resize_shuf+8+r11*2], 1
+ pshufb m10, m14
+ pshufb m11, m15
+ pshufb m12, m0
+ pshufb m13, m2
+.filter:
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ vextracti128 xm9, m9, 1
+ movq xm14, [base+resize_filter+r8*8]
+ movq xm15, [base+resize_filter+r9*8]
+ movq xm0, [base+resize_filter+r10*8]
+ movq xm2, [base+resize_filter+r11*8]
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ movhps xm14, [base+resize_filter+r8*8]
+ movhps xm15, [base+resize_filter+r9*8]
+ movhps xm0, [base+resize_filter+r10*8]
+ movhps xm2, [base+resize_filter+r11*8]
+ pmovsxbw m14, xm14
+ pmovsxbw m15, xm15
+ pmovsxbw m0, xm0
+ pmovsxbw m2, xm2
+ pmaddwd m10, m14
+ pmaddwd m11, m15
+ pmaddwd m12, m0
+ pmaddwd m13, m2
+ phaddd m10, m11
+ phaddd m12, m13
+ phaddd m10, m12
+ psubd m10, m3, m10
+ psrad m10, 7
+ vextracti128 xm0, m10, 1
+ packusdw xm10, xm0
+ pminsw xm10, xm7
+ mova [dstq+xq*2], xm10
+ paddd m4, m5
+ add xd, 8
+ cmp xd, dst_wd
+ jl .loop_x
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/mc16_avx512.asm b/third_party/dav1d/src/x86/mc16_avx512.asm
new file mode 100644
index 0000000000..585ba53e08
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc16_avx512.asm
@@ -0,0 +1,4858 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+ db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41
+spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17
+ db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49
+ db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25
+ db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57
+spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+ db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45
+spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21
+ db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53
+ db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29
+ db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61
+spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
+ db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
+ db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
+ db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
+spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
+ db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
+ db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
+ db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
+prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+ db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
+ db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
+ db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
+prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46
+ db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62
+ db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110
+ db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126
+prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78
+ db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94
+ db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110
+ db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126
+spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30
+ db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46
+spel_shuf4b: db 18, 19, 33, 34, 22, 23, 37, 38, 26, 27, 41, 42, 30, 31, 45, 46
+ db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
+spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30
+ db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78
+ db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
+ db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110
+spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78
+ db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
+ db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110
+ db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
+spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46
+ db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62
+ db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110
+ db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126
+spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78
+ db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94
+ db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110
+ db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126
+spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38
+ db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14
+ db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46
+spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30
+spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21
+ db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25
+w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+ db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
+ db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
+w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
+ db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
+ db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94
+ db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126
+w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
+ db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
+ db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94
+ db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126
+w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
+ db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
+ db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110
+ db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126
+warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37
+ db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41
+ db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45
+ db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
+warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
+ db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53
+ db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57
+ db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61
+warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
+ db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
+ db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
+ db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
+deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7
+pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7
+ dd 1
+pw_2048: times 2 dw 2048
+ dd 3
+pw_8192: times 2 dw 8192
+avg_shift: dw 5, 5, 3, 3
+pw_27615: times 2 dw 27615
+pw_32766: times 2 dw 32766
+warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
+warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
+warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29
+resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
+resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13
+resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15
+resize_permE: dq 0, 2, 4, 6
+resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13
+resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
+
+prep_hv_shift: dq 6, 4
+put_bilin_h_rnd: dw 8, 8, 10, 10
+prep_mul: dw 16, 16, 4, 4
+put_8tap_h_rnd: dd 34, 40
+prep_8tap_rnd: dd 128 - (8192 << 8)
+warp_8x8_rnd_h: dd 512, 2048
+warp_8x8_rnd_v: dd 262144, 65536
+warp_8x8t_rnd_v: dd 16384 - (8192 << 15)
+avg_round: dw -16400, -16400, -16388, -16388
+w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4)
+mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6)
+w_mask_round: dd 128, 64
+bidir_shift: dw 6, 6, 4, 4
+
+pb_64: times 4 db 64
+pw_m512: times 2 dw -512
+pw_2: times 2 dw 2
+pw_64: times 2 dw 64
+pd_32: dd 32
+pd_63: dd 63
+pd_128: dd 128
+pd_640: dd 640
+pd_2176: dd 2176
+pd_16384: dd 16384
+pd_0_4: dd 0, 4
+
+%define pw_16 prep_mul
+%define pd_512 warp_8x8_rnd_h
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put)
+%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep)
+
+BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+cextern mc_warp_filter
+cextern obmc_masks_avx2
+cextern resize_filter
+
+SECTION .text
+
+%if WIN64
+DECLARE_REG_TMP 4
+%else
+DECLARE_REG_TMP 8
+%endif
+
+INIT_ZMM avx512icl
+cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy
+ mov mxyd, r6m ; mx
+ lea r7, [put_avx512icl]
+ tzcnt t0d, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx t0d, word [r7+t0*2+table_offset(put,)]
+ add t0, r7
+ jmp t0
+.put_w2:
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movu xmm0, [srcq+ssq*0]
+ movu xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], xmm0
+ mova [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu ym0, [srcq+ssq*0]
+ movu ym1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], ym0
+ mova [dstq+dsq*1], ym1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+ movu m2, [srcq+ssq*1+64*0]
+ movu m3, [srcq+ssq*1+64*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+64*0], m0
+ mova [dstq+dsq*0+64*1], m1
+ mova [dstq+dsq*1+64*0], m2
+ mova [dstq+dsq*1+64*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+ movu m2, [srcq+64*2]
+ movu m3, [srcq+64*3]
+ add srcq, ssq
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ mova [dstq+64*2], m2
+ mova [dstq+64*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ vpbroadcastw m5, mxyd
+ mov mxyd, r7m ; my
+ vpbroadcastd m4, [pw_16]
+ psubw m4, m5
+ test mxyd, mxyd
+ jnz .hv
+ ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
+ movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)]
+ mov r6d, r8m ; bitdepth_max
+ add t0, r7
+ shr r6d, 11
+ vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4]
+ jmp t0
+.h_w2:
+ movq xmm1, [srcq+ssq*0]
+ movhps xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmullw xmm0, xmm1, xm4
+ psrlq xmm1, 16
+ pmullw xmm1, xm5
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 4
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ movq xmm0, [srcq+ssq*0+0]
+ movhps xmm0, [srcq+ssq*1+0]
+ movq xmm1, [srcq+ssq*0+2]
+ movhps xmm1, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw xmm0, xm4
+ pmullw xmm1, xm5
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 4
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0+0]
+ vinserti32x4 ym0, [srcq+ssq*1+0], 1
+ movu xm1, [srcq+ssq*0+2]
+ vinserti32x4 ym1, [srcq+ssq*1+2], 1
+ lea srcq, [srcq+ssq*2]
+ pmullw ym0, ym4
+ pmullw ym1, ym5
+ paddw ym0, ym6
+ paddw ym0, ym1
+ psrlw ym0, 4
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu ym0, [srcq+ssq*0+0]
+ vinserti32x8 m0, [srcq+ssq*1+0], 1
+ movu ym1, [srcq+ssq*0+2]
+ vinserti32x8 m1, [srcq+ssq*1+2], 1
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m6
+ paddw m0, m1
+ psrlw m0, 4
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ pmullw m0, m4, [srcq+ssq*0+0]
+ pmullw m2, m5, [srcq+ssq*0+2]
+ pmullw m1, m4, [srcq+ssq*1+0]
+ pmullw m3, m5, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ paddw m0, m6
+ paddw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ pmullw m0, m4, [srcq+64*0+0]
+ pmullw m2, m5, [srcq+64*0+2]
+ pmullw m1, m4, [srcq+64*1+0]
+ pmullw m3, m5, [srcq+64*1+2]
+ add srcq, ssq
+ paddw m0, m6
+ paddw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ pmullw m0, m4, [srcq+64*0+0]
+ pmullw m7, m5, [srcq+64*0+2]
+ pmullw m1, m4, [srcq+64*1+0]
+ pmullw m8, m5, [srcq+64*1+2]
+ pmullw m2, m4, [srcq+64*2+0]
+ pmullw m9, m5, [srcq+64*2+2]
+ pmullw m3, m4, [srcq+64*3+0]
+ pmullw m10, m5, [srcq+64*3+2]
+ add srcq, ssq
+ REPX {paddw x, m6}, m0, m1, m2, m3
+ paddw m0, m7
+ paddw m1, m8
+ paddw m2, m9
+ paddw m3, m10
+ REPX {psrlw x, 4}, m0, m1, m2, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ mova [dstq+64*2], m2
+ mova [dstq+64*3], m3
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)]
+ shl mxyd, 11
+ vpbroadcastw m8, mxyd
+ add t0, r7
+ jmp t0
+.v_w2:
+ movd xmm0, [srcq+ssq*0]
+.v_w2_loop:
+ movd xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq xmm2, xmm0, xmm1
+ movd xmm0, [srcq+ssq*0]
+ punpckldq xmm1, xmm0
+ psubw xmm1, xmm2
+ pmulhrsw xmm1, xm8
+ paddw xmm1, xmm2
+ movd [dstq+dsq*0], xmm1
+ pextrd [dstq+dsq*1], xmm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xmm0, [srcq+ssq*0]
+.v_w4_loop:
+ movq xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq xmm2, xmm0, xmm1
+ movq xmm0, [srcq+ssq*0]
+ punpcklqdq xmm1, xmm0
+ psubw xmm1, xmm2
+ pmulhrsw xmm1, xm8
+ paddw xmm1, xmm2
+ movq [dstq+dsq*0], xmm1
+ movhps [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu xmm0, [srcq+ssq*0]
+.v_w8_loop:
+ vbroadcasti128 ymm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd ymm2, ymm0, ymm1, 0xf0
+ vbroadcasti128 ymm0, [srcq+ssq*0]
+ vpblendd ymm1, ymm0, 0xf0
+ psubw ymm1, ymm2
+ pmulhrsw ymm1, ym8
+ paddw ymm1, ymm2
+ mova [dstq+dsq*0], xmm1
+ vextracti128 [dstq+dsq*1], ymm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ vzeroupper
+ RET
+.v_w16:
+ movu ym0, [srcq+ssq*0]
+.v_w16_loop:
+ movu ym3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ psubw ym1, ym3, ym0
+ pmulhrsw ym1, ym8
+ paddw ym1, ym0
+ movu ym0, [srcq+ssq*0]
+ psubw ym2, ym0, ym3
+ pmulhrsw ym2, ym8
+ paddw ym2, ym3
+ mova [dstq+dsq*0], ym1
+ mova [dstq+dsq*1], ym2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+ movu m0, [srcq+ssq*0]
+.v_w32_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m1, m3, m0
+ pmulhrsw m1, m8
+ paddw m1, m0
+ movu m0, [srcq+ssq*0]
+ psubw m2, m0, m3
+ pmulhrsw m2, m8
+ paddw m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ RET
+.v_w64:
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+.v_w64_loop:
+ movu m2, [srcq+ssq*1+64*0]
+ movu m3, [srcq+ssq*1+64*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m4, m2, m0
+ pmulhrsw m4, m8
+ paddw m4, m0
+ movu m0, [srcq+ssq*0+64*0]
+ psubw m5, m3, m1
+ pmulhrsw m5, m8
+ paddw m5, m1
+ movu m1, [srcq+ssq*0+64*1]
+ psubw m6, m0, m2
+ pmulhrsw m6, m8
+ psubw m7, m1, m3
+ pmulhrsw m7, m8
+ mova [dstq+dsq*0+64*0], m4
+ mova [dstq+dsq*0+64*1], m5
+ paddw m6, m2
+ paddw m7, m3
+ mova [dstq+dsq*1+64*0], m6
+ mova [dstq+dsq*1+64*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+ movu m2, [srcq+ssq*0+64*2]
+ movu m3, [srcq+ssq*0+64*3]
+.v_w128_loop:
+ movu m4, [srcq+ssq*1+64*0]
+ movu m5, [srcq+ssq*1+64*1]
+ movu m6, [srcq+ssq*1+64*2]
+ movu m7, [srcq+ssq*1+64*3]
+ lea srcq, [srcq+ssq*2]
+ psubw m9, m4, m0
+ pmulhrsw m9, m8
+ paddw m9, m0
+ movu m0, [srcq+ssq*0+64*0]
+ psubw m10, m5, m1
+ pmulhrsw m10, m8
+ paddw m10, m1
+ movu m1, [srcq+ssq*0+64*1]
+ psubw m11, m6, m2
+ pmulhrsw m11, m8
+ paddw m11, m2
+ movu m2, [srcq+ssq*0+64*2]
+ psubw m12, m7, m3
+ pmulhrsw m12, m8
+ paddw m12, m3
+ movu m3, [srcq+ssq*0+64*3]
+ mova [dstq+dsq*0+64*0], m9
+ psubw m9, m0, m4
+ pmulhrsw m9, m8
+ mova [dstq+dsq*0+64*1], m10
+ psubw m10, m1, m5
+ pmulhrsw m10, m8
+ mova [dstq+dsq*0+64*2], m11
+ psubw m11, m2, m6
+ pmulhrsw m11, m8
+ mova [dstq+dsq*0+64*3], m12
+ psubw m12, m3, m7
+ pmulhrsw m12, m8
+ paddw m9, m4
+ paddw m10, m5
+ mova [dstq+dsq*1+64*0], m9
+ mova [dstq+dsq*1+64*1], m10
+ paddw m11, m6
+ paddw m12, m7
+ mova [dstq+dsq*1+64*2], m11
+ mova [dstq+dsq*1+64*3], m12
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w128_loop
+ RET
+.hv:
+ movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)]
+ shl mxyd, 11
+ vpbroadcastd m6, [pw_2]
+ vpbroadcastw m7, mxyd
+ vpbroadcastd m8, [pw_8192]
+ add t0, r7
+ test dword r8m, 0x800
+ jnz .hv_12bpc
+ psllw m4, 2
+ psllw m5, 2
+ vpbroadcastd m8, [pw_2048]
+.hv_12bpc:
+ jmp t0
+.hv_w2:
+ vpbroadcastq xmm1, [srcq+ssq*0]
+ pmullw xmm0, xmm1, xm4
+ psrlq xmm1, 16
+ pmullw xmm1, xm5
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 2
+.hv_w2_loop:
+ movq xmm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm2, [srcq+ssq*0]
+ pmullw xmm1, xmm2, xm4
+ psrlq xmm2, 16
+ pmullw xmm2, xm5
+ paddw xmm1, xm6
+ paddw xmm1, xmm2
+ psrlw xmm1, 2 ; 1 _ 2 _
+ shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _
+ mova xmm0, xmm1
+ psubw xmm1, xmm2
+ paddw xmm1, xmm1
+ pmulhw xmm1, xm7
+ paddw xmm1, xmm2
+ pmulhrsw xmm1, xm8
+ movd [dstq+dsq*0], xmm1
+ pextrd [dstq+dsq*1], xmm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ pmullw xmm0, xm4, [srcq+ssq*0-8]
+ pmullw xmm1, xm5, [srcq+ssq*0-6]
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 2
+.hv_w4_loop:
+ movq xmm1, [srcq+ssq*1+0]
+ movq xmm2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm1, [srcq+ssq*0+0]
+ movhps xmm2, [srcq+ssq*0+2]
+ pmullw xmm1, xm4
+ pmullw xmm2, xm5
+ paddw xmm1, xm6
+ paddw xmm1, xmm2
+ psrlw xmm1, 2 ; 1 2
+ shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1
+ mova xmm0, xmm1
+ psubw xmm1, xmm2
+ paddw xmm1, xmm1
+ pmulhw xmm1, xm7
+ paddw xmm1, xmm2
+ pmulhrsw xmm1, xm8
+ movq [dstq+dsq*0], xmm1
+ movhps [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ pmullw xmm0, xm4, [srcq+ssq*0+0]
+ pmullw xmm1, xm5, [srcq+ssq*0+2]
+ paddw xmm0, xm6
+ paddw xmm0, xmm1
+ psrlw xmm0, 2
+ vinserti32x4 ym0, xmm0, 1
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1+0]
+ movu xm2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym1, [srcq+ssq*0+0], 1
+ vinserti32x4 ym2, [srcq+ssq*0+2], 1
+ pmullw ym1, ym4
+ pmullw ym2, ym5
+ paddw ym1, ym6
+ paddw ym1, ym2
+ psrlw ym1, 2 ; 1 2
+ vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1
+ mova ym0, ym1
+ psubw ym1, ym2
+ paddw ym1, ym1
+ pmulhw ym1, ym7
+ paddw ym1, ym2
+ pmulhrsw ym1, ym8
+ mova [dstq+dsq*0], xm1
+ vextracti32x4 [dstq+dsq*1], ym1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ pmullw ym0, ym4, [srcq+ssq*0+0]
+ pmullw ym1, ym5, [srcq+ssq*0+2]
+ paddw ym0, ym6
+ paddw ym0, ym1
+ psrlw ym0, 2
+ vinserti32x8 m0, ym0, 1
+.hv_w16_loop:
+ movu ym1, [srcq+ssq*1+0]
+ movu ym2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m1, [srcq+ssq*0+0], 1
+ vinserti32x8 m2, [srcq+ssq*0+2], 1
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m6
+ paddw m1, m2
+ psrlw m1, 2 ; 1 2
+ vshufi32x4 m2, m0, m1, q1032 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m7
+ paddw m1, m2
+ pmulhrsw m1, m8
+ mova [dstq+dsq*0], ym1
+ vextracti32x8 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+.hv_w64:
+.hv_w128:
+ movifnidn wd, wm
+ lea r6d, [hq+wq*8-256]
+ mov r4, srcq
+ mov r7, dstq
+.hv_w32_loop0:
+ pmullw m0, m4, [srcq+ssq*0+0]
+ pmullw m1, m5, [srcq+ssq*0+2]
+ paddw m0, m6
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w32_loop:
+ pmullw m3, m4, [srcq+ssq*1+0]
+ pmullw m1, m5, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ paddw m3, m6
+ paddw m3, m1
+ psrlw m3, 2
+ psubw m1, m3, m0
+ paddw m1, m1
+ pmulhw m1, m7
+ paddw m1, m0
+ pmullw m0, m4, [srcq+ssq*0+0]
+ pmullw m2, m5, [srcq+ssq*0+2]
+ paddw m0, m6
+ paddw m0, m2
+ psrlw m0, 2
+ psubw m2, m0, m3
+ paddw m2, m2
+ pmulhw m2, m7
+ paddw m2, m3
+ pmulhrsw m1, m8
+ pmulhrsw m2, m8
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w32_loop
+ add r4, 64
+ add r7, 64
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .hv_w32_loop0
+ RET
+
+cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea r6, [prep_avx512icl]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ mov r5d, r7m ; bitdepth_max
+ vpbroadcastd m5, [r6-prep_avx512icl+pw_8192]
+ add wq, r6
+ shr r5d, 11
+ vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4]
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movq xmm0, [srcq+strideq*0]
+ movhps xmm0, [srcq+strideq*1]
+ vpbroadcastq ymm1, [srcq+strideq*2]
+ vpbroadcastq ymm2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd ymm0, ymm1, 0x30
+ vpblendd ymm0, ymm2, 0xc0
+ pmullw ymm0, ym4
+ psubw ymm0, ym5
+ mova [tmpq], ymm0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ vzeroupper
+ RET
+.prep_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti32x4 ym0, [srcq+strideq*1], 1
+ vinserti32x4 m0, [srcq+strideq*2], 2
+ vinserti32x4 m0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ psubw m0, m5
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ pmullw m1, m4
+ psubw m0, m5
+ psubw m1, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 64*2
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmullw m0, m4, [srcq+strideq*0]
+ pmullw m1, m4, [srcq+strideq*1]
+ pmullw m2, m4, [srcq+strideq*2]
+ pmullw m3, m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 4
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmullw m0, m4, [srcq+strideq*0+64*0]
+ pmullw m1, m4, [srcq+strideq*0+64*1]
+ pmullw m2, m4, [srcq+strideq*1+64*0]
+ pmullw m3, m4, [srcq+strideq*1+64*1]
+ lea srcq, [srcq+strideq*2]
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 2
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmullw m0, m4, [srcq+64*0]
+ pmullw m1, m4, [srcq+64*1]
+ pmullw m2, m4, [srcq+64*2]
+ pmullw m3, m4, [srcq+64*3]
+ add srcq, strideq
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ vpbroadcastw m5, mxyd
+ mov mxyd, r6m ; my
+ vpbroadcastd m4, [pw_16]
+ vpbroadcastd m6, [pw_32766]
+ psubw m4, m5
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m4, 2
+ psllw m5, 2
+.h_12bpc:
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ movu xm1, [srcq+strideq*0]
+ vinserti32x4 ym1, [srcq+strideq*2], 1
+ movu xm2, [srcq+strideq*1]
+ vinserti32x4 ym2, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq ym0, ym1, ym2
+ psrldq ym1, 2
+ psrldq ym2, 2
+ pmullw ym0, ym4
+ punpcklqdq ym1, ym2
+ pmullw ym1, ym5
+ psubw ym0, ym6
+ paddw ym0, ym1
+ psraw ym0, 2
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4
+ RET
+.h_w8:
+ movu xm0, [srcq+strideq*0+0]
+ movu xm1, [srcq+strideq*0+2]
+ vinserti32x4 ym0, [srcq+strideq*1+0], 1
+ vinserti32x4 ym1, [srcq+strideq*1+2], 1
+ vinserti32x4 m0, [srcq+strideq*2+0], 2
+ vinserti32x4 m1, [srcq+strideq*2+2], 2
+ vinserti32x4 m0, [srcq+stride3q +0], 3
+ vinserti32x4 m1, [srcq+stride3q +2], 3
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ pmullw m1, m5
+ psubw m0, m6
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8
+ RET
+.h_w16:
+ movu ym0, [srcq+strideq*0+0]
+ vinserti32x8 m0, [srcq+strideq*1+0], 1
+ movu ym1, [srcq+strideq*0+2]
+ vinserti32x8 m1, [srcq+strideq*1+2], 1
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ psubw m0, m6
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ pmullw m0, m4, [srcq+strideq*0+0]
+ pmullw m2, m5, [srcq+strideq*0+2]
+ pmullw m1, m4, [srcq+strideq*1+0]
+ pmullw m3, m5, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ psubw m0, m6
+ psubw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 64*2
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m2, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+64]
+ pmullw m3, m5, [srcq+66]
+ add srcq, strideq
+ psubw m0, m6
+ psubw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 64*2
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m7, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+ 64]
+ pmullw m8, m5, [srcq+ 66]
+ pmullw m2, m4, [srcq+128]
+ pmullw m9, m5, [srcq+130]
+ pmullw m3, m4, [srcq+192]
+ pmullw m10, m5, [srcq+194]
+ add srcq, strideq
+ REPX {psubw x, m6}, m0, m1, m2, m3
+ paddw m0, m7
+ paddw m1, m8
+ paddw m2, m9
+ paddw m3, m10
+ REPX {psraw x, 2}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+ vpbroadcastw m9, mxyd
+ vpbroadcastd m8, [pw_16]
+ vpbroadcastd m10, [pw_32766]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ psubw m8, m9
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m8, 2
+ psllw m9, 2
+.v_12bpc:
+ jmp wq
+.v_w4:
+ movq xmm0, [srcq+strideq*0]
+.v_w4_loop:
+ vpbroadcastq xmm2, [srcq+strideq*1]
+ vpbroadcastq ymm1, [srcq+strideq*2]
+ vpbroadcastq ymm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd ymm2, ymm1, 0x30
+ vpblendd ymm2, ymm3, 0xc0
+ vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3
+ movq xmm0, [srcq+strideq*0]
+ valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4
+ pmullw ymm1, ym8
+ pmullw ymm2, ym9
+ psubw ymm1, ym10
+ paddw ymm1, ymm2
+ psraw ymm1, 2
+ mova [tmpq], ymm1
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ movu xm0, [srcq+strideq*0]
+.v_w8_loop:
+ vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
+ vinserti32x4 m1, [srcq+strideq*2], 2
+ vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3
+ lea srcq, [srcq+strideq*4]
+ movu xm0, [srcq+strideq*0]
+ valignq m2, m0, m1, 2 ; 1 2 3 4
+ pmullw m1, m8
+ pmullw m2, m9
+ psubw m1, m10
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu ym0, [srcq+strideq*0]
+.v_w16_loop:
+ vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1
+ movu ym3, [srcq+strideq*2]
+ vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3
+ lea srcq, [srcq+strideq*4]
+ movu ym0, [srcq+strideq*0]
+ vshufi32x4 m3, m1, m3, q1032 ; 1 2
+ vshufi32x4 m4, m2, m0, q1032 ; 3 4
+ pmullw m1, m8
+ pmullw m2, m8
+ pmullw m3, m9
+ pmullw m4, m9
+ psubw m1, m10
+ psubw m2, m10
+ paddw m1, m3
+ paddw m2, m4
+ psraw m1, 2
+ psraw m2, 2
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ movu m0, [srcq+strideq*0]
+.v_w32_loop:
+ movu m3, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m8, m0
+ movu m0, [srcq+strideq*0]
+ pmullw m2, m8, m3
+ pmullw m3, m9
+ pmullw m4, m9, m0
+ psubw m1, m10
+ psubw m2, m10
+ paddw m1, m3
+ paddw m2, m4
+ psraw m1, 2
+ psraw m2, 2
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 2
+ jg .v_w32_loop
+ RET
+.v_w64:
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+.v_w64_loop:
+ add srcq, strideq
+ pmullw m2, m8, m0
+ movu m0, [srcq+64*0]
+ pmullw m3, m8, m1
+ movu m1, [srcq+64*1]
+ pmullw m4, m9, m0
+ pmullw m5, m9, m1
+ psubw m2, m10
+ psubw m3, m10
+ paddw m2, m4
+ paddw m3, m5
+ psraw m2, 2
+ psraw m3, 2
+ mova [tmpq+64*0], m2
+ mova [tmpq+64*1], m3
+ add tmpq, 64*2
+ dec hd
+ jg .v_w64_loop
+ RET
+.v_w128:
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+ movu m2, [srcq+64*2]
+ movu m3, [srcq+64*3]
+.v_w128_loop:
+ add srcq, strideq
+ pmullw m4, m8, m0
+ movu m0, [srcq+64*0]
+ pmullw m5, m8, m1
+ movu m1, [srcq+64*1]
+ pmullw m6, m8, m2
+ movu m2, [srcq+64*2]
+ pmullw m7, m8, m3
+ movu m3, [srcq+64*3]
+ pmullw m11, m9, m0
+ pmullw m12, m9, m1
+ pmullw m13, m9, m2
+ pmullw m14, m9, m3
+ REPX {psubw x, m10}, m4, m5, m6, m7
+ paddw m4, m11
+ paddw m5, m12
+ paddw m6, m13
+ paddw m7, m14
+ REPX {psraw x, 2}, m4, m5, m6, m7
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m5
+ mova [tmpq+64*2], m6
+ mova [tmpq+64*3], m7
+ add tmpq, 64*4
+ dec hd
+ jg .v_w128_loop
+ RET
+.hv:
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ vpbroadcastw m7, mxyd
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.hv_w4:
+ movq xmm0, [srcq+strideq*0+0]
+ movq xmm1, [srcq+strideq*0+2]
+ pmullw xmm0, xm4
+ pmullw xmm1, xm5
+ psubw xmm0, xm6
+ paddw xmm0, xmm1
+ psraw xmm0, 2
+ vpbroadcastq ym0, xmm0
+.hv_w4_loop:
+ movu xm1, [srcq+strideq*1]
+ vinserti128 ym1, [srcq+stride3q ], 1
+ movu xm2, [srcq+strideq*2]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 ym2, [srcq+strideq*0], 1
+ punpcklqdq ym3, ym1, ym2
+ psrldq ym1, 2
+ psrldq ym2, 2
+ pmullw ym3, ym4
+ punpcklqdq ym1, ym2
+ pmullw ym1, ym5
+ psubw ym3, ym6
+ paddw ym1, ym3
+ psraw ym1, 2 ; 1 2 3 4
+ valignq ym2, ym1, ym0, 3 ; 0 1 2 3
+ mova ym0, ym1
+ psubw ym1, ym2
+ pmulhrsw ym1, ym7
+ paddw ym1, ym2
+ mova [tmpq], ym1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ pmullw xm0, xm4, [srcq+strideq*0+0]
+ pmullw xm1, xm5, [srcq+strideq*0+2]
+ psubw xm0, xm6
+ paddw xm0, xm1
+ psraw xm0, 2
+ vinserti32x4 m0, xm0, 3
+.hv_w8_loop:
+ movu xm1, [srcq+strideq*1+0]
+ movu xm2, [srcq+strideq*1+2]
+ vinserti32x4 ym1, [srcq+strideq*2+0], 1
+ vinserti32x4 ym2, [srcq+strideq*2+2], 1
+ vinserti32x4 m1, [srcq+stride3q +0], 2
+ vinserti32x4 m2, [srcq+stride3q +2], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti32x4 m1, [srcq+strideq*0+0], 3
+ vinserti32x4 m2, [srcq+strideq*0+2], 3
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m6
+ paddw m1, m2
+ psraw m1, 2 ; 1 2 3 4
+ valignq m2, m1, m0, 6 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ pmullw ym0, ym4, [srcq+strideq*0+0]
+ pmullw ym1, ym5, [srcq+strideq*0+2]
+ psubw ym0, ym6
+ paddw ym0, ym1
+ psraw ym0, 2
+ vinserti32x8 m0, ym0, 1
+.hv_w16_loop:
+ movu ym1, [srcq+strideq*1+0]
+ movu ym2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m1, [srcq+strideq*0+0], 1
+ vinserti32x8 m2, [srcq+strideq*0+2], 1
+ pmullw m1, m4
+ pmullw m2, m5
+ psubw m1, m6
+ paddw m1, m2
+ psraw m1, 2 ; 1 2
+ vshufi32x4 m2, m0, m1, q1032 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m7
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ pmullw m0, m4, [srcq+strideq*0+0]
+ pmullw m1, m5, [srcq+strideq*0+2]
+ psubw m0, m6
+ paddw m0, m1
+ psraw m0, 2
+.hv_w32_loop:
+ pmullw m3, m4, [srcq+strideq*1+0]
+ pmullw m1, m5, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ psubw m3, m6
+ paddw m3, m1
+ psraw m3, 2
+ psubw m1, m3, m0
+ pmulhrsw m1, m7
+ paddw m1, m0
+ pmullw m0, m4, [srcq+strideq*0+0]
+ pmullw m2, m5, [srcq+strideq*0+2]
+ psubw m0, m6
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m3
+ pmulhrsw m2, m7
+ paddw m2, m3
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 2
+ jg .hv_w32_loop
+ RET
+.hv_w64:
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m2, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+64]
+ pmullw m3, m5, [srcq+66]
+ psubw m0, m6
+ psubw m1, m6
+ paddw m0, m2
+ paddw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+.hv_w64_loop:
+ add srcq, strideq
+ pmullw m2, m4, [srcq+ 0]
+ pmullw m8, m5, [srcq+ 2]
+ pmullw m3, m4, [srcq+64]
+ pmullw m9, m5, [srcq+66]
+ psubw m2, m6
+ psubw m3, m6
+ paddw m2, m8
+ paddw m3, m9
+ psraw m2, 2
+ psraw m3, 2
+ psubw m8, m2, m0
+ psubw m9, m3, m1
+ pmulhrsw m8, m7
+ pmulhrsw m9, m7
+ paddw m8, m0
+ mova m0, m2
+ paddw m9, m1
+ mova m1, m3
+ mova [tmpq+64*0], m8
+ mova [tmpq+64*1], m9
+ add tmpq, 64*2
+ dec hd
+ jg .hv_w64_loop
+ RET
+.hv_w128:
+ pmullw m0, m4, [srcq+ 0]
+ pmullw m8, m5, [srcq+ 2]
+ pmullw m1, m4, [srcq+ 64]
+ pmullw m9, m5, [srcq+ 66]
+ pmullw m2, m4, [srcq+128]
+ pmullw m10, m5, [srcq+130]
+ pmullw m3, m4, [srcq+192]
+ pmullw m11, m5, [srcq+194]
+ REPX {psubw x, m6}, m0, m1, m2, m3
+ paddw m0, m8
+ paddw m1, m9
+ paddw m2, m10
+ paddw m3, m11
+ REPX {psraw x, 2}, m0, m1, m2, m3
+.hv_w128_loop:
+ add srcq, strideq
+ pmullw m8, m4, [srcq+ 0]
+ pmullw m12, m5, [srcq+ 2]
+ pmullw m9, m4, [srcq+ 64]
+ pmullw m13, m5, [srcq+ 66]
+ pmullw m10, m4, [srcq+128]
+ pmullw m14, m5, [srcq+130]
+ pmullw m11, m4, [srcq+192]
+ pmullw m15, m5, [srcq+194]
+ REPX {psubw x, m6}, m8, m9, m10, m11
+ paddw m8, m12
+ paddw m9, m13
+ paddw m10, m14
+ paddw m11, m15
+ REPX {psraw x, 2}, m8, m9, m10, m11
+ psubw m12, m8, m0
+ psubw m13, m9, m1
+ psubw m14, m10, m2
+ psubw m15, m11, m3
+ REPX {pmulhrsw x, m7}, m12, m13, m14, m15
+ paddw m12, m0
+ mova m0, m8
+ paddw m13, m1
+ mova m1, m9
+ mova [tmpq+64*0], m12
+ mova [tmpq+64*1], m13
+ paddw m14, m2
+ mova m2, m10
+ paddw m15, m3
+ mova m3, m11
+ mova [tmpq+64*2], m14
+ mova [tmpq+64*3], m15
+ add tmpq, 64*4
+ dec hd
+ jg .hv_w128_loop
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v
+cglobal %1_8tap_%2_16bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%define buf rsp+stack_offset+8 ; shadow space
+%else
+DECLARE_REG_TMP 7, 8
+%define buf rsp-40 ; red zone
+%endif
+
+MC_8TAP_FN put, sharp, SHARP, SHARP
+MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH
+MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP
+MC_8TAP_FN put, smooth, SMOOTH, SMOOTH
+MC_8TAP_FN put, sharp_regular, SHARP, REGULAR
+MC_8TAP_FN put, regular_sharp, REGULAR, SHARP
+MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR
+MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH
+MC_8TAP_FN put, regular, REGULAR, REGULAR
+
+cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my
+%define base r8-put_avx512icl
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx512icl]
+ movifnidn wd, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ sub srcq, 2
+ mova ym2, [spel_h_shuf2a]
+ pmovsxbw xmm4, [base+subpel_filters+mxq*8]
+ pshufd xmm3, xmm4, q1111
+ pshufd xmm4, xmm4, q2222
+.h_w2_loop:
+ movu xm1, [srcq+ssq*0]
+ vinserti32x4 ym1, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ mova xmm0, xm8
+ vpermb ym1, ym2, ym1
+ vpdpwssd xmm0, xmm3, xm1
+ vextracti32x4 xm1, ym1, 1
+ vpdpwssd xmm0, xmm4, xm1
+ psrad xmm0, 6
+ packusdw xmm0, xmm0
+ pminsw xmm0, xm9
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ vbroadcasti32x4 ym4, [spel_h_shufA]
+ vbroadcasti32x4 ym5, [spel_h_shufB]
+ pshufd xmm0, xmm0, q2211
+ vpbroadcastq ym6, xmm0
+ vpermq ym7, ymm0, q1111
+.h_w4_loop:
+ movu xm2, [srcq+ssq*0]
+ vinserti32x4 ym2, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ mova ym0, ym8
+ pshufb ym1, ym2, ym4
+ vpdpwssd ym0, ym6, ym1
+ pshufb ym2, ym5
+ vpdpwssd ym0, ym7, ym2
+ psrad ym0, 6
+ vextracti32x4 xm1, ym0, 1
+ packusdw xm0, xm1
+ pminsw xmm0, xm0, xm9
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ mov r7d, r8m
+ vpbroadcastw m9, r8m
+ shr r7d, 11
+ vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4]
+ cmp wd, 4
+ je .h_w4
+ jl .h_w2
+ shr mxd, 16
+ sub srcq, 6
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ mova [buf], xmm0
+ vpbroadcastd m10, xmm0
+ vpbroadcastd m11, [buf+ 4]
+ vpbroadcastd m12, [buf+ 8]
+ vpbroadcastd m13, [buf+12]
+ sub wd, 16
+ je .h_w16
+ jg .h_w32
+.h_w8:
+ mova m4, [spel_h_shufA]
+ movu m5, [spel_h_shufB]
+ movu m6, [spel_h_shufC]
+ mova m7, [spel_h_shufD]
+.h_w8_loop:
+ movu ym2, [srcq+ssq*0]
+ vinserti32x8 m2, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ mova m0, m8
+ vpermb m1, m4, m2
+ vpdpwssd m0, m10, m1
+ vpermb m1, m5, m2
+ vpdpwssd m0, m11, m1
+ vpermb m1, m6, m2
+ vpdpwssd m0, m12, m1
+ vpermb m1, m7, m2
+ vpdpwssd m0, m13, m1
+ psrad m0, 6
+ vextracti32x8 ym1, m0, 1
+ packusdw ym0, ym1
+ pminsw ym0, ym9
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8_loop
+ RET
+.h_w16:
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+.h_w16_loop:
+ movu ym2, [srcq+ssq*0+ 0]
+ vinserti32x8 m2, [srcq+ssq*1+ 0], 1
+ movu ym3, [srcq+ssq*0+16]
+ vinserti32x8 m3, [srcq+ssq*1+16], 1
+ lea srcq, [srcq+ssq*2]
+ mova m0, m8
+ mova m1, m8
+ pshufb m4, m2, m6
+ vpdpwssd m0, m10, m4 ; a0
+ pshufb m4, m3, m6
+ vpdpwssd m1, m12, m4 ; b2
+ pshufb m4, m2, m7
+ vpdpwssd m0, m11, m4 ; a1
+ pshufb m4, m3, m7
+ vpdpwssd m1, m13, m4 ; b3
+ shufpd m2, m3, 0x55
+ pshufb m4, m2, m6
+ vpdpwssd m0, m12, m4 ; a2
+ vpdpwssd m1, m10, m4 ; b0
+ pshufb m2, m7
+ vpdpwssd m0, m13, m2 ; a3
+ vpdpwssd m1, m11, m2 ; b1
+ psrad m0, 6
+ psrad m1, 6
+ packusdw m0, m1
+ pminsw m0, m9
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ lea srcq, [srcq+wq*2]
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ lea dstq, [dstq+wq*2]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ neg wq
+.h_w32_loop0:
+ mov r6, wq
+.h_w32_loop:
+ movu m2, [srcq+r6*2+ 0]
+ movu m3, [srcq+r6*2+ 8]
+ mova m0, m8
+ mova m1, m8
+ pshufb m4, m2, m6
+ vpdpwssd m0, m10, m4 ; a0
+ pshufb m4, m3, m6
+ vpdpwssd m1, m10, m4 ; b0
+ vpdpwssd m0, m12, m4 ; a2
+ movu m4, [srcq+r6*2+16]
+ pshufb m3, m7
+ vpdpwssd m1, m11, m3 ; b1
+ vpdpwssd m0, m13, m3 ; a3
+ pshufb m3, m4, m6
+ vpdpwssd m1, m12, m3 ; b2
+ pshufb m2, m7
+ vpdpwssd m0, m11, m2 ; a1
+ pshufb m4, m7
+ vpdpwssd m1, m13, m4 ; b3
+ psrad m0, 6
+ psrad m1, 6
+ packusdw m0, m1
+ pminsw m0, m9
+ mova [dstq+r6*2], m0
+ add r6, 32
+ jl .h_w32_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w32_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastd m10, [pd_32]
+ pmovsxbw xmm0, [base+subpel_filters+myq*8]
+ tzcnt r7d, wd
+ vpbroadcastw m11, r8m
+ lea r6, [ssq*3]
+ movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)]
+ sub srcq, r6
+ mova [rsp+stack_offset+8], xmm0
+ vpbroadcastd m12, xmm0
+ add r7, r8
+ vpbroadcastd m13, [rsp+stack_offset+12]
+ vpbroadcastd m14, [rsp+stack_offset+16]
+ vpbroadcastd m15, [rsp+stack_offset+20]
+ jmp r7
+.v_w2:
+ movd xmm2, [srcq+ssq*0]
+ pinsrd xmm2, [srcq+ssq*1], 1
+ pinsrd xmm2, [srcq+ssq*2], 2
+ add srcq, r6
+ pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3
+ movd xmm3, [srcq+ssq*1]
+ vpbroadcastd xmm1, [srcq+ssq*2]
+ add srcq, r6
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm3, xmm1, 0x02 ; 4 5
+ vpblendd xmm1, xmm0, 0x02 ; 5 6
+ palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
+ punpcklwd xmm3, xmm1 ; 45 56
+ punpcklwd xmm1, xmm2, xmm4 ; 01 12
+ punpckhwd xmm2, xmm4 ; 23 34
+.v_w2_loop:
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova xmm5, xm10
+ vpdpwssd xmm5, xm12, xmm1 ; a0 b0
+ mova xmm1, xmm2
+ vpdpwssd xmm5, xm13, xmm2 ; a1 b1
+ mova xmm2, xmm3
+ vpdpwssd xmm5, xm14, xmm3 ; a2 b2
+ vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm0, 0x02 ; 7 8
+ punpcklwd xmm3, xmm4 ; 67 78
+ vpdpwssd xmm5, xm15, xmm3 ; a3 b3
+ psrad xmm5, 6
+ packusdw xmm5, xmm5
+ pminsw xmm5, xm11
+ movd [dstq+dsq*0], xmm5
+ pextrd [dstq+dsq*1], xmm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq xmm1, [srcq+ssq*0]
+ vpbroadcastq ymm0, [srcq+ssq*1]
+ vpbroadcastq ymm2, [srcq+ssq*2]
+ add srcq, r6
+ vpbroadcastq ymm4, [srcq+ssq*0]
+ vpbroadcastq ymm3, [srcq+ssq*1]
+ vpbroadcastq ymm5, [srcq+ssq*2]
+ add srcq, r6
+ vpblendd ymm1, ymm0, 0x30
+ vpblendd ymm0, ymm2, 0x30
+ punpcklwd ymm1, ymm0 ; 01 12
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm2, ymm4, 0x30
+ vpblendd ymm4, ymm3, 0x30
+ punpcklwd ymm2, ymm4 ; 23 34
+ vpblendd ymm3, ymm5, 0x30
+ vpblendd ymm5, ymm0, 0x30
+ punpcklwd ymm3, ymm5 ; 45 56
+.v_w4_loop:
+ vpbroadcastq ymm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova ymm4, ym10
+ vpdpwssd ymm4, ym12, ymm1 ; a0 b0
+ mova ymm1, ymm2
+ vpdpwssd ymm4, ym13, ymm2 ; a1 b1
+ mova ymm2, ymm3
+ vpdpwssd ymm4, ym14, ymm3 ; a2 b2
+ vpblendd ymm3, ymm0, ymm5, 0x30
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm5, ymm0, 0x30
+ punpcklwd ymm3, ymm5 ; 67 78
+ vpdpwssd ymm4, ym15, ymm3 ; a3 b3
+ psrad ymm4, 6
+ vextracti128 xmm5, ymm4, 1
+ packusdw xmm4, xmm5
+ pminsw xmm4, xm11
+ movq [dstq+dsq*0], xmm4
+ movhps [dstq+dsq*1], xmm4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ vbroadcasti32x4 m2, [srcq+ssq*2]
+ vinserti32x4 m1, m2, [srcq+ssq*0], 0
+ vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2
+ add srcq, r6
+ vinserti32x4 ym2, [srcq+ssq*0], 1
+ vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4
+ mova m6, [spel_v_shuf8]
+ movu xm0, [srcq+ssq*1]
+ vinserti32x4 ym0, [srcq+ssq*2], 1
+ add srcq, r6
+ vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6
+ vpermb m1, m6, m1 ; 01 12
+ vpermb m2, m6, m2 ; 23 34
+ vpermb m3, m6, m0 ; 45 56
+.v_w8_loop:
+ vinserti32x4 m0, [srcq+ssq*1], 3
+ lea srcq, [srcq+ssq*2]
+ movu xm5, [srcq+ssq*0]
+ mova m4, m10
+ vpdpwssd m4, m12, m1 ; a0 b0
+ mova m1, m2
+ vshufi32x4 m0, m5, q1032 ; 6 7 8
+ vpdpwssd m4, m13, m2 ; a1 b1
+ mova m2, m3
+ vpdpwssd m4, m14, m3 ; a2 b2
+ vpermb m3, m6, m0 ; 67 78
+ vpdpwssd m4, m15, m3 ; a3 b3
+ psrad m4, 6
+ vextracti32x8 ym5, m4, 1
+ packusdw ym4, ym5
+ pminsw ym4, ym11
+ mova [dstq+dsq*0], xm4
+ vextracti32x4 [dstq+dsq*1], ym4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ vbroadcasti32x8 m1, [srcq+ssq*1]
+ vinserti32x8 m0, m1, [srcq+ssq*0], 0
+ vinserti32x8 m1, [srcq+ssq*2], 1
+ mova m8, [spel_v_shuf16]
+ add srcq, r6
+ movu ym3, [srcq+ssq*0]
+ vinserti32x8 m3, [srcq+ssq*1], 1
+ movu ym5, [srcq+ssq*2]
+ add srcq, r6
+ vinserti32x8 m5, [srcq+ssq*0], 1
+ vpermb m0, m8, m0 ; 01
+ vpermb m1, m8, m1 ; 12
+ vpermb m3, m8, m3 ; 34
+ vpermb m5, m8, m5 ; 56
+ mova m9, [deint_q_shuf]
+ vpshrdd m2, m1, m3, 16 ; 23
+ vpshrdd m4, m3, m5, 16 ; 45
+.v_w16_loop:
+ mova m6, m10
+ mova m7, m10
+ vpdpwssd m6, m12, m0 ; a0
+ mova m0, m2
+ vpdpwssd m7, m12, m1 ; b0
+ mova m1, m3
+ vpdpwssd m6, m13, m2 ; a1
+ mova m2, m4
+ vpdpwssd m7, m13, m3 ; b1
+ mova m3, m5
+ vpdpwssd m6, m14, m4 ; a2
+ mova m4, m5
+ vpdpwssd m7, m14, m5 ; b2
+ movu ym5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m5, [srcq+ssq*0], 1
+ vpermb m5, m8, m5 ; 78
+ vpshrdd m4, m5, 16 ; 67
+ vpdpwssd m6, m15, m4 ; a3
+ vpdpwssd m7, m15, m5 ; b3
+ psrad m6, 6
+ psrad m7, 6
+ packusdw m6, m7
+ pminsw m6, m11
+ vpermq m6, m9, m6
+ mova [dstq+dsq*0], ym6
+ vextracti32x8 [dstq+dsq*1], m6, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+.v_w64:
+.v_w128:
+%if WIN64
+ movaps [rsp+stack_offset+8], xmm6
+%endif
+ lea wd, [hq+wq*8-256]
+ mov r7, srcq
+ mov r8, dstq
+.v_w32_loop0:
+ movu m16, [srcq+ssq*0]
+ movu m17, [srcq+ssq*1]
+ movu m18, [srcq+ssq*2]
+ add srcq, r6
+ movu m19, [srcq+ssq*0]
+ movu m20, [srcq+ssq*1]
+ movu m21, [srcq+ssq*2]
+ add srcq, r6
+ movu m22, [srcq+ssq*0]
+ punpcklwd m0, m16, m17 ; 01l
+ punpckhwd m16, m17 ; 01h
+ punpcklwd m1, m17, m18 ; 12l
+ punpckhwd m17, m18 ; 12h
+ punpcklwd m2, m18, m19 ; 23l
+ punpckhwd m18, m19 ; 23h
+ punpcklwd m3, m19, m20 ; 34l
+ punpckhwd m19, m20 ; 34h
+ punpcklwd m4, m20, m21 ; 45l
+ punpckhwd m20, m21 ; 45h
+ punpcklwd m5, m21, m22 ; 56l
+ punpckhwd m21, m22 ; 56h
+.v_w32_loop:
+ mova m6, m10
+ vpdpwssd m6, m12, m0 ; a0l
+ mova m8, m10
+ vpdpwssd m8, m12, m16 ; a0h
+ mova m7, m10
+ vpdpwssd m7, m12, m1 ; b0l
+ mova m9, m10
+ vpdpwssd m9, m12, m17 ; b0h
+ mova m0, m2
+ vpdpwssd m6, m13, m2 ; a1l
+ mova m16, m18
+ vpdpwssd m8, m13, m18 ; a1h
+ mova m1, m3
+ vpdpwssd m7, m13, m3 ; b1l
+ mova m17, m19
+ vpdpwssd m9, m13, m19 ; b1h
+ mova m2, m4
+ vpdpwssd m6, m14, m4 ; a2l
+ mova m18, m20
+ vpdpwssd m8, m14, m20 ; a2h
+ mova m3, m5
+ vpdpwssd m7, m14, m5 ; b2l
+ mova m19, m21
+ vpdpwssd m9, m14, m21 ; b2h
+ movu m21, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m4, m22, m21 ; 67l
+ punpckhwd m20, m22, m21 ; 67h
+ movu m22, [srcq+ssq*0]
+ vpdpwssd m6, m15, m4 ; a3l
+ vpdpwssd m8, m15, m20 ; a3h
+ punpcklwd m5, m21, m22 ; 78l
+ punpckhwd m21, m22 ; 78h
+ vpdpwssd m7, m15, m5 ; b3l
+ vpdpwssd m9, m15, m21 ; b3h
+ REPX {psrad x, 6}, m6, m8, m7, m9
+ packusdw m6, m8
+ packusdw m7, m9
+ pminsw m6, m11
+ pminsw m7, m11
+ mova [dstq+dsq*0], m6
+ mova [dstq+dsq*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ add r7, 64
+ add r8, 64
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+ jg .v_w32_loop0
+%if WIN64
+ movaps xmm6, [rsp+stack_offset+8]
+%endif
+ vzeroupper
+ RET
+.hv:
+ vpbroadcastw m11, r8m
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ pmovsxbw xmm1, [base+subpel_filters+myq*8]
+ lea r6, [ssq*3]
+ sub srcq, 2
+ sub srcq, r6
+ test dword r8m, 0x800
+ jnz .hv_12bit
+ vpbroadcastd m10, [pd_2176]
+ psllw xmm0, 6
+ jmp .hv_main
+.hv_12bit:
+ vpbroadcastd m10, [pd_640]
+ psllw xmm0, 4
+ psllw xmm1, 2
+.hv_main:
+ mova [buf+ 0], xmm0
+ mova [buf+16], xmm1
+ vpbroadcastd m8, [buf+ 4]
+ vpbroadcastd m9, [buf+ 8]
+ vpbroadcastd ym12, xmm1
+ vpbroadcastd ym13, [buf+20]
+ vpbroadcastd ym14, [buf+24]
+ vpbroadcastd ym15, [buf+28]
+ movu xm4, [srcq+ssq*0]
+ vinserti32x4 ym4, [srcq+ssq*1], 1
+ vinserti32x4 m4, [srcq+ssq*2], 2
+ add srcq, r6
+ vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3
+ movu xm0, [srcq+ssq*1]
+ vinserti32x4 ym0, [srcq+ssq*2], 1
+ add srcq, r6
+ vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti32x4 m2, [spel_h_shufA]
+ mova m3, [spel_h_shuf2b]
+ mova ym6, [spel_h_shuf2a]
+ mova xm7, [spel_shuf2]
+ mova m1, m10
+ pshufb m4, m2
+ pshufb m0, m2
+ punpcklqdq m2, m4, m0
+ vpdpwssd m1, m8, m2 ; 04 15 26 3_
+ punpckhqdq m4, m0
+ vpdpwssd m1, m9, m4
+ vpermb m1, m3, m1 ; 01 12
+ vextracti32x4 xm2, ym1, 1 ; 23 34
+ vextracti32x4 xm3, m1, 2 ; 45 56
+.hv_w2_loop:
+ movu xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym5, [srcq+ssq*0], 1
+ mova xm4, xm10
+ vpermb ym5, ym6, ym5
+ pmaddwd xmm0, xm12, xm1 ; a0 b0
+ vpdpwssd xm4, xm8, xm5
+ vextracti32x4 xm5, ym5, 1
+ mova xm1, xm2
+ vpdpwssd xmm0, xm13, xm2 ; a1 b1
+ vpdpwssd xm4, xm9, xm5 ; 7 8
+ mova xm2, xm3
+ vpdpwssd xmm0, xm14, xm3 ; a2 b2
+ vpermt2b xm3, xm7, xm4 ; 67 78
+ vpdpwssd xmm0, xm15, xm3 ; a3 b3
+ psrad xmm0, 10
+ packusdw xmm0, xmm0
+ pminsw xmm0, xm11
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ vbroadcasti32x4 m19, [spel_h_shufA]
+ vbroadcasti32x4 m20, [spel_h_shufB]
+ mova ym6, [spel_shuf4a]
+ mova ym7, [spel_shuf4b]
+ mova m2, m10
+ mova m3, m10
+ pshufb m1, m4, m19
+ vpdpwssd m2, m8, m1
+ pshufb m1, m0, m19
+ vpdpwssd m3, m8, m1
+ pshufb m4, m20
+ vpdpwssd m2, m9, m4
+ pshufb m0, m20
+ vpdpwssd m3, m9, m0
+ vpermb m1, m6, m2 ; 01 12
+ vshufi32x4 m2, m3, q1032
+ vpermb m3, m6, m3 ; 45 56
+ vpermb m2, m6, m2 ; 23 34
+.hv_w4_loop:
+ movu xm18, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 ym18, [srcq+ssq*0], 1
+ mova ym4, ym10
+ pshufb ym17, ym18, ym19
+ pmaddwd ym16, ym12, ym1 ; a0 b0
+ vpdpwssd ym4, ym8, ym17
+ pshufb ym18, ym20
+ mova ym1, ym2
+ vpdpwssd ym16, ym13, ym2 ; a1 b1
+ vpdpwssd ym4, ym9, ym18 ; 7 8
+ mova ym2, ym3
+ vpdpwssd ym16, ym14, ym3 ; a2 b2
+ vpermt2b ym3, ym7, ym4 ; 67 78
+ vpdpwssd ym16, ym15, ym3 ; a3 b3
+ psrad ym16, 10
+ vextracti128 xm17, ym16, 1
+ packusdw xm16, xm17
+ pminsw xm16, xm11
+ movq [dstq+dsq*0], xm16
+ movhps [dstq+dsq*1], xm16
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ vzeroupper
+ RET
+.hv_w8:
+ shr mxd, 16
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ pmovsxbw xmm1, [base+subpel_filters+myq*8]
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+ test dword r8m, 0x800
+ jnz .hv_w8_12bit
+ vpbroadcastd m10, [pd_2176]
+ psllw xmm0, 6
+ jmp .hv_w8_main
+.hv_w8_12bit:
+ vpbroadcastd m10, [pd_640]
+ psllw xmm0, 4
+ psllw xmm1, 2
+.hv_w8_main:
+ mova [buf+ 0], xmm0
+ mova [buf+16], xmm1
+ vpbroadcastd m12, xmm0
+ vpbroadcastd m13, [buf+ 4]
+ vpbroadcastd m14, [buf+ 8]
+ vpbroadcastd m15, [buf+12]
+ vpbroadcastd m16, xmm1
+ vpbroadcastd m17, [buf+20]
+ vpbroadcastd m18, [buf+24]
+ vpbroadcastd m19, [buf+28]
+ cmp wd, 16
+ je .hv_w16
+ jg .hv_w32
+ mova m5, [spel_h_shufA]
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1
+ movu ym9, [srcq+ssq*2]
+ add srcq, r6
+ vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3
+ movu ym20, [srcq+ssq*1]
+ vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5
+ add srcq, r6
+ movu ym21, [srcq+ssq*0] ; 6
+ movu m6, [spel_h_shufB]
+ movu m7, [spel_h_shufC]
+ vpermb m8, m5, m0
+ mova m1, m10
+ vpdpwssd m1, m12, m8 ; a0 b0
+ vpermb m8, m5, m9
+ mova m2, m10
+ vpdpwssd m2, m12, m8 ; c0 d0
+ vpermb m8, m5, m20
+ mova m3, m10
+ vpdpwssd m3, m12, m8 ; e0 f0
+ vpermb m8, m5, m21
+ mova m4, m10
+ vpdpwssd m4, m12, m8 ; g0
+ vpermb m8, m6, m0
+ vpdpwssd m1, m13, m8 ; a1 b1
+ vpermb m8, m6, m9
+ vpdpwssd m2, m13, m8 ; c1 d1
+ vpermb m8, m6, m20
+ vpdpwssd m3, m13, m8 ; e1 f1
+ vpermb m8, m6, m21
+ vpdpwssd m4, m13, m8 ; g1
+ vpermb m8, m7, m0
+ vpdpwssd m1, m14, m8 ; a2 b2
+ vpermb m8, m7, m9
+ vpdpwssd m2, m14, m8 ; c2 d2
+ vpermb m8, m7, m20
+ vpdpwssd m3, m14, m8 ; e2 f2
+ vpermb m8, m7, m21
+ vpdpwssd m4, m14, m8 ; g2
+ mova m8, [spel_h_shufD]
+ vpermb m0, m8, m0
+ vpdpwssd m1, m15, m0 ; a3 b3
+ mova m0, [spel_shuf8a]
+ vpermb m9, m8, m9
+ vpdpwssd m2, m15, m9 ; c3 d3
+ mova m9, [spel_shuf8b]
+ vpermb m20, m8, m20
+ vpdpwssd m3, m15, m20 ; e3 f3
+ vpermb m21, m8, m21
+ vpdpwssd m4, m15, m21 ; g3
+ vpermt2b m1, m0, m2 ; 01 12
+ vpermt2b m2, m0, m3 ; 23 34
+ vpermt2b m3, m0, m4 ; 45 56
+.hv_w8_loop:
+ movu ym0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m0, [srcq+ssq*0], 1
+ mova m4, m10
+ vpermb m21, m5, m0
+ vpdpwssd m4, m12, m21 ; h0 i0
+ vpermb m21, m6, m0
+ pmaddwd m20, m16, m1 ; A0 B0
+ vpdpwssd m4, m13, m21 ; h1 i1
+ vpermb m21, m7, m0
+ mova m1, m2
+ vpdpwssd m20, m17, m2 ; A1 B1
+ vpdpwssd m4, m14, m21 ; h2 i2
+ vpermb m21, m8, m0
+ mova m2, m3
+ vpdpwssd m20, m18, m3 ; A2 B2
+ vpdpwssd m4, m15, m21 ; h3 i3
+ vpermt2b m3, m9, m4 ; 67 78
+ vpdpwssd m20, m19, m3 ; A3 B3
+ psrad m20, 10
+ vextracti32x8 ym21, m20, 1
+ packusdw ym20, ym21
+ pminsw ym20, ym11
+ mova [dstq+dsq*0], xm20
+ vextracti128 [dstq+dsq*1], ym20, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ vzeroupper
+ RET
+.hv_w16:
+ WIN64_SPILL_XMM 26
+ vbroadcasti32x8 m5, [srcq+ssq*0+ 8]
+ vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0
+ vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0
+ movu ym6, [srcq+ssq*1+ 0]
+ movu ym7, [srcq+ssq*1+16]
+ vinserti32x8 m6, [srcq+ssq*2+ 0], 1
+ vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2
+ add srcq, r6
+ movu ym22, [srcq+ssq*0+ 0]
+ movu ym23, [srcq+ssq*0+16]
+ vinserti32x8 m22, [srcq+ssq*1+ 0], 1
+ vinserti32x8 m23, [srcq+ssq*1+16], 1 ; 3 4
+ movu ym24, [srcq+ssq*2+ 0]
+ movu ym25, [srcq+ssq*2+16]
+ add srcq, r6
+ vinserti32x8 m24, [srcq+ssq*0+ 0], 1
+ vinserti32x8 m25, [srcq+ssq*0+16], 1 ; 5 6
+ vbroadcasti32x4 m20, [spel_h_shufA]
+ vbroadcasti32x4 m21, [spel_h_shufB]
+ mova m9, [spel_shuf16]
+ pshufb m0, m4, m20
+ mova m1, m10
+ vpdpwssd m1, m12, m0 ; a0
+ pshufb m0, m6, m20
+ mova m2, m10
+ vpdpwssd m2, m12, m0 ; b0
+ pshufb m0, m7, m20
+ mova m3, m10
+ vpdpwssd m3, m14, m0 ; c2
+ pshufb m0, m4, m21
+ vpdpwssd m1, m13, m0 ; a1
+ pshufb m0, m6, m21
+ vpdpwssd m2, m13, m0 ; b1
+ pshufb m0, m7, m21
+ vpdpwssd m3, m15, m0 ; c3
+ pshufb m0, m5, m20
+ vpdpwssd m1, m14, m0 ; a2
+ shufpd m6, m7, 0x55
+ pshufb m7, m6, m20
+ vpdpwssd m2, m14, m7 ; b2
+ vpdpwssd m3, m12, m7 ; c0
+ pshufb m5, m21
+ vpdpwssd m1, m15, m5 ; a3
+ pshufb m6, m21
+ vpdpwssd m2, m15, m6 ; b3
+ vpdpwssd m3, m13, m6 ; c1
+ pshufb m0, m22, m20
+ mova m4, m10
+ vpdpwssd m4, m12, m0 ; d0
+ pshufb m0, m23, m20
+ mova m5, m10
+ vpdpwssd m5, m14, m0 ; e2
+ pshufb m0, m24, m20
+ mova m6, m10
+ vpdpwssd m6, m12, m0 ; f0
+ pshufb m0, m25, m20
+ mova m7, m10
+ vpdpwssd m7, m14, m0 ; g2
+ pshufb m0, m22, m21
+ vpdpwssd m4, m13, m0 ; d1
+ pshufb m0, m23, m21
+ vpdpwssd m5, m15, m0 ; e3
+ pshufb m0, m24, m21
+ vpdpwssd m6, m13, m0 ; f1
+ pshufb m0, m25, m21
+ vpdpwssd m7, m15, m0 ; g3
+ shufpd m22, m23, 0x55
+ pshufb m23, m22, m20
+ vpdpwssd m4, m14, m23 ; d2
+ vpdpwssd m5, m12, m23 ; e0
+ shufpd m24, m25, 0x55
+ pshufb m25, m24, m20
+ vpdpwssd m6, m14, m25 ; f2
+ vpdpwssd m7, m12, m25 ; g0
+ pshufb m22, m21
+ vpdpwssd m4, m15, m22 ; d3
+ vpdpwssd m5, m13, m22 ; e1
+ pshufb m24, m21
+ vpdpwssd m6, m15, m24 ; f3
+ vpdpwssd m7, m13, m24 ; g1
+ pslldq m1, 1
+ vpermt2b m2, m9, m3 ; 12
+ vpermt2b m4, m9, m5 ; 34
+ vpermt2b m6, m9, m7 ; 56
+ vpshrdd m1, m2, 16 ; 01
+ vpshrdd m3, m2, m4, 16 ; 23
+ vpshrdd m5, m4, m6, 16 ; 45
+.hv_w16_loop:
+ movu ym24, [srcq+ssq*1+ 0]
+ movu ym25, [srcq+ssq*1+16]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m24, [srcq+ssq*0+ 0], 1
+ vinserti32x8 m25, [srcq+ssq*0+16], 1
+ mova m7, m10
+ mova m8, m10
+ pshufb m0, m24, m20
+ vpdpwssd m7, m12, m0 ; h0
+ pshufb m0, m25, m20
+ vpdpwssd m8, m14, m0 ; i2
+ pmaddwd m22, m16, m1 ; A0
+ mova m1, m3
+ pmaddwd m23, m16, m2 ; B0
+ mova m2, m4
+ pshufb m0, m24, m21
+ vpdpwssd m7, m13, m0 ; h1
+ pshufb m0, m25, m21
+ vpdpwssd m8, m15, m0 ; i3
+ vpdpwssd m22, m17, m3 ; A1
+ mova m3, m5
+ vpdpwssd m23, m17, m4 ; B1
+ mova m4, m6
+ shufpd m24, m25, 0x55
+ pshufb m25, m24, m20
+ vpdpwssd m7, m14, m25 ; h2
+ vpdpwssd m8, m12, m25 ; i0
+ vpdpwssd m22, m18, m5 ; A2
+ vpdpwssd m23, m18, m6 ; B2
+ pshufb m24, m21
+ vpdpwssd m7, m15, m24 ; h3
+ vpdpwssd m8, m13, m24 ; i1
+ vpermt2b m7, m9, m8 ; 78
+ vpshrdd m5, m6, m7, 16 ; 67
+ vpdpwssd m22, m19, m5 ; A3
+ vpdpwssd m23, m19, m7 ; B3
+ mova m6, m7
+ psrad m22, 10
+ psrad m23, 10
+ vshufi32x4 m0, m22, m23, q3232
+ vinserti32x8 m22, ym23, 1
+ packusdw m22, m0
+ pminsw m22, m11
+ mova [dstq+dsq*0], ym22
+ vextracti32x8 [dstq+dsq*1], m22, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 32
+ vbroadcasti32x4 m20, [spel_h_shufA]
+ vbroadcasti32x4 m21, [spel_h_shufB]
+ mova m22, [spel_shuf32]
+ lea wd, [hq+wq*8-256]
+ mov r7, srcq
+ mov r8, dstq
+.hv_w32_loop0:
+ movu m6, [srcq+ssq*0+ 0]
+ movu m7, [srcq+ssq*0+ 8]
+ movu m8, [srcq+ssq*0+16]
+ mova m0, m10
+ mova m23, m10
+ pshufb m9, m6, m20
+ vpdpwssd m0, m12, m9 ; a0l
+ pshufb m9, m7, m20
+ vpdpwssd m23, m12, m9 ; a0h
+ vpdpwssd m0, m14, m9 ; a2l
+ pshufb m7, m21
+ vpdpwssd m23, m13, m7 ; a1h
+ vpdpwssd m0, m15, m7 ; a3l
+ pshufb m7, m8, m20
+ vpdpwssd m23, m14, m7 ; a2h
+ pshufb m6, m21
+ vpdpwssd m0, m13, m6 ; a1l
+ pshufb m8, m21
+ vpdpwssd m23, m15, m8 ; a3h
+%macro PUT_8TAP_HV_W32 5 ; dst_lo, dst_hi, stride_name, stride[1-2]
+ movu m6, [srcq+%3*%4+ 0]
+ movu m7, [srcq+%3*%4+ 8]
+ movu m8, [srcq+%3*%4+16]
+%if %4 == 2
+ add srcq, r6
+%endif
+ movu m29, [srcq+%3*%5+ 0]
+ movu m30, [srcq+%3*%5+ 8]
+ movu m31, [srcq+%3*%5+16]
+%if %5 == 2
+ add srcq, r6
+%endif
+ mova m%1, m10
+ mova m9, m10
+ pshufb m%2, m6, m20
+ vpdpwssd m%1, m12, m%2 ; x0l
+ pshufb m%2, m29, m20
+ vpdpwssd m9, m12, m%2 ; y0l
+ pshufb m6, m21
+ vpdpwssd m%1, m13, m6 ; x1l
+ pshufb m29, m21
+ vpdpwssd m9, m13, m29 ; y1l
+ pshufb m6, m7, m20
+ mova m%2, m10
+ vpdpwssd m%2, m12, m6 ; x0h
+ pshufb m29, m30, m20
+ vpdpwssd m%1, m14, m6 ; y2l
+ mova m6, m10
+ vpdpwssd m6, m12, m29 ; x0h
+ pshufb m7, m21
+ vpdpwssd m9, m14, m29 ; y2l
+ pshufb m30, m21
+ vpdpwssd m%2, m13, m7 ; x1h
+ vpdpwssd m%1, m15, m7 ; x3l
+ pshufb m7, m8, m20
+ vpdpwssd m6, m13, m30 ; y1h
+ vpdpwssd m9, m15, m30 ; y3l
+ pshufb m30, m31, m20
+ vpdpwssd m%2, m14, m7 ; x2h
+ pshufb m8, m21
+ vpdpwssd m6, m14, m30 ; y2h
+ pshufb m31, m21
+ vpdpwssd m%2, m15, m8 ; x3h
+ vpdpwssd m6, m15, m31 ; y3h
+%if %1 == 1
+ vpermt2b m0, m22, m%1 ; 01l
+ vpermt2b m23, m22, m%2 ; 01h
+%endif
+ vpermt2b m%1, m22, m9 ; xyl
+ vpermt2b m%2, m22, m6 ; xyh
+%endmacro
+ PUT_8TAP_HV_W32 1, 24, ssq, 1, 2 ; 12
+ PUT_8TAP_HV_W32 3, 26, ssq, 0, 1 ; 34
+ PUT_8TAP_HV_W32 5, 28, ssq, 2, 0 ; 56
+ vpshrdd m2, m1, m3, 16 ; 23l
+ vpshrdd m25, m24, m26, 16 ; 23h
+ vpshrdd m4, m3, m5, 16 ; 45l
+ vpshrdd m27, m26, m28, 16 ; 45h
+.hv_w32_loop:
+ movu m7, [srcq+ssq*1+ 0]
+ movu m9, [srcq+ssq*2+ 0]
+ movu m6, [srcq+ssq*1+ 8]
+ movu m8, [srcq+ssq*2+ 8]
+ mova m29, m10
+ mova m31, m10
+ pshufb m30, m7, m20
+ vpdpwssd m29, m12, m30 ; h0l
+ pshufb m30, m9, m20
+ vpdpwssd m31, m12, m30 ; i0l
+ pshufb m7, m21
+ vpdpwssd m29, m13, m7 ; h1l
+ pshufb m9, m21
+ vpdpwssd m31, m13, m9 ; i1l
+ pshufb m7, m6, m20
+ vpdpwssd m29, m14, m7 ; h2l
+ pshufb m9, m8, m20
+ vpdpwssd m31, m14, m9 ; i2l
+ pshufb m6, m21
+ vpdpwssd m29, m15, m6 ; h3l
+ pshufb m8, m21
+ vpdpwssd m31, m15, m8 ; i3l
+ mova m30, m10
+ vpdpwssd m30, m12, m7 ; h0h
+ movu m7, [srcq+ssq*1+16]
+ lea srcq, [srcq+ssq*2]
+ vpermt2b m29, m22, m31 ; 78l
+ mova m31, m10
+ vpdpwssd m31, m12, m9 ; i0h
+ movu m9, [srcq+ssq*0+16]
+ vpdpwssd m30, m13, m6 ; h1h
+ pshufb m6, m7, m20
+ vpdpwssd m31, m13, m8 ; i1h
+ pshufb m8, m9, m20
+ vpdpwssd m30, m14, m6 ; h2h
+ pmaddwd m6, m16, m0 ; A0l
+ pshufb m7, m21
+ vpdpwssd m31, m14, m8 ; i2h
+ pmaddwd m8, m16, m23 ; A0h
+ pshufb m9, m21
+ vpdpwssd m30, m15, m7 ; h3h
+ pmaddwd m7, m16, m1 ; B0l
+ vpdpwssd m31, m15, m9 ; i3h
+ pmaddwd m9, m16, m24 ; B0h
+ mova m0, m2
+ vpdpwssd m6, m17, m2 ; A1l
+ mova m23, m25
+ vpdpwssd m8, m17, m25 ; A1h
+ mova m1, m3
+ vpdpwssd m7, m17, m3 ; B1l
+ mova m24, m26
+ vpdpwssd m9, m17, m26 ; B1h
+ vpermt2b m30, m22, m31 ; 78h
+ vpdpwssd m6, m18, m4 ; A2l
+ mova m2, m4
+ vpdpwssd m8, m18, m27 ; A2h
+ mova m25, m27
+ vpdpwssd m7, m18, m5 ; B2l
+ mova m3, m5
+ vpdpwssd m9, m18, m28 ; B2h
+ mova m26, m28
+ vpshrdd m4, m5, m29, 16 ; 67l
+ vpdpwssd m6, m19, m4 ; A3l
+ vpshrdd m27, m28, m30, 16 ; 67h
+ vpdpwssd m8, m19, m27 ; A3h
+ mova m5, m29
+ vpdpwssd m7, m19, m29 ; B3l
+ mova m28, m30
+ vpdpwssd m9, m19, m30 ; B3h
+ REPX {psrad x, 10}, m6, m8, m7, m9
+ packusdw m6, m8
+ packusdw m7, m9
+ pminsw m6, m11
+ pminsw m7, m11
+ mova [dstq+dsq*0], m6
+ mova [dstq+dsq*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w32_loop
+ add r7, 64
+ add r8, 64
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+ jg .hv_w32_loop0
+ RET
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+MC_8TAP_FN prep, sharp, SHARP, SHARP
+MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH
+MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP
+MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH
+MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR
+MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP
+MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR
+MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH
+MC_8TAP_FN prep, regular, REGULAR, REGULAR
+
+cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3
+%define base r7-prep_avx512icl
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep_avx512icl]
+ mov wd, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ mov r5d, r7m ; bitdepth_max
+ vpbroadcastd m5, [pw_8192]
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ shr r5d, 11
+ vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4]
+ add wq, r7
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ sub srcq, 2
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ mov r5d, r7m
+ vbroadcasti32x4 m4, [spel_h_shufA]
+ vbroadcasti32x4 m5, [spel_h_shufB]
+ shr r5d, 11
+ mova ym9, [prep_endA]
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ mova [tmpq], xmm0
+ vpbroadcastd m6, [tmpq+4]
+ vpbroadcastd m7, [tmpq+8]
+.h_w4_loop:
+ movu xm2, [srcq+strideq*0]
+ vinserti32x4 ym2, [srcq+strideq*1], 1
+ vinserti32x4 m2, [srcq+strideq*2], 2
+ vinserti32x4 m2, [srcq+r6 ], 3
+ lea srcq, [srcq+strideq*4]
+ mova m0, m10
+ pshufb m1, m2, m4
+ vpdpwssd m0, m6, m1
+ pshufb m2, m5
+ vpdpwssd m0, m7, m2
+ vpermb m0, m9, m0
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m10, [prep_8tap_rnd]
+ lea r6, [strideq*3]
+ cmp wd, 4
+ je .h_w4
+ shr mxd, 16
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ mov r5d, r7m
+ sub srcq, 6
+ shr r5d, 11
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ mova [tmpq], xmm0
+ vpbroadcastd m12, xmm0
+ vpbroadcastd m13, [tmpq+ 4]
+ vpbroadcastd m14, [tmpq+ 8]
+ vpbroadcastd m15, [tmpq+12]
+ cmp wd, 16
+ je .h_w16
+ jg .h_w32
+.h_w8:
+ mova m6, [spel_h_shufA]
+ movu m7, [spel_h_shufB]
+ movu m8, [spel_h_shufC]
+ mova m9, [spel_h_shufD]
+ mova m11, [prep_endB]
+.h_w8_loop:
+ movu ym4, [srcq+strideq*0]
+ vinserti32x8 m4, [srcq+strideq*1], 1
+ movu ym5, [srcq+strideq*2]
+ vinserti32x8 m5, [srcq+r6 ], 1
+ lea srcq, [srcq+strideq*4]
+ mova m0, m10
+ mova m1, m10
+ vpermb m2, m6, m4
+ vpermb m3, m6, m5
+ vpdpwssd m0, m12, m2
+ vpdpwssd m1, m12, m3
+ vpermb m2, m7, m4
+ vpermb m3, m7, m5
+ vpdpwssd m0, m13, m2
+ vpdpwssd m1, m13, m3
+ vpermb m2, m8, m4
+ vpermb m3, m8, m5
+ vpdpwssd m0, m14, m2
+ vpdpwssd m1, m14, m3
+ vpermb m2, m9, m4
+ vpermb m3, m9, m5
+ vpdpwssd m0, m15, m2
+ vpdpwssd m1, m15, m3
+ vpermt2b m0, m11, m1
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ mova m11, [prep_endC]
+.h_w16_loop:
+ movu ym2, [srcq+strideq*0+ 0]
+ vinserti32x8 m2, [srcq+strideq*1+ 0], 1
+ movu ym3, [srcq+strideq*0+16]
+ vinserti32x8 m3, [srcq+strideq*1+16], 1
+ lea srcq, [srcq+strideq*2]
+ mova m0, m10
+ mova m1, m10
+ pshufb m4, m2, m6
+ vpdpwssd m0, m12, m4 ; a0
+ pshufb m4, m3, m6
+ vpdpwssd m1, m14, m4 ; b2
+ pshufb m4, m2, m7
+ vpdpwssd m0, m13, m4 ; a1
+ pshufb m4, m3, m7
+ vpdpwssd m1, m15, m4 ; b3
+ shufpd m2, m3, 0x55
+ pshufb m4, m2, m6
+ vpdpwssd m0, m14, m4 ; a2
+ vpdpwssd m1, m12, m4 ; b0
+ pshufb m2, m7
+ vpdpwssd m0, m15, m2 ; a3
+ vpdpwssd m1, m13, m2 ; b1
+ vpermt2b m0, m11, m1
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ vbroadcasti32x4 m6, [spel_h_shufA]
+ lea srcq, [srcq+wq*2]
+ vbroadcasti32x4 m7, [spel_h_shufB]
+ neg wq
+ mova m11, [prep_endC]
+.h_w32_loop0:
+ mov r6, wq
+.h_w32_loop:
+ movu m2, [srcq+r6*2+ 0]
+ movu m3, [srcq+r6*2+ 8]
+ mova m0, m10
+ mova m1, m10
+ pshufb m4, m2, m6
+ vpdpwssd m0, m12, m4 ; a0
+ pshufb m4, m3, m6
+ vpdpwssd m1, m12, m4 ; b0
+ vpdpwssd m0, m14, m4 ; a2
+ movu m4, [srcq+r6*2+16]
+ pshufb m3, m7
+ vpdpwssd m1, m13, m3 ; b1
+ vpdpwssd m0, m15, m3 ; a3
+ pshufb m3, m4, m6
+ vpdpwssd m1, m14, m3 ; b2
+ pshufb m2, m7
+ vpdpwssd m0, m13, m2 ; a1
+ pshufb m4, m7
+ vpdpwssd m1, m15, m4 ; b3
+ vpermt2b m0, m11, m1
+ mova [tmpq], m0
+ add tmpq, 64
+ add r6, 32
+ jl .h_w32_loop
+ add srcq, strideq
+ dec hd
+ jg .h_w32_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ mov r5d, r7m
+ vpbroadcastd m10, [prep_8tap_rnd]
+ pmovsxbw xmm0, [base+subpel_filters+myq*8]
+ tzcnt r6d, wd
+ shr r5d, 11
+ movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)]
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ add r7, r6
+ lea r6, [strideq*3]
+ sub srcq, r6
+ mova [tmpq], xmm0
+ vpbroadcastd m12, xmm0
+ vpbroadcastd m13, [tmpq+ 4]
+ vpbroadcastd m14, [tmpq+ 8]
+ vpbroadcastd m15, [tmpq+12]
+ jmp r7
+.v_w4:
+ movq xmm1, [srcq+strideq*0]
+ vpbroadcastq ymm0, [srcq+strideq*1]
+ vpbroadcastq ymm2, [srcq+strideq*2]
+ add srcq, r6
+ vpbroadcastq ymm4, [srcq+strideq*0]
+ vpbroadcastq ymm3, [srcq+strideq*1]
+ vpbroadcastq ymm5, [srcq+strideq*2]
+ mova xm11, [prep_endA]
+ add srcq, r6
+ vpblendd ymm1, ymm0, 0x30
+ vpblendd ymm0, ymm2, 0x30
+ punpcklwd ymm1, ymm0 ; 01 12
+ vpbroadcastq ymm0, [srcq+strideq*0]
+ vpblendd ymm2, ymm4, 0x30
+ vpblendd ymm4, ymm3, 0x30
+ punpcklwd ymm2, ymm4 ; 23 34
+ vpblendd ymm3, ymm5, 0x30
+ vpblendd ymm5, ymm0, 0x30
+ punpcklwd ymm3, ymm5 ; 45 56
+.v_w4_loop:
+ vpbroadcastq ymm5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ mova ymm4, ym10
+ vpdpwssd ymm4, ym12, ymm1 ; a0 b0
+ mova ymm1, ymm2
+ vpdpwssd ymm4, ym13, ymm2 ; a1 b1
+ mova ymm2, ymm3
+ vpdpwssd ymm4, ym14, ymm3 ; a2 b2
+ vpblendd ymm3, ymm0, ymm5, 0x30
+ vpbroadcastq ymm0, [srcq+strideq*0]
+ vpblendd ymm5, ymm0, 0x30
+ punpcklwd ymm3, ymm5 ; 67 78
+ vpdpwssd ymm4, ym15, ymm3 ; a3 b3
+ vpermb ymm4, ym11, ymm4
+ mova [tmpq], xmm4
+ add tmpq, 16
+ sub hd, 2
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ vbroadcasti32x4 m2, [srcq+strideq*2]
+ vinserti32x4 m1, m2, [srcq+strideq*0], 0
+ vinserti32x4 m1, [srcq+strideq*1], 1 ; 0 1 2
+ add srcq, r6
+ vinserti32x4 ym2, [srcq+strideq*0], 1
+ vinserti32x4 m2, [srcq+strideq*1], 2 ; 2 3 4
+ mova m6, [spel_v_shuf8]
+ movu xm0, [srcq+strideq*1]
+ vinserti32x4 ym0, [srcq+strideq*2], 1
+ add srcq, r6
+ vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6
+ mova ym11, [prep_endB]
+ vpermb m1, m6, m1 ; 01 12
+ vpermb m2, m6, m2 ; 23 34
+ vpermb m3, m6, m0 ; 45 56
+.v_w8_loop:
+ vinserti32x4 m0, [srcq+strideq*1], 3
+ lea srcq, [srcq+strideq*2]
+ movu xm5, [srcq+strideq*0]
+ mova m4, m10
+ vpdpwssd m4, m12, m1 ; a0 b0
+ mova m1, m2
+ vshufi32x4 m0, m5, q1032 ; 6 7 8
+ vpdpwssd m4, m13, m2 ; a1 b1
+ mova m2, m3
+ vpdpwssd m4, m14, m3 ; a2 b2
+ vpermb m3, m6, m0 ; 67 78
+ vpdpwssd m4, m15, m3 ; a3 b3
+ vpermb m4, m11, m4
+ mova [tmpq], ym4
+ add tmpq, 32
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ vbroadcasti32x8 m1, [srcq+strideq*1]
+ vinserti32x8 m0, m1, [srcq+strideq*0], 0
+ vinserti32x8 m1, [srcq+strideq*2], 1
+ mova m8, [spel_v_shuf16]
+ add srcq, r6
+ movu ym3, [srcq+strideq*0]
+ vinserti32x8 m3, [srcq+strideq*1], 1
+ movu ym5, [srcq+strideq*2]
+ add srcq, r6
+ vinserti32x8 m5, [srcq+strideq*0], 1
+ mova m11, [prep_endA]
+ vpermb m0, m8, m0 ; 01
+ vpermb m1, m8, m1 ; 12
+ vpermb m3, m8, m3 ; 34
+ vpermb m5, m8, m5 ; 56
+ vpshrdd m2, m1, m3, 16 ; 23
+ vpshrdd m4, m3, m5, 16 ; 45
+.v_w16_loop:
+ mova m6, m10
+ mova m7, m10
+ vpdpwssd m6, m12, m0 ; a0
+ mova m0, m2
+ vpdpwssd m7, m12, m1 ; b0
+ mova m1, m3
+ vpdpwssd m6, m13, m2 ; a1
+ mova m2, m4
+ vpdpwssd m7, m13, m3 ; b1
+ mova m3, m5
+ vpdpwssd m6, m14, m4 ; a2
+ mova m4, m5
+ vpdpwssd m7, m14, m5 ; b2
+ movu ym5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m5, [srcq+strideq*0], 1
+ vpermb m5, m8, m5 ; 78
+ vpshrdd m4, m5, 16 ; 67
+ vpdpwssd m6, m15, m4 ; a3
+ vpdpwssd m7, m15, m5 ; b3
+ vpermt2b m6, m11, m7
+ mova [tmpq], m6
+ add tmpq, 64
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+.v_w64:
+.v_w128:
+%if WIN64
+ PUSH r8
+ movaps [rsp+stack_offset+8], xmm6
+%endif
+ lea r5, [hq+wq*8-256]
+ mov r7, srcq
+ mov r8, tmpq
+.v_w32_loop0:
+ movu m16, [srcq+strideq*0]
+ movu m17, [srcq+strideq*1]
+ movu m18, [srcq+strideq*2]
+ add srcq, r6
+ movu m19, [srcq+strideq*0]
+ movu m20, [srcq+strideq*1]
+ movu m21, [srcq+strideq*2]
+ add srcq, r6
+ movu m22, [srcq+strideq*0]
+ mova m11, [prep_endC]
+ punpcklwd m0, m16, m17 ; 01l
+ punpckhwd m16, m17 ; 01h
+ punpcklwd m1, m17, m18 ; 12l
+ punpckhwd m17, m18 ; 12h
+ punpcklwd m2, m18, m19 ; 23l
+ punpckhwd m18, m19 ; 23h
+ punpcklwd m3, m19, m20 ; 34l
+ punpckhwd m19, m20 ; 34h
+ punpcklwd m4, m20, m21 ; 45l
+ punpckhwd m20, m21 ; 45h
+ punpcklwd m5, m21, m22 ; 56l
+ punpckhwd m21, m22 ; 56h
+.v_w32_loop:
+ mova m6, m10
+ vpdpwssd m6, m12, m0 ; a0l
+ mova m8, m10
+ vpdpwssd m8, m12, m16 ; a0h
+ mova m7, m10
+ vpdpwssd m7, m12, m1 ; b0l
+ mova m9, m10
+ vpdpwssd m9, m12, m17 ; b0h
+ mova m0, m2
+ vpdpwssd m6, m13, m2 ; a1l
+ mova m16, m18
+ vpdpwssd m8, m13, m18 ; a1h
+ mova m1, m3
+ vpdpwssd m7, m13, m3 ; b1l
+ mova m17, m19
+ vpdpwssd m9, m13, m19 ; b1h
+ mova m2, m4
+ vpdpwssd m6, m14, m4 ; a2l
+ mova m18, m20
+ vpdpwssd m8, m14, m20 ; a2h
+ mova m3, m5
+ vpdpwssd m7, m14, m5 ; b2l
+ mova m19, m21
+ vpdpwssd m9, m14, m21 ; b2h
+ movu m21, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklwd m4, m22, m21 ; 67l
+ punpckhwd m20, m22, m21 ; 67h
+ movu m22, [srcq+strideq*0]
+ vpdpwssd m6, m15, m4 ; a3l
+ vpdpwssd m8, m15, m20 ; a3h
+ punpcklwd m5, m21, m22 ; 78l
+ punpckhwd m21, m22 ; 78h
+ vpdpwssd m7, m15, m5 ; b3l
+ vpdpwssd m9, m15, m21 ; b3h
+ vpermt2b m6, m11, m8
+ vpermt2b m7, m11, m9
+ mova [tmpq+wq*0], m6
+ mova [tmpq+wq*2], m7
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w32_loop
+ add r7, 64
+ add r8, 64
+ movzx hd, r5b
+ mov srcq, r7
+ mov tmpq, r8
+ sub r5d, 1<<8
+ jg .v_w32_loop0
+%if WIN64
+ movaps xmm6, [rsp+stack_offset+8]
+ POP r8
+%endif
+ vzeroupper
+ RET
+.hv:
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ mov r5d, r7m
+ pmovsxbw xmm1, [base+subpel_filters+myq*8]
+ lea r6, [strideq*3]
+ sub srcq, 2
+ shr r5d, 11
+ sub srcq, r6
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ psllw xmm1, 2
+ vpbroadcastd m10, [prep_8tap_rnd]
+ vpbroadcastd ym11, [pd_128]
+ mova xm21, [prep_endA]
+ mova [tmpq+ 0], xmm0
+ mova [tmpq+16], xmm1
+ vpbroadcastd m8, [tmpq+ 4]
+ vpbroadcastd m9, [tmpq+ 8]
+ vpbroadcastd ym12, xmm1
+ vpbroadcastd ym13, [tmpq+20]
+ vpbroadcastd ym14, [tmpq+24]
+ vpbroadcastd ym15, [tmpq+28]
+ movu xm4, [srcq+strideq*0]
+ vinserti32x4 ym4, [srcq+strideq*1], 1
+ vinserti32x4 m4, [srcq+strideq*2], 2
+ add srcq, r6
+ vinserti32x4 m4, [srcq+strideq*0], 3 ; 0 1 2 3
+ movu xm0, [srcq+strideq*1]
+ vinserti32x4 ym0, [srcq+strideq*2], 1
+ add srcq, r6
+ vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6
+ vbroadcasti32x4 m19, [spel_h_shufA]
+ vbroadcasti32x4 m20, [spel_h_shufB]
+ mova ym6, [spel_shuf4a]
+ mova ym7, [spel_shuf4b]
+ mova m2, m10
+ mova m3, m10
+ pshufb m1, m4, m19
+ vpdpwssd m2, m8, m1
+ pshufb m1, m0, m19
+ vpdpwssd m3, m8, m1
+ pshufb m4, m20
+ vpdpwssd m2, m9, m4
+ pshufb m0, m20
+ vpdpwssd m3, m9, m0
+ vpermb m1, m6, m2 ; 01 12
+ vshufi32x4 m2, m3, q1032
+ vpermb m3, m6, m3 ; 45 56
+ vpermb m2, m6, m2 ; 23 34
+.hv_w4_loop:
+ movu xm18, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti128 ym18, [srcq+strideq*0], 1
+ mova ym16, ym11
+ mova ym4, ym10
+ pshufb ym17, ym18, ym19
+ vpdpwssd ym16, ym12, ym1 ; a0 b0
+ vpdpwssd ym4, ym8, ym17
+ pshufb ym18, ym20
+ mova ym1, ym2
+ vpdpwssd ym16, ym13, ym2 ; a1 b1
+ vpdpwssd ym4, ym9, ym18 ; 7 8
+ mova ym2, ym3
+ vpdpwssd ym16, ym14, ym3 ; a2 b2
+ vpermt2b ym3, ym7, ym4 ; 67 78
+ vpdpwssd ym16, ym15, ym3 ; a3 b3
+ vpermb ym16, ym21, ym16
+ mova [tmpq], xm16
+ add tmpq, 16
+ sub hd, 2
+ jg .hv_w4_loop
+ vzeroupper
+ RET
+.hv_w8:
+ shr mxd, 16
+ pmovsxbw xmm0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ mov r5d, r7m
+ pmovsxbw xmm1, [base+subpel_filters+myq*8]
+ lea r6, [strideq*3]
+ sub srcq, 6
+ shr r5d, 11
+ sub srcq, r6
+ vpbroadcastd m10, [prep_8tap_rnd]
+ vpbroadcastd m11, [pd_128]
+ psllw xmm0, [base+prep_hv_shift+r5*8]
+ psllw xmm1, 2
+ mova [tmpq+ 0], xmm0
+ mova [tmpq+16], xmm1
+ vpbroadcastd m12, xmm0
+ vpbroadcastd m13, [tmpq+ 4]
+ vpbroadcastd m14, [tmpq+ 8]
+ vpbroadcastd m15, [tmpq+12]
+ vpbroadcastd m16, xmm1
+ vpbroadcastd m17, [tmpq+20]
+ vpbroadcastd m18, [tmpq+24]
+ vpbroadcastd m19, [tmpq+28]
+ cmp wd, 16
+ je .hv_w16
+ jg .hv_w32
+ WIN64_SPILL_XMM 23
+ mova m5, [spel_h_shufA]
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1
+ movu ym9, [srcq+strideq*2]
+ add srcq, r6
+ vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3
+ movu ym20, [srcq+strideq*1]
+ vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5
+ add srcq, r6
+ movu ym21, [srcq+strideq*0] ; 6
+ movu m6, [spel_h_shufB]
+ movu m7, [spel_h_shufC]
+ mova ym22, [prep_endB]
+ vpermb m8, m5, m0
+ mova m1, m10
+ vpdpwssd m1, m12, m8 ; a0 b0
+ vpermb m8, m5, m9
+ mova m2, m10
+ vpdpwssd m2, m12, m8 ; c0 d0
+ vpermb m8, m5, m20
+ mova m3, m10
+ vpdpwssd m3, m12, m8 ; e0 f0
+ vpermb m8, m5, m21
+ mova m4, m10
+ vpdpwssd m4, m12, m8 ; g0
+ vpermb m8, m6, m0
+ vpdpwssd m1, m13, m8 ; a1 b1
+ vpermb m8, m6, m9
+ vpdpwssd m2, m13, m8 ; c1 d1
+ vpermb m8, m6, m20
+ vpdpwssd m3, m13, m8 ; e1 f1
+ vpermb m8, m6, m21
+ vpdpwssd m4, m13, m8 ; g1
+ vpermb m8, m7, m0
+ vpdpwssd m1, m14, m8 ; a2 b2
+ vpermb m8, m7, m9
+ vpdpwssd m2, m14, m8 ; c2 d2
+ vpermb m8, m7, m20
+ vpdpwssd m3, m14, m8 ; e2 f2
+ vpermb m8, m7, m21
+ vpdpwssd m4, m14, m8 ; g2
+ mova m8, [spel_h_shufD]
+ vpermb m0, m8, m0
+ vpdpwssd m1, m15, m0 ; a3 b3
+ mova m0, [spel_shuf8a]
+ vpermb m9, m8, m9
+ vpdpwssd m2, m15, m9 ; c3 d3
+ mova m9, [spel_shuf8b]
+ vpermb m20, m8, m20
+ vpdpwssd m3, m15, m20 ; e3 f3
+ vpermb m21, m8, m21
+ vpdpwssd m4, m15, m21 ; g3
+ vpermt2b m1, m0, m2 ; 01 12
+ vpermt2b m2, m0, m3 ; 23 34
+ vpermt2b m3, m0, m4 ; 45 56
+.hv_w8_loop:
+ movu ym0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m0, [srcq+strideq*0], 1
+ mova m4, m10
+ mova m20, m11
+ vpermb m21, m5, m0
+ vpdpwssd m4, m12, m21 ; h0 i0
+ vpermb m21, m6, m0
+ vpdpwssd m20, m16, m1 ; A0 B0
+ vpdpwssd m4, m13, m21 ; h1 i1
+ vpermb m21, m7, m0
+ mova m1, m2
+ vpdpwssd m20, m17, m2 ; A1 B1
+ vpdpwssd m4, m14, m21 ; h2 i2
+ vpermb m21, m8, m0
+ mova m2, m3
+ vpdpwssd m20, m18, m3 ; A2 B2
+ vpdpwssd m4, m15, m21 ; h3 i3
+ vpermt2b m3, m9, m4 ; 67 78
+ vpdpwssd m20, m19, m3 ; A3 B3
+ vpermb m20, m22, m20
+ mova [tmpq], ym20
+ add tmpq, 32
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 27
+ vbroadcasti32x8 m5, [srcq+strideq*0+ 8]
+ vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0
+ vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0
+ movu ym6, [srcq+strideq*1+ 0]
+ movu ym7, [srcq+strideq*1+16]
+ vinserti32x8 m6, [srcq+strideq*2+ 0], 1
+ vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2
+ add srcq, r6
+ movu ym22, [srcq+strideq*0+ 0]
+ movu ym23, [srcq+strideq*0+16]
+ vinserti32x8 m22, [srcq+strideq*1+ 0], 1
+ vinserti32x8 m23, [srcq+strideq*1+16], 1 ; 3 4
+ movu ym24, [srcq+strideq*2+ 0]
+ movu ym25, [srcq+strideq*2+16]
+ add srcq, r6
+ vinserti32x8 m24, [srcq+strideq*0+ 0], 1
+ vinserti32x8 m25, [srcq+strideq*0+16], 1 ; 5 6
+ vbroadcasti32x4 m20, [spel_h_shufA]
+ vbroadcasti32x4 m21, [spel_h_shufB]
+ mova m9, [spel_shuf16]
+ mova m26, [prep_endB]
+ pshufb m0, m4, m20
+ mova m1, m10
+ vpdpwssd m1, m12, m0 ; a0
+ pshufb m0, m6, m20
+ mova m2, m10
+ vpdpwssd m2, m12, m0 ; b0
+ pshufb m0, m7, m20
+ mova m3, m10
+ vpdpwssd m3, m14, m0 ; c2
+ pshufb m0, m4, m21
+ vpdpwssd m1, m13, m0 ; a1
+ pshufb m0, m6, m21
+ vpdpwssd m2, m13, m0 ; b1
+ pshufb m0, m7, m21
+ vpdpwssd m3, m15, m0 ; c3
+ pshufb m0, m5, m20
+ vpdpwssd m1, m14, m0 ; a2
+ shufpd m6, m7, 0x55
+ pshufb m7, m6, m20
+ vpdpwssd m2, m14, m7 ; b2
+ vpdpwssd m3, m12, m7 ; c0
+ pshufb m5, m21
+ vpdpwssd m1, m15, m5 ; a3
+ pshufb m6, m21
+ vpdpwssd m2, m15, m6 ; b3
+ vpdpwssd m3, m13, m6 ; c1
+ pshufb m0, m22, m20
+ mova m4, m10
+ vpdpwssd m4, m12, m0 ; d0
+ pshufb m0, m23, m20
+ mova m5, m10
+ vpdpwssd m5, m14, m0 ; e2
+ pshufb m0, m24, m20
+ mova m6, m10
+ vpdpwssd m6, m12, m0 ; f0
+ pshufb m0, m25, m20
+ mova m7, m10
+ vpdpwssd m7, m14, m0 ; g2
+ pshufb m0, m22, m21
+ vpdpwssd m4, m13, m0 ; d1
+ pshufb m0, m23, m21
+ vpdpwssd m5, m15, m0 ; e3
+ pshufb m0, m24, m21
+ vpdpwssd m6, m13, m0 ; f1
+ pshufb m0, m25, m21
+ vpdpwssd m7, m15, m0 ; g3
+ shufpd m22, m23, 0x55
+ pshufb m23, m22, m20
+ vpdpwssd m4, m14, m23 ; d2
+ vpdpwssd m5, m12, m23 ; e0
+ shufpd m24, m25, 0x55
+ pshufb m25, m24, m20
+ vpdpwssd m6, m14, m25 ; f2
+ vpdpwssd m7, m12, m25 ; g0
+ pshufb m22, m21
+ vpdpwssd m4, m15, m22 ; d3
+ vpdpwssd m5, m13, m22 ; e1
+ pshufb m24, m21
+ vpdpwssd m6, m15, m24 ; f3
+ vpdpwssd m7, m13, m24 ; g1
+ pslldq m1, 1
+ vpermt2b m2, m9, m3 ; 12
+ vpermt2b m4, m9, m5 ; 34
+ vpermt2b m6, m9, m7 ; 56
+ vpshrdd m1, m2, 16 ; 01
+ vpshrdd m3, m2, m4, 16 ; 23
+ vpshrdd m5, m4, m6, 16 ; 45
+.hv_w16_loop:
+ movu ym24, [srcq+strideq*1+ 0]
+ movu ym25, [srcq+strideq*1+16]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m24, [srcq+strideq*0+ 0], 1
+ vinserti32x8 m25, [srcq+strideq*0+16], 1
+ mova m7, m10
+ mova m8, m10
+ pshufb m0, m24, m20
+ vpdpwssd m7, m12, m0 ; h0
+ mova m22, m11
+ pshufb m0, m25, m20
+ vpdpwssd m8, m14, m0 ; i2
+ mova m23, m11
+ vpdpwssd m22, m16, m1 ; A0
+ mova m1, m3
+ vpdpwssd m23, m16, m2 ; B0
+ mova m2, m4
+ pshufb m0, m24, m21
+ vpdpwssd m7, m13, m0 ; h1
+ pshufb m0, m25, m21
+ vpdpwssd m8, m15, m0 ; i3
+ vpdpwssd m22, m17, m3 ; A1
+ mova m3, m5
+ vpdpwssd m23, m17, m4 ; B1
+ mova m4, m6
+ shufpd m24, m25, 0x55
+ pshufb m25, m24, m20
+ vpdpwssd m7, m14, m25 ; h2
+ vpdpwssd m8, m12, m25 ; i0
+ vpdpwssd m22, m18, m5 ; A2
+ vpdpwssd m23, m18, m6 ; B2
+ pshufb m24, m21
+ vpdpwssd m7, m15, m24 ; h3
+ vpdpwssd m8, m13, m24 ; i1
+ vpermt2b m7, m9, m8 ; 78
+ vpshrdd m5, m6, m7, 16 ; 67
+ vpdpwssd m22, m19, m5 ; A3
+ vpdpwssd m23, m19, m7 ; B3
+ mova m6, m7
+ vpermt2b m22, m26, m23
+ mova [tmpq], m22
+ add tmpq, 64
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+%if WIN64
+ %assign stack_offset stack_offset - stack_size_padded
+ PUSH r8
+ %assign regs_used regs_used + 1
+ WIN64_SPILL_XMM 32
+%endif
+ vbroadcasti32x4 m20, [spel_h_shufA]
+ vbroadcasti32x4 m21, [spel_h_shufB]
+ mova m22, [spel_shuf32]
+ lea r5d, [hq+wq*8-256]
+ mov r7, srcq
+ mov r8, tmpq
+.hv_w32_loop0:
+ movu m6, [srcq+strideq*0+ 0]
+ movu m7, [srcq+strideq*0+ 8]
+ movu m8, [srcq+strideq*0+16]
+ mova m0, m10
+ mova m23, m10
+ pshufb m9, m6, m20
+ vpdpwssd m0, m12, m9 ; a0l
+ pshufb m9, m7, m20
+ vpdpwssd m23, m12, m9 ; a0h
+ vpdpwssd m0, m14, m9 ; a2l
+ pshufb m7, m21
+ vpdpwssd m23, m13, m7 ; a1h
+ vpdpwssd m0, m15, m7 ; a3l
+ pshufb m7, m8, m20
+ vpdpwssd m23, m14, m7 ; a2h
+ pshufb m6, m21
+ vpdpwssd m0, m13, m6 ; a1l
+ pshufb m8, m21
+ vpdpwssd m23, m15, m8 ; a3h
+ PUT_8TAP_HV_W32 1, 24, strideq, 1, 2 ; 12
+ PUT_8TAP_HV_W32 3, 26, strideq, 0, 1 ; 34
+ PUT_8TAP_HV_W32 5, 28, strideq, 2, 0 ; 56
+ vpshrdd m2, m1, m3, 16 ; 23l
+ vpshrdd m25, m24, m26, 16 ; 23h
+ vpshrdd m4, m3, m5, 16 ; 45l
+ vpshrdd m27, m26, m28, 16 ; 45h
+.hv_w32_loop:
+ movu m7, [srcq+strideq*1+ 0]
+ movu m9, [srcq+strideq*2+ 0]
+ movu m6, [srcq+strideq*1+ 8]
+ movu m8, [srcq+strideq*2+ 8]
+ mova m29, m10
+ mova m31, m10
+ pshufb m30, m7, m20
+ vpdpwssd m29, m12, m30 ; h0l
+ pshufb m30, m9, m20
+ vpdpwssd m31, m12, m30 ; i0l
+ pshufb m7, m21
+ vpdpwssd m29, m13, m7 ; h1l
+ pshufb m9, m21
+ vpdpwssd m31, m13, m9 ; i1l
+ pshufb m7, m6, m20
+ vpdpwssd m29, m14, m7 ; h2l
+ pshufb m9, m8, m20
+ vpdpwssd m31, m14, m9 ; i2l
+ pshufb m6, m21
+ vpdpwssd m29, m15, m6 ; h3l
+ pshufb m8, m21
+ vpdpwssd m31, m15, m8 ; i3l
+ mova m30, m10
+ vpdpwssd m30, m12, m7 ; h0h
+ movu m7, [srcq+strideq*1+16]
+ lea srcq, [srcq+strideq*2]
+ vpermt2b m29, m22, m31 ; 78l
+ mova m31, m10
+ vpdpwssd m31, m12, m9 ; i0h
+ movu m9, [srcq+strideq*0+16]
+ vpdpwssd m30, m13, m6 ; h1h
+ pshufb m6, m7, m20
+ vpdpwssd m31, m13, m8 ; i1h
+ pshufb m8, m9, m20
+ vpdpwssd m30, m14, m6 ; h2h
+ mova m6, m11
+ vpdpwssd m6, m16, m0 ; A0l
+ pshufb m7, m21
+ vpdpwssd m31, m14, m8 ; i2h
+ mova m8, m11
+ vpdpwssd m8, m16, m23 ; A0h
+ pshufb m9, m21
+ vpdpwssd m30, m15, m7 ; h3h
+ mova m7, m11
+ vpdpwssd m7, m16, m1 ; B0l
+ vpdpwssd m31, m15, m9 ; i3h
+ mova m9, m11
+ vpdpwssd m9, m16, m24 ; B0h
+ mova m0, m2
+ vpdpwssd m6, m17, m2 ; A1l
+ mova m23, m25
+ vpdpwssd m8, m17, m25 ; A1h
+ mova m1, m3
+ vpdpwssd m7, m17, m3 ; B1l
+ mova m24, m26
+ vpdpwssd m9, m17, m26 ; B1h
+ vpermt2b m30, m22, m31 ; 78h
+ mova m31, [prep_endC]
+ vpdpwssd m6, m18, m4 ; A2l
+ mova m2, m4
+ vpdpwssd m8, m18, m27 ; A2h
+ mova m25, m27
+ vpdpwssd m7, m18, m5 ; B2l
+ mova m3, m5
+ vpdpwssd m9, m18, m28 ; B2h
+ mova m26, m28
+ vpshrdd m4, m5, m29, 16 ; 67l
+ vpdpwssd m6, m19, m4 ; A3l
+ vpshrdd m27, m28, m30, 16 ; 67h
+ vpdpwssd m8, m19, m27 ; A3h
+ mova m5, m29
+ vpdpwssd m7, m19, m29 ; B3l
+ mova m28, m30
+ vpdpwssd m9, m19, m30 ; B3h
+ vpermt2b m6, m31, m8
+ vpermt2b m7, m31, m9
+ mova [tmpq+wq*0], m6
+ mova [tmpq+wq*2], m7
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .hv_w32_loop
+ add r7, 64
+ add r8, 64
+ movzx hd, r5b
+ mov srcq, r7
+ mov tmpq, r8
+ sub r5d, 1<<8
+ jg .hv_w32_loop0
+ RET
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts
+%define base r6-pd_0to7
+ mov t0d, r7m
+ lea r6, [pd_0to7]
+ shr t0d, 11
+ vpbroadcastd m8, [base+warp_8x8t_rnd_v]
+ vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4]
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main
+ psrad m14, m16, 15
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
+ psrad m16, 15
+ packssdw m14, m16
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
+ psrad m15, m16, 15
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
+ add tsq, tsq
+ psrad m16, 15
+ packssdw m15, m16
+ jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end
+
+cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd
+ mov t0d, r7m ; pixel_max
+ lea r6, [pd_0to7]
+ shr t0d, 11
+ vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4]
+ vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4]
+ call .main
+ psrad m14, m16, 13
+ call .main2
+ psrad m16, 13
+ packusdw m14, m16
+ call .main2
+ psrad m15, m16, 13
+ call .main2
+ vpbroadcastd m0, [base+bidir_shift+t0*4]
+ vpsrlvw m14, m0
+ psrad m16, 13
+ packusdw m15, m16
+ vpsrlvw m15, m0
+.end:
+ mova m0, [base+warp8x8_end]
+ vpermb m16, m0, m14
+ lea r2, [dsq*3]
+ mova [dstq+dsq*0], xm16
+ vextracti128 [dstq+dsq*1], ym16, 1
+ vextracti32x4 [dstq+dsq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ vpermb m16, m0, m15
+ lea dstq, [dstq+dsq*4]
+ mova [dstq+dsq*0], xm16
+ vextracti128 [dstq+dsq*1], ym16, 1
+ vextracti32x4 [dstq+dsq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ RET
+.main:
+ vpbroadcastd ym3, [base+pd_512]
+%if WIN64
+ mov abcdq, r5mp
+ vpaddd ym18, ym3, r6m {1to8} ; mx
+%else
+ add r5d, 512
+ vpbroadcastd ym18, r5d
+%endif
+ vpaddd ym20, ym3, r7m {1to8} ; my
+ mova ym16, [base+pd_0to7]
+ vpbroadcastd ym19, [abcdq+4*0] ; alpha
+ vpbroadcastd ym21, [abcdq+4*1] ; gamma
+ lea r4, [ssq*3+6]
+ vpdpwssd ym18, ym19, ym16 ; tmx
+ vpdpwssd ym20, ym21, ym16 ; tmy
+ sub srcq, r4
+ mova m10, [base+warp8x8_permA]
+ lea r4, [mc_warp_filter+64*8]
+ vbroadcasti32x4 m12, [base+warp8x8_permC]
+ kxnorb k1, k1, k1
+ vbroadcasti32x4 m13, [base+warp8x8_permD]
+ movu ym5, [srcq+0]
+ vinserti32x8 m5, [srcq+8], 1
+ psrad ym17, ym18, 10
+ mova m11, [base+warp8x8_permB]
+ kmovb k2, k1
+ vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0
+ psrad ym19, 16 ; beta
+ psrad ym21, 16 ; delta
+ paddd ym18, ym19
+ vpermb m4, m10, m5
+ vpbroadcastq m9, [base+warp_shift_h+t0*8]
+ pshufd m3, m3, q3120
+ paddd m7, m1, m1
+ pshufb m2, m3, m12
+ vpdpwssd m1, m4, m2
+ vpermb m5, m11, m5
+ vshufi32x4 m4, m5, q1021
+ pshufb m3, m13
+ vpdpwssd m1, m4, m3
+ call .h
+ psllq m2, m1, 32
+ paddd m1, m2
+ vpmultishiftqb m1, m9, m1
+ vpshrdq m1, m0, 48 ; 01 12
+ call .h
+ vpshrdq m2, m1, m0, 48 ; 23 34
+ call .h
+ vpshrdq m3, m2, m0, 48 ; 45 56
+.main2:
+ call .h
+ psrad ym6, ym20, 10
+ kmovb k1, k2
+ paddd ym17, ym20, ym21 ; my += delta
+ vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0
+ psrad ym16, ym17, 10
+ kmovb k2, k1
+ vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1
+ shufps m5, m20, m6, q2020
+ mova m16, m8
+ pshufb m4, m5, m12
+ vpdpwssd m16, m1, m4 ; a0 b0
+ pshufb m5, m13
+ mova m1, m2
+ vpdpwssd m16, m2, m5 ; a1 b1
+ shufps m6, m20, m6, q3131
+ paddd ym20, ym17, ym21
+ pshufb m4, m6, m12
+ mova m2, m3
+ vpdpwssd m16, m3, m4 ; a2 b2
+ vpshrdq m3, m0, 48 ; 67 78
+ pshufb m6, m13
+ vpdpwssd m16, m3, m6 ; a3 b3
+ ret
+ALIGN function_align
+.h:
+ movu ym16, [srcq+ssq*1]
+ psrad ym6, ym18, 10
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m5, m16, [srcq+ssq*0], 1
+ kmovb k1, k2
+ paddd ym17, ym18, ym19 ; mx += beta
+ vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1
+ psrad ym16, ym17, 10
+ kmovb k2, k1
+ vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2
+ vpermb m4, m10, m5
+ shufps m16, m18, m6, q2020
+ shufps m6, m18, m6, q3131
+ mova m0, m7
+ pshufb m18, m16, m12
+ vpdpwssd m0, m4, m18 ; a0 b0
+ vpermb m5, m11, m5
+ pshufb m18, m6, m13
+ vpdpwssd m0, m5, m18 ; a3 b3
+ paddd ym18, ym17, ym19
+ vshufi32x4 m17, m4, m5, q1021
+ pshufb m16, m13
+ vpdpwssd m0, m17, m16 ; a1 b1
+ vshufi32x4 m4, m5, q2132
+ pshufb m6, m12
+ vpdpwssd m0, m4, m6 ; a2 b2
+ vpmultishiftqb m0, m9, m0 ; a a b b
+ ret
+
+%macro BIDIR_FN 0
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq ], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ cmp hd, 8
+ jl .w4_end
+ vextracti32x4 xm2, m0, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti32x4 xm0, ym1, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ vextracti32x4 xm0, m1, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm1
+ vextracti32x4 [dstq+strideq*1], ym1, 1
+ vextracti32x4 [dstq+strideq*2], m1, 2
+ vextracti32x4 [dstq+stride3q ], m1, 3
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ call .main
+ mova [dstq+64*2], m0
+ mova [dstq+64*3], m1
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 5
+%else
+DECLARE_REG_TMP 7
+%endif
+
+cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg_avx512icl_table
+ lea r6, [avg_avx512icl_table]
+ tzcnt wd, wm
+ mov t0d, r6m ; pixel_max
+ movsxd wq, [r6+wq*4]
+ shr t0d, 11
+ vpbroadcastd m2, [base+avg_round+t0*4]
+ vpbroadcastd m3, [base+avg_shift+t0*4]
+ movifnidn hd, hm
+ add wq, r6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m0, [tmp1q+64*0]
+ paddsw m0, [tmp2q+64*0]
+ mova m1, [tmp1q+64*1]
+ paddsw m1, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ pmaxsw m0, m2
+ pmaxsw m1, m2
+ psubsw m0, m2
+ psubsw m1, m2
+ vpsrlvw m0, m3
+ vpsrlvw m1, m3
+ ret
+
+cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-w_avg_avx512icl_table
+ lea r6, [w_avg_avx512icl_table]
+ tzcnt wd, wm
+ mov t0d, r7m ; pixel_max
+ shr t0d, 11
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m5, [base+w_avg_round+t0*4]
+ vpbroadcastd m7, [base+bidir_shift+t0*4]
+ add wq, r6
+ mov r6d, r6m ; weight
+ lea t0d, [r6-16]
+ shl r6d, 16
+ sub r6d, t0d ; 16-weight, weight
+ movifnidn hd, hm
+ vpbroadcastd m6, r6d
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m3, [tmp1q+64*0]
+ mova m1, [tmp2q+64*0]
+ mova m0, [tmp1q+64*1]
+ mova m4, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ punpcklwd m2, m1, m3
+ punpckhwd m1, m3
+ punpcklwd m3, m4, m0
+ punpckhwd m4, m0
+ mova m0, m5
+ vpdpwssd m0, m6, m2
+ mova m2, m5
+ vpdpwssd m2, m6, m1
+ mova m1, m5
+ vpdpwssd m1, m6, m3
+ mova m3, m5
+ vpdpwssd m3, m6, m4
+ REPX {psrad x, 2}, m0, m2, m1, m3
+ packusdw m0, m2
+ packusdw m1, m3
+ vpsrlvw m0, m7
+ vpsrlvw m1, m7
+ ret
+
+cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask_avx512icl_table
+ lea r7, [mask_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m8, [base+pw_64]
+ vpbroadcastd m9, [base+mask_round+r6*4]
+ vpbroadcastd m10, [base+bidir_shift+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ BIDIR_FN
+ALIGN function_align
+.main:
+ pmovzxbw m1, [maskq+32*0]
+ mova m4, [tmp1q+64*0]
+ mova m2, [tmp2q+64*0]
+ pmovzxbw m6, [maskq+32*1]
+ mova m5, [tmp1q+64*1]
+ mova m3, [tmp2q+64*1]
+ add maskq, 32*2
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ punpcklwd m7, m4, m2
+ punpckhwd m4, m2
+ psubw m0, m8, m1
+ punpcklwd m2, m1, m0 ; m, 64-m
+ punpckhwd m1, m0
+ mova m0, m9
+ vpdpwssd m0, m7, m2
+ mova m2, m9
+ vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m)
+ punpcklwd m7, m5, m3
+ punpckhwd m5, m3
+ psubw m1, m8, m6
+ punpcklwd m3, m6, m1
+ punpckhwd m6, m1
+ mova m1, m9
+ vpdpwssd m1, m7, m3
+ mova m3, m9
+ vpdpwssd m3, m5, m6
+ REPX {psrad x, 4}, m0, m2, m1, m3
+ packusdw m0, m2
+ packusdw m1, m3
+ vpsrlvw m0, m10
+ vpsrlvw m1, m10
+ ret
+
+cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx512icl_table
+ lea r7, [w_mask_420_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ vpbroadcastd m11, [base+pw_64]
+ vpbroadcastd m12, [base+mask_round+r6*4]
+ vpbroadcastd m13, [base+bidir_shift+r6*4]
+ mov r6d, r7m ; sign
+ vpbroadcastd m14, [base+w_mask_round+r6*4]
+ mova ym15, [w_mask_end42x]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ mova m4, [w_mask_shuf4]
+ vpermt2b m2, m4, m3
+ mova m3, m14
+ vpdpbusd m3, m2, [pb_64] {1to16}
+ vpermb m3, m15, m3
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ mova [maskq], xm3
+ cmp hd, 8
+ jl .w4_end
+ vextracti32x4 xm2, m0, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8:
+ mova m8, [w_mask_shuf8]
+ vpbroadcastd m9, [pb_64]
+ jmp .w8_start
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w8_start:
+ vpermt2b m2, m8, m3
+ mova m3, m14
+ vpdpbusd m3, m2, m9
+ vpermb m3, m15, m3
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ mova [maskq], xm3
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm1
+ vextracti32x4 [dstq+strideq*1], ym1, 1
+ vextracti32x4 [dstq+strideq*2], m1, 2
+ vextracti32x4 [dstq+stride3q ], m1, 3
+ jg .w8_loop
+.w8_end:
+ RET
+.w16:
+ mova m8, [w_mask_shuf16]
+ vpbroadcastd m9, [pb_64]
+ jmp .w16_start
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w16_start:
+ vpermt2b m2, m8, m3
+ mova m3, m14
+ vpdpbusd m3, m2, m9
+ vpermb m3, m15, m3
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ mova [maskq], xm3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w32:
+ paddw m2, m3
+ mova m8, m14
+ vpdpwssd m8, m11, m2
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ call .main
+ paddw m2, m3
+ mova m3, m14
+ vpdpwssd m3, m11, m2
+ vpermt2b m8, m15, m3
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m1
+ mova [maskq], ym8
+ sub hd, 4
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w64:
+ mova m8, m2
+ mova m9, m3
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*0+64*1], m1
+ call .main
+ paddw m8, m2
+ paddw m9, m3
+ mova m2, m14
+ vpdpwssd m2, m11, m8
+ mova m3, m14
+ vpdpwssd m3, m11, m9
+ vpermt2b m2, m15, m3
+ mova [dstq+strideq*1+64*0], m0
+ mova [dstq+strideq*1+64*1], m1
+ mova [maskq], ym2
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 64
+.w128:
+ mova m16, m2
+ mova m8, m3
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*0+64*1], m1
+ call .main
+ mova m17, m2
+ mova m9, m3
+ mova [dstq+strideq*0+64*2], m0
+ mova [dstq+strideq*0+64*3], m1
+ call .main
+ paddw m2, m16
+ paddw m3, m8
+ mova m16, m14
+ vpdpwssd m16, m11, m2
+ mova m8, m14
+ vpdpwssd m8, m11, m3
+ mova [dstq+strideq*1+64*0], m0
+ mova [dstq+strideq*1+64*1], m1
+ call .main
+ paddw m2, m17
+ paddw m3, m9
+ mova m17, m14
+ vpdpwssd m17, m11, m2
+ mova m9, m14
+ vpdpwssd m9, m11, m3
+ vpermt2b m16, m15, m8
+ vpermt2b m17, m15, m9
+ mova [dstq+strideq*1+64*2], m0
+ mova [dstq+strideq*1+64*3], m1
+ mova [maskq+32*0], ym16
+ mova [maskq+32*1], ym17
+ sub hd, 2
+ jg .w128_loop
+ vzeroupper
+ RET
+ALIGN function_align
+.main:
+ mova m1, [tmp1q+64*0]
+ mova m3, [tmp2q+64*0]
+ mova m4, [tmp1q+64*1]
+ mova m7, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ psubsw m6, m1, m3
+ punpcklwd m5, m3, m1
+ pabsw m6, m6
+ punpckhwd m3, m1
+ psubusw m6, m10, m6
+ psrlw m6, 10 ; 64-m
+ psubw m2, m11, m6 ; m
+ punpcklwd m1, m6, m2
+ punpckhwd m6, m2
+ mova m0, m12
+ vpdpwssd m0, m5, m1
+ mova m1, m12
+ vpdpwssd m1, m3, m6
+ psubsw m5, m4, m7
+ punpcklwd m6, m7, m4
+ pabsw m5, m5
+ punpckhwd m7, m4
+ psubusw m5, m10, m5
+ psrlw m5, 10
+ psubw m3, m11, m5
+ punpcklwd m4, m5, m3
+ psrad m0, 4
+ punpckhwd m5, m3
+ psrad m1, 4
+ packusdw m0, m1
+ mova m1, m12
+ vpdpwssd m1, m6, m4
+ mova m4, m12
+ vpdpwssd m4, m7, m5
+ psrad m1, 4
+ psrad m4, 4
+ packusdw m1, m4
+ vpsrlvw m0, m13
+ vpsrlvw m1, m13
+ ret
+
+cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx512icl_table
+ lea r7, [w_mask_422_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ vpbroadcastd m9, [base+pw_64]
+ vpbroadcastd m10, [base+mask_round+r6*4]
+ vpbroadcastd m11, [base+bidir_shift+r6*4]
+ mov r6d, r7m ; sign
+ vpbroadcastd m12, [base+w_mask_round+r6*4]
+ mova ym13, [w_mask_end42x]
+ mov maskq, maskmp
+ add wq, r7
+ paddw m14, m9, m9 ; pw_128
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ cmp hd, 8
+ jl .w4_end
+ vextracti32x4 xm2, m0, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm1
+ vextracti32x4 [dstq+strideq*1], ym1, 1
+ vextracti32x4 [dstq+strideq*2], m1, 2
+ vextracti32x4 [dstq+stride3q ], m1, 3
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ call .main
+ mova [dstq+64*2], m0
+ mova [dstq+64*3], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ mova m1, [tmp1q+64*0]
+ mova m3, [tmp2q+64*0]
+ mova m4, [tmp1q+64*1]
+ mova m7, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ psubsw m6, m1, m3
+ punpcklwd m5, m3, m1
+ pabsw m6, m6
+ punpckhwd m3, m1
+ psubusw m6, m8, m6
+ psrlw m6, 10
+ psubw m2, m9, m6
+ punpcklwd m1, m6, m2
+ punpckhwd m6, m2
+ mova m0, m10
+ vpdpwssd m0, m5, m1
+ mova m1, m10
+ vpdpwssd m1, m3, m6
+ psubsw m5, m4, m7
+ punpcklwd m6, m7, m4
+ pabsw m5, m5
+ punpckhwd m7, m4
+ psubusw m5, m8, m5
+ psrlw m5, 10
+ psubw m3, m9, m5
+ punpcklwd m4, m5, m3
+ psrad m0, 4
+ punpckhwd m5, m3
+ psrad m1, 4
+ packusdw m0, m1
+ mova m1, m10
+ vpdpwssd m1, m6, m4
+ mova m4, m10
+ vpdpwssd m4, m7, m5
+ mova m5, m12
+ vpdpwssd m5, m14, m2
+ mova m2, m12
+ vpdpwssd m2, m14, m3
+ psrad m1, 4
+ psrad m4, 4
+ packusdw m1, m4
+ vpermt2b m5, m13, m2
+ vpsrlvw m0, m11
+ vpsrlvw m1, m11
+ mova [maskq], ym5
+ add maskq, 32
+ ret
+
+cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx512icl_table
+ lea r7, [w_mask_444_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movifnidn hd, hm
+ shr r6d, 11
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ vpbroadcastd m9, [base+pw_64]
+ vpbroadcastd m10, [base+mask_round+r6*4]
+ mova m11, [w_mask_end444]
+ vpbroadcastd m12, [base+bidir_shift+r6*4]
+ mov maskq, maskmp
+ add wq, r7
+ call .main
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ cmp hd, 8
+ jl .w4_end
+ vextracti32x4 xm2, m0, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ je .w4_end
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
+.w4_end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w8:
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 8
+ jl .w8_end
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm1
+ vextracti32x4 [dstq+strideq*1], ym1, 1
+ vextracti32x4 [dstq+strideq*2], m1, 2
+ vextracti32x4 [dstq+stride3q ], m1, 3
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*4]
+.w16:
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+stride3q ], m1, 1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ call .main
+ mova [dstq+64*2], m0
+ mova [dstq+64*3], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ mova m1, [tmp1q+64*0]
+ mova m3, [tmp2q+64*0]
+ mova m4, [tmp1q+64*1]
+ mova m7, [tmp2q+64*1]
+ add tmp1q, 64*2
+ add tmp2q, 64*2
+ psubsw m6, m1, m3
+ punpcklwd m5, m3, m1
+ pabsw m6, m6
+ punpckhwd m3, m1
+ psubusw m6, m8, m6
+ psrlw m6, 10
+ psubw m2, m9, m6
+ punpcklwd m1, m6, m2
+ punpckhwd m6, m2
+ mova m0, m10
+ vpdpwssd m0, m5, m1
+ mova m1, m10
+ vpdpwssd m1, m3, m6
+ psubsw m5, m4, m7
+ punpcklwd m6, m7, m4
+ pabsw m5, m5
+ punpckhwd m7, m4
+ psubusw m5, m8, m5
+ psrlw m5, 10
+ psubw m3, m9, m5
+ punpcklwd m4, m5, m3
+ psrad m0, 4
+ punpckhwd m5, m3
+ psrad m1, 4
+ packusdw m0, m1
+ mova m1, m10
+ vpdpwssd m1, m6, m4
+ mova m4, m10
+ vpdpwssd m4, m7, m5
+ vpermt2b m2, m11, m3
+ psrad m1, 4
+ psrad m4, 4
+ packusdw m1, m4
+ vpsrlvw m0, m12
+ vpsrlvw m1, m12
+ mova [maskq], m2
+ add maskq, 64
+ ret
+
+cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx512icl_table
+ lea r6, [blend_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ movifnidn maskq, maskmp
+ vpbroadcastd m6, [base+pw_m512]
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ pmovzxbw ym19, [maskq]
+ movq xm16, [dstq+dsq*0]
+ movhps xm16, [dstq+dsq*1]
+ vpbroadcastq ym17, [dstq+dsq*2]
+ vpbroadcastq ym18, [dstq+r6 ]
+ pmullw ym19, ym6
+ vpblendd ym16, ym17, 0x30
+ vpblendd ym16, ym18, 0xc0
+ psubw ym17, ym16, [tmpq]
+ add maskq, 16
+ add tmpq, 32
+ pmulhrsw ym17, ym19
+ paddw ym16, ym17
+ vextracti128 xm17, ym16, 1
+ movq [dstq+dsq*0], xm16
+ movhps [dstq+dsq*1], xm16
+ movq [dstq+dsq*2], xm17
+ movhps [dstq+r6 ], xm17
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ vzeroupper
+ RET
+.w8:
+ pmovzxbw m2, [maskq]
+ mova xm0, [dstq+dsq*0]
+ vinserti32x4 ym0, [dstq+dsq*1], 1
+ vinserti32x4 m0, [dstq+dsq*2], 2
+ vinserti32x4 m0, [dstq+r6 ], 3
+ pmullw m2, m6
+ psubw m1, m0, [tmpq]
+ add maskq, 32
+ add tmpq, 64
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ vextracti32x4 [dstq+dsq*2], m0, 2
+ vextracti32x4 [dstq+r6 ], m0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ RET
+.w16:
+ pmovzxbw m4, [maskq+32*0]
+ pmovzxbw m5, [maskq+32*1]
+ mova ym0, [dstq+dsq*0]
+ vinserti32x8 m0, [dstq+dsq*1], 1
+ mova ym1, [dstq+dsq*2]
+ vinserti32x8 m1, [dstq+r6 ], 1
+ pmullw m4, m6
+ pmullw m5, m6
+ psubw m2, m0, [tmpq+64*0]
+ psubw m3, m1, [tmpq+64*1]
+ add maskq, 32*2
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ mova [dstq+dsq*2], ym1
+ vextracti32x8 [dstq+r6 ], m1, 1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ pmovzxbw m4, [maskq+32*0]
+ pmovzxbw m5, [maskq+32*1]
+ mova m0, [dstq+dsq*0]
+ mova m1, [dstq+dsq*1]
+ pmullw m4, m6
+ pmullw m5, m6
+ psubw m2, m0, [tmpq+ 64*0]
+ psubw m3, m1, [tmpq+ 64*1]
+ add maskq, 32*2
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32
+ RET
+
+cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
+ lea r5, [blend_v_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp wq
+.w2:
+ vpbroadcastd xmm2, [obmc_masks_avx2+2*2]
+.w2_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movq xmm1, [tmpq]
+ add tmpq, 4*2
+ psubw xmm1, xmm0, xmm1
+ pmulhrsw xmm1, xmm2
+ paddw xmm0, xmm1
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ vpbroadcastq xmm2, [obmc_masks_avx2+4*2]
+.w4_loop:
+ movq xmm0, [dstq+dsq*0]
+ movhps xmm0, [dstq+dsq*1]
+ psubw xmm1, xmm0, [tmpq]
+ add tmpq, 8*2
+ pmulhrsw xmm1, xmm2
+ paddw xmm0, xmm1
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2]
+.w8_loop:
+ mova xm0, [dstq+dsq*0]
+ vinserti32x4 ym0, [dstq+dsq*1], 1
+ psubw ym1, ym0, [tmpq]
+ add tmpq, 16*2
+ pmulhrsw ym1, ym2
+ paddw ym0, ym1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ vbroadcasti32x8 m2, [obmc_masks_avx2+16*2]
+.w16_loop:
+ mova ym0, [dstq+dsq*0]
+ vinserti32x8 m0, [dstq+dsq*1], 1
+ psubw m1, m0, [tmpq]
+ add tmpq, 32*2
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+ mova m4, [obmc_masks_avx2+32*2]
+.w32_loop:
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 64*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 64*1]
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+
+cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask
+%define base r6-$$
+ lea r6, [$$]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [base+blend_h_avx512icl_table+wq*4]
+ lea maskq, [base+obmc_masks_avx2+hq*2]
+ lea hd, [hq*3]
+ lea wq, [base+blend_h_avx512icl_table+wq]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movd xmm2, [maskq+hq*2]
+ movq xmm1, [tmpq]
+ add tmpq, 4*2
+ punpcklwd xmm2, xmm2
+ psubw xmm1, xmm0, xmm1
+ pmulhrsw xmm1, xmm2
+ paddw xmm0, xmm1
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova xmm3, [blend_shuf]
+.w4_loop:
+ movq xmm0, [dstq+dsq*0]
+ movhps xmm0, [dstq+dsq*1]
+ movd xmm2, [maskq+hq*2]
+ psubw xmm1, xmm0, [tmpq]
+ add tmpq, 8*2
+ pshufb xmm2, xmm3
+ pmulhrsw xmm1, xmm2
+ paddw xmm0, xmm1
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ vbroadcasti32x4 ym3, [blend_shuf]
+ shufpd ym3, ym3, 0x0c
+.w8_loop:
+ mova xm0, [dstq+dsq*0]
+ vinserti32x4 ym0, [dstq+dsq*1], 1
+ vpbroadcastd ym2, [maskq+hq*2]
+ psubw ym1, ym0, [tmpq]
+ add tmpq, 16*2
+ pshufb ym2, ym3
+ pmulhrsw ym1, ym2
+ paddw ym0, ym1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+.w16:
+ vbroadcasti32x4 m3, [blend_shuf]
+ shufpd m3, m3, 0xf0
+.w16_loop:
+ mova ym0, [dstq+dsq*0]
+ vinserti32x8 m0, [dstq+dsq*1], 1
+ vpbroadcastd m2, [maskq+hq*2]
+ psubw m1, m0, [tmpq]
+ add tmpq, 32*2
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16_loop
+ RET
+.w32:
+ vpbroadcastw m4, [maskq+hq*2]
+ vpbroadcastw m5, [maskq+hq*2+2]
+ mova m0, [dstq+dsq*0]
+ psubw m2, m0, [tmpq+ 64*0]
+ mova m1, [dstq+dsq*1]
+ psubw m3, m1, [tmpq+ 64*1]
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w32
+ RET
+.w64:
+ vpbroadcastw m4, [maskq+hq*2]
+ mova m0, [dstq+64*0]
+ psubw m2, m0, [tmpq+64*0]
+ mova m1, [dstq+64*1]
+ psubw m3, m1, [tmpq+64*1]
+ add tmpq, 64*2
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ vpbroadcastw m8, [maskq+hq*2]
+ mova m0, [dstq+64*0]
+ psubw m4, m0, [tmpq+64*0]
+ mova m1, [dstq+64*1]
+ psubw m5, m1, [tmpq+64*1]
+ mova m2, [dstq+64*2]
+ psubw m6, m2, [tmpq+64*2]
+ mova m3, [dstq+64*3]
+ psubw m7, m3, [tmpq+64*3]
+ add tmpq, 64*4
+ REPX {pmulhrsw x, m8}, m4, m5, m6, m7
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ mova [dstq+64*2], m2
+ mova [dstq+64*3], m3
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ mov r6, ~0
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+ kmovq k6, r6
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
+ LEA r7, $$
+%define base r7-$$
+ vpbroadcastd m3, [base+pd_16384]
+ vpbroadcastd m7, [base+pd_63]
+ mova m24, [base+resize_permA]
+ mova m25, [base+resize_permB]
+ mova m26, [base+resize_permC]
+ mova m27, [base+resize_permD]
+ vbroadcasti32x4 m28, [base+resize_shufA]
+ vbroadcasti32x4 m29, [base+resize_shufB]
+ mova m30, [base+resize_permE]
+ vpbroadcastw ym31, pxmaxm
+ vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
+ pslld m5, 4 ; dx*16
+ pslld m6, 14
+ pxor m2, m2
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+.loop_x:
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ vptestmd k5, m1, m1
+ pand m9, m7 ; filter offset (masked)
+ ktestw k5, k5
+ jz .load
+ vpbroadcastq m14, [base+pd_0_4]
+ vpermq m10, m0, q1100
+ vpermq m11, m0, q3322
+ vpermq m20, m1, q1100
+ vpermq m21, m1, q3322
+ punpckldq m10, m10
+ punpckldq m11, m11
+ punpckldq m20, m20
+ punpckldq m21, m21
+ paddd m10, m14
+ paddd m11, m14
+ paddd m20, m14
+ paddd m21, m14
+ vextracti32x8 ym12, m10, 1
+ vextracti32x8 ym13, m11, 1
+ vextracti32x8 ym22, m20, 1
+ vextracti32x8 ym23, m21, 1
+ kmovq k1, k6
+ kmovq k2, k6
+ kmovq k3, k6
+ kmovq k4, k6
+ vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3
+ vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7
+ vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B
+ vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F
+ kmovq k1, k6
+ kmovq k2, k6
+ kmovq k3, k6
+ kmovq k4, k6
+ vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2]
+ vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2]
+ vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2]
+ vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2]
+ pshufb m16, m0
+ pshufb m17, m1
+ pshufb m18, m14
+ pshufb m19, m15
+ mova m20, m24
+ mova m22, m24
+ mova m21, m25
+ mova m23, m25
+ vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b
+ vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d
+ vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb
+ vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd
+ mova m15, m26
+ mova m17, m26
+ mova m16, m27
+ mova m18, m27
+ vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa
+ vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb
+ vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc
+ vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd
+ kmovq k1, k6
+ kmovq k2, k6
+ vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
+ vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
+ pshufb m10, m11, m28
+ pshufb m11, m11, m29
+ pshufb m12, m13, m28
+ pshufb m13, m13, m29
+ jmp .filter
+.load:
+ kmovq k1, k6
+ kmovq k2, k6
+ kmovq k3, k6
+ kmovq k4, k6
+ vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
+ vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
+ pshufb m10, m11, m28
+ pshufb m11, m11, m29
+ pshufb m12, m13, m28
+ pshufb m13, m13, m29
+ vpgatherdd m15{k3}, [srcq+m0*2+ 0]
+ vpgatherdd m16{k4}, [srcq+m0*2+ 4]
+ kmovq k1, k6
+ kmovq k2, k6
+ vpgatherdd m17{k1}, [srcq+m0*2+ 8]
+ vpgatherdd m18{k2}, [srcq+m0*2+12]
+.filter:
+ mova m14, m2
+ vpdpwssd m14, m15, m10
+ vpdpwssd m14, m16, m11
+ vpdpwssd m14, m17, m12
+ vpdpwssd m14, m18, m13
+ psubd m14, m3, m14
+ psrad m14, 15
+ packusdw m14, m14
+ vpermq m14, m30, m14
+ pminsw ym14, ym31
+ mova [dstq+xq*2], ym14
+ paddd m4, m5
+ add xd, 16
+ cmp xd, dst_wd
+ jl .loop_x
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/mc16_sse.asm b/third_party/dav1d/src/x86/mc16_sse.asm
new file mode 100644
index 0000000000..fde8e372a3
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc16_sse.asm
@@ -0,0 +1,8731 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+; dav1d_obmc_masks[] << 9
+obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0
+ dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0
+ dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120
+ dw 4096, 3072, 2048, 1536, 0, 0, 0, 0
+ dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240
+ dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608
+ dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024
+
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
+spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+rescale_mul: dd 0, 1, 2, 3
+resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
+bdct_lb_q: times 8 db 0
+ times 8 db 4
+ times 8 db 8
+ times 8 db 12
+
+pw_2: times 8 dw 2
+pw_16: times 4 dw 16
+prep_mul: times 4 dw 16
+ times 8 dw 4
+pw_64: times 8 dw 64
+pw_256: times 8 dw 256
+pw_2048: times 4 dw 2048
+bidir_mul: times 4 dw 2048
+pw_8192: times 8 dw 8192
+pw_27615: times 8 dw 27615
+pw_32766: times 8 dw 32766
+pw_m512: times 8 dw -512
+pd_63: times 4 dd 63
+pd_64: times 4 dd 64
+pd_512: times 4 dd 512
+pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32
+pd_0x3ff: times 4 dd 0x3ff
+pd_0x4000: times 4 dd 0x4000
+pq_0x400000: times 2 dq 0x400000
+pq_0x40000000: times 2 dq 0x40000000
+pd_65538: times 2 dd 65538
+
+put_bilin_h_rnd: times 4 dw 8
+ times 4 dw 10
+s_8tap_h_rnd: times 2 dd 2
+ times 2 dd 8
+put_s_8tap_v_rnd: times 2 dd 512
+ times 2 dd 128
+s_8tap_h_sh: dd 2, 4
+put_s_8tap_v_sh: dd 10, 8
+bidir_rnd: times 4 dw -16400
+ times 4 dw -16388
+put_8tap_h_rnd: dd 34, 34, 40, 40
+prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4)
+prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5)
+
+warp8x8_shift: dd 11, 13
+warp8x8_rnd1: dd 1024, 1024, 4096, 4096
+warp8x8_rnd2: times 4 dw 4096
+ times 4 dw 16384
+warp8x8t_rnd: times 2 dd 16384 - (8192 << 15)
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put)
+%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep)
+
+BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
+
+%macro SCALED_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
+%%table:
+ %rep %0 - 2
+ dw %%base %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_1024:
+ %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy1_w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_2048:
+ %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy2_w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+cextern mc_warp_filter
+cextern resize_filter
+
+SECTION .text
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+INIT_XMM ssse3
+cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy
+%define base t0-put_ssse3
+ mov mxyd, r6m ; mx
+ LEA t0, put_ssse3
+ movifnidn wd, wm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ tzcnt wd, wd
+ movzx wd, word [base+put_ssse3_table+wq*2]
+ add wq, t0
+ movifnidn hd, hm
+ jmp wq
+.put_w2:
+ mov r4d, [srcq+ssq*0]
+ mov r6d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r4d
+ mov [dstq+dsq*1], r6d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ movq m0, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq [dstq+dsq*0], m0
+ movq [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu m0, [srcq+ssq*0+16*0]
+ movu m1, [srcq+ssq*0+16*1]
+ movu m2, [srcq+ssq*1+16*0]
+ movu m3, [srcq+ssq*1+16*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+16*0], m0
+ mova [dstq+dsq*0+16*1], m1
+ mova [dstq+dsq*1+16*0], m2
+ mova [dstq+dsq*1+16*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ add srcq, ssq
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ add srcq, ssq
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ mova [dstq+16*6], m2
+ mova [dstq+16*7], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w64
+ RET
+.put_w128:
+ add srcq, 16*8
+ add dstq, 16*8
+.put_w128_loop:
+ movu m0, [srcq-16*8]
+ movu m1, [srcq-16*7]
+ movu m2, [srcq-16*6]
+ movu m3, [srcq-16*5]
+ mova [dstq-16*8], m0
+ mova [dstq-16*7], m1
+ mova [dstq-16*6], m2
+ mova [dstq-16*5], m3
+ movu m0, [srcq-16*4]
+ movu m1, [srcq-16*3]
+ movu m2, [srcq-16*2]
+ movu m3, [srcq-16*1]
+ mova [dstq-16*4], m0
+ mova [dstq-16*3], m1
+ mova [dstq-16*2], m2
+ mova [dstq-16*1], m3
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ add srcq, ssq
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ mova [dstq+16*6], m2
+ mova [dstq+16*7], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128_loop
+ RET
+.h:
+ movd m5, mxyd
+ mov mxyd, r7m ; my
+ mova m4, [base+pw_16]
+ pshufb m5, [base+pw_256]
+ psubw m4, m5
+ test mxyd, mxyd
+ jnz .hv
+ ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
+ mov r6d, r8m ; bitdepth_max
+ shr r6d, 11
+ movddup m3, [base+put_bilin_h_rnd+r6*8]
+ movifnidn hd, hm
+ sub wd, 8
+ jg .h_w16
+ je .h_w8
+ cmp wd, -4
+ je .h_w4
+.h_w2:
+ movq m1, [srcq+ssq*0]
+ movhps m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4, m1
+ psrlq m1, 16
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 4
+ movd [dstq+dsq*0], m0
+ punpckhqdq m0, m0
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ movq m0, [srcq+ssq*0]
+ movhps m0, [srcq+ssq*1]
+ movq m1, [srcq+ssq*0+2]
+ movhps m1, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 4
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ neg wq
+.h_w16_loop0:
+ mov r6, wq
+.h_w16_loop:
+ movu m0, [srcq+r6*2+ 0]
+ movu m1, [srcq+r6*2+ 2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ movu m1, [srcq+r6*2+16]
+ movu m2, [srcq+r6*2+18]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m0, 4
+ psrlw m1, 4
+ mova [dstq+r6*2+16*0], m0
+ mova [dstq+r6*2+16*1], m1
+ add r6, 16
+ jl .h_w16_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w16_loop0
+ RET
+.v:
+ shl mxyd, 11
+ movd m5, mxyd
+ pshufb m5, [base+pw_256]
+ movifnidn hd, hm
+ cmp wd, 4
+ jg .v_w8
+ je .v_w4
+.v_w2:
+ movd m0, [srcq+ssq*0]
+.v_w2_loop:
+ movd m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq m2, m0, m1
+ movd m0, [srcq+ssq*0]
+ punpcklqdq m1, m0
+ psubw m1, m2
+ pmulhrsw m1, m5
+ paddw m1, m2
+ movd [dstq+dsq*0], m1
+ punpckhqdq m1, m1
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movq m0, [srcq+ssq*0]
+.v_w4_loop:
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklqdq m2, m0, m1
+ movq m0, [srcq+ssq*0]
+ punpcklqdq m1, m0
+ psubw m1, m2
+ pmulhrsw m1, m5
+ paddw m1, m2
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+%if ARCH_X86_64
+%if WIN64
+ push r7
+%endif
+ shl wd, 5
+ mov r7, srcq
+ lea r6d, [wq+hq-256]
+ mov r4, dstq
+%else
+ mov r6, srcq
+%endif
+.v_w8_loop0:
+ movu m0, [srcq+ssq*0]
+.v_w8_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ psubw m1, m3, m0
+ pmulhrsw m1, m5
+ paddw m1, m0
+ movu m0, [srcq+ssq*0]
+ psubw m2, m0, m3
+ pmulhrsw m2, m5
+ paddw m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+%if ARCH_X86_64
+ add r7, 16
+ add r4, 16
+ movzx hd, r6b
+ mov srcq, r7
+ mov dstq, r4
+ sub r6d, 1<<8
+%else
+ mov dstq, dstmp
+ add r6, 16
+ mov hd, hm
+ add dstq, 16
+ mov srcq, r6
+ mov dstmp, dstq
+ sub wd, 8
+%endif
+ jg .v_w8_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+.hv:
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11
+ mova m3, [base+pw_2]
+ movd m6, mxyd
+ mova m7, [base+pw_8192]
+ pshufb m6, [base+pw_256]
+ test dword r8m, 0x800
+ jnz .hv_12bpc
+ psllw m4, 2
+ psllw m5, 2
+ mova m7, [base+pw_2048]
+.hv_12bpc:
+ movifnidn hd, hm
+ cmp wd, 4
+ jg .hv_w8
+ je .hv_w4
+.hv_w2:
+ movddup m0, [srcq+ssq*0]
+ pshufhw m1, m0, q0321
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w2_loop:
+ movq m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps m2, [srcq+ssq*0]
+ pmullw m1, m4, m2
+ psrlq m2, 16
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2 ; 1 _ 2 _
+ shufpd m2, m0, m1, 0x01 ; 0 _ 1 _
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ movd [dstq+dsq*0], m1
+ punpckhqdq m1, m1
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ movddup m0, [srcq+ssq*0]
+ movddup m1, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w4_loop:
+ movq m1, [srcq+ssq*1]
+ movq m2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*0]
+ movhps m2, [srcq+ssq*0+2]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2 ; 1 2
+ shufpd m2, m0, m1, 0x01 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+%if ARCH_X86_64
+%if WIN64
+ push r7
+%endif
+ shl wd, 5
+ lea r6d, [wq+hq-256]
+ mov r4, srcq
+ mov r7, dstq
+%else
+ mov r6, srcq
+%endif
+.hv_w8_loop0:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m1, m5
+ paddw m0, m3
+ paddw m0, m1
+ psrlw m0, 2
+.hv_w8_loop:
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*1+2]
+ lea srcq, [srcq+ssq*2]
+ pmullw m1, m4
+ pmullw m2, m5
+ paddw m1, m3
+ paddw m1, m2
+ psrlw m1, 2
+ psubw m2, m1, m0
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m0
+ pmulhrsw m2, m7
+ mova [dstq+dsq*0], m2
+ movu m0, [srcq+ssq*0]
+ movu m2, [srcq+ssq*0+2]
+ pmullw m0, m4
+ pmullw m2, m5
+ paddw m0, m3
+ paddw m0, m2
+ psrlw m0, 2
+ psubw m2, m0, m1
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m1
+ pmulhrsw m2, m7
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+%if ARCH_X86_64
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+%else
+ mov dstq, dstmp
+ add r6, 16
+ mov hd, hm
+ add dstq, 16
+ mov srcq, r6
+ mov dstmp, dstq
+ sub wd, 8
+%endif
+ jg .hv_w8_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+
+cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3
+%define base r6-prep_ssse3
+ movifnidn mxyd, r5m ; mx
+ LEA r6, prep_ssse3
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ tzcnt wd, wd
+ movzx wd, word [base+prep_ssse3_table+wq*2]
+ mov r5d, r7m ; bitdepth_max
+ mova m5, [base+pw_8192]
+ add wq, r6
+ shr r5d, 11
+ movddup m4, [base+prep_mul+r5*8]
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movq m0, [srcq+strideq*0]
+ movhps m0, [srcq+strideq*1]
+ movq m1, [srcq+strideq*2]
+ movhps m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmullw m0, m4
+ pmullw m1, m4
+ psubw m0, m5
+ psubw m1, m5
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*2]
+ movu m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu m0, [srcq+strideq*0+16*0]
+ movu m1, [srcq+strideq*0+16*1]
+ movu m2, [srcq+strideq*1+16*0]
+ movu m3, [srcq+strideq*1+16*1]
+ lea srcq, [srcq+strideq*2]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 2
+ jg .prep_w16
+ RET
+.prep_w32:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ add srcq, strideq
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ dec hd
+ jg .prep_w32
+ RET
+.prep_w64:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ add srcq, strideq
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*4], m0
+ mova [tmpq+16*5], m1
+ mova [tmpq+16*6], m2
+ mova [tmpq+16*7], m3
+ add tmpq, 16*8
+ dec hd
+ jg .prep_w64
+ RET
+.prep_w128:
+ movu m0, [srcq+16* 0]
+ movu m1, [srcq+16* 1]
+ movu m2, [srcq+16* 2]
+ movu m3, [srcq+16* 3]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ movu m0, [srcq+16* 4]
+ movu m1, [srcq+16* 5]
+ movu m2, [srcq+16* 6]
+ movu m3, [srcq+16* 7]
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq+16*4], m0
+ mova [tmpq+16*5], m1
+ mova [tmpq+16*6], m2
+ mova [tmpq+16*7], m3
+ movu m0, [srcq+16* 8]
+ movu m1, [srcq+16* 9]
+ movu m2, [srcq+16*10]
+ movu m3, [srcq+16*11]
+ add tmpq, 16*16
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq-16*8], m0
+ mova [tmpq-16*7], m1
+ mova [tmpq-16*6], m2
+ mova [tmpq-16*5], m3
+ movu m0, [srcq+16*12]
+ movu m1, [srcq+16*13]
+ movu m2, [srcq+16*14]
+ movu m3, [srcq+16*15]
+ add srcq, strideq
+ REPX {pmullw x, m4}, m0, m1, m2, m3
+ REPX {psubw x, m5}, m0, m1, m2, m3
+ mova [tmpq-16*4], m0
+ mova [tmpq-16*3], m1
+ mova [tmpq-16*2], m2
+ mova [tmpq-16*1], m3
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ movd m4, mxyd
+ mov mxyd, r6m ; my
+ mova m3, [base+pw_16]
+ pshufb m4, [base+pw_256]
+ mova m5, [base+pw_32766]
+ psubw m3, m4
+ test dword r7m, 0x800
+ jnz .h_12bpc
+ psllw m3, 2
+ psllw m4, 2
+.h_12bpc:
+ test mxyd, mxyd
+ jnz .hv
+ sub wd, 8
+ je .h_w8
+ jg .h_w16
+.h_w4:
+ movq m0, [srcq+strideq*0]
+ movhps m0, [srcq+strideq*1]
+ movq m1, [srcq+strideq*0+2]
+ movhps m1, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 16
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ lea srcq, [srcq+wq*2]
+ neg wq
+.h_w16_loop0:
+ mov r6, wq
+.h_w16_loop:
+ movu m0, [srcq+r6*2+ 0]
+ movu m1, [srcq+r6*2+ 2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ movu m1, [srcq+r6*2+16]
+ movu m2, [srcq+r6*2+18]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ add r6, 16
+ jl .h_w16_loop
+ add srcq, strideq
+ dec hd
+ jg .h_w16_loop0
+ RET
+.v:
+ movd m4, mxyd
+ mova m3, [base+pw_16]
+ pshufb m4, [base+pw_256]
+ mova m5, [base+pw_32766]
+ psubw m3, m4
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m3, 2
+ psllw m4, 2
+.v_12bpc:
+ cmp wd, 8
+ je .v_w8
+ jg .v_w16
+.v_w4:
+ movq m0, [srcq+strideq*0]
+.v_w4_loop:
+ movq m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklqdq m1, m0, m2 ; 0 1
+ movq m0, [srcq+strideq*0]
+ punpcklqdq m2, m0 ; 1 2
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq], m1
+ add tmpq, 16
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movu m0, [srcq+strideq*0]
+.v_w8_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m3
+ pmullw m1, m4, m2
+ psubw m0, m5
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m3
+ mova [tmpq+16*0], m1
+ pmullw m1, m4, m0
+ psubw m2, m5
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+16*1], m1
+ add tmpq, 16*2
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+%if WIN64
+ push r7
+%endif
+ mov r5, srcq
+%if ARCH_X86_64
+ lea r6d, [wq*4-32]
+ mov wd, wd
+ lea r6d, [hq+r6*8]
+ mov r7, tmpq
+%else
+ mov r6d, wd
+%endif
+.v_w16_loop0:
+ movu m0, [srcq+strideq*0]
+.v_w16_loop:
+ movu m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ pmullw m0, m3
+ pmullw m1, m4, m2
+ psubw m0, m5
+ paddw m1, m0
+ movu m0, [srcq+strideq*0]
+ psraw m1, 2
+ pmullw m2, m3
+ mova [tmpq+wq*0], m1
+ pmullw m1, m4, m0
+ psubw m2, m5
+ paddw m1, m2
+ psraw m1, 2
+ mova [tmpq+wq*2], m1
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w16_loop
+%if ARCH_X86_64
+ add r5, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+%else
+ mov tmpq, tmpmp
+ add r5, 16
+ mov hd, hm
+ add tmpq, 16
+ mov srcq, r5
+ mov tmpmp, tmpq
+ sub r6d, 8
+%endif
+ jg .v_w16_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+.hv:
+ WIN64_SPILL_XMM 7
+ shl mxyd, 11
+ movd m6, mxyd
+ pshufb m6, [base+pw_256]
+ cmp wd, 8
+ je .hv_w8
+ jg .hv_w16
+.hv_w4:
+ movddup m0, [srcq+strideq*0]
+ movddup m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+.hv_w4_loop:
+ movq m1, [srcq+strideq*1]
+ movq m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ movhps m1, [srcq+strideq*0]
+ movhps m2, [srcq+strideq*0+2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2 ; 1 2
+ shufpd m2, m0, m1, 0x01 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 16
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+.hv_w8_loop:
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2
+ psubw m2, m1, m0
+ pmulhrsw m2, m6
+ paddw m2, m0
+ mova [tmpq+16*0], m2
+ movu m0, [srcq+strideq*0]
+ movu m2, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m2, m4
+ psubw m0, m5
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+%if WIN64
+ push r7
+%endif
+ mov r5, srcq
+%if ARCH_X86_64
+ lea r6d, [wq*4-32]
+ mov wd, wd
+ lea r6d, [hq+r6*8]
+ mov r7, tmpq
+%else
+ mov r6d, wd
+%endif
+.hv_w16_loop0:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m1, m4
+ psubw m0, m5
+ paddw m0, m1
+ psraw m0, 2
+.hv_w16_loop:
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*1+2]
+ lea srcq, [srcq+strideq*2]
+ pmullw m1, m3
+ pmullw m2, m4
+ psubw m1, m5
+ paddw m1, m2
+ psraw m1, 2
+ psubw m2, m1, m0
+ pmulhrsw m2, m6
+ paddw m2, m0
+ mova [tmpq+wq*0], m2
+ movu m0, [srcq+strideq*0]
+ movu m2, [srcq+strideq*0+2]
+ pmullw m0, m3
+ pmullw m2, m4
+ psubw m0, m5
+ paddw m0, m2
+ psraw m0, 2
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+wq*2], m2
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .hv_w16_loop
+%if ARCH_X86_64
+ add r5, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+%else
+ mov tmpq, tmpmp
+ add r5, 16
+ mov hd, hm
+ add tmpq, 16
+ mov srcq, r5
+ mov tmpmp, tmpq
+ sub r6d, 8
+%endif
+ jg .hv_w16_loop0
+%if WIN64
+ pop r7
+%endif
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; prefix, type, type_h, type_v
+cglobal %1_%2_16bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1, 2, 6
+%elif WIN64
+DECLARE_REG_TMP 4, 5, 8
+%else
+DECLARE_REG_TMP 7, 8, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+%if ARCH_X86_32
+cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my
+%define mxb r0b
+%define mxd r0
+%define mxq r0
+%define myb r1b
+%define myd r1
+%define myq r1
+%define m8 [esp+16*0]
+%define m9 [esp+16*1]
+%define m10 [esp+16*2]
+%define m11 [esp+16*3]
+%define m12 [esp+16*4]
+%define m13 [esp+16*5]
+%define m14 [esp+16*6]
+%define m15 [esp+16*7]
+%else
+cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
+%endif
+%define base t2-put_ssse3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ LEA t2, put_ssse3
+ movifnidn wd, wm
+ movifnidn srcq, srcmp
+ movifnidn ssq, ssmp
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [base+put_ssse3_table+wq*2]
+ movifnidn dstq, dstmp
+ movifnidn dsq, dsmp
+ add wq, t2
+%if WIN64
+ pop r8
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ mov myd, r8m
+ movd m5, r8m
+ shr myd, 11
+ movddup m4, [base+put_8tap_h_rnd+myq*8]
+ movifnidn dsq, dsmp
+ pshufb m5, [base+pw_256]
+ cmp wd, 4
+ jg .h_w8
+ movzx mxd, mxb
+ lea srcq, [srcq-2]
+ movq m3, [base+subpel_filters+mxq*8]
+ movifnidn dstq, dstmp
+ punpcklbw m3, m3
+ psraw m3, 8 ; sign-extend
+ je .h_w4
+.h_w2:
+ mova m2, [base+spel_h_shuf2]
+ pshufd m3, m3, q2121
+.h_w2_loop:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m2
+ pshufb m1, m2
+ pmaddwd m0, m3
+ pmaddwd m1, m3
+ phaddd m0, m1
+ paddd m0, m4
+ psrad m0, 6
+ packssdw m0, m0
+ pxor m1, m1
+ pminsw m0, m5
+ pmaxsw m0, m1
+ movd [dstq+dsq*0], m0
+ pshuflw m0, m0, q3232
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ WIN64_SPILL_XMM 8
+ mova m6, [base+spel_h_shufA]
+ mova m7, [base+spel_h_shufB]
+ pshufd m2, m3, q1111
+ pshufd m3, m3, q2222
+.h_w4_loop:
+ movu m1, [srcq]
+ add srcq, ssq
+ pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m7 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m2
+ pmaddwd m1, m3
+ paddd m0, m4
+ paddd m0, m1
+ psrad m0, 6
+ packssdw m0, m0
+ pxor m1, m1
+ pminsw m0, m5
+ pmaxsw m0, m1
+ movq [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w4_loop
+ RET
+.h_w8:
+%if WIN64
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 12
+%endif
+ shr mxd, 16
+ movq m3, [base+subpel_filters+mxq*8]
+ movifnidn dstq, dstmp
+ mova m6, [base+spel_h_shufA]
+ mova m7, [base+spel_h_shufB]
+%if UNIX64
+ mov wd, wd
+%endif
+ lea srcq, [srcq+wq*2]
+ punpcklbw m3, m3
+ lea dstq, [dstq+wq*2]
+ psraw m3, 8
+ neg wq
+%if ARCH_X86_32
+ ALLOC_STACK -16*4
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+%else
+ pshufd m8, m3, q0000
+ pshufd m9, m3, q1111
+ pshufd m10, m3, q2222
+ pshufd m11, m3, q3333
+%endif
+.h_w8_loop0:
+ mov r6, wq
+.h_w8_loop:
+ movu m0, [srcq+r6*2- 6]
+ movu m1, [srcq+r6*2+ 2]
+ pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4
+ pshufb m0, m7 ; 2 3 3 4 4 5 5 6
+ pmaddwd m2, m8 ; abcd0
+ pmaddwd m0, m9 ; abcd1
+ pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8
+ pshufb m1, m7 ; 6 7 7 8 8 9 9 a
+ paddd m2, m4
+ paddd m0, m2
+ pmaddwd m2, m10, m3 ; abcd2
+ pmaddwd m3, m8 ; efgh0
+ paddd m0, m2
+ pmaddwd m2, m11, m1 ; abcd3
+ pmaddwd m1, m9 ; efgh1
+ paddd m0, m2
+ movu m2, [srcq+r6*2+10]
+ paddd m3, m4
+ paddd m1, m3
+ pshufb m3, m2, m6 ; 8 9 9 a a b b c
+ pshufb m2, m7 ; a b b c c d d e
+ pmaddwd m3, m10 ; efgh2
+ pmaddwd m2, m11 ; efgh3
+ paddd m1, m3
+ paddd m1, m2
+ psrad m0, 6
+ psrad m1, 6
+ packssdw m0, m1
+ pxor m1, m1
+ pminsw m0, m5
+ pmaxsw m0, m1
+ mova [dstq+r6*2], m0
+ add r6, 8
+ jl .h_w8_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w8_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovb myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+%if WIN64
+ WIN64_SPILL_XMM 15
+%endif
+ movd m7, r8m
+ movifnidn dstq, dstmp
+ movifnidn dsq, dsmp
+ punpcklbw m3, m3
+ pshufb m7, [base+pw_256]
+ psraw m3, 8 ; sign-extend
+%if ARCH_X86_32
+ ALLOC_STACK -16*7
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+%else
+ pshufd m8, m3, q0000
+ pshufd m9, m3, q1111
+ pshufd m10, m3, q2222
+ pshufd m11, m3, q3333
+%endif
+ lea r6, [ssq*3]
+ sub srcq, r6
+ cmp wd, 2
+ jne .v_w4
+.v_w2:
+ movd m1, [srcq+ssq*0]
+ movd m4, [srcq+ssq*1]
+ movd m2, [srcq+ssq*2]
+ add srcq, r6
+ movd m5, [srcq+ssq*0]
+ movd m3, [srcq+ssq*1]
+ movd m6, [srcq+ssq*2]
+ add srcq, r6
+ movd m0, [srcq+ssq*0]
+ punpckldq m1, m4 ; 0 1
+ punpckldq m4, m2 ; 1 2
+ punpckldq m2, m5 ; 2 3
+ punpckldq m5, m3 ; 3 4
+ punpckldq m3, m6 ; 4 5
+ punpckldq m6, m0 ; 5 6
+ punpcklwd m1, m4 ; 01 12
+ punpcklwd m2, m5 ; 23 34
+ punpcklwd m3, m6 ; 45 56
+ pxor m6, m6
+.v_w2_loop:
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m8, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m9 ; a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m10 ; a2 b2
+ paddd m5, m3
+ punpckldq m3, m0, m4 ; 6 7
+ movd m0, [srcq+ssq*0]
+ punpckldq m4, m0 ; 7 8
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m11, m3 ; a3 b3
+ paddd m5, m4
+ psrad m5, 5
+ packssdw m5, m5
+ pmaxsw m5, m6
+ pavgw m5, m6
+ pminsw m5, m7
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q3232
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+%if ARCH_X86_32
+ shl wd, 14
+%if STACK_ALIGNMENT < 16
+ mov [esp+4*29], srcq
+ mov [esp+4*30], dstq
+%else
+ mov srcmp, srcq
+%endif
+ lea wd, [wq+hq-(1<<16)]
+%else
+ shl wd, 6
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [wq+hq-(1<<8)]
+%endif
+.v_w4_loop0:
+ movq m1, [srcq+ssq*0]
+ movq m2, [srcq+ssq*1]
+ movq m3, [srcq+ssq*2]
+ add srcq, r6
+ movq m4, [srcq+ssq*0]
+ movq m5, [srcq+ssq*1]
+ movq m6, [srcq+ssq*2]
+ add srcq, r6
+ movq m0, [srcq+ssq*0]
+ punpcklwd m1, m2 ; 01
+ punpcklwd m2, m3 ; 12
+ punpcklwd m3, m4 ; 23
+ punpcklwd m4, m5 ; 34
+ punpcklwd m5, m6 ; 45
+ punpcklwd m6, m0 ; 56
+%if ARCH_X86_32
+ jmp .v_w4_loop_start
+.v_w4_loop:
+ mova m1, m12
+ mova m2, m13
+ mova m3, m14
+.v_w4_loop_start:
+ pmaddwd m1, m8 ; a0
+ pmaddwd m2, m8 ; b0
+ mova m12, m3
+ mova m13, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m1, m3
+ paddd m2, m4
+ mova m14, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m3, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m1, m3
+ pmaddwd m3, m11, m6 ; b3
+ paddd m2, m3
+ psrad m1, 5
+ psrad m2, 5
+ packssdw m1, m2
+ pxor m2, m2
+ pmaxsw m1, m2
+ pavgw m1, m2
+ pminsw m1, m7
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+%if STACK_ALIGNMENT < 16
+ mov srcq, [esp+4*29]
+ mov dstq, [esp+4*30]
+ movzx hd, ww
+ add srcq, 8
+ add dstq, 8
+ mov [esp+4*29], srcq
+ mov [esp+4*30], dstq
+%else
+ mov srcq, srcmp
+ mov dstq, dstmp
+ movzx hd, ww
+ add srcq, 8
+ add dstq, 8
+ mov srcmp, srcq
+ mov dstmp, dstq
+%endif
+ sub wd, 1<<16
+%else
+.v_w4_loop:
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ paddd m13, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m14, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m14
+ psrad m12, 5
+ psrad m13, 5
+ packssdw m12, m13
+ pxor m13, m13
+ pmaxsw m12, m13
+ pavgw m12, m13
+ pminsw m12, m7
+ movq [dstq+dsq*0], m12
+ movhps [dstq+dsq*1], m12
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ add r7, 8
+ add r8, 8
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+%endif
+ jg .v_w4_loop0
+ RET
+.hv:
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+%if ARCH_X86_32
+ movd m4, r8m
+ mova m6, [base+pd_512]
+ pshufb m4, [base+pw_256]
+%else
+%if WIN64
+ ALLOC_STACK 16*6, 16
+%endif
+ movd m15, r8m
+ pshufb m15, [base+pw_256]
+%endif
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ je .hv_w4
+ movq m0, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovb myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if ARCH_X86_32
+ mov dstq, dstmp
+ mov dsq, dsmp
+ mova m5, [base+spel_h_shuf2]
+ ALLOC_STACK -16*8
+%else
+ mova m6, [base+pd_512]
+ mova m9, [base+spel_h_shuf2]
+%endif
+ pshuflw m0, m0, q2121
+ pxor m7, m7
+ punpcklbw m7, m0
+ punpcklbw m3, m3
+ psraw m3, 8 ; sign-extend
+ test dword r8m, 0x800
+ jz .hv_w2_10bpc
+ psraw m7, 2
+ psllw m3, 2
+.hv_w2_10bpc:
+ lea r6, [ssq*3]
+ sub srcq, 2
+ sub srcq, r6
+%if ARCH_X86_32
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m9, m5
+ mova m11, m0
+ mova m12, m1
+ mova m13, m2
+ mova m14, m3
+ mova m15, m4
+%else
+ pshufd m11, m3, q0000
+ pshufd m12, m3, q1111
+ pshufd m13, m3, q2222
+ pshufd m14, m3, q3333
+%endif
+ movu m2, [srcq+ssq*0]
+ movu m3, [srcq+ssq*1]
+ movu m1, [srcq+ssq*2]
+ add srcq, r6
+ movu m4, [srcq+ssq*0]
+%if ARCH_X86_32
+ REPX {pshufb x, m5}, m2, m3, m1, m4
+%else
+ REPX {pshufb x, m9}, m2, m3, m1, m4
+%endif
+ REPX {pmaddwd x, m7}, m2, m3, m1, m4
+ phaddd m2, m3 ; 0 1
+ phaddd m1, m4 ; 2 3
+ movu m3, [srcq+ssq*1]
+ movu m4, [srcq+ssq*2]
+ add srcq, r6
+ movu m0, [srcq+ssq*0]
+%if ARCH_X86_32
+ REPX {pshufb x, m5}, m3, m4, m0
+%else
+ REPX {pshufb x, m9}, m3, m4, m0
+%endif
+ REPX {pmaddwd x, m7}, m3, m4, m0
+ phaddd m3, m4 ; 4 5
+ phaddd m0, m0 ; 6 6
+ REPX {paddd x, m6}, m2, m1, m3, m0
+ REPX {psrad x, 10}, m2, m1, m3, m0
+ packssdw m2, m1 ; 0 1 2 3
+ packssdw m3, m0 ; 4 5 6 _
+ palignr m4, m3, m2, 4 ; 1 2 3 4
+ pshufd m5, m3, q0321 ; 5 6 _ _
+ punpcklwd m1, m2, m4 ; 01 12
+ punpckhwd m2, m4 ; 23 34
+ punpcklwd m3, m5 ; 45 56
+.hv_w2_loop:
+ movu m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movu m5, [srcq+ssq*0]
+ pshufb m4, m9
+ pshufb m5, m9
+ pmaddwd m4, m7
+ pmaddwd m5, m7
+ phaddd m4, m5
+ pmaddwd m5, m11, m1 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m12 ; a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m13 ; a2 b2
+ paddd m5, m3
+ paddd m4, m6
+ psrad m4, 10 ; 7 8
+ packssdw m0, m4
+ pshufd m3, m0, q2103
+ punpckhwd m3, m0 ; 67 78
+ mova m0, m4
+ pmaddwd m4, m14, m3 ; a3 b3
+ paddd m5, m6
+ paddd m5, m4
+ psrad m5, 10
+ packssdw m5, m5
+ pxor m4, m4
+ pminsw m5, m15
+ pmaxsw m5, m4
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q3232
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+.hv_w4:
+ movq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovb myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+ mov dstq, dstmp
+ mov dsq, dsmp
+ mova m0, [base+spel_h_shufA]
+ mova m1, [base+spel_h_shufB]
+ ALLOC_STACK -16*15
+ mova m8, m0
+ mova m9, m1
+ mova m14, m6
+%else
+ mova m8, [base+spel_h_shufA]
+ mova m9, [base+spel_h_shufB]
+%endif
+ pxor m0, m0
+ punpcklbw m0, m2
+ punpcklbw m3, m3
+ psraw m3, 8
+ test dword r8m, 0x800
+ jz .hv_w4_10bpc
+ psraw m0, 2
+ psllw m3, 2
+.hv_w4_10bpc:
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+%if ARCH_X86_32
+ %define tmp esp+16*8
+ shl wd, 14
+%if STACK_ALIGNMENT < 16
+ mov [esp+4*61], srcq
+ mov [esp+4*62], dstq
+%else
+ mov srcmp, srcq
+%endif
+ mova [tmp+16*5], m4
+ lea wd, [wq+hq-(1<<16)]
+ pshufd m1, m0, q0000
+ pshufd m2, m0, q1111
+ pshufd m5, m0, q2222
+ pshufd m0, m0, q3333
+ mova m10, m1
+ mova m11, m2
+ mova m12, m5
+ mova m13, m0
+%else
+%if WIN64
+ %define tmp rsp
+%else
+ %define tmp rsp-104 ; red zone
+%endif
+ shl wd, 6
+ mov r7, srcq
+ mov r8, dstq
+ lea wd, [wq+hq-(1<<8)]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+ mova [tmp+16*5], m15
+%endif
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [tmp+16*1], m0
+ mova [tmp+16*2], m1
+ mova [tmp+16*3], m2
+ mova [tmp+16*4], m3
+%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512]
+ pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4
+ pshufb m%1, m9 ; 2 3 3 4 4 5 5 6
+ pmaddwd m%3, m10
+ pmaddwd m%1, m11
+ paddd m%3, %5
+ paddd m%1, m%3
+ pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8
+ pshufb m%2, m9 ; 6 7 7 8 8 9 9 a
+ pmaddwd m%3, m12
+ pmaddwd m%2, m13
+ paddd m%1, m%3
+ paddd m%1, m%2
+ psrad m%1, %4
+%endmacro
+.hv_w4_loop0:
+%if ARCH_X86_64
+ mova m14, [pd_512]
+%endif
+ movu m4, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ movu m5, [srcq+ssq*1+0]
+ movu m2, [srcq+ssq*1+8]
+ movu m6, [srcq+ssq*2+0]
+ movu m3, [srcq+ssq*2+8]
+ add srcq, r6
+ PUT_8TAP_HV_H 4, 1, 0, 10
+ PUT_8TAP_HV_H 5, 2, 0, 10
+ PUT_8TAP_HV_H 6, 3, 0, 10
+ movu m7, [srcq+ssq*0+0]
+ movu m2, [srcq+ssq*0+8]
+ movu m1, [srcq+ssq*1+0]
+ movu m3, [srcq+ssq*1+8]
+ PUT_8TAP_HV_H 7, 2, 0, 10
+ PUT_8TAP_HV_H 1, 3, 0, 10
+ movu m2, [srcq+ssq*2+0]
+ movu m3, [srcq+ssq*2+8]
+ add srcq, r6
+ PUT_8TAP_HV_H 2, 3, 0, 10
+ packssdw m4, m7 ; 0 3
+ packssdw m5, m1 ; 1 4
+ movu m0, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 1, 3, 10
+ packssdw m6, m2 ; 2 5
+ packssdw m7, m0 ; 3 6
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+%if ARCH_X86_32
+ jmp .hv_w4_loop_start
+.hv_w4_loop:
+ mova m1, [tmp+16*6]
+ mova m2, m15
+.hv_w4_loop_start:
+ mova m7, [tmp+16*1]
+ pmaddwd m1, m7 ; a0
+ pmaddwd m2, m7 ; b0
+ mova m7, [tmp+16*2]
+ mova [tmp+16*6], m3
+ pmaddwd m3, m7 ; a1
+ mova m15, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m1, m3
+ paddd m2, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 10
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 10
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m1, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m2, m7 ; b3
+ psrad m1, 9
+ psrad m2, 9
+ packssdw m1, m2
+ pxor m7, m7
+ pmaxsw m1, m7
+ pavgw m7, m1
+ pminsw m7, [tmp+16*5]
+ movq [dstq+dsq*0], m7
+ movhps [dstq+dsq*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+%if STACK_ALIGNMENT < 16
+ mov srcq, [esp+4*61]
+ mov dstq, [esp+4*62]
+ add srcq, 8
+ add dstq, 8
+ mov [esp+4*61], srcq
+ mov [esp+4*62], dstq
+%else
+ mov srcq, srcmp
+ mov dstq, dstmp
+ add srcq, 8
+ add dstq, 8
+ mov srcmp, srcq
+ mov dstmp, dstq
+%endif
+ movzx hd, ww
+ sub wd, 1<<16
+%else
+.hv_w4_loop:
+ mova m15, [tmp+16*1]
+ pmaddwd m14, m15, m1 ; a0
+ pmaddwd m15, m2 ; b0
+ mova m7, [tmp+16*2]
+ mova m1, m3
+ pmaddwd m3, m7 ; a1
+ mova m2, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m14, m3
+ paddd m15, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m14, m5
+ paddd m15, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512]
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512]
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m14, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m15, m7 ; b3
+ psrad m14, 9
+ psrad m15, 9
+ packssdw m14, m15
+ pxor m7, m7
+ pmaxsw m14, m7
+ pavgw m7, m14
+ pminsw m7, [tmp+16*5]
+ movq [dstq+dsq*0], m7
+ movhps [dstq+dsq*1], m7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ add r7, 8
+ add r8, 8
+ movzx hd, wb
+ mov srcq, r7
+ mov dstq, r8
+ sub wd, 1<<8
+%endif
+ jg .hv_w4_loop0
+ RET
+%undef tmp
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 2, 1, 6, 4
+%elif WIN64
+DECLARE_REG_TMP 6, 4, 7, 4
+%else
+DECLARE_REG_TMP 6, 7, 7, 8
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+%if ARCH_X86_32
+cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my
+%define mxb r0b
+%define mxd r0
+%define mxq r0
+%define myb r2b
+%define myd r2
+%define myq r2
+%else
+cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
+%endif
+%define base t2-prep_ssse3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ LEA t2, prep_ssse3
+ movifnidn wd, wm
+ movifnidn srcq, srcmp
+ test mxd, 0xf00
+ jnz .h
+ movifnidn hd, hm
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ mov myd, r7m ; bitdepth_max
+ movzx wd, word [base+prep_ssse3_table+wq*2]
+ mova m5, [base+pw_8192]
+ shr myd, 11
+ add wq, t2
+ movddup m4, [base+prep_mul+myq*8]
+ movifnidn ssq, ssmp
+ movifnidn tmpq, tmpmp
+ lea r6, [ssq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ movifnidn ssq, r2mp
+ movifnidn hd, r4m
+ movddup m5, [base+prep_8tap_1d_rnd]
+ cmp wd, 4
+ jne .h_w8
+ movzx mxd, mxb
+ movq m0, [base+subpel_filters+mxq*8]
+ mova m3, [base+spel_h_shufA]
+ mova m4, [base+spel_h_shufB]
+ movifnidn tmpq, tmpmp
+ sub srcq, 2
+ WIN64_SPILL_XMM 8
+ punpcklbw m0, m0
+ psraw m0, 8
+ test dword r7m, 0x800
+ jnz .h_w4_12bpc
+ psllw m0, 2
+.h_w4_12bpc:
+ pshufd m6, m0, q1111
+ pshufd m7, m0, q2222
+.h_w4_loop:
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4
+ pshufb m1, m4 ; 2 3 3 4 4 5 5 6
+ pmaddwd m0, m6
+ pmaddwd m1, m7
+ paddd m0, m5
+ paddd m0, m1
+ pshufb m1, m2, m3
+ pshufb m2, m4
+ pmaddwd m1, m6
+ pmaddwd m2, m7
+ paddd m1, m5
+ paddd m1, m2
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova [tmpq], m0
+ add tmpq, 16
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ WIN64_SPILL_XMM 11
+ shr mxd, 16
+ movq m2, [base+subpel_filters+mxq*8]
+ mova m4, [base+spel_h_shufA]
+ mova m6, [base+spel_h_shufB]
+ movifnidn tmpq, r0mp
+ add wd, wd
+ punpcklbw m2, m2
+ add srcq, wq
+ psraw m2, 8
+ add tmpq, wq
+ neg wq
+ test dword r7m, 0x800
+ jnz .h_w8_12bpc
+ psllw m2, 2
+.h_w8_12bpc:
+ pshufd m7, m2, q0000
+%if ARCH_X86_32
+ ALLOC_STACK -16*3
+ pshufd m0, m2, q1111
+ pshufd m1, m2, q2222
+ pshufd m2, m2, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+%else
+ pshufd m8, m2, q1111
+ pshufd m9, m2, q2222
+ pshufd m10, m2, q3333
+%endif
+.h_w8_loop0:
+ mov r6, wq
+.h_w8_loop:
+ movu m0, [srcq+r6- 6]
+ movu m1, [srcq+r6+ 2]
+ pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4
+ pshufb m0, m6 ; 2 3 3 4 4 5 5 6
+ pmaddwd m2, m7 ; abcd0
+ pmaddwd m0, m8 ; abcd1
+ pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8
+ pshufb m1, m6 ; 6 7 7 8 8 9 9 a
+ paddd m2, m5
+ paddd m0, m2
+ pmaddwd m2, m9, m3 ; abcd2
+ pmaddwd m3, m7 ; efgh0
+ paddd m0, m2
+ pmaddwd m2, m10, m1 ; abcd3
+ pmaddwd m1, m8 ; efgh1
+ paddd m0, m2
+ movu m2, [srcq+r6+10]
+ paddd m3, m5
+ paddd m1, m3
+ pshufb m3, m2, m4 ; a b b c c d d e
+ pshufb m2, m6 ; 8 9 9 a a b b c
+ pmaddwd m3, m9 ; efgh2
+ pmaddwd m2, m10 ; efgh3
+ paddd m1, m3
+ paddd m1, m2
+ psrad m0, 4
+ psrad m1, 4
+ packssdw m0, m1
+ mova [tmpq+r6], m0
+ add r6, 16
+ jl .h_w8_loop
+ add srcq, ssq
+ sub tmpq, wq
+ dec hd
+ jg .h_w8_loop0
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+ WIN64_SPILL_XMM 15
+ movddup m7, [base+prep_8tap_1d_rnd]
+ movifnidn ssq, r2mp
+ movifnidn tmpq, r0mp
+ punpcklbw m3, m3
+ psraw m3, 8 ; sign-extend
+ test dword r7m, 0x800
+ jnz .v_12bpc
+ psllw m3, 2
+.v_12bpc:
+%if ARCH_X86_32
+ ALLOC_STACK -16*7
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m8, m0
+ mova m9, m1
+ mova m10, m2
+ mova m11, m3
+%else
+ pshufd m8, m3, q0000
+ pshufd m9, m3, q1111
+ pshufd m10, m3, q2222
+ pshufd m11, m3, q3333
+%endif
+ lea r6, [ssq*3]
+ sub srcq, r6
+ mov r6d, wd
+ shl wd, 6
+ mov r5, srcq
+%if ARCH_X86_64
+ mov r7, tmpq
+%elif STACK_ALIGNMENT < 16
+ mov [esp+4*29], tmpq
+%endif
+ lea wd, [wq+hq-(1<<8)]
+.v_loop0:
+ movq m1, [srcq+ssq*0]
+ movq m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq m3, [srcq+ssq*0]
+ movq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq m5, [srcq+ssq*0]
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq m0, [srcq+ssq*0]
+ punpcklwd m1, m2 ; 01
+ punpcklwd m2, m3 ; 12
+ punpcklwd m3, m4 ; 23
+ punpcklwd m4, m5 ; 34
+ punpcklwd m5, m6 ; 45
+ punpcklwd m6, m0 ; 56
+%if ARCH_X86_32
+ jmp .v_loop_start
+.v_loop:
+ mova m1, m12
+ mova m2, m13
+ mova m3, m14
+.v_loop_start:
+ pmaddwd m1, m8 ; a0
+ pmaddwd m2, m8 ; b0
+ mova m12, m3
+ mova m13, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m1, m3
+ paddd m2, m4
+ mova m14, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m3, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m1, m7
+ paddd m1, m3
+ pmaddwd m3, m11, m6 ; b3
+ paddd m2, m7
+ paddd m2, m3
+ psrad m1, 4
+ psrad m2, 4
+ packssdw m1, m2
+ movq [tmpq+r6*0], m1
+ movhps [tmpq+r6*2], m1
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .v_loop
+%if STACK_ALIGNMENT < 16
+ mov tmpq, [esp+4*29]
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov [esp+4*29], tmpq
+%else
+ mov tmpq, tmpmp
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov tmpmp, tmpq
+%endif
+%else
+.v_loop:
+ pmaddwd m12, m8, m1 ; a0
+ pmaddwd m13, m8, m2 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m9 ; a1
+ pmaddwd m4, m9 ; b1
+ paddd m12, m3
+ paddd m13, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m10 ; a2
+ pmaddwd m6, m10 ; b2
+ paddd m12, m5
+ paddd m13, m6
+ movq m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklwd m5, m0, m6 ; 67
+ movq m0, [srcq+ssq*0]
+ pmaddwd m14, m11, m5 ; a3
+ punpcklwd m6, m0 ; 78
+ paddd m12, m7
+ paddd m12, m14
+ pmaddwd m14, m11, m6 ; b3
+ paddd m13, m7
+ paddd m13, m14
+ psrad m12, 4
+ psrad m13, 4
+ packssdw m12, m13
+ movq [tmpq+r6*0], m12
+ movhps [tmpq+r6*2], m12
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .v_loop
+ add r5, 8
+ add r7, 8
+ mov srcq, r5
+ mov tmpq, r7
+%endif
+ movzx hd, wb
+ sub wd, 1<<8
+ jg .v_loop0
+ RET
+.hv:
+%if STACK_ALIGNMENT < 16
+ %xdefine rstk rsp
+%else
+ %assign stack_offset stack_offset - stack_size_padded
+%endif
+ movzx t3d, mxb
+ shr mxd, 16
+ cmp wd, 4
+ cmove mxd, t3d
+ movifnidn hd, r4m
+ movq m2, [base+subpel_filters+mxq*8]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ movq m3, [base+subpel_filters+myq*8]
+%if ARCH_X86_32
+ mov ssq, r2mp
+ mov tmpq, r0mp
+ mova m0, [base+spel_h_shufA]
+ mova m1, [base+spel_h_shufB]
+ mova m4, [base+prep_8tap_2d_rnd]
+ ALLOC_STACK -16*14
+ mova m8, m0
+ mova m9, m1
+ mova m14, m4
+%else
+%if WIN64
+ ALLOC_STACK 16*6, 16
+%endif
+ mova m8, [base+spel_h_shufA]
+ mova m9, [base+spel_h_shufB]
+%endif
+ pxor m0, m0
+ punpcklbw m0, m2
+ punpcklbw m3, m3
+ psraw m0, 4
+ psraw m3, 8
+ test dword r7m, 0x800
+ jz .hv_10bpc
+ psraw m0, 2
+.hv_10bpc:
+ lea r6, [ssq*3]
+ sub srcq, 6
+ sub srcq, r6
+ mov r6d, wd
+ shl wd, 6
+ mov r5, srcq
+%if ARCH_X86_32
+ %define tmp esp+16*8
+%if STACK_ALIGNMENT < 16
+ mov [esp+4*61], tmpq
+%endif
+ pshufd m1, m0, q0000
+ pshufd m2, m0, q1111
+ pshufd m5, m0, q2222
+ pshufd m0, m0, q3333
+ mova m10, m1
+ mova m11, m2
+ mova m12, m5
+ mova m13, m0
+%else
+%if WIN64
+ %define tmp rsp
+%else
+ %define tmp rsp-88 ; red zone
+%endif
+ mov r7, tmpq
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+%endif
+ lea wd, [wq+hq-(1<<8)]
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [tmp+16*1], m0
+ mova [tmp+16*2], m1
+ mova [tmp+16*3], m2
+ mova [tmp+16*4], m3
+.hv_loop0:
+%if ARCH_X86_64
+ mova m14, [prep_8tap_2d_rnd]
+%endif
+ movu m4, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ movu m5, [srcq+ssq*1+0]
+ movu m2, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ movu m6, [srcq+ssq*0+0]
+ movu m3, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 4, 1, 0, 6
+ PUT_8TAP_HV_H 5, 2, 0, 6
+ PUT_8TAP_HV_H 6, 3, 0, 6
+ movu m7, [srcq+ssq*1+0]
+ movu m2, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ movu m1, [srcq+ssq*0+0]
+ movu m3, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 7, 2, 0, 6
+ PUT_8TAP_HV_H 1, 3, 0, 6
+ movu m2, [srcq+ssq*1+0]
+ movu m3, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 2, 3, 0, 6
+ packssdw m4, m7 ; 0 3
+ packssdw m5, m1 ; 1 4
+ movu m0, [srcq+ssq*0+0]
+ movu m1, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 1, 3, 6
+ packssdw m6, m2 ; 2 5
+ packssdw m7, m0 ; 3 6
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+%if ARCH_X86_32
+ jmp .hv_loop_start
+.hv_loop:
+ mova m1, [tmp+16*5]
+ mova m2, m15
+.hv_loop_start:
+ mova m7, [tmp+16*1]
+ pmaddwd m1, m7 ; a0
+ pmaddwd m2, m7 ; b0
+ mova m7, [tmp+16*2]
+ mova [tmp+16*5], m3
+ pmaddwd m3, m7 ; a1
+ mova m15, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m1, m14
+ paddd m2, m14
+ paddd m1, m3
+ paddd m2, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m1, m5
+ paddd m2, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 6
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 6
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m1, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m2, m7 ; b3
+ psrad m1, 6
+ psrad m2, 6
+ packssdw m1, m2
+ movq [tmpq+r6*0], m1
+ movhps [tmpq+r6*2], m1
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .hv_loop
+%if STACK_ALIGNMENT < 16
+ mov tmpq, [esp+4*61]
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov [esp+4*61], tmpq
+%else
+ mov tmpq, tmpmp
+ add r5, 8
+ add tmpq, 8
+ mov srcq, r5
+ mov tmpmp, tmpq
+%endif
+%else
+.hv_loop:
+ mova m15, [tmp+16*1]
+ mova m7, [prep_8tap_2d_rnd]
+ pmaddwd m14, m15, m1 ; a0
+ pmaddwd m15, m2 ; b0
+ paddd m14, m7
+ paddd m15, m7
+ mova m7, [tmp+16*2]
+ mova m1, m3
+ pmaddwd m3, m7 ; a1
+ mova m2, m4
+ pmaddwd m4, m7 ; b1
+ mova m7, [tmp+16*3]
+ paddd m14, m3
+ paddd m15, m4
+ mova m3, m5
+ pmaddwd m5, m7 ; a2
+ mova m4, m6
+ pmaddwd m6, m7 ; b2
+ paddd m14, m5
+ paddd m15, m6
+ movu m7, [srcq+ssq*1+0]
+ movu m5, [srcq+ssq*1+8]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd]
+ packssdw m0, m7 ; 6 7
+ mova [tmp+16*0], m0
+ movu m0, [srcq+ssq*0+0]
+ movu m5, [srcq+ssq*0+8]
+ PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd]
+ mova m6, [tmp+16*0]
+ packssdw m7, m0 ; 7 8
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, [tmp+16*4]
+ paddd m14, m7 ; a3
+ pmaddwd m7, m6, [tmp+16*4]
+ paddd m15, m7 ; b3
+ psrad m14, 6
+ psrad m15, 6
+ packssdw m14, m15
+ movq [tmpq+r6*0], m14
+ movhps [tmpq+r6*2], m14
+ lea tmpq, [tmpq+r6*4]
+ sub hd, 2
+ jg .hv_loop
+ add r5, 8
+ add r7, 8
+ mov srcq, r5
+ mov tmpq, r7
+%endif
+ movzx hd, wb
+ sub wd, 1<<8
+ jg .hv_loop0
+ RET
+%undef tmp
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro SAVE_REG 1
+ %xdefine r%1_save r%1
+ %xdefine r%1q_save r%1q
+ %xdefine r%1d_save r%1d
+ %if ARCH_X86_32
+ %define r%1m_save [rstk+stack_offset+(%1+1)*4]
+ %endif
+%endmacro
+
+%macro LOAD_REG 1
+ %xdefine r%1 r%1_save
+ %xdefine r%1q r%1q_save
+ %xdefine r%1d r%1d_save
+ %if ARCH_X86_32
+ %define r%1m r%1m_save
+ %endif
+ %undef r%1d_save
+ %undef r%1q_save
+ %undef r%1_save
+%endmacro
+
+%macro REMAP_REG 2-3
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+ %if ARCH_X86_32
+ %if %3 == 0
+ %xdefine r%1m r%2m
+ %else
+ %define r%1m [rstk+stack_offset+(%1+1)*4]
+ %endif
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %if ARCH_X86_64
+ SAVE_REG 14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %else
+ SAVE_REG 5
+ %assign %%i 5
+ %rep 5
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j, 0
+ %assign %%i %%i-1
+ %endrep
+ %endif
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %if ARCH_X86_64
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ LOAD_REG 14
+ %else
+ %rep 4
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j, 1
+ %assign %%i %%i+1
+ %endrep
+ LOAD_REG 5
+ %endif
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%if ARCH_X86_32
+ %macro MC_4TAP_SCALED_H 1 ; dst_mem
+ movu m7, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m5, [r4 +ssq*0]
+ movu m6, [r4 +ssq*1]
+ lea srcq, [srcq+ssq*2]
+ lea r4, [r4 +ssq*2]
+ REPX {pshufb x, m12}, m7, m2
+ REPX {pmaddwd x, m13}, m7, m2
+ REPX {pshufb x, m14}, m5, m6
+ REPX {pmaddwd x, m15}, m5, m6
+ phaddd m7, m5
+ phaddd m2, m6
+ mova m5, [esp+0x00]
+ movd m6, [esp+0x10]
+ paddd m7, m5
+ paddd m2, m5
+ psrad m7, m6
+ psrad m2, m6
+ packssdw m7, m2
+ mova [stk+%1], m7
+ %endmacro
+%endif
+
+%if ARCH_X86_64
+ %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
+ movu m%1, [srcq+ r4*2]
+ movu m%2, [srcq+ r6*2]
+ movu m%3, [srcq+ r7*2]
+ movu m%4, [srcq+ r9*2]
+ movu m%5, [srcq+r10*2]
+ movu m%6, [srcq+r11*2]
+ movu m%7, [srcq+r13*2]
+ movu m%8, [srcq+ rX*2]
+ add srcq, ssq
+ pmaddwd m%1, [stk+0x10]
+ pmaddwd m%2, [stk+0x20]
+ pmaddwd m%3, [stk+0x30]
+ pmaddwd m%4, [stk+0x40]
+ pmaddwd m%5, [stk+0x50]
+ pmaddwd m%6, [stk+0x60]
+ pmaddwd m%7, [stk+0x70]
+ pmaddwd m%8, [stk+0x80]
+ phaddd m%1, m%2
+ phaddd m%3, m%4
+ phaddd m%5, m%6
+ phaddd m%7, m%8
+ phaddd m%1, m%3
+ phaddd m%5, m%7
+ paddd m%1, hround
+ paddd m%5, hround
+ psrad m%1, m12
+ psrad m%5, m12
+ packssdw m%1, m%5
+ %endmacro
+%else
+ %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets
+ %if %3 == 1
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ %endif
+ movu m0, [srcq+r0*2]
+ movu m1, [srcq+rX*2]
+ movu m2, [srcq+r4*2]
+ movu m3, [srcq+r5*2]
+ mov r0, [stk+16]
+ mov rX, [stk+20]
+ mov r4, [stk+24]
+ mov r5, [stk+28]
+ pmaddwd m0, [stk+%1+0x00]
+ pmaddwd m1, [stk+%1+0x10]
+ pmaddwd m2, [stk+%1+0x20]
+ pmaddwd m3, [stk+%1+0x30]
+ phaddd m0, m1
+ phaddd m2, m3
+ movu m4, [srcq+r0*2]
+ movu m5, [srcq+rX*2]
+ movu m6, [srcq+r4*2]
+ movu m7, [srcq+r5*2]
+ add srcq, ssq
+ pmaddwd m4, [stk+%1+0xa0]
+ pmaddwd m5, [stk+%1+0xb0]
+ pmaddwd m6, [stk+%1+0xc0]
+ pmaddwd m7, [stk+%1+0xd0]
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m0, m2
+ phaddd m4, m6
+ paddd m0, hround
+ paddd m4, hround
+ psrad m0, m12
+ psrad m4, m12
+ packssdw m0, m4
+ %if %2 != 0
+ mova [stk+%2], m0
+ %endif
+ %endmacro
+%endif
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isput 1
+ %assign isprep 0
+ %if ARCH_X86_64
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %else
+cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %endif
+ %else ; ARCH_X86_32
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %else
+cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
+ %endif
+ %endif
+ %xdefine base_reg r12
+%else ; prep
+ %assign isput 0
+ %assign isprep 1
+ %if ARCH_X86_64
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %define tmp_stridem qword [stk+0x138]
+ %endif
+ %xdefine base_reg r11
+ %else ; ARCH_X86_32
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %else
+cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
+ %endif
+ %define tmp_stridem dword [stk+0x138]
+ %endif
+%endif
+%if ARCH_X86_32
+ mov [esp+0x1f0], t0d
+ mov [esp+0x1f4], t1d
+ %if isput && required_stack_alignment > STACK_ALIGNMENT
+ mov dstd, dstm
+ mov dsd, dsm
+ mov srcd, srcm
+ mov ssd, ssm
+ mov hd, hm
+ mov r4, mxm
+ %define r0m [esp+0x200]
+ %define dsm [esp+0x204]
+ %define dsmp dsm
+ %define r1m dsm
+ %define r2m [esp+0x208]
+ %define ssm [esp+0x20c]
+ %define r3m ssm
+ %define hm [esp+0x210]
+ %define mxm [esp+0x214]
+ mov r0m, dstd
+ mov dsm, dsd
+ mov r2m, srcd
+ mov ssm, ssd
+ mov hm, hd
+ mov r0, mym
+ mov r1, dxm
+ mov r2, dym
+ %define mym [esp+0x218]
+ %define dxm [esp+0x21c]
+ %define dym [esp+0x220]
+ mov mxm, r4
+ mov mym, r0
+ mov dxm, r1
+ mov dym, r2
+ tzcnt wd, wm
+ %endif
+ %if isput
+ mov r3, pxmaxm
+ %define pxmaxm r3
+ %else
+ mov r2, pxmaxm
+ %endif
+ %if isprep && required_stack_alignment > STACK_ALIGNMENT
+ %xdefine base_reg r5
+ %else
+ %xdefine base_reg r6
+ %endif
+%endif
+ LEA base_reg, %1_8tap_scaled_16bpc_ssse3
+%xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3
+%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
+ tzcnt wd, wm
+%endif
+%if ARCH_X86_64
+ %if isput
+ mov r7d, pxmaxm
+ %endif
+%else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+%endif
+ movd m8, dxm
+ movd m14, mxm
+%if isput
+ movd m15, pxmaxm
+%endif
+ pshufd m8, m8, q0000
+ pshufd m14, m14, q0000
+%if isput
+ pshuflw m15, m15, q0000
+ punpcklqdq m15, m15
+%endif
+%if isprep
+ %if UNIX64
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+ %endif
+ %if ARCH_X86_64
+ mov r6d, pxmaxm
+ %endif
+%endif
+%if ARCH_X86_64
+ mov dyd, dym
+%endif
+%if isput
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %elif ARCH_X86_64
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %else
+ %endif
+ %if ARCH_X86_64
+ %if required_stack_alignment > STACK_ALIGNMENT
+ %define dsm [rsp+0x138]
+ %define rX r1
+ %define rXd r1d
+ %else
+ %define dsm dsq
+ %define rX r14
+ %define rXd r14d
+ %endif
+ %else
+ %define rX r1
+ %endif
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %elif ARCH_X86_64
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %xdefine hm r7m
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %if ARCH_X86_64
+ %define rX r14
+ %define rXd r14d
+ %else
+ %define rX r3
+ %endif
+%endif
+%if ARCH_X86_64
+ shr r7d, 11
+ mova m10, [base+pd_0x3ff]
+ movddup m11, [base+s_8tap_h_rnd+r7*8]
+ movd m12, [base+s_8tap_h_sh+r7*4]
+ %if isput
+ movddup m13, [base+put_s_8tap_v_rnd+r7*8]
+ movd m7, [base+put_s_8tap_v_sh+r7*4]
+ %define pxmaxm [rsp]
+ mova pxmaxm, m15
+ punpcklqdq m12, m7
+ %endif
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [esp+0x00]
+ %define m12 [esp+0x10]
+ shr r3, 11
+ movddup m1, [base+s_8tap_h_rnd+r3*8]
+ movd m2, [base+s_8tap_h_sh+r3*4]
+ %if isput
+ %define m13 [esp+0x20]
+ %define pxmaxm [esp+0x30]
+ %define stk esp+0x40
+ movddup m5, [base+put_s_8tap_v_rnd+r3*8]
+ movd m6, [base+put_s_8tap_v_sh+r3*4]
+ mova pxmaxm, m15
+ punpcklqdq m2, m6
+ mova m13, m5
+ %else
+ %define m13 [base+pd_m524256]
+ %endif
+ mov ssd, ssm
+ mova m11, m1
+ mova m12, m2
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ mov r1, [esp+0x1f4]
+ lea r0, [ssd*3]
+ movzx r2, r1b
+ shr r1, 16
+ cmp dword hm, 6
+ cmovs r1, r2
+ mov [esp+0x1f4], r1
+ %if isprep
+ mov r1, r1m
+ %endif
+ mov r2, r2m
+ sub srcq, r0
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define ss3q r0
+ %define myd r4
+ %define dyd dword dym
+ %define hd dword hm
+%endif
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+ %else
+ movzx r4, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r4
+ %endif
+ pxor m9, m9
+ punpckldq m9, m8
+ paddd m14, m9 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ pshufd m15, m15, q0321
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_q]
+ mova m6, [base+spel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m2, m2
+ pcmpeqd m8, m2
+ psrld m14, 10
+ paddd m14, m14
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [stk], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m15 m6
+ %endif
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m7
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ pand m9, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m9
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ movu m7, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ %else
+ pand m7, m5, [base+pd_0x4000]
+ pandn m5, m15
+ por m5, m7
+ %define m15 m5
+ %endif
+ punpcklbw m15, m15
+ psraw m15, 8
+ REPX {pshufb x, m14}, m0, m1, m2, m3
+ REPX {pmaddwd x, m15}, m0, m1, m2, m3
+ %if ARCH_X86_64
+ REPX {pshufb x, m14}, m4, m5, m6, m7
+ REPX {pmaddwd x, m15}, m4, m5, m6, m7
+ phaddd m0, m1
+ phaddd m2, m3
+ phaddd m4, m5
+ phaddd m6, m7
+ REPX {paddd x, m11}, m0, m2, m4, m6
+ REPX {psrad x, m12}, m0, m2, m4, m6
+ packssdw m0, m2 ; 0 1 2 3
+ packssdw m4, m6 ; 4 5 6 7
+ SWAP m1, m4
+ %else
+ mova [stk+0x10], m15
+ phaddd m0, m1
+ phaddd m2, m3
+ movu m1, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m14}, m1, m7, m6, m3
+ REPX {pmaddwd x, m15}, m1, m7, m6, m3
+ phaddd m1, m7
+ phaddd m6, m3
+ REPX {paddd x, m11}, m0, m2, m1, m6
+ REPX {psrad x, m12}, m0, m2, m1, m6
+ packssdw m0, m2
+ packssdw m1, m6
+ %define m14 [stk+0x00]
+ %define m15 [stk+0x10]
+ %endif
+ palignr m2, m1, m0, 4 ; 1 2 3 4
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ pshufd m5, m1, q0321 ; 5 6 7 _
+ punpcklwd m2, m1, m5 ; 45 56
+ punpckhwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mov myd, mym
+ mov r0, r0m
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ mova [stk+0x40], m2
+ mova [stk+0x50], m4
+ %endif
+.w2_loop:
+ and myd, 0x3ff
+ %if ARCH_X86_64
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m10, r6q
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pmaddwd m5, m3, m7
+ pmaddwd m6, m0, m8
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ pmaddwd m7, m2, m9
+ pmaddwd m8, m4, m10
+ paddd m5, m6
+ paddd m7, m8
+ %else
+ mov r1, [esp+0x1f4]
+ xor r3, r3
+ mov r5, myd
+ shr r5, 6
+ lea r1, [r1+r5]
+ mov r5, 64 << 24
+ cmovnz r3, [base+subpel_filters+r1*8+4]
+ cmovnz r5, [base+subpel_filters+r1*8+0]
+ movd m6, r3
+ movd m7, r5
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m5, m7, q0000
+ pshufd m6, m7, q1111
+ pmaddwd m3, m5
+ pmaddwd m0, m6
+ pshufd m5, m7, q2222
+ pshufd m7, m7, q3333
+ pmaddwd m2, m5
+ pmaddwd m4, m7
+ paddd m3, m0
+ paddd m2, m4
+ SWAP m5, m3
+ SWAP m7, m2
+ %define m8 m3
+ %endif
+ paddd m5, m13
+ pshufd m6, m12, q1032
+ pxor m8, m8
+ paddd m5, m7
+ psrad m5, m6
+ packssdw m5, m5
+ pmaxsw m5, m8
+ pminsw m5, pxmaxm
+ movd [dstq], m5
+ add dstq, dsmp
+ dec hd
+ jz .ret
+ %if ARCH_X86_64
+ add myd, dyd
+ %else
+ add myd, dym
+ %endif
+ test myd, ~0x3ff
+ %if ARCH_X86_32
+ SWAP m3, m5
+ SWAP m2, m7
+ mova m3, [stk+0x20]
+ mova m0, [stk+0x30]
+ mova m2, [stk+0x40]
+ mova m4, [stk+0x50]
+ %endif
+ jz .w2_loop
+ %if ARCH_X86_32
+ mov r3, r3m
+ %endif
+ movu m5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps m3, m0, q1032 ; 01 12
+ shufps m0, m2, q1032 ; 23 34
+ shufps m2, m4, q1032 ; 45 56
+ pshufb m5, m14
+ pmaddwd m5, m15
+ phaddd m5, m5
+ paddd m5, m11
+ psrad m5, m12
+ packssdw m5, m5
+ palignr m4, m5, m1, 12
+ punpcklqdq m1, m4, m4 ; 6 7 6 7
+ punpcklwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ mova [stk+0x40], m2
+ mova [stk+0x50], m4
+ %endif
+ jmp .w2_loop
+.w2_skip_line:
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m3, m0 ; 01 12
+ mova m0, m2 ; 23 34
+ pshufb m5, m14
+ pshufb m6, m14
+ pmaddwd m5, m15
+ pmaddwd m6, m15
+ phaddd m5, m6
+ paddd m5, m11
+ psrad m5, m12
+ packssdw m5, m5 ; 6 7 6 7
+ punpckhqdq m1, m5 ; 4 5 6 7
+ pshufd m5, m1, q0321 ; 5 6 7 _
+ punpcklwd m2, m1, m5 ; 45 56
+ punpckhwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ mova [stk+0x40], m2
+ mova [stk+0x50], m4
+ %endif
+ jmp .w2_loop
+%endif
+INIT_XMM ssse3
+.w4:
+%if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %if isput
+ mova [rsp+0x30], m13
+ %endif
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+%else
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ movzx r4, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r4
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+%else
+ %define m9 [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ pshufd m7, m15, q1032
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r6d, m15
+ movd r13d, m7
+ mova m10, [base+bdct_lb_q+ 0]
+ mova m11, [base+bdct_lb_q+16]
+ movd m13, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+ r6*8+2]
+ movd m15, [base+subpel_filters+r11*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r0, m15
+ movd r4, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd rX, m15
+ movd r5, m7
+ mova m5, [base+bdct_lb_q+ 0]
+ mova m6, [base+bdct_lb_q+16]
+ movd m1, [base+subpel_filters+r0*8+2]
+ movd m2, [base+subpel_filters+rX*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ movifprep r3, r3m
+ SWAP m4, m7
+ %define m10 m5
+ %define m11 m6
+ %define m12 m1
+ %define m13 m1
+%endif
+ psrld m14, 10
+ paddd m14, m14
+ punpckldq m13, m2
+ punpckldq m15, m4
+ punpcklqdq m13, m15
+ pxor m2, m2
+ pcmpeqd m0, m2
+%if ARCH_X86_64
+ pand m9, m0
+%else
+ pand m2, m9, m0
+ %define m9 m2
+ SWAP m7, m4
+%endif
+ pandn m0, m13
+%if ARCH_X86_64
+ SWAP m13, m0
+%else
+ %define m13 m0
+%endif
+ por m13, m9
+ punpckhbw m15, m13, m13
+ punpcklbw m13, m13
+ psraw m15, 8
+ psraw m13, 8
+ pshufb m12, m14, m10
+ pshufb m14, m11
+ mova m10, [base+spel_s_shuf2]
+ movd r4d, m14
+ shr r4d, 24
+%if ARCH_X86_32
+ mova [stk+0x20], m13
+ mova [stk+0x30], m15
+ pxor m2, m2
+%endif
+ pshufb m7, m14, m2
+ psubb m14, m7
+ paddb m12, m10
+ paddb m14, m10
+%if ARCH_X86_64
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu m7, [srcq+ssq*0]
+ movu m9, [srcq+ssq*1]
+ movu m8, [srcq+ssq*2]
+ movu m10, [srcq+ss3q ]
+ movu m1, [srcq+r4 ]
+ movu m3, [srcq+r6 ]
+ movu m2, [srcq+r11 ]
+ movu m4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m7, m9, m8, m10
+ REPX {pmaddwd x, m13}, m7, m9, m8, m10
+ REPX {pshufb x, m14}, m1, m2, m3, m4
+ REPX {pmaddwd x, m15}, m1, m2, m3, m4
+ mova m5, [rsp+0x10]
+ movd xm6, [rsp+0x20]
+ phaddd m7, m1
+ phaddd m9, m3
+ phaddd m8, m2
+ phaddd m10, m4
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m4, [srcq+ss3q ]
+ REPX {paddd x, m5}, m7, m9, m8, m10
+ REPX {psrad x, xm6}, m7, m9, m8, m10
+ packssdw m7, m9 ; 0 1
+ packssdw m8, m10 ; 2 3
+ movu m0, [srcq+r4 ]
+ movu m9, [srcq+r6 ]
+ movu m10, [srcq+r11 ]
+ movu m11, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m1, m2, m3, m4
+ REPX {pmaddwd x, m13}, m1, m2, m3, m4
+ REPX {pshufb x, m14}, m0, m9, m10, m11
+ REPX {pmaddwd x, m15}, m0, m9, m10, m11
+ phaddd m1, m0
+ phaddd m2, m9
+ phaddd m3, m10
+ phaddd m4, m11
+ REPX {paddd x, m5}, m1, m2, m3, m4
+ REPX {psrad x, xm6}, m1, m2, m3, m4
+ packssdw m1, m2 ; 4 5
+ packssdw m3, m4 ; 6 7
+ SWAP m9, m1
+ shufps m4, m7, m8, q1032 ; 1 2
+ shufps m5, m8, m9, q1032 ; 3 4
+ shufps m6, m9, m3, q1032 ; 5 6
+ pshufd m10, m3, q1032 ; 7 _
+ punpcklwd m0, m7, m4 ; 01
+ punpckhwd m7, m4 ; 12
+ punpcklwd m1, m8, m5 ; 23
+ punpckhwd m8, m5 ; 34
+ punpcklwd m2, m9, m6 ; 45
+ punpckhwd m9, m6 ; 56
+ punpcklwd m3, m10 ; 67
+ mova [rsp+0x40], m7
+ mova [rsp+0x50], m8
+ mova [rsp+0x60], m9
+%else
+ mova [stk+0x00], m12
+ mova [stk+0x10], m14
+ add r4, srcq
+ MC_4TAP_SCALED_H 0x40 ; 0 1
+ MC_4TAP_SCALED_H 0x50 ; 2 3
+ MC_4TAP_SCALED_H 0x60 ; 4 5
+ MC_4TAP_SCALED_H 0x70 ; 6 7
+ mova m4, [stk+0x40]
+ mova m5, [stk+0x50]
+ mova m6, [stk+0x60]
+ mova m7, [stk+0x70]
+ mov [stk+0xc0], r4
+ shufps m1, m4, m5, q1032 ; 1 2
+ shufps m2, m5, m6, q1032 ; 3 4
+ shufps m3, m6, m7, q1032 ; 5 6
+ pshufd m0, m7, q1032 ; 7 _
+ mova [stk+0xb0], m0
+ punpcklwd m0, m4, m1 ; 01
+ punpckhwd m4, m1 ; 12
+ punpcklwd m1, m5, m2 ; 23
+ punpckhwd m5, m2 ; 34
+ punpcklwd m2, m6, m3 ; 45
+ punpckhwd m6, m3 ; 56
+ punpcklwd m3, m7, [stk+0xb0] ; 67
+ mov myd, mym
+ mov r0, r0m
+ mova [stk+0x40], m0 ; 01
+ mova [stk+0x50], m1 ; 23
+ mova [stk+0x60], m2 ; 45
+ mova [stk+0x70], m3 ; 67
+ mova [stk+0x80], m4 ; 12
+ mova [stk+0x90], m5 ; 34
+ mova [stk+0xa0], m6 ; 56
+ %define m12 [stk+0x00]
+ %define m14 [stk+0x10]
+ %define m13 [stk+0x20]
+ %define m15 [stk+0x30]
+ %define hrnd_mem [esp+0x00]
+ %define hsh_mem [esp+0x10]
+ %if isput
+ %define vrnd_mem [esp+0x20]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+%endif
+.w4_loop:
+ and myd, 0x3ff
+%if ARCH_X86_64
+ mov r11d, 64 << 24
+ mov r13d, myd
+ shr r13d, 6
+ lea r13d, [t1+r13]
+ cmovnz r11q, [base+subpel_filters+r13*8]
+ movq m9, r11q
+ punpcklbw m9, m9
+ psraw m9, 8
+ pshufd m7, m9, q0000
+ pshufd m8, m9, q1111
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pshufd m7, m9, q2222
+ pshufd m9, m9, q3333
+ pmaddwd m6, m2, m7
+ pmaddwd m8, m3, m9
+ %if isput
+ movd m9, [rsp+0x28]
+ %define vrnd_mem [rsp+0x30]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+ paddd m4, m5
+ paddd m6, m8
+ paddd m4, m6
+ paddd m4, vrnd_mem
+%else
+ mov mym, myd
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr r4, 6
+ lea r5, [r5+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ pmaddwd m2, m6
+ pmaddwd m3, m7
+ %if isput
+ movd m4, [esp+0x18]
+ %endif
+ paddd m0, m1
+ paddd m2, m3
+ paddd m0, vrnd_mem
+ paddd m0, m2
+ SWAP m4, m0
+ %define m9 m0
+%endif
+%if isput
+ pxor m5, m5
+ psrad m4, m9
+ packssdw m4, m4
+ pmaxsw m4, m5
+ pminsw m4, pxmaxm
+ movq [dstq], m4
+ add dstq, dsmp
+%else
+ psrad m4, 6
+ packssdw m4, m4
+ movq [tmpq], m4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+%if ARCH_X86_64
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+ mova m8, [rsp+0x10]
+ movd m9, [rsp+0x20]
+ movu m4, [srcq]
+ movu m5, [srcq+r4]
+ test myd, 0x400
+ jz .w4_skip_line
+ mova m0, [rsp+0x40]
+ mova [rsp+0x40], m1
+ mova m1, [rsp+0x50]
+ mova [rsp+0x50], m2
+ mova m2, [rsp+0x60]
+ mova [rsp+0x60], m3
+ pshufb m4, m12
+ pshufb m5, m14
+ pmaddwd m4, m13
+ pmaddwd m5, m15
+ phaddd m4, m5
+ paddd m4, m8
+ psrad m4, m9
+ packssdw m4, m4
+ punpcklwd m3, m10, m4
+ mova m10, m4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu m6, [srcq+ssq*1]
+ movu m7, [srcq+r6]
+ mova m0, [rsp+0x50]
+ mova m11, [rsp+0x60]
+ pshufb m4, m12
+ pshufb m6, m12
+ pshufb m5, m14
+ pshufb m7, m14
+ pmaddwd m4, m13
+ pmaddwd m6, m13
+ pmaddwd m5, m15
+ pmaddwd m7, m15
+ mova [rsp+0x40], m0
+ mova [rsp+0x50], m11
+ phaddd m4, m5
+ phaddd m6, m7
+ paddd m4, m8
+ paddd m6, m8
+ psrad m4, m9
+ psrad m6, m9
+ packssdw m4, m6
+ punpcklwd m9, m10, m4
+ mova [rsp+0x60], m9
+ pshufd m10, m4, q1032
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ punpcklwd m3, m4, m10
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+%else
+ SWAP m0, m4
+ mov myd, mym
+ mov r3, r3m
+ add myd, dym
+ test myd, ~0x3ff
+ jnz .w4_next_line
+ mova m0, [stk+0x40]
+ mova m1, [stk+0x50]
+ mova m2, [stk+0x60]
+ mova m3, [stk+0x70]
+ jmp .w4_loop
+.w4_next_line:
+ mov r5, [stk+0xc0]
+ movu m4, [srcq]
+ movu m5, [r5]
+ test myd, 0x400
+ jz .w4_skip_line
+ add [stk+0xc0], ssq
+ mova m0, [stk+0x80]
+ mova m3, [stk+0x50]
+ mova [stk+0x40], m0
+ mova [stk+0x80], m3
+ mova m1, [stk+0x90]
+ mova m6, [stk+0x60]
+ mova [stk+0x50], m1
+ mova [stk+0x90], m6
+ mova m2, [stk+0xa0]
+ mova m7, [stk+0x70]
+ mova [stk+0x60], m2
+ mova [stk+0xa0], m7
+ pshufb m4, m12
+ pshufb m5, m14
+ pmaddwd m4, m13
+ pmaddwd m5, m15
+ phaddd m4, m5
+ paddd m4, hrnd_mem
+ psrad m4, hsh_mem
+ packssdw m4, m4
+ punpcklwd m3, [stk+0xb0], m4
+ mova [stk+0xb0], m4
+ mova [stk+0x70], m3
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu m6, [srcq+ssq*1]
+ movu m7, [r5 +ssq*1]
+ lea r5, [r5 +ssq*2]
+ mov [stk+0xc0], r5
+ mova m0, [stk+0x50]
+ mova m1, [stk+0x60]
+ mova m2, [stk+0x70]
+ mova m3, [stk+0x90]
+ pshufb m4, m12
+ pshufb m6, m12
+ pshufb m5, m14
+ pshufb m7, m14
+ pmaddwd m4, m13
+ pmaddwd m6, m13
+ pmaddwd m5, m15
+ pmaddwd m7, m15
+ mova [stk+0x40], m0
+ mova [stk+0x50], m1
+ mova [stk+0x60], m2
+ mova [stk+0x80], m3
+ phaddd m4, m5
+ phaddd m6, m7
+ mova m5, [stk+0xa0]
+ mova m7, [stk+0xb0]
+ paddd m4, hrnd_mem
+ paddd m6, hrnd_mem
+ psrad m4, hsh_mem
+ psrad m6, hsh_mem
+ packssdw m4, m6
+ punpcklwd m7, m4
+ pshufd m6, m4, q1032
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m7
+ mova [stk+0xb0], m6
+ punpcklwd m3, m4, m6
+ mova [stk+0x70], m3
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+%endif
+INIT_XMM ssse3
+%if ARCH_X86_64
+ %define stk rsp+0x20
+%endif
+.w8:
+ mov dword [stk+0xf0], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [stk+0xf0], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [stk+0xf0], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [stk+0xf0], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [stk+0xf0], 16
+ movifprep tmp_stridem, 256
+.w_start:
+%if ARCH_X86_64
+ %ifidn %1, put
+ movifnidn dsm, dsq
+ %endif
+ mova [rsp+0x10], m11
+ %define hround m11
+ shr t0d, 16
+ movd m15, t0d
+ %if isprep
+ mova m13, [base+pd_m524256]
+ %endif
+%else
+ %define hround [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m10 [base+pd_0x3ff]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq ssm
+ %endif
+ mov r4, [esp+0x1f0]
+ shr r4, 16
+ movd m15, r4
+ mov r0, r0m
+ mov myd, mym
+%endif
+ sub srcq, 6
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ mova [stk+0x100], m7
+ mova [stk+0x120], m15
+ mov [stk+0x0f8], srcq
+ mov [stk+0x130], r0q ; dstq / tmpq
+%if ARCH_X86_64 && UNIX64
+ mov hm, hd
+%elif ARCH_X86_32
+ mov r5, hm
+ mov [stk+0x0f4], myd
+ mov [stk+0x134], r5
+%endif
+ jmp .hloop
+.hloop_prep:
+ dec dword [stk+0x0f0]
+ jz .ret
+%if ARCH_X86_64
+ add qword [stk+0x130], 16
+ mov hd, hm
+%else
+ add dword [stk+0x130], 16
+ mov myd, [stk+0x0f4]
+ mov r5, [stk+0x134]
+ mov r0, [stk+0x130]
+%endif
+ mova m7, [stk+0x100]
+ mova m14, [stk+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m11, [rsp+0x10]
+%endif
+ mova m15, [stk+0x120]
+ mov srcq, [stk+0x0f8]
+%if ARCH_X86_64
+ mov r0q, [stk+0x130] ; dstq / tmpq
+%else
+ mov mym, myd
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.hloop:
+%if ARCH_X86_64
+ mova m9, [base+pq_0x40000000]
+%else
+ %define m9 [base+pq_0x40000000]
+%endif
+ pxor m1, m1
+ psrld m2, m14, 10
+ mova [stk], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m1
+ pshufd m2, m5, q1032
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pxor m2, m2
+ pcmpeqd m5, m2
+ mova [stk+0x110], m14
+ pshufd m4, m15, q1032
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ movq r11, m14
+ punpckhqdq m14, m14
+ movq rX, m14
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m9, m4
+ pand m8, m9, m6
+ pand m15, m9, m14
+ pand m9, m9, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m9, m5
+ punpcklbw m0, m7, m7
+ punpckhbw m7, m7
+ punpcklbw m1, m8, m8
+ punpckhbw m8, m8
+ psraw m0, 8
+ psraw m7, 8
+ psraw m1, 8
+ psraw m8, 8
+ punpcklbw m2, m15, m15
+ punpckhbw m15, m15
+ punpcklbw m3, m9, m9
+ punpckhbw m9, m9
+ psraw m2, 8
+ psraw m15, 8
+ psraw m3, 8
+ psraw m9, 8
+ mova [stk+0x10], m0
+ mova [stk+0x20], m7
+ mova [stk+0x30], m1
+ mova [stk+0x40], m8
+ mova [stk+0x50], m2
+ mova [stk+0x60], m15
+ mova [stk+0x70], m3
+ mova [stk+0x80], m9
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
+ mova [stk+0x90], m1
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
+ mova [stk+0xa0], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
+ mova [stk+0xb0], m3
+ MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
+ mova [stk+0xc0], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
+ mova [stk+0xd0], m5
+ MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
+ MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
+ MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
+ mova m5, [stk+0xd0]
+ mova m1, [stk+0x90]
+ mova m2, [stk+0xa0]
+ mova m3, [stk+0xb0]
+ mova m9, [stk+0xc0]
+ mov myd, mym
+ mov dyd, dym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova [stk+0x90], m4
+ mova [stk+0xa0], m5
+ mova [stk+0xb0], m6
+ mova [stk+0xc0], m7
+ %define hround [rsp+0x10]
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m11, r6q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufd m5, m11, q0000
+ pshufd m7, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m4, m5, m0
+ pmaddwd m5, m5, m1
+ pmaddwd m6, m7, m2
+ pmaddwd m7, m7, m3
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [stk+0x90], m10
+ pmaddwd m7, [stk+0xa0], m10
+ pmaddwd m8, [stk+0xb0], m11
+ pmaddwd m9, [stk+0xc0], m11
+ paddd m4, m6
+ paddd m5, m7
+ %if isput
+ pshufd m6, m12, q1032
+ %endif
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r4, m15
+ movd r5, m4
+ mova m14, [stk+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [stk+16], m14
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m9, m4
+ pand m1, m9, m6
+ pand m2, m9, m7
+ pand m3, m9, m5
+ pandn m4, [stk+0x20]
+ pandn m6, [stk+0x30]
+ pandn m7, [stk+0x40]
+ pandn m5, [stk+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ punpcklbw m4, m0, m0
+ punpckhbw m0, m0
+ punpcklbw m5, m1, m1
+ punpckhbw m1, m1
+ psraw m4, 8
+ psraw m0, 8
+ psraw m5, 8
+ psraw m1, 8
+ punpcklbw m6, m2, m2
+ punpckhbw m2, m2
+ punpcklbw m7, m3, m3
+ punpckhbw m3, m3
+ psraw m6, 8
+ psraw m2, 8
+ psraw m7, 8
+ psraw m3, 8
+ mova [stk+0x0a0], m4
+ mova [stk+0x0b0], m0
+ mova [stk+0x0c0], m5
+ mova [stk+0x0d0], m1
+ mova [stk+0x140], m6
+ mova [stk+0x150], m2
+ mova [stk+0x160], m7
+ mova [stk+0x170], m3
+ MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
+ MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
+ MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
+ MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
+ MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
+ MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
+ MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
+ MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
+ mova m5, [stk+0x60]
+ mova m6, [stk+0x70]
+ mova m7, [stk+0x80]
+ mova m0, [stk+0x90]
+ mov myd, mym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m1, [stk+0x20]
+ mova m2, [stk+0x30]
+ mova m3, [stk+0x40]
+ mova m4, [stk+0x50]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+.vloop:
+ mov r0, r0m
+ mov r5, [esp+0x1f4]
+ and myd, 0x3ff
+ mov mym, myd
+ xor r3, r3
+ shr r4, 6
+ lea r5, [r5+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [stk+0x60], m6
+ pmaddwd m3, [stk+0x70], m6
+ pmaddwd m4, [stk+0x80], m7
+ pmaddwd m5, [stk+0x90], m7
+ %if isput
+ movd m6, [esp+0x18]
+ %endif
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, vrnd_mem
+ paddd m1, vrnd_mem
+ paddd m4, m0
+ paddd m5, m1
+%endif
+%ifidn %1, put
+ psrad m4, m6
+ psrad m5, m6
+ packssdw m4, m5
+ pxor m7, m7
+ pmaxsw m4, m7
+ pminsw m4, pxmaxm
+ mova [dstq], m4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+%if ARCH_X86_64
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [stk+0x140], myd
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ jz .skip_line
+ mova m14, [base+unpckw]
+ movu m8, [srcq+r10*2]
+ movu m9, [srcq+r11*2]
+ movu m10, [srcq+r13*2]
+ movu m11, [srcq+ rX*2]
+ movu m4, [srcq+ r4*2]
+ movu m5, [srcq+ r6*2]
+ movu m6, [srcq+ r7*2]
+ movu m7, [srcq+ r9*2]
+ add srcq, ssq
+ mov myd, [stk+0x140]
+ mov dyd, dym
+ pshufd m15, m14, q1032
+ pshufb m0, m14 ; 0a 1a
+ pshufb m1, m14 ; 0b 1b
+ pshufb m2, m15 ; 3a 2a
+ pshufb m3, m15 ; 3b 2b
+ pmaddwd m8, [stk+0x50]
+ pmaddwd m9, [stk+0x60]
+ pmaddwd m10, [stk+0x70]
+ pmaddwd m11, [stk+0x80]
+ pmaddwd m4, [stk+0x10]
+ pmaddwd m5, [stk+0x20]
+ pmaddwd m6, [stk+0x30]
+ pmaddwd m7, [stk+0x40]
+ phaddd m8, m9
+ phaddd m10, m11
+ mova m11, hround
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m8, m10
+ phaddd m4, m6
+ paddd m4, m11
+ paddd m8, m11
+ psrad m4, m12
+ psrad m8, m12
+ packssdw m4, m8
+ pshufb m5, [stk+0x90], m14 ; 4a 5a
+ pshufb m6, [stk+0xa0], m14 ; 4b 5b
+ pshufb m7, [stk+0xb0], m15 ; 7a 6a
+ pshufb m8, [stk+0xc0], m15 ; 7b 6b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ punpckhwd m5, m7 ; 56a
+ punpckhwd m6, m8 ; 56b
+ punpcklwd m7, m4 ; 78a
+ punpckhqdq m4, m4
+ punpcklwd m8, m4 ; 78b
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m6
+ mova [stk+0xb0], m7
+ mova [stk+0xc0], m8
+ jmp .vloop
+.skip_line:
+ MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11
+ MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11
+ mov myd, [stk+0x140]
+ mov dyd, dym
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ mova m2, [stk+0x90] ; 23a
+ mova m3, [stk+0xa0] ; 23b
+ mova m5, [stk+0xb0] ; 45a
+ mova m6, [stk+0xc0] ; 45b
+ punpcklwd m7, m4, m8 ; 67a
+ punpckhwd m4, m8 ; 67b
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m6
+ mova [stk+0xb0], m7
+ mova [stk+0xc0], m4
+%else
+ mov r0m, r0
+ mov myd, mym
+ mov r3, r3m
+ add myd, dym
+ test myd, ~0x3ff
+ mov mym, myd
+ jnz .next_line
+ mova m0, [stk+0x20]
+ mova m1, [stk+0x30]
+ mova m2, [stk+0x40]
+ mova m3, [stk+0x50]
+ jmp .vloop
+.next_line:
+ test myd, 0x400
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ jz .skip_line
+ MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
+ mova m7, [base+unpckw]
+ pshufd m4, m7, q1032
+ pshufb m0, [stk+0x20], m7 ; 0a 1a
+ pshufb m1, [stk+0x30], m7 ; 0b 1b
+ pshufb m2, [stk+0x40], m4 ; 3a 2a
+ pshufb m3, [stk+0x50], m4 ; 3b 2b
+ pshufb m5, [stk+0x60], m7 ; 4a 5a
+ pshufb m6, [stk+0x70], m7 ; 4b 5b
+ pshufb m7, [stk+0x80], m4 ; 7a 6a
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ punpckhwd m5, m7 ; 56a
+ mova [stk+0x60], m5
+ pshufb m5, [stk+0x90], m4 ; 7b 6b
+ punpcklwd m7, [stk+0xe0] ; 78a
+ punpckhwd m6, m5 ; 56b
+ mova [stk+0x70], m6
+ movq m6, [stk+0xe8]
+ mova [stk+0x80], m7
+ punpcklwd m5, m6
+ mov myd, mym
+ mova [stk+0x90], m5
+ jmp .vloop
+.skip_line:
+ MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
+ MC_8TAP_SCALED_H 0xa0, 0 ; 9
+ mova m7, [stk+0xe0]
+ mova m2, [stk+0x60] ; 23a
+ mova m3, [stk+0x70] ; 23b
+ mova m4, [stk+0x80] ; 45a
+ mova m5, [stk+0x90] ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova m0, [stk+0x40] ; 01a
+ mova m1, [stk+0x50] ; 01b
+ mov myd, mym
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+%endif
+ jmp .vloop
+INIT_XMM ssse3
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy1_w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+ %else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+ %define m11 [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m13 [esp+0x20]
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+ mov r1, r1m
+ %endif
+ pxor m9, m9
+ punpckldq m9, m8
+ paddd m14, m9 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ pshufd m15, m15, q0321
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_q]
+ mova m6, [base+spel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m2, m2
+ pcmpeqd m8, m2
+ psrld m14, 10
+ paddd m14, m14
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [stk], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m15 m6
+ %endif
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m7
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ pand m9, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m9
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ %else
+ pand m7, m5, [base+pd_0x4000]
+ pandn m5, m15
+ por m5, m7
+ %define m15 m5
+ mov myd, mym
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr myd, 6
+ lea r5, [r5+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ mov [stk+0x20], r3
+ mov r3, r3m
+ %endif
+ punpcklbw m15, m15
+ psraw m15, 8
+ REPX {pshufb x, m14}, m0, m1, m2, m3
+ REPX {pmaddwd x, m15}, m0, m1, m2, m3
+ %if ARCH_X86_64
+ REPX {pshufb x, m14}, m4, m5, m6
+ REPX {pmaddwd x, m15}, m4, m5, m6
+ phaddd m0, m1
+ phaddd m2, m3
+ phaddd m4, m5
+ phaddd m6, m6
+ REPX {paddd x, m11}, m0, m2, m4, m6
+ REPX {psrad x, m12}, m0, m2, m4, m6
+ packssdw m0, m2 ; 0 1 2 3
+ packssdw m4, m6 ; 4 5 6
+ SWAP m1, m4
+ movq m10, r4
+ %else
+ mova [stk+0x10], m15
+ phaddd m0, m1
+ phaddd m2, m3
+ movu m1, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ REPX {pshufb x, m14}, m1, m7, m6
+ REPX {pmaddwd x, m15}, m1, m7, m6
+ %define m14 [stk+0x00]
+ %define m15 [stk+0x10]
+ phaddd m1, m7
+ phaddd m6, m6
+ REPX {paddd x, m11}, m0, m2, m1, m6
+ REPX {psrad x, m12}, m0, m2, m1, m6
+ packssdw m0, m2
+ packssdw m1, m6
+ %define m8 m6
+ %define m9 m4
+ %define m10 m5
+ movd m10, r4
+ movd m9, [stk+0x20]
+ punpckldq m10, m9
+ %endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ %if ARCH_X86_32
+ mova [stk+0x50], m7
+ mova [stk+0x60], m8
+ mova [stk+0x70], m9
+ mova [stk+0x80], m10
+ %define m7 [stk+0x50]
+ %define m8 [stk+0x60]
+ %define m9 [stk+0x70]
+ %define m10 [stk+0x80]
+ %endif
+ palignr m2, m1, m0, 4 ; 1 2 3 4
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ pshufd m4, m1, q2121 ; 5 6 5 6
+ punpcklwd m2, m1, m4 ; 45 56
+ %if ARCH_X86_32
+ mov r0, r0m
+ %endif
+.dy1_w2_loop:
+ movu m1, [srcq+ssq*0]
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m3, m7
+ mova m3, m0
+ pmaddwd m0, m8
+ pshufb m1, m14
+ pshufb m6, m14
+ pmaddwd m1, m15
+ pmaddwd m6, m15
+ phaddd m1, m6
+ paddd m1, m11
+ psrad m1, m12
+ packssdw m1, m1
+ paddd m5, m0
+ mova m0, m2
+ pmaddwd m2, m9
+ paddd m5, m2
+ palignr m2, m1, m4, 12
+ punpcklwd m2, m1 ; 67 78
+ pmaddwd m4, m2, m10
+ paddd m5, m13
+ paddd m5, m4
+ pxor m6, m6
+ mova m4, m1
+ pshufd m1, m12, q1032
+ psrad m5, m1
+ packssdw m5, m5
+ pmaxsw m5, m6
+ pminsw m5, pxmaxm
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q1032
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+INIT_XMM ssse3
+.dy1_w4:
+%if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %if isput
+ mova [rsp+0x30], m13
+ %define vrnd_mem [rsp+0x30]
+ %define stk rsp+0x40
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %define stk rsp+0x30
+ %endif
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m9 [base+pd_0x4000]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq r3
+ %endif
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ pshufd m7, m15, q1032
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r6d, m15
+ movd r13d, m7
+ mova m10, [base+bdct_lb_q+ 0]
+ mova m11, [base+bdct_lb_q+16]
+ movd m13, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+ r6*8+2]
+ movd m15, [base+subpel_filters+r11*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r0, m15
+ movd r4, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd rX, m15
+ movd r5, m7
+ mova m5, [base+bdct_lb_q+ 0]
+ mova m6, [base+bdct_lb_q+16]
+ movd m1, [base+subpel_filters+r0*8+2]
+ movd m2, [base+subpel_filters+rX*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ SWAP m4, m7
+ %if isprep
+ mov r3, r3m
+ %endif
+ %define m10 m5
+ %define m11 m6
+ %define m12 m1
+ %define m13 m1
+%endif
+ psrld m14, 10
+ paddd m14, m14
+ punpckldq m13, m2
+ punpckldq m15, m4
+ punpcklqdq m13, m15
+ pxor m2, m2
+ pcmpeqd m0, m2
+%if ARCH_X86_64
+ pand m9, m0
+%else
+ pand m2, m9, m0
+ %define m9 m2
+ SWAP m7, m4
+%endif
+ pandn m0, m13
+%if ARCH_X86_64
+ SWAP m13, m0
+%else
+ %define m13 m0
+%endif
+ por m13, m9
+ punpckhbw m15, m13, m13
+ punpcklbw m13, m13
+ psraw m15, 8
+ psraw m13, 8
+ pshufb m12, m14, m10
+ pshufb m14, m11
+ mova m10, [base+spel_s_shuf2]
+ movd r4d, m14
+ shr r4d, 24
+%if ARCH_X86_32
+ mova [stk+0x40], m13
+ mova [stk+0x50], m15
+ pxor m2, m2
+%endif
+ pshufb m7, m14, m2
+ psubb m14, m7
+ paddb m12, m10
+ paddb m14, m10
+%if ARCH_X86_64
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu m7, [srcq+ssq*0]
+ movu m9, [srcq+ssq*1]
+ movu m8, [srcq+ssq*2]
+ movu m10, [srcq+ss3q ]
+ movu m1, [srcq+r4 ]
+ movu m3, [srcq+r6 ]
+ movu m2, [srcq+r11 ]
+ movu m4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m7, m9, m8, m10
+ REPX {pmaddwd x, m13}, m7, m9, m8, m10
+ REPX {pshufb x, m14}, m1, m3, m2, m4
+ REPX {pmaddwd x, m15}, m1, m3, m2, m4
+ mova m5, [rsp+0x10]
+ movd xm6, [rsp+0x20]
+ phaddd m7, m1
+ phaddd m9, m3
+ phaddd m8, m2
+ phaddd m10, m4
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ REPX {paddd x, m5}, m7, m9, m8, m10
+ REPX {psrad x, xm6}, m7, m9, m8, m10
+ packssdw m7, m9 ; 0 1
+ packssdw m8, m10 ; 2 3
+ movu m0, [srcq+r4 ]
+ movu m9, [srcq+r6 ]
+ movu m10, [srcq+r11 ]
+ add srcq, ss3q
+ REPX {pshufb x, m12}, m1, m2, m3
+ REPX {pmaddwd x, m13}, m1, m2, m3
+ REPX {pshufb x, m14}, m0, m9, m10
+ REPX {pmaddwd x, m15}, m0, m9, m10
+ phaddd m1, m0
+ phaddd m2, m9
+ phaddd m3, m10
+ shr myd, 6
+ mov r13d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r13q, [base+subpel_filters+myq*8]
+ REPX {paddd x, m5}, m1, m2, m3
+ REPX {psrad x, xm6}, m1, m2, m3
+ packssdw m1, m2 ; 4 5
+ packssdw m3, m3 ; 6 6
+ SWAP m9, m1
+ shufps m4, m7, m8, q1032 ; 1 2
+ shufps m5, m8, m9, q1032 ; 3 4
+ shufps m6, m9, m3, q1032 ; 5 6
+ punpcklwd m0, m7, m4 ; 01
+ punpckhwd m7, m4 ; 12
+ punpcklwd m1, m8, m5 ; 23
+ punpckhwd m8, m5 ; 34
+ punpcklwd m2, m9, m6 ; 45
+ punpckhwd m9, m6 ; 56
+ movq m10, r13
+ mova [stk+0x00], m1
+ mova [stk+0x10], m8
+ mova [stk+0x20], m2
+ mova [stk+0x30], m9
+ mova [stk+0x40], m3
+ %define hrnd_mem [rsp+0x10]
+ %define hsh_mem [rsp+0x20]
+ %define vsh_mem [rsp+0x28]
+ %if isput
+ %define vrnd_mem [rsp+0x30]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+%else
+ mova [stk+0x20], m12
+ mova [stk+0x30], m14
+ add r4, srcq
+ MC_4TAP_SCALED_H 0x60 ; 0 1
+ MC_4TAP_SCALED_H 0x70 ; 2 3
+ MC_4TAP_SCALED_H 0x80 ; 4 5
+ movu m7, [srcq]
+ movu m2, [r4]
+ add srcq, ssq
+ add r4, ssq
+ mov [stk+0xb0], r4
+ pshufb m7, m12
+ pshufb m2, m14
+ pmaddwd m7, m13
+ pmaddwd m2, m15
+ phaddd m7, m2
+ paddd m7, [esp+0x00]
+ psrad m7, [esp+0x10]
+ packssdw m7, m7 ; 6 6
+ mova m4, [stk+0x60]
+ mova m5, [stk+0x70]
+ mova m6, [stk+0x80]
+ mov myd, mym
+ mov rX, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea rX, [rX+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+rX*8+0]
+ cmovnz r5, [base+subpel_filters+rX*8+4]
+ mov r3, r3m
+ shufps m1, m4, m5, q1032 ; 1 2
+ shufps m2, m5, m6, q1032 ; 3 4
+ shufps m3, m6, m7, q1032 ; 5 6
+ mova [stk+0xa0], m7
+ punpcklwd m0, m4, m1 ; 01
+ punpckhwd m4, m1 ; 12
+ punpcklwd m1, m5, m2 ; 23
+ punpckhwd m5, m2 ; 34
+ punpcklwd m2, m6, m3 ; 45
+ punpckhwd m6, m3 ; 56
+ movd m7, r4
+ movd m3, r5
+ mov r0, r0m
+ %if isput
+ mov r1, r1m
+ %endif
+ mov r4, [stk+0xb0]
+ mova [stk+0xc0], m4 ; 12
+ mova [stk+0x60], m1 ; 23
+ mova [stk+0x70], m2 ; 45
+ mova [stk+0x80], m5 ; 34
+ mova [stk+0x90], m6 ; 56
+ %define m12 [stk+0x20]
+ %define m14 [stk+0x30]
+ %define m13 [stk+0x40]
+ %define m15 [stk+0x50]
+ %define hrnd_mem [esp+0x00]
+ %define hsh_mem [esp+0x10]
+ %define vsh_mem [esp+0x18]
+ %if isput
+ %define vrnd_mem [esp+0x20]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+ %define m10 m7
+ punpckldq m10, m3
+%endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m3, m10, q0000
+ pshufd m4, m10, q1111
+ pshufd m5, m10, q2222
+ pshufd m10, m10, q3333
+%if ARCH_X86_32
+ %xdefine m8 m3
+ %xdefine m9 m6
+ %xdefine m11 m5
+ %xdefine m6 m4
+ mova [stk+0x100], m3
+ mova [stk+0x110], m4
+ mova [stk+0x120], m5
+ mova [stk+0x130], m10
+ %define m3 [stk+0x100]
+ %define m4 [stk+0x110]
+ %define m5 [stk+0x120]
+ %define m10 [stk+0x130]
+ mova m7, [stk+0xc0]
+ mova m8, [stk+0x80]
+%endif
+.dy1_w4_loop:
+ movu m11, [srcq+ssq*0]
+ movu m6, [srcq+ssq*1]
+ pmaddwd m0, m3
+ pmaddwd m7, m3
+ pmaddwd m1, m4
+ pmaddwd m8, m4
+ pmaddwd m2, m5
+ pmaddwd m9, m5
+ paddd m1, m0
+ paddd m8, m7
+%if ARCH_X86_64
+ movu m0, [srcq+r4]
+ movu m7, [srcq+r6]
+%else
+ movu m0, [r4+ssq*0]
+ movu m7, [r4+ssq*1]
+ lea r4, [r4+ssq*2]
+%endif
+ lea srcq, [srcq+ssq*2]
+ paddd m1, m2
+ paddd m8, m9
+ pshufb m11, m12
+ pshufb m6, m12
+ pmaddwd m11, m13
+ pmaddwd m6, m13
+ pshufb m0, m14
+ pshufb m7, m14
+ pmaddwd m0, m15
+ pmaddwd m7, m15
+ phaddd m11, m0
+ phaddd m6, m7
+ paddd m11, hrnd_mem
+ paddd m6, hrnd_mem
+ psrad m11, hsh_mem
+ psrad m6, hsh_mem
+ packssdw m11, m6 ; 7 8
+%if ARCH_X86_64
+ shufps m9, [stk+0x40], m11, q1032 ; 6 7
+ mova m0, [stk+0x00]
+ mova [stk+0x40], m11
+%else
+ shufps m9, [stk+0xa0], m11, q1032 ; 6 7
+ mova m0, [stk+0x60]
+ mova [stk+0xa0], m11
+%endif
+ punpcklwd m2, m9, m11 ; 67
+ punpckhwd m9, m11 ; 78
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m9, m10
+%if isput
+ movd m11, vsh_mem
+%endif
+ paddd m1, vrnd_mem
+ paddd m8, vrnd_mem
+ paddd m1, m6
+ paddd m8, m7
+%if ARCH_X86_64
+ mova m7, [stk+0x10]
+%else
+ mova m7, [stk+0x80]
+%endif
+%if isput
+ psrad m1, m11
+ psrad m8, m11
+%else
+ psrad m1, 6
+ psrad m8, 6
+%endif
+ packssdw m1, m8
+%if ARCH_X86_64
+ mova m8, [stk+0x30]
+%else
+ mova m8, [stk+0x90]
+%endif
+%if isput
+ pxor m6, m6
+ pmaxsw m1, m6
+ pminsw m1, pxmaxm
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], m1
+ add tmpq, 16
+%endif
+%if ARCH_X86_64
+ mova m1, [stk+0x20]
+ mova [stk+0x10], m8
+ mova [stk+0x00], m1
+ mova [stk+0x20], m2
+ mova [stk+0x30], m9
+%else
+ mova m1, [stk+0x70]
+ mova [stk+0x80], m8
+ mova [stk+0x60], m1
+ mova [stk+0x70], m2
+ mova [stk+0x90], m9
+%endif
+ sub hd, 2
+ jg .dy1_w4_loop
+ MC_8TAP_SCALED_RET ; why not jz .ret?
+INIT_XMM ssse3
+.dy1_w8:
+ mov dword [stk+0xf0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [stk+0xf0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [stk+0xf0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [stk+0xf0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [stk+0xf0], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ mov myd, mym
+%if ARCH_X86_64
+ %ifidn %1, put
+ movifnidn dsm, dsq
+ %endif
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %define hround m11
+ %if isput
+ mova [rsp+0x30], m13
+ %else
+ mova m13, [base+pd_m524256]
+ %endif
+ shr t0d, 16
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+%else
+ %define hround [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m10 [base+pd_0x3ff]
+ %define m8 m0
+ %xdefine m14 m4
+ %xdefine m15 m3
+ %if isprep
+ %define ssq ssm
+ %endif
+ mov r5, [esp+0x1f0]
+ mov r3, [esp+0x1f4]
+ shr r5, 16
+ movd m15, r5
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r0, r0m
+ mov r3, r3m
+%endif
+ sub srcq, 6
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+%if ARCH_X86_64
+ movq m3, r4q
+%else
+ movd m5, r4
+ movd m6, r5
+ punpckldq m5, m6
+ SWAP m3, m5
+%endif
+ punpcklbw m3, m3
+ psraw m3, 8
+ mova [stk+0x100], m7
+ mova [stk+0x120], m15
+ mov [stk+0x0f8], srcq
+ mov [stk+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+%if ARCH_X86_64
+ mova [stk+0x140], m0
+ mova [stk+0x150], m1
+ mova [stk+0x160], m2
+ mova [stk+0x170], m3
+ %if UNIX64
+ mov hm, hd
+ %endif
+%else
+ mova [stk+0x180], m0
+ mova [stk+0x190], m1
+ mova [stk+0x1a0], m2
+ mova [stk+0x1b0], m3
+ SWAP m5, m3
+ mov r5, hm
+ mov [stk+0x134], r5
+%endif
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [stk+0x0f0]
+ jz .ret
+%if ARCH_X86_64
+ add qword [stk+0x130], 16
+ mov hd, hm
+%else
+ add dword [stk+0x130], 16
+ mov r5, [stk+0x134]
+ mov r0, [stk+0x130]
+%endif
+ mova m7, [stk+0x100]
+ mova m14, [stk+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m11, [rsp+0x10]
+%endif
+ mova m15, [stk+0x120]
+ mov srcq, [stk+0x0f8]
+%if ARCH_X86_64
+ mov r0q, [stk+0x130] ; dstq / tmpq
+%else
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.dy1_hloop:
+%if ARCH_X86_64
+ mova m9, [base+pq_0x40000000]
+%else
+ %define m9 [base+pq_0x40000000]
+%endif
+ pxor m1, m1
+ psrld m2, m14, 10
+ mova [stk], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m1
+ pshufd m2, m5, q1032
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pxor m2, m2
+ pcmpeqd m5, m2
+ mova [stk+0x110], m14
+ pshufd m4, m15, q1032
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ movq r11, m14
+ punpckhqdq m14, m14
+ movq rX, m14
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m9, m4
+ pand m8, m9, m6
+ pand m15, m9, m14
+ pand m9, m9, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m9, m5
+ punpcklbw m0, m7, m7
+ punpckhbw m7, m7
+ punpcklbw m1, m8, m8
+ punpckhbw m8, m8
+ psraw m0, 8
+ psraw m7, 8
+ psraw m1, 8
+ psraw m8, 8
+ punpcklbw m2, m15, m15
+ punpckhbw m15, m15
+ punpcklbw m3, m9, m9
+ punpckhbw m9, m9
+ psraw m2, 8
+ psraw m15, 8
+ psraw m3, 8
+ psraw m9, 8
+ mova [stk+0x10], m0
+ mova [stk+0x20], m7
+ mova [stk+0x30], m1
+ mova [stk+0x40], m8
+ mova [stk+0x50], m2
+ mova [stk+0x60], m15
+ mova [stk+0x70], m3
+ mova [stk+0x80], m9
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
+ mova [stk+0x90], m1
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
+ mova [stk+0xa0], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
+ mova [stk+0xb0], m3
+ MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
+ mova [stk+0xc0], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
+ mova [stk+0xd0], m5
+ MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
+ MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
+ MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
+ mova m5, [stk+0xd0]
+ mova m1, [stk+0x90]
+ mova m2, [stk+0xa0]
+ mova m3, [stk+0xb0]
+ mova m9, [stk+0xc0]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova m10, [stk+0x140]
+ mova m11, [stk+0x150]
+ mova m14, [stk+0x160]
+ mova m15, [stk+0x170]
+ mova [stk+0x90], m4
+ mova [stk+0xa0], m5
+ mova [stk+0xb0], m6
+ mova [stk+0xc0], m7
+ %define hround [rsp+0x10]
+ %define shift [rsp+0x20]
+ %if isput
+ %define vround [rsp+0x30]
+ %else
+ %define vround [base+pd_m524256]
+ %endif
+.dy1_vloop:
+ pmaddwd m4, m0, m10
+ pmaddwd m5, m1, m10
+ pmaddwd m6, m2, m11
+ pmaddwd m7, m3, m11
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [stk+0x90], m14
+ pmaddwd m7, [stk+0xa0], m14
+ pmaddwd m8, [stk+0xb0], m15
+ pmaddwd m9, [stk+0xc0], m15
+ paddd m4, m6
+ paddd m5, m7
+ %if isput
+ pshufd m6, m12, q1032
+ %endif
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r4, m15
+ movd r5, m4
+ mova m14, [stk+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [stk+16], m14
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m9, m4
+ pand m1, m9, m6
+ pand m2, m9, m7
+ pand m3, m9, m5
+ pandn m4, [stk+0x20]
+ pandn m6, [stk+0x30]
+ pandn m7, [stk+0x40]
+ pandn m5, [stk+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ punpcklbw m4, m0, m0
+ punpckhbw m0, m0
+ punpcklbw m5, m1, m1
+ punpckhbw m1, m1
+ psraw m4, 8
+ psraw m0, 8
+ psraw m5, 8
+ psraw m1, 8
+ punpcklbw m6, m2, m2
+ punpckhbw m2, m2
+ punpcklbw m7, m3, m3
+ punpckhbw m3, m3
+ psraw m6, 8
+ psraw m2, 8
+ psraw m7, 8
+ psraw m3, 8
+ mova [stk+0x0a0], m4
+ mova [stk+0x0b0], m0
+ mova [stk+0x0c0], m5
+ mova [stk+0x0d0], m1
+ mova [stk+0x140], m6
+ mova [stk+0x150], m2
+ mova [stk+0x160], m7
+ mova [stk+0x170], m3
+ MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
+ MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
+ MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
+ MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
+ MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
+ MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
+ MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
+ MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
+ mova m5, [stk+0x60]
+ mova m6, [stk+0x70]
+ mova m7, [stk+0x80]
+ mova m0, [stk+0x90]
+ mov r0, r0m
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m1, [stk+0x20]
+ mova m2, [stk+0x30]
+ mova m3, [stk+0x40]
+ mova m4, [stk+0x50]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova m4, [stk+0x180]
+ mova m5, [stk+0x190]
+ mova m6, [stk+0x1a0]
+ mova m7, [stk+0x1b0]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+.dy1_vloop:
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [stk+0x60], m6
+ pmaddwd m3, [stk+0x70], m6
+ pmaddwd m4, [stk+0x80], m7
+ pmaddwd m5, [stk+0x90], m7
+ %if isput
+ movd m6, [esp+0x18]
+ %endif
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, vrnd_mem
+ paddd m1, vrnd_mem
+ paddd m4, m0
+ paddd m5, m1
+%endif
+%ifidn %1, put
+ psrad m4, m6
+ psrad m5, m6
+ packssdw m4, m5
+ pxor m7, m7
+ pmaxsw m4, m7
+ pminsw m4, pxmaxm
+ mova [dstq], m4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+%if ARCH_X86_64
+ movu m8, [srcq+r10*2]
+ movu m9, [srcq+r11*2]
+ movu m12, [srcq+r13*2]
+ movu m13, [srcq+ rX*2]
+ movu m4, [srcq+ r4*2]
+ movu m5, [srcq+ r6*2]
+ movu m6, [srcq+ r7*2]
+ movu m7, [srcq+ r9*2]
+ add srcq, ssq
+ pmaddwd m8, [stk+0x50]
+ pmaddwd m9, [stk+0x60]
+ pmaddwd m12, [stk+0x70]
+ pmaddwd m13, [stk+0x80]
+ pmaddwd m4, [stk+0x10]
+ pmaddwd m5, [stk+0x20]
+ pmaddwd m6, [stk+0x30]
+ pmaddwd m7, [stk+0x40]
+ phaddd m8, m9
+ phaddd m12, m13
+ mova m9, [base+unpckw]
+ mova m13, hround
+ phaddd m4, m5
+ phaddd m6, m7
+ phaddd m8, m12
+ phaddd m4, m6
+ pshufd m5, m9, q1032
+ pshufb m0, m9 ; 0a 1a
+ pshufb m1, m9 ; 0b 1b
+ pshufb m2, m5 ; 3a 2a
+ pshufb m3, m5 ; 3b 2b
+ mova m12, shift
+ paddd m4, m13
+ paddd m8, m13
+ psrad m4, m12
+ psrad m8, m12
+ packssdw m4, m8
+ pshufb m6, [stk+0x90], m9 ; 4a 5a
+ pshufb m7, [stk+0xa0], m9 ; 4b 5b
+ pshufb m8, [stk+0xb0], m5 ; 7a 6a
+ pshufb m13, [stk+0xc0], m5 ; 7b 6b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m6 ; 34a
+ punpcklwd m3, m7 ; 34b
+ punpckhwd m6, m8 ; 56a
+ punpckhwd m7, m13 ; 56b
+ punpcklwd m8, m4 ; 78a
+ punpckhqdq m4, m4
+ punpcklwd m13, m4 ; 78b
+ mova [stk+0x90], m6
+ mova [stk+0xa0], m7
+ mova [stk+0xb0], m8
+ mova [stk+0xc0], m13
+ mova m13, vround
+%else
+ mov r0m, r0
+ mov r3, r3m
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
+ mova m7, [base+unpckw]
+ pshufd m4, m7, q1032
+ pshufb m0, [stk+0x20], m7 ; 0a 1a
+ pshufb m1, [stk+0x30], m7 ; 0b 1b
+ pshufb m2, [stk+0x40], m4 ; 3a 2a
+ pshufb m3, [stk+0x50], m4 ; 3b 2b
+ pshufb m5, [stk+0x60], m7 ; 4a 5a
+ pshufb m6, [stk+0x70], m7 ; 4b 5b
+ pshufb m7, [stk+0x80], m4 ; 7a 6a
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ punpckhwd m5, m7 ; 56a
+ mova [stk+0x60], m5
+ pshufb m5, [stk+0x90], m4 ; 7b 6b
+ punpcklwd m7, [stk+0xe0] ; 78a
+ mova m4, [stk+0x180]
+ punpckhwd m6, m5 ; 56b
+ mova [stk+0x70], m6
+ movq m6, [stk+0xe8]
+ mova [stk+0x80], m7
+ mova m7, [stk+0x1b0]
+ punpcklwd m5, m6
+ mova m6, [stk+0x1a0]
+ mova [stk+0x90], m5
+ mova m5, [stk+0x190]
+ mov r0, r0m
+%endif
+ jmp .dy1_vloop
+INIT_XMM ssse3
+%if ARCH_X86_64
+ %define stk rsp+0x20
+%endif
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%if isput
+.dy2_w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m13
+ %define vrnd_mem [rsp+0x10]
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+ %else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+ %define m11 [esp+0x00]
+ %define m12 [esp+0x10]
+ %define vrnd_mem [esp+0x20]
+ mov r1, r1m
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+ %endif
+ pxor m9, m9
+ punpckldq m9, m8
+ paddd m14, m9 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ pshufd m15, m15, q0321
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_q]
+ mova m6, [base+spel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m2, m2
+ pcmpeqd m8, m2
+ psrld m14, 10
+ paddd m14, m14
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [stk], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m15 m6
+ %endif
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*2]
+ movu m2, [srcq+ssq*4]
+ punpckldq m15, m7
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ pand m9, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m9
+ movu m4, [srcq+ssq*1]
+ movu m5, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ %else
+ pand m7, m5, [base+pd_0x4000]
+ pandn m5, m15
+ por m5, m7
+ %define m15 m5
+ mov myd, mym
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr myd, 6
+ lea r5, [r5+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ mov [stk+0x20], r3
+ mov r3, r3m
+ %endif
+ punpcklbw m15, m15
+ psraw m15, 8
+ REPX {pshufb x, m14}, m0, m1, m2
+ REPX {pmaddwd x, m15}, m0, m1, m2
+ %if ARCH_X86_64
+ REPX {pshufb x, m14}, m4, m5, m6
+ REPX {pmaddwd x, m15}, m4, m5, m6
+ phaddd m0, m1
+ phaddd m1, m2
+ phaddd m4, m5
+ phaddd m5, m6
+ REPX {paddd x, m11}, m0, m1, m4, m5
+ REPX {psrad x, m12}, m0, m1, m4, m5
+ packssdw m0, m1 ; 0 2 2 4
+ packssdw m4, m5 ; 1 3 3 5
+ SWAP m2, m4
+ movq m10, r4
+ %else
+ mova [stk+0x10], m15
+ phaddd m0, m1
+ phaddd m1, m2
+ movu m2, [srcq+ssq*1]
+ movu m7, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ movu m6, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ REPX {pshufb x, m14}, m2, m7, m6
+ REPX {pmaddwd x, m15}, m2, m7, m6
+ %define m14 [stk+0x00]
+ %define m15 [stk+0x10]
+ phaddd m2, m7
+ phaddd m7, m6
+ REPX {paddd x, m11}, m0, m1, m2, m7
+ REPX {psrad x, m12}, m0, m1, m2, m7
+ packssdw m0, m1
+ packssdw m2, m7
+ %define m8 m6
+ %define m9 m4
+ %define m10 m5
+ movd m10, r4
+ movd m9, [stk+0x20]
+ punpckldq m10, m9
+ %endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ %if ARCH_X86_32
+ mova [stk+0x50], m7
+ mova [stk+0x60], m8
+ mova [stk+0x70], m9
+ mova [stk+0x80], m10
+ %xdefine m13 m7
+ %define m7 [stk+0x50]
+ %define m8 [stk+0x60]
+ %define m9 [stk+0x70]
+ %define m10 [stk+0x80]
+ %endif
+ punpcklwd m1, m0, m2 ; 01 23
+ punpckhwd m3, m0, m2 ; 23 45
+ %if ARCH_X86_32
+ mov r4, r0m
+ %define dstq r4
+ mova [stk+0x20], m3
+ mova [stk+0x30], m0
+ %endif
+.dy2_w2_loop:
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ movu m13, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m3, m8
+ REPX {pshufb x, m14}, m4, m5, m6, m13
+ REPX {pmaddwd x, m15}, m4, m5, m6, m13
+ phaddd m4, m5
+ phaddd m6, m13
+ pmaddwd m5, m1, m7
+ paddd m4, m11
+ paddd m6, m11
+ psrad m4, m12
+ psrad m6, m12
+ packssdw m4, m6 ; 6 7 8 9
+ paddd m5, m3
+ pshufd m3, m4, q2200
+ pshufd m4, m4, q3311
+ palignr m3, m0, 12 ; 4 6 6 8
+ palignr m4, m2, 12 ; 5 7 7 9
+ mova m0, m3
+ mova m2, m4
+ punpcklwd m1, m3, m4
+ punpckhwd m3, m4
+ pmaddwd m6, m1, m9
+ pmaddwd m4, m3, m10
+ paddd m5, vrnd_mem
+ paddd m6, m4
+ paddd m5, m6
+ pshufd m4, m12, q1032
+ pxor m6, m6
+ psrad m5, m4
+ packssdw m5, m5
+ pmaxsw m5, m6
+ pminsw m5, pxmaxm
+ movd [dstq+dsq*0], m5
+ pshuflw m5, m5, q1032
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+INIT_XMM ssse3
+.dy2_w4:
+%if ARCH_X86_64
+ mov myd, mym
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %if isput
+ mova [rsp+0x30], m13
+ %define vrnd_mem [rsp+0x30]
+ %define stk rsp+0x40
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %define stk rsp+0x30
+ %endif
+ movzx t0d, t0b
+ sub srcq, 2
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m9 [base+pd_0x4000]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq r3
+ %endif
+ movzx r5, byte [esp+0x1f0]
+ sub srcq, 2
+ movd m15, r5
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m9, [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ pshufd m7, m15, q1032
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r6d, m15
+ movd r13d, m7
+ mova m10, [base+bdct_lb_q+ 0]
+ mova m11, [base+bdct_lb_q+16]
+ movd m13, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+ r6*8+2]
+ movd m15, [base+subpel_filters+r11*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r1, m15
+ movd r4, m7
+ pshufd m15, m15, q0321
+ pshufd m7, m7, q0321
+ movd r3, m15
+ movd r5, m7
+ mova m5, [base+bdct_lb_q+ 0]
+ mova m6, [base+bdct_lb_q+16]
+ movd m1, [base+subpel_filters+r1*8+2]
+ movd m2, [base+subpel_filters+r3*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ SWAP m4, m7
+ mov r3, r3m
+ %if isprep
+ lea ss3q, [ssq*3]
+ %endif
+ %define m10 m5
+ %define m11 m6
+ %define m12 m1
+ %define m13 m1
+%endif
+ psrld m14, 10
+ paddd m14, m14
+ punpckldq m13, m2
+ punpckldq m15, m4
+ punpcklqdq m13, m15
+ pxor m2, m2
+ pcmpeqd m0, m2
+%if ARCH_X86_64
+ pand m9, m0
+%else
+ pand m2, m9, m0
+ %define m9 m2
+ SWAP m7, m4
+%endif
+ pandn m0, m13
+%if ARCH_X86_64
+ SWAP m13, m0
+%else
+ %define m13 m0
+%endif
+ por m13, m9
+ punpckhbw m15, m13, m13
+ punpcklbw m13, m13
+ psraw m15, 8
+ psraw m13, 8
+ pshufb m12, m14, m10
+ pshufb m14, m11
+ mova m10, [base+spel_s_shuf2]
+ movd r4d, m14
+ shr r4d, 24
+%if ARCH_X86_32
+ mova [stk+0x40], m13
+ mova [stk+0x50], m15
+ pxor m2, m2
+%endif
+ pshufb m7, m14, m2
+ psubb m14, m7
+ paddb m12, m10
+ paddb m14, m10
+%if ARCH_X86_64
+ lea r6, [r4+ssq*1]
+ lea r11, [r4+ssq*2]
+ lea r13, [r4+ss3q ]
+ movu m1, [srcq+ssq*0]
+ movu m8, [srcq+ssq*2]
+ movu m9, [srcq+ssq*1]
+ movu m10, [srcq+ss3q ]
+ movu m7, [srcq+r4 ]
+ movu m2, [srcq+r11 ]
+ movu m3, [srcq+r6 ]
+ movu m4, [srcq+r13 ]
+ lea srcq, [srcq+ssq*4]
+ REPX {pshufb x, m12}, m1, m9, m8, m10
+ REPX {pmaddwd x, m13}, m1, m9, m8, m10
+ REPX {pshufb x, m14}, m7, m3, m2, m4
+ REPX {pmaddwd x, m15}, m7, m3, m2, m4
+ mova m5, [rsp+0x10]
+ movd xm6, [rsp+0x20]
+ phaddd m1, m7
+ phaddd m8, m2
+ phaddd m9, m3
+ phaddd m10, m4
+ movu m2, [srcq+ssq*0]
+ movu m3, [srcq+ssq*1]
+ REPX {paddd x, m5}, m1, m9, m8, m10
+ REPX {psrad x, xm6}, m1, m9, m8, m10
+ packssdw m1, m8 ; 0 2
+ packssdw m9, m10 ; 1 3
+ movu m0, [srcq+r4 ]
+ movu m8, [srcq+r6 ]
+ lea srcq, [srcq+ssq*2]
+ REPX {pshufb x, m12}, m2, m3
+ REPX {pmaddwd x, m13}, m2, m3
+ REPX {pshufb x, m14}, m0, m8
+ REPX {pmaddwd x, m15}, m0, m8
+ phaddd m2, m0
+ phaddd m3, m8
+ shr myd, 6
+ mov r9d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r9q, [base+subpel_filters+myq*8]
+ REPX {paddd x, m5}, m2, m3
+ REPX {psrad x, xm6}, m2, m3
+ packssdw m2, m3 ; 4 5
+ pshufd m3, m2, q1032 ; 5 _
+ punpcklwd m0, m1, m9 ; 01
+ punpckhwd m1, m9 ; 23
+ punpcklwd m2, m3 ; 45
+ movq m10, r9
+ %define hrnd_mem [rsp+0x10]
+ %define hsh_mem [rsp+0x20]
+ %define vsh_mem [rsp+0x28]
+ %if isput
+ %define vrnd_mem [rsp+0x30]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+%else
+ mova [stk+0x20], m12
+ mova [stk+0x30], m14
+ add r4, srcq
+ MC_4TAP_SCALED_H 0x60 ; 0 1
+ MC_4TAP_SCALED_H 0x70 ; 2 3
+ MC_4TAP_SCALED_H 0x80 ; 4 5
+ mov [stk+0xe0], r4
+ mova m3, [base+spel_s_shuf8]
+ mova m0, [stk+0x60]
+ mova m1, [stk+0x70]
+ mova m2, [stk+0x80]
+ mov myd, mym
+ mov rX, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea rX, [rX+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+rX*8+0]
+ cmovnz r5, [base+subpel_filters+rX*8+4]
+ mov r3, r3m
+ pshufb m0, m3 ; 01
+ pshufb m1, m3 ; 23
+ pshufb m2, m3 ; 45
+ movd m7, r4
+ movd m4, r5
+ mov r5, r0m
+ %if isput
+ mov r1, r1m
+ %endif
+ mov r4, [stk+0xe0]
+ %define dstq r5
+ %define tmpq r5
+ %define m12 [stk+0x20]
+ %define m14 [stk+0x30]
+ %define m13 [stk+0x40]
+ %define m15 [stk+0x50]
+ %define hrnd_mem [esp+0x00]
+ %define hsh_mem [esp+0x10]
+ %define vsh_mem [esp+0x18]
+ %if isput
+ %define vrnd_mem [esp+0x20]
+ %else
+ %define vrnd_mem [base+pd_m524256]
+ %endif
+ %define m10 m7
+ punpckldq m10, m4
+%endif
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m3, m10, q0000
+ pshufd m4, m10, q1111
+ pshufd m5, m10, q2222
+ pshufd m10, m10, q3333
+%if ARCH_X86_32
+ %xdefine m8 m3
+ %xdefine m9 m6
+ %xdefine m11 m5
+ %xdefine m6 m4
+ mova [stk+0x100], m3
+ mova [stk+0x110], m4
+ mova [stk+0x120], m5
+ mova [stk+0x130], m10
+ %define m3 [stk+0x100]
+ %define m4 [stk+0x110]
+ %define m5 [stk+0x120]
+ %define m10 [stk+0x130]
+%endif
+.dy2_w4_loop:
+ pmaddwd m8, m0, m3
+ pmaddwd m9, m1, m3
+ mova m0, m2
+ pmaddwd m1, m4
+ pmaddwd m11, m2, m4
+ paddd m8, vrnd_mem
+ paddd m9, vrnd_mem
+ pmaddwd m2, m5
+ paddd m8, m1
+ paddd m9, m11
+ paddd m8, m2
+ movu m6, [srcq+ssq*0]
+ movu m1, [srcq+ssq*2]
+%if ARCH_X86_64
+ movu m11, [srcq+r4 ]
+ movu m2, [srcq+r11]
+%else
+ movu m11, [r4+ssq*0]
+ movu m2, [r4+ssq*2]
+%endif
+ pshufb m6, m12
+ pshufb m1, m12
+ pmaddwd m6, m13
+ pmaddwd m1, m13
+ pshufb m11, m14
+ pshufb m2, m14
+ pmaddwd m11, m15
+ pmaddwd m2, m15
+ phaddd m6, m11
+ phaddd m1, m2
+ paddd m6, hrnd_mem
+ paddd m1, hrnd_mem
+ psrad m6, hsh_mem
+ psrad m1, hsh_mem
+ movu m7, [srcq+ssq*1]
+ movu m11, [srcq+ss3q ]
+ packssdw m6, m1 ; 6 8
+%if ARCH_X86_64
+ movu m2, [srcq+r6 ]
+ movu m1, [srcq+r13]
+%else
+ movu m2, [r4+ssq*1]
+ movu m1, [r4+ss3q ]
+%endif
+ pshufb m7, m12
+ pshufb m11, m12
+ pmaddwd m7, m13
+ pmaddwd m11, m13
+ pshufb m2, m14
+ pshufb m1, m14
+ pmaddwd m2, m15
+ pmaddwd m1, m15
+ phaddd m7, m2
+ phaddd m11, m1
+ paddd m7, hrnd_mem
+ paddd m11, hrnd_mem
+ psrad m7, hsh_mem
+ psrad m11, hsh_mem
+ packssdw m7, m11 ; 7 9
+%if ARCH_X86_32
+ lea r4, [r4+ssq*4]
+%endif
+ lea srcq, [srcq+ssq*4]
+ punpcklwd m1, m6, m7 ; 67
+ punpckhwd m6, m7 ; 89
+ mova m2, m6
+ pmaddwd m11, m1, m5
+ pmaddwd m7, m1, m10
+ pmaddwd m6, m10
+ paddd m9, m11
+%if isput
+ movd m11, vsh_mem
+%endif
+ paddd m8, m7
+ paddd m9, m6
+%if isput
+ psrad m8, m11
+ psrad m9, m11
+ packssdw m8, m9
+ pxor m7, m7
+ pmaxsw m8, m7
+ pminsw m8, pxmaxm
+ movq [dstq+dsq*0], m8
+ movhps [dstq+dsq*1], m8
+ lea dstq, [dstq+dsq*2]
+%else
+ psrad m8, 6
+ psrad m9, 6
+ packssdw m8, m9
+ mova [tmpq], m8
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET ; why not jz .ret?
+INIT_XMM ssse3
+.dy2_w8:
+ mov dword [stk+0xf0], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [stk+0xf0], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [stk+0xf0], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [stk+0xf0], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [stk+0xf0], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ mov myd, mym
+%if ARCH_X86_64
+ %ifidn %1, put
+ movifnidn dsm, dsq
+ %endif
+ mova [rsp+0x10], m11
+ mova [rsp+0x20], m12
+ %define hround m11
+ %if isput
+ mova [rsp+0x30], m13
+ %else
+ mova m13, [base+pd_m524256]
+ %endif
+ shr t0d, 16
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+%else
+ %define hround [esp+0x00]
+ %define m12 [esp+0x10]
+ %define m10 [base+pd_0x3ff]
+ %define m8 m0
+ %xdefine m14 m4
+ %xdefine m15 m3
+ %if isput
+ %define dstq r0
+ %else
+ %define tmpq r0
+ %define ssq ssm
+ %endif
+ mov r5, [esp+0x1f0]
+ mov r3, [esp+0x1f4]
+ shr r5, 16
+ movd m15, r5
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r0, r0m
+ mov r3, r3m
+%endif
+ sub srcq, 6
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+%if ARCH_X86_64
+ movq m3, r4q
+%else
+ movd m5, r4
+ movd m6, r5
+ punpckldq m5, m6
+ SWAP m3, m5
+%endif
+ punpcklbw m3, m3
+ psraw m3, 8
+ mova [stk+0x100], m7
+ mova [stk+0x120], m15
+ mov [stk+0x0f8], srcq
+ mov [stk+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+%if ARCH_X86_64
+ mova [stk+0x140], m0
+ mova [stk+0x150], m1
+ mova [stk+0x160], m2
+ mova [stk+0x170], m3
+ %if UNIX64
+ mov hm, hd
+ %endif
+%else
+ mova [stk+0x180], m0
+ mova [stk+0x190], m1
+ mova [stk+0x1a0], m2
+ mova [stk+0x1b0], m3
+ SWAP m5, m3
+ mov r5, hm
+ mov [stk+0x134], r5
+%endif
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [stk+0x0f0]
+ jz .ret
+%if ARCH_X86_64
+ add qword [stk+0x130], 16
+ mov hd, hm
+%else
+ add dword [stk+0x130], 16
+ mov r5, [stk+0x134]
+ mov r0, [stk+0x130]
+%endif
+ mova m7, [stk+0x100]
+ mova m14, [stk+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m11, [rsp+0x10]
+%endif
+ mova m15, [stk+0x120]
+ mov srcq, [stk+0x0f8]
+%if ARCH_X86_64
+ mov r0q, [stk+0x130] ; dstq / tmpq
+%else
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.dy2_hloop:
+%if ARCH_X86_64
+ mova m9, [base+pq_0x40000000]
+%else
+ %define m9 [base+pq_0x40000000]
+%endif
+ pxor m1, m1
+ psrld m2, m14, 10
+ mova [stk], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m1
+ pshufd m2, m5, q1032
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ pshufd m5, m5, q0321
+ pshufd m2, m2, q0321
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pxor m2, m2
+ pcmpeqd m5, m2
+ mova [stk+0x110], m14
+ pshufd m4, m15, q1032
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ movq r11, m14
+ punpckhqdq m14, m14
+ movq rX, m14
+ mov r10d, r11d
+ shr r11, 32
+ mov r13d, rXd
+ shr rX, 32
+ mov r4d, [stk+ 0]
+ mov r6d, [stk+ 4]
+ mov r7d, [stk+ 8]
+ mov r9d, [stk+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m9, m4
+ pand m8, m9, m6
+ pand m15, m9, m14
+ pand m9, m9, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m9, m5
+ punpcklbw m0, m7, m7
+ punpckhbw m7, m7
+ punpcklbw m1, m8, m8
+ punpckhbw m8, m8
+ psraw m0, 8
+ psraw m7, 8
+ psraw m1, 8
+ psraw m8, 8
+ punpcklbw m2, m15, m15
+ punpckhbw m15, m15
+ punpcklbw m3, m9, m9
+ punpckhbw m9, m9
+ psraw m2, 8
+ psraw m15, 8
+ psraw m3, 8
+ psraw m9, 8
+ mova [stk+0x10], m0
+ mova [stk+0x20], m7
+ mova [stk+0x30], m1
+ mova [stk+0x40], m8
+ mova [stk+0x50], m2
+ mova [stk+0x60], m15
+ mova [stk+0x70], m3
+ mova [stk+0x80], m9
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
+ mova [stk+0x90], m1
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
+ mova [stk+0xa0], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
+ mova [stk+0xb0], m3
+ MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
+ mova [stk+0xc0], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
+ mova [stk+0xd0], m5
+ MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
+ MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
+ MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
+ mova m5, [stk+0xd0]
+ mova m1, [stk+0x90]
+ mova m2, [stk+0xa0]
+ mova m3, [stk+0xb0]
+ mova m9, [stk+0xc0]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova m10, [stk+0x140]
+ mova m11, [stk+0x150]
+ mova m14, [stk+0x160]
+ mova m15, [stk+0x170]
+ mova [stk+0x90], m4
+ mova [stk+0xa0], m5
+ mova [stk+0xb0], m6
+ mova [stk+0xc0], m7
+ %define hround [rsp+0x10]
+ %define shift [rsp+0x20]
+ %if isput
+ %define vround [rsp+0x30]
+ %else
+ %define vround [base+pd_m524256]
+ %endif
+.dy2_vloop:
+ pmaddwd m4, m0, m10
+ pmaddwd m5, m1, m10
+ pmaddwd m6, m2, m11
+ pmaddwd m7, m3, m11
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [stk+0x90], m14
+ pmaddwd m7, [stk+0xa0], m14
+ pmaddwd m8, [stk+0xb0], m15
+ pmaddwd m9, [stk+0xc0], m15
+ paddd m4, m6
+ paddd m5, m7
+ %if isput
+ pshufd m6, m12, q1032
+ %endif
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ pshufd m15, m15, q0321
+ pshufd m4, m4, q0321
+ movd r4, m15
+ movd r5, m4
+ mova m14, [stk+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [stk+16], m14
+ mov r0, [stk+ 0]
+ mov rX, [stk+ 4]
+ mov r4, [stk+ 8]
+ mov r5, [stk+12]
+ mova [stk+0x20], m0
+ mova [stk+0x30], m1
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m9, m4
+ pand m1, m9, m6
+ pand m2, m9, m7
+ pand m3, m9, m5
+ pandn m4, [stk+0x20]
+ pandn m6, [stk+0x30]
+ pandn m7, [stk+0x40]
+ pandn m5, [stk+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ punpcklbw m4, m0, m0
+ punpckhbw m0, m0
+ punpcklbw m5, m1, m1
+ punpckhbw m1, m1
+ psraw m4, 8
+ psraw m0, 8
+ psraw m5, 8
+ psraw m1, 8
+ punpcklbw m6, m2, m2
+ punpckhbw m2, m2
+ punpcklbw m7, m3, m3
+ punpckhbw m3, m3
+ psraw m6, 8
+ psraw m2, 8
+ psraw m7, 8
+ psraw m3, 8
+ mova [stk+0x0a0], m4
+ mova [stk+0x0b0], m0
+ mova [stk+0x0c0], m5
+ mova [stk+0x0d0], m1
+ mova [stk+0x140], m6
+ mova [stk+0x150], m2
+ mova [stk+0x160], m7
+ mova [stk+0x170], m3
+ MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0
+ MC_8TAP_SCALED_H 0xa0, 0x30 ; 1
+ MC_8TAP_SCALED_H 0xa0, 0x40 ; 2
+ MC_8TAP_SCALED_H 0xa0, 0x50 ; 3
+ MC_8TAP_SCALED_H 0xa0, 0x60 ; 4
+ MC_8TAP_SCALED_H 0xa0, 0x70 ; 5
+ MC_8TAP_SCALED_H 0xa0, 0x80 ; 6
+ MC_8TAP_SCALED_H 0xa0, 0x90 ; 7
+ mova m5, [stk+0x60]
+ mova m6, [stk+0x70]
+ mova m7, [stk+0x80]
+ mova m0, [stk+0x90]
+ mov r0, r0m
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m1, [stk+0x20]
+ mova m2, [stk+0x30]
+ mova m3, [stk+0x40]
+ mova m4, [stk+0x50]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova m4, [stk+0x180]
+ mova m5, [stk+0x190]
+ mova m6, [stk+0x1a0]
+ mova m7, [stk+0x1b0]
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+.dy2_vloop:
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [stk+0x60], m6
+ pmaddwd m3, [stk+0x70], m6
+ pmaddwd m4, [stk+0x80], m7
+ pmaddwd m5, [stk+0x90], m7
+ %if isput
+ movd m6, [esp+0x18]
+ %endif
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, vrnd_mem
+ paddd m1, vrnd_mem
+ paddd m4, m0
+ paddd m5, m1
+%endif
+%ifidn %1, put
+ psrad m4, m6
+ psrad m5, m6
+ packssdw m4, m5
+ pxor m7, m7
+ pmaxsw m4, m7
+ pminsw m4, pxmaxm
+ mova [dstq], m4
+ add dstq, dsm
+%else
+ psrad m4, 6
+ psrad m5, 6
+ packssdw m4, m5
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+%if ARCH_X86_64
+ MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1
+ mova [stk+0xd0], m4
+ MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1
+ mova m4, [stk+0xd0]
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ mova m2, [stk+0x90] ; 23a
+ mova m3, [stk+0xa0] ; 23b
+ mova m5, [stk+0xb0] ; 45a
+ mova m6, [stk+0xc0] ; 45b
+ punpcklwd m7, m4, m8 ; 67a
+ punpckhwd m4, m8 ; 67b
+ mova [stk+0x90], m5
+ mova [stk+0xa0], m6
+ mova [stk+0xb0], m7
+ mova [stk+0xc0], m4
+%else
+ mov r0m, r0
+ mov r3, r3m
+ MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8
+ MC_8TAP_SCALED_H 0xa0, 0 ; 9
+ mova m7, [stk+0xe0]
+ mova m2, [stk+0x60] ; 23a
+ mova m3, [stk+0x70] ; 23b
+ mova m4, [stk+0x80] ; 45a
+ mova m5, [stk+0x90] ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova m0, [stk+0x40] ; 01a
+ mova m1, [stk+0x50] ; 01b
+ mova [stk+0x40], m2
+ mova [stk+0x50], m3
+ mova [stk+0x60], m4
+ mova [stk+0x70], m5
+ mova m4, [stk+0x180]
+ mova m5, [stk+0x190]
+ mova [stk+0x80], m6
+ mova [stk+0x90], m7
+ mova m6, [stk+0x1a0]
+ mova m7, [stk+0x1b0]
+ mov r0, r0m
+%endif
+ jmp .dy2_vloop
+INIT_XMM ssse3
+.ret:
+ MC_8TAP_SCALED_RET 0
+%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
+ %define r0m [rstk+stack_offset+ 4]
+ %define r1m [rstk+stack_offset+ 8]
+ %define r2m [rstk+stack_offset+12]
+ %define r3m [rstk+stack_offset+16]
+%endif
+%undef isput
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled_16bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, (5*15 << 16) | 5*15
+ jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%elif ARCH_X86_64
+DECLARE_REG_TMP 6, 8
+%else
+DECLARE_REG_TMP 1, 2
+%endif
+BILIN_SCALED_FN put
+FN put_8tap_scaled, sharp, SHARP, SHARP
+FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN put_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN put_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN put_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN put_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%elif ARCH_X86_64
+DECLARE_REG_TMP 6, 7
+%else
+DECLARE_REG_TMP 1, 2
+%endif
+BILIN_SCALED_FN prep
+FN prep_8tap_scaled, sharp, SHARP, SHARP
+FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN prep_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 6
+%else
+DECLARE_REG_TMP 2
+%endif
+
+%if ARCH_X86_64
+; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that
+; by allocating 16 bytes more stack space so that stack offsets match up.
+%if WIN64 && STACK_ALIGNMENT == 16
+%assign stksz 16*14
+%else
+%assign stksz 16*13
+%endif
+cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \
+ mx, tmp, alpha, beta, \
+ filter, my, gamma, cnt
+%assign stack_size_padded_8x8t stack_size_padded
+%else
+cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
+ filter, mx, my
+%define m8 [esp+16*13]
+%define m9 [esp+16*14]
+%define cntd dword [esp+4*63]
+%define dstq tmpq
+%define dsq 0
+%if STACK_ALIGNMENT < 16
+%define dstm [esp+4*65]
+%define dsm [esp+4*66]
+%else
+%define dstm r0m
+%define dsm r1m
+%endif
+%endif
+%define base filterq-$$
+ mov t0d, r7m
+ LEA filterq, $$
+ shr t0d, 11
+%if ARCH_X86_64
+ movddup m8, [base+warp8x8t_rnd]
+%else
+ movddup m1, [base+warp8x8t_rnd]
+ mov r1, r1m
+ add r1, r1
+ mova m8, m1
+ mov r1m, r1 ; ds *= 2
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main
+ jmp .start
+.loop:
+%if ARCH_X86_64
+ lea dstq, [dstq+dsq*4]
+%else
+ add dstq, dsm
+ mov dstm, dstq
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2
+.start:
+%if ARCH_X86_32
+ mov dstq, dstm
+%endif
+ paddd m1, m8
+ paddd m2, m8
+ psrad m1, 15
+ psrad m2, 15
+ packssdw m1, m2
+ mova [dstq+dsq*0], m1
+ call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3
+%if ARCH_X86_32
+ mov dstq, dstm
+ add dstq, dsm
+%endif
+ paddd m1, m8
+ paddd m2, m8
+ psrad m1, 15
+ psrad m2, 15
+ packssdw m1, m2
+ mova [dstq+dsq*2], m1
+ dec cntd
+ jg .loop
+ RET
+
+%if ARCH_X86_64
+cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \
+ mx, tmp, alpha, beta, \
+ filter, my, gamma, cnt
+ASSERT stack_size_padded == stack_size_padded_8x8t
+%else
+cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
+ filter, mx, my
+%endif
+ mov t0d, r7m
+ LEA filterq, $$
+ shr t0d, 11
+%if ARCH_X86_64
+ movddup m8, [base+warp8x8_rnd2+t0*8]
+ movd m9, r7m ; pixel_max
+ pshufb m9, [base+pw_256]
+%else
+ movddup m1, [base+warp8x8_rnd2+t0*8]
+ movd m2, r7m ; pixel_max
+ pshufb m2, [base+pw_256]
+ mova m8, m1
+ mova m9, m2
+%endif
+ call .main
+ jmp .start
+.loop:
+%if ARCH_X86_64
+ lea dstq, [dstq+dsq*2]
+%else
+ add dstq, dsm
+ mov dstm, dstq
+%endif
+ call .main2
+.start:
+%if ARCH_X86_32
+ mov dstq, dstm
+%endif
+ psrad m1, 16
+ psrad m2, 16
+ packssdw m1, m2
+ pmaxsw m1, m6
+ pmulhrsw m1, m8
+ pminsw m1, m9
+ mova [dstq+dsq*0], m1
+ call .main3
+%if ARCH_X86_32
+ mov dstq, dstm
+ add dstq, dsm
+%endif
+ psrad m1, 16
+ psrad m2, 16
+ packssdw m1, m2
+ pmaxsw m1, m6
+ pmulhrsw m1, m8
+ pminsw m1, m9
+ mova [dstq+dsq*1], m1
+ dec cntd
+ jg .loop
+ RET
+ALIGN function_align
+.main:
+ ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+ mov deltaq, r5m
+ mov mxd, r6m
+%endif
+ movd m0, [base+warp8x8_shift+t0*4]
+ movddup m7, [base+warp8x8_rnd1+t0*8]
+ add filterq, mc_warp_filter-$$
+%if ARCH_X86_64
+ movsx alphad, word [deltaq+2*0]
+ movsx betad, word [deltaq+2*1]
+ movsx gammad, word [deltaq+2*2]
+ movsx deltad, word [deltaq+2*3]
+ lea tmpq, [ssq*3]
+ add mxd, 512+(64<<10)
+ sub srcq, tmpq ; src -= ss*3
+ imul tmpd, alphad, -7
+ mov myd, r7m
+ add betad, tmpd ; beta -= alpha*7
+ imul tmpd, gammad, -7
+ add myd, 512+(64<<10)
+ mov cntd, 4
+ add deltad, tmpd ; delta -= gamma*7
+%else
+%if STACK_ALIGNMENT < 16
+ %assign stack_offset stack_offset - gprsize
+%endif
+ mov r3d, r5m ; abcd
+%if STACK_ALIGNMENT < 16
+ mov r0, r1m ; dst
+ mov r1, r2m ; ds
+ mov [esp+gprsize+4*65], r0
+ mov [esp+gprsize+4*66], r1
+%endif
+ movsx alphad, word [r3+2*0]
+ movsx r2d, word [r3+2*1]
+ movsx gammad, word [r3+2*2]
+ movsx r3d, word [r3+2*3]
+ imul r5d, alphad, -7
+ add r2d, r5d ; beta -= alpha*7
+ imul r5d, gammad, -7
+ mov [esp+gprsize+4*60], r2d
+ add r3d, r5d ; delta -= gamma*7
+ mov [esp+gprsize+4*61], r3d
+ mov r3d, r4m ; ss
+ mov srcq, r3m
+ mov mxd, r6m
+ mov myd, r7m
+ mov dword [esp+gprsize+4*63], 4 ; cnt
+ mov [esp+gprsize+4*62], r3
+ lea r3, [r3*3]
+ add mxd, 512+(64<<10)
+ add myd, 512+(64<<10)
+ sub srcq, r3 ; src -= ss*3
+%if STACK_ALIGNMENT < 16
+ %assign stack_offset stack_offset + gprsize
+%endif
+%endif
+ mova [rsp+gprsize], m0
+ pxor m6, m6
+ call .h
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 01
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 1], m1
+ mova [rsp+gprsize+16* 4], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 12
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 7], m1
+ mova [rsp+gprsize+16*10], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 23
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 2], m1
+ mova [rsp+gprsize+16* 5], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 34
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 8], m1
+ mova [rsp+gprsize+16*11], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 45
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 3], m1
+ mova [rsp+gprsize+16* 6], m5
+ mova m5, m0
+ call .h
+ punpcklwd m1, m5, m0 ; 56
+ punpckhwd m5, m0
+ mova [rsp+gprsize+16* 9], m1
+ mova [rsp+gprsize+16*12], m5
+ mova m5, m0
+.main2:
+ call .h
+%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ movq m4, [filterq+myq*8] ; a
+ lea myd, [tmpq+gammaq]
+ shr tmpd, 10
+ movq m2, [filterq+tmpq*8] ; b
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ movq m3, [filterq+myq*8] ; c
+ lea myd, [tmpq+gammaq]
+ shr tmpd, 10
+ movq m1, [filterq+tmpq*8] ; d
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ punpcklwd m4, m2
+ punpcklwd m3, m1
+ punpckldq m2, m4, m3
+ punpckhdq m4, m3
+ punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
+ pmaddwd m1, [rsp+gprsize+16*%1]
+ punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
+ mova m2, [rsp+gprsize+16*%2]
+ pmaddwd m3, m2
+ mova [rsp+gprsize+16*%1], m2
+ paddd m1, m3
+ punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
+ mova m2, [rsp+gprsize+16*%3]
+ pmaddwd m3, m2
+ mova [rsp+gprsize+16*%2], m2
+ paddd m1, m3
+ punpcklwd m3, m5, m0 ; 67
+ punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
+ pmaddwd m2, m3
+ mova [rsp+gprsize+16*%3], m3
+ paddd m1, m2
+ movq m4, [filterq+myq*8] ; e
+ lea myd, [tmpq+gammaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8] ; f
+ lea tmpd, [myq+gammaq]
+ shr myd, 10
+ movq m2, [filterq+myq*8] ; g
+%if ARCH_X86_64
+ lea myd, [tmpq+deltaq] ; my += delta
+%else
+ mov myd, [esp+gprsize+4*61]
+ add myd, tmpd
+%endif
+ shr tmpd, 10
+ punpcklwd m4, m3
+ movq m3, [filterq+tmpq*8] ; h
+ punpcklwd m2, m3
+ punpckldq m3, m4, m2
+ punpckhdq m4, m2
+ punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8
+ pmaddwd m2, [rsp+gprsize+16*%4]
+ punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8
+ mova m3, [rsp+gprsize+16*%5]
+ pmaddwd m6, m3
+ mova [rsp+gprsize+16*%4], m3
+ pxor m3, m3
+ paddd m2, m6
+ punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8
+ mova m6, [rsp+gprsize+16*%6]
+ pmaddwd m3, m6
+ mova [rsp+gprsize+16*%5], m6
+ punpckhwd m5, m0
+ pxor m6, m6
+ paddd m2, m3
+ punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8
+ pmaddwd m3, m5
+ mova [rsp+gprsize+16*%6], m5
+ mova m5, m0
+ paddd m2, m3
+%endmacro
+ WARP_V 1, 2, 3, 4, 5, 6
+ ret
+.main3:
+ call .h
+ WARP_V 7, 8, 9, 10, 11, 12
+ ret
+ALIGN function_align
+.h:
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ punpcklbw m0, m6, m3
+ movu m3, [srcq-6]
+ pmaddwd m0, m3 ; 0
+ lea mxd, [tmpq+alphaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ punpcklbw m2, m6, m3
+ movu m3, [srcq-4]
+ pmaddwd m2, m3 ; 1
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ phaddd m0, m2 ; 0 1
+ punpcklbw m2, m6, m3
+ movu m3, [srcq-2]
+ pmaddwd m2, m3 ; 2
+ lea mxd, [tmpq+alphaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ punpcklbw m1, m6, m3
+ movu m3, [srcq+0]
+ pmaddwd m1, m3 ; 3
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ phaddd m2, m1 ; 2 3
+ punpcklbw m1, m6, m3
+ movu m3, [srcq+2]
+ pmaddwd m1, m3 ; 4
+ lea mxd, [tmpq+alphaq]
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ phaddd m0, m2 ; 0 1 2 3
+ punpcklbw m2, m6, m3
+ movu m3, [srcq+4]
+ pmaddwd m2, m3 ; 5
+ lea tmpd, [mxq+alphaq]
+ shr mxd, 10
+ movq m3, [filterq+mxq*8]
+ phaddd m1, m2 ; 4 5
+ punpcklbw m2, m6, m3
+ movu m3, [srcq+6]
+ pmaddwd m2, m3 ; 6
+%if ARCH_X86_64
+ lea mxd, [tmpq+betaq] ; mx += beta
+%else
+ mov mxd, [esp+gprsize*2+4*60]
+ add mxd, tmpd
+%endif
+ shr tmpd, 10
+ movq m3, [filterq+tmpq*8]
+ punpcklbw m4, m6, m3
+ movu m3, [srcq+8]
+%if ARCH_X86_64
+ add srcq, ssq
+%else
+ add srcq, [esp+gprsize*2+4*62]
+%endif
+ pmaddwd m3, m4 ; 7
+ phaddd m2, m3 ; 6 7
+ phaddd m1, m2 ; 4 5 6 7
+ paddd m0, m7
+ paddd m1, m7
+ psrad m0, [rsp+gprsize*2]
+ psrad m1, [rsp+gprsize*2]
+ packssdw m0, m1
+ ret
+
+%macro BIDIR_FN 0
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+.ret:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jne .w8_loop
+ RET
+.w16_loop:
+ call .main
+ add dstq, strideq
+.w16:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ dec hd
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ call .main
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ call .main
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16* 0], m0
+ mova [dstq+16* 1], m1
+ call .main
+ mova [dstq+16* 2], m0
+ mova [dstq+16* 3], m1
+ call .main
+ mova [dstq+16* 4], m0
+ mova [dstq+16* 5], m1
+ call .main
+ mova [dstq+16* 6], m0
+ mova [dstq+16* 7], m1
+ call .main
+ mova [dstq+16* 8], m0
+ mova [dstq+16* 9], m1
+ call .main
+ mova [dstq+16*10], m0
+ mova [dstq+16*11], m1
+ call .main
+ mova [dstq+16*12], m0
+ mova [dstq+16*13], m1
+ call .main
+ mova [dstq+16*14], m0
+ mova [dstq+16*15], m1
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h
+%define base r6-avg_ssse3_table
+ LEA r6, avg_ssse3_table
+ tzcnt wd, wm
+ mov t0d, r6m ; pixel_max
+ movsxd wq, [r6+wq*4]
+ shr t0d, 11
+ movddup m2, [base+bidir_rnd+t0*8]
+ movddup m3, [base+bidir_mul+t0*8]
+ movifnidn hd, hm
+ add wq, r6
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m0, [tmp1q+16*0]
+ paddsw m0, [tmp2q+16*0]
+ mova m1, [tmp1q+16*1]
+ paddsw m1, [tmp2q+16*1]
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ pmaxsw m0, m2
+ pmaxsw m1, m2
+ psubsw m0, m2
+ psubsw m1, m2
+ pmulhw m0, m3
+ pmulhw m1, m3
+ ret
+
+cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h
+%define base r6-w_avg_ssse3_table
+ LEA r6, w_avg_ssse3_table
+ tzcnt wd, wm
+ mov t0d, r6m ; weight
+ movd m6, r7m ; pixel_max
+ movddup m5, [base+pd_65538]
+ movsxd wq, [r6+wq*4]
+ pshufb m6, [base+pw_256]
+ add wq, r6
+ lea r6d, [t0-16]
+ shl t0d, 16
+ sub t0d, r6d ; 16-weight, weight
+ paddw m5, m6
+ mov r6d, t0d
+ shl t0d, 2
+ test dword r7m, 0x800
+ cmovnz r6d, t0d
+ movifnidn hd, hm
+ movd m4, r6d
+ pslld m5, 7
+ pxor m7, m7
+ pshufd m4, m4, q0000
+ BIDIR_FN
+ALIGN function_align
+.main:
+ mova m2, [tmp1q+16*0]
+ mova m0, [tmp2q+16*0]
+ punpckhwd m3, m0, m2
+ punpcklwd m0, m2
+ mova m2, [tmp1q+16*1]
+ mova m1, [tmp2q+16*1]
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ pmaddwd m3, m4
+ pmaddwd m0, m4
+ paddd m3, m5
+ paddd m0, m5
+ psrad m3, 8
+ psrad m0, 8
+ packssdw m0, m3
+ punpckhwd m3, m1, m2
+ punpcklwd m1, m2
+ pmaddwd m3, m4
+ pmaddwd m1, m4
+ paddd m3, m5
+ paddd m1, m5
+ psrad m3, 8
+ psrad m1, 8
+ packssdw m1, m3
+ pminsw m0, m6
+ pminsw m1, m6
+ pmaxsw m0, m7
+ pmaxsw m1, m7
+ ret
+
+%if ARCH_X86_64
+cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
+%else
+cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
+%define hd dword r5m
+%define m8 [base+pw_64]
+%endif
+%define base r6-mask_ssse3_table
+ LEA r6, mask_ssse3_table
+ tzcnt wd, wm
+ mov t0d, r7m ; pixel_max
+ shr t0d, 11
+ movsxd wq, [r6+wq*4]
+ movddup m6, [base+bidir_rnd+t0*8]
+ movddup m7, [base+bidir_mul+t0*8]
+%if ARCH_X86_64
+ mova m8, [base+pw_64]
+ movifnidn hd, hm
+%endif
+ add wq, r6
+ mov maskq, r6mp
+ BIDIR_FN
+ALIGN function_align
+.main:
+ movq m3, [maskq+8*0]
+ mova m0, [tmp1q+16*0]
+ mova m4, [tmp2q+16*0]
+ pxor m5, m5
+ punpcklbw m3, m5
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ psubw m1, m8, m3
+ punpckhwd m4, m3, m1 ; m, 64-m
+ punpcklwd m3, m1
+ pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m)
+ pmaddwd m0, m3
+ movq m3, [maskq+8*1]
+ mova m1, [tmp1q+16*1]
+ mova m4, [tmp2q+16*1]
+ add maskq, 8*2
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ psrad m2, 5
+ psrad m0, 5
+ packssdw m0, m2
+ punpcklbw m3, m5
+ punpckhwd m2, m1, m4
+ punpcklwd m1, m4
+ psubw m5, m8, m3
+ punpckhwd m4, m3, m5 ; m, 64-m
+ punpcklwd m3, m5
+ pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m)
+ pmaddwd m1, m3
+ psrad m2, 5
+ psrad m1, 5
+ packssdw m1, m2
+ pmaxsw m0, m6
+ pmaxsw m1, m6
+ psubsw m0, m6
+ psubsw m1, m6
+ pmulhw m0, m7
+ pmulhw m1, m7
+ ret
+
+cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_420_ssse3_table
+ LEA t0, w_mask_420_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movd m0, r7m ; sign
+ shr r6d, 11
+ movsxd wq, [t0+wq*4]
+%if ARCH_X86_64
+ mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ mova m9, [base+pw_64]
+ movddup m10, [base+bidir_rnd+r6*8]
+ movddup m11, [base+bidir_mul+r6*8]
+%else
+ mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
+ mova m2, [base+pw_64]
+ movddup m3, [base+bidir_rnd+r6*8]
+ movddup m4, [base+bidir_mul+r6*8]
+ ALLOC_STACK -16*4
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova [rsp+16*2], m3
+ mova [rsp+16*3], m4
+ %define m8 [rsp+gprsize+16*0]
+ %define m9 [rsp+gprsize+16*1]
+ %define m10 [rsp+gprsize+16*2]
+ %define m11 [rsp+gprsize+16*3]
+%endif
+ movd m7, [base+pw_2]
+ psubw m7, m0
+ pshufb m7, [base+pw_256]
+ add wq, t0
+ movifnidn hd, r5m
+ mov maskq, r6mp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 4
+.w4:
+ movq [dstq+strideq*0], m0
+ phaddw m2, m3
+ movhps [dstq+strideq*1], m0
+ phaddd m2, m2
+ lea dstq, [dstq+strideq*2]
+ paddw m2, m7
+ movq [dstq+strideq*0], m1
+ psrlw m2, 2
+ movhps [dstq+strideq*1], m1
+ packuswb m2, m2
+ movd [maskq], m2
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 4
+.w8:
+ mova [dstq+strideq*0], m0
+ paddw m2, m3
+ phaddw m2, m2
+ mova [dstq+strideq*1], m1
+ paddw m2, m7
+ psrlw m2, 2
+ packuswb m2, m2
+ movd [maskq], m2
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 8
+.w16:
+ mova [dstq+strideq*1+16*0], m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*1+16*1], m3
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*0]
+ paddw m3, [dstq+strideq*1+16*1]
+ mova [dstq+strideq*1+16*0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*1], m1
+ paddw m2, m7
+ psrlw m2, 2
+ packuswb m2, m2
+ movq [maskq], m2
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16
+.w32:
+ mova [dstq+strideq*1+16*0], m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*1+16*1], m3
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*0+16*2], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*3], m2
+ mova [dstq+strideq*0+16*3], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*0]
+ paddw m3, [dstq+strideq*1+16*1]
+ mova [dstq+strideq*1+16*0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*2], m2
+ mova [dstq+strideq*1+16*1], m1
+ call .main
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*2]
+ paddw m2, [dstq+strideq*1+16*3]
+ mova [dstq+strideq*1+16*2], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*3], m1
+ packuswb m3, m2
+ mova [maskq], m3
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16*2
+.w64:
+ mova [dstq+strideq*1+16*1], m2
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*1+16*2], m3
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*1+16*3], m2
+ mova [dstq+strideq*0+16*2], m0
+ mova [dstq+strideq*1+16*4], m3
+ mova [dstq+strideq*0+16*3], m1
+ call .main
+ mova [dstq+strideq*1+16*5], m2
+ mova [dstq+strideq*0+16*4], m0
+ mova [dstq+strideq*1+16*6], m3
+ mova [dstq+strideq*0+16*5], m1
+ call .main
+ mova [dstq+strideq*0+16*6], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*7], m2
+ mova [dstq+strideq*0+16*7], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*1]
+ paddw m3, [dstq+strideq*1+16*2]
+ mova [dstq+strideq*1+16*0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*2], m2
+ mova [dstq+strideq*1+16*1], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*3]
+ paddw m3, [dstq+strideq*1+16*4]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*2]
+ mova [dstq+strideq*1+16*2], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*3], m1
+ packuswb m3, m2
+ mova [maskq+16*0], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16*5]
+ paddw m3, [dstq+strideq*1+16*6]
+ mova [dstq+strideq*1+16*4], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*6], m2
+ mova [dstq+strideq*1+16*5], m1
+ call .main
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*6]
+ paddw m2, [dstq+strideq*1+16*7]
+ mova [dstq+strideq*1+16*6], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*7], m1
+ packuswb m3, m2
+ mova [maskq+16*1], m3
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16*4
+.w128:
+ mova [dstq+strideq*1+16* 1], m2
+ mova [dstq+strideq*0+16* 0], m0
+ mova [dstq+strideq*1+16* 2], m3
+ mova [dstq+strideq*0+16* 1], m1
+ call .main
+ mova [dstq+strideq*1+16* 3], m2
+ mova [dstq+strideq*0+16* 2], m0
+ mova [dstq+strideq*1+16* 4], m3
+ mova [dstq+strideq*0+16* 3], m1
+ call .main
+ mova [dstq+strideq*1+16* 5], m2
+ mova [dstq+strideq*0+16* 4], m0
+ mova [dstq+strideq*1+16* 6], m3
+ mova [dstq+strideq*0+16* 5], m1
+ call .main
+ mova [dstq+strideq*1+16* 7], m2
+ mova [dstq+strideq*0+16* 6], m0
+ mova [dstq+strideq*1+16* 8], m3
+ mova [dstq+strideq*0+16* 7], m1
+ call .main
+ mova [dstq+strideq*1+16* 9], m2
+ mova [dstq+strideq*0+16* 8], m0
+ mova [dstq+strideq*1+16*10], m3
+ mova [dstq+strideq*0+16* 9], m1
+ call .main
+ mova [dstq+strideq*1+16*11], m2
+ mova [dstq+strideq*0+16*10], m0
+ mova [dstq+strideq*1+16*12], m3
+ mova [dstq+strideq*0+16*11], m1
+ call .main
+ mova [dstq+strideq*1+16*13], m2
+ mova [dstq+strideq*0+16*12], m0
+ mova [dstq+strideq*1+16*14], m3
+ mova [dstq+strideq*0+16*13], m1
+ call .main
+ mova [dstq+strideq*0+16*14], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*15], m2
+ mova [dstq+strideq*0+16*15], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16* 1]
+ paddw m3, [dstq+strideq*1+16* 2]
+ mova [dstq+strideq*1+16* 0], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16* 2], m2
+ mova [dstq+strideq*1+16* 1], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16* 3]
+ paddw m3, [dstq+strideq*1+16* 4]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16* 2]
+ mova [dstq+strideq*1+16* 2], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16* 3], m1
+ packuswb m3, m2
+ mova [maskq+16*0], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16* 5]
+ paddw m3, [dstq+strideq*1+16* 6]
+ mova [dstq+strideq*1+16* 4], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16* 6], m2
+ mova [dstq+strideq*1+16* 5], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16* 7]
+ paddw m3, [dstq+strideq*1+16* 8]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16* 6]
+ mova [dstq+strideq*1+16* 6], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16* 7], m1
+ packuswb m3, m2
+ mova [maskq+16*1], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16* 9]
+ paddw m3, [dstq+strideq*1+16*10]
+ mova [dstq+strideq*1+16* 8], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*10], m2
+ mova [dstq+strideq*1+16* 9], m1
+ call .main
+ paddw m2, [dstq+strideq*1+16*11]
+ paddw m3, [dstq+strideq*1+16*12]
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*10]
+ mova [dstq+strideq*1+16*10], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*11], m1
+ packuswb m3, m2
+ mova [maskq+16*2], m3
+ call .main
+ paddw m2, [dstq+strideq*1+16*13]
+ paddw m3, [dstq+strideq*1+16*14]
+ mova [dstq+strideq*1+16*12], m0
+ phaddw m2, m3
+ mova [dstq+strideq*1+16*14], m2
+ mova [dstq+strideq*1+16*13], m1
+ call .main
+ phaddw m2, m3
+ paddw m3, m7, [dstq+strideq*1+16*14]
+ paddw m2, [dstq+strideq*1+16*15]
+ mova [dstq+strideq*1+16*14], m0
+ paddw m2, m7
+ psrlw m3, 2
+ psrlw m2, 2
+ mova [dstq+strideq*1+16*15], m1
+ packuswb m3, m2
+ mova [maskq+16*3], m3
+ sub hd, 2
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+%macro W_MASK 2 ; dst/tmp_offset, mask
+ mova m%1, [tmp1q+16*%1]
+ mova m%2, [tmp2q+16*%1]
+ punpcklwd m4, m%2, m%1
+ punpckhwd m5, m%2, m%1
+ psubsw m%1, m%2
+ pabsw m%1, m%1
+ psubusw m6, m8, m%1
+ psrlw m6, 10 ; 64-m
+ psubw m%2, m9, m6 ; m
+ punpcklwd m%1, m6, m%2
+ punpckhwd m6, m%2
+ pmaddwd m%1, m4
+ pmaddwd m6, m5
+ psrad m%1, 5
+ psrad m6, 5
+ packssdw m%1, m6
+ pmaxsw m%1, m10
+ psubsw m%1, m10
+ pmulhw m%1, m11
+%endmacro
+ W_MASK 0, 2
+ W_MASK 1, 3
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ ret
+
+cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_422_ssse3_table
+ LEA t0, w_mask_422_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ movd m7, r7m ; sign
+ shr r6d, 11
+ movsxd wq, [t0+wq*4]
+%if ARCH_X86_64
+ mova m8, [base+pw_27615]
+ mova m9, [base+pw_64]
+ movddup m10, [base+bidir_rnd+r6*8]
+ movddup m11, [base+bidir_mul+r6*8]
+%else
+ mova m1, [base+pw_27615]
+ mova m2, [base+pw_64]
+ movddup m3, [base+bidir_rnd+r6*8]
+ movddup m4, [base+bidir_mul+r6*8]
+ ALLOC_STACK -16*4
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova [rsp+16*2], m3
+ mova [rsp+16*3], m4
+%endif
+ pxor m0, m0
+ add wq, t0
+ pshufb m7, m0
+ movifnidn hd, r5m
+ mov maskq, r6mp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+.end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ call .main
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ call .main
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16* 0], m0
+ mova [dstq+16* 1], m1
+ call .main
+ mova [dstq+16* 2], m0
+ mova [dstq+16* 3], m1
+ call .main
+ mova [dstq+16* 4], m0
+ mova [dstq+16* 5], m1
+ call .main
+ mova [dstq+16* 6], m0
+ mova [dstq+16* 7], m1
+ call .main
+ mova [dstq+16* 8], m0
+ mova [dstq+16* 9], m1
+ call .main
+ mova [dstq+16*10], m0
+ mova [dstq+16*11], m1
+ call .main
+ mova [dstq+16*12], m0
+ mova [dstq+16*13], m1
+ call .main
+ mova [dstq+16*14], m0
+ mova [dstq+16*15], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 2
+ W_MASK 1, 3
+ phaddw m2, m3
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ packuswb m2, m2
+ pxor m3, m3
+ psubb m2, m7
+ pavgb m2, m3
+ movq [maskq], m2
+ add maskq, 8
+ ret
+
+cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_444_ssse3_table
+ LEA t0, w_mask_444_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r8m ; pixel_max
+ shr r6d, 11
+ movsxd wq, [t0+wq*4]
+%if ARCH_X86_64
+ mova m8, [base+pw_27615]
+ mova m9, [base+pw_64]
+ movddup m10, [base+bidir_rnd+r6*8]
+ movddup m11, [base+bidir_mul+r6*8]
+%else
+ mova m1, [base+pw_27615]
+ mova m2, [base+pw_64]
+ movddup m3, [base+bidir_rnd+r6*8]
+ movddup m7, [base+bidir_mul+r6*8]
+ ALLOC_STACK -16*3
+ mova [rsp+16*0], m1
+ mova [rsp+16*1], m2
+ mova [rsp+16*2], m3
+ %define m11 m7
+%endif
+ add wq, t0
+ movifnidn hd, r5m
+ mov maskq, r6mp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+.end:
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ sub hd, 2
+ jg .w8_loop
+.w8_end:
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0+16*0], m0
+ mova [dstq+strideq*0+16*1], m1
+ call .main
+ mova [dstq+strideq*1+16*0], m0
+ mova [dstq+strideq*1+16*1], m1
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ call .main
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ call .main
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ call .main
+ mova [dstq+16*6], m0
+ mova [dstq+16*7], m1
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16* 0], m0
+ mova [dstq+16* 1], m1
+ call .main
+ mova [dstq+16* 2], m0
+ mova [dstq+16* 3], m1
+ call .main
+ mova [dstq+16* 4], m0
+ mova [dstq+16* 5], m1
+ call .main
+ mova [dstq+16* 6], m0
+ mova [dstq+16* 7], m1
+ call .main
+ mova [dstq+16* 8], m0
+ mova [dstq+16* 9], m1
+ call .main
+ mova [dstq+16*10], m0
+ mova [dstq+16*11], m1
+ call .main
+ mova [dstq+16*12], m0
+ mova [dstq+16*13], m1
+ call .main
+ mova [dstq+16*14], m0
+ mova [dstq+16*15], m1
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ W_MASK 0, 2
+ W_MASK 1, 3
+ packuswb m2, m3
+ add tmp1q, 16*2
+ add tmp2q, 16*2
+ mova [maskq], m2
+ add maskq, 16
+ ret
+
+; (a * (64 - m) + b * m + 32) >> 6
+; = (((b - a) * m + 32) >> 6) + a
+; = (((b - a) * (m << 9) + 16384) >> 15) + a
+; except m << 9 overflows int16_t when m == 64 (which is possible),
+; but if we negate m it works out (-64 << 9 == -32768).
+; = (((a - b) * (m * -512) + 16384) >> 15) + a
+cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3
+%define base r6-blend_ssse3_table
+ LEA r6, blend_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ movifnidn maskq, maskmp
+ mova m7, [base+pw_m512]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ pxor m6, m6
+ jmp wq
+.w4:
+ mova m5, [maskq]
+ movq m0, [dstq+strideq*0]
+ movhps m0, [dstq+strideq*1]
+ movq m1, [dstq+strideq*2]
+ movhps m1, [dstq+stride3q ]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ add maskq, 16
+ add tmpq, 32
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+stride3q ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ mova m5, [maskq]
+ mova m0, [dstq+strideq*0]
+ mova m1, [dstq+strideq*1]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ add maskq, 16
+ add tmpq, 32
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8
+ RET
+.w16:
+ mova m5, [maskq]
+ mova m0, [dstq+16*0]
+ mova m1, [dstq+16*1]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ add maskq, 16
+ add tmpq, 32
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w16
+ RET
+.w32:
+ mova m5, [maskq+16*0]
+ mova m0, [dstq+16*0]
+ mova m1, [dstq+16*1]
+ psubw m2, m0, [tmpq+16*0]
+ psubw m3, m1, [tmpq+16*1]
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova m5, [maskq+16*1]
+ mova m0, [dstq+16*2]
+ mova m1, [dstq+16*3]
+ psubw m2, m0, [tmpq+16*2]
+ psubw m3, m1, [tmpq+16*3]
+ add maskq, 32
+ add tmpq, 64
+ punpcklbw m4, m5, m6
+ punpckhbw m5, m6
+ pmullw m4, m7
+ pmullw m5, m7
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*2], m0
+ mova [dstq+16*3], m1
+ add dstq, strideq
+ dec hd
+ jg .w32
+ RET
+
+cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
+%define base r5-blend_v_ssse3_table
+ LEA r5, blend_v_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp wq
+.w2:
+ movd m4, [base+obmc_masks+2*2]
+.w2_loop:
+ movd m0, [dstq+strideq*0]
+ movd m2, [tmpq+4*0]
+ movd m1, [dstq+strideq*1]
+ movd m3, [tmpq+4*1]
+ add tmpq, 4*2
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ movddup m2, [base+obmc_masks+4*2]
+.w4_loop:
+ movq m0, [dstq+strideq*0]
+ movhps m0, [dstq+strideq*1]
+ mova m1, [tmpq]
+ add tmpq, 8*2
+ psubw m1, m0
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ mova m4, [base+obmc_masks+8*2]
+.w8_loop:
+ mova m0, [dstq+strideq*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+strideq*1]
+ mova m3, [tmpq+16*1]
+ add tmpq, 16*2
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ mova m4, [base+obmc_masks+16*2]
+ movq m5, [base+obmc_masks+16*3]
+.w16_loop:
+ mova m0, [dstq+16*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+16*1]
+ mova m3, [tmpq+16*1]
+ add tmpq, 16*2
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w16_loop
+ RET
+.w32:
+%if WIN64
+ movaps [rsp+8], m6
+%endif
+ mova m4, [base+obmc_masks+16*4]
+ mova m5, [base+obmc_masks+16*5]
+ mova m6, [base+obmc_masks+16*6]
+.w32_loop:
+ mova m0, [dstq+16*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+16*1]
+ mova m3, [tmpq+16*1]
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m5
+ paddw m0, m2
+ mova m2, [dstq+16*2]
+ paddw m1, m3
+ mova m3, [tmpq+16*2]
+ add tmpq, 16*4
+ psubw m3, m2
+ pmulhrsw m3, m6
+ paddw m2, m3
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+%if WIN64
+ movaps m6, [rsp+8]
+%endif
+ RET
+
+%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
+ mova m0, [dstq+16*(%1+0)]
+ mova m2, [tmpq+16*(%2+0)]
+ mova m1, [dstq+16*(%1+1)]
+ mova m3, [tmpq+16*(%2+1)]
+%if %3
+ add tmpq, 16*%3
+%endif
+ psubw m2, m0
+ psubw m3, m1
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+16*(%1+0)], m0
+ mova [dstq+16*(%1+1)], m1
+%endmacro
+
+cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base r6-blend_h_ssse3_table
+ LEA r6, blend_h_ssse3_table
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ movddup m4, [base+blend_shuf]
+ lea maskq, [base+obmc_masks+hq*2]
+ lea hd, [hq*3]
+ add wq, r6
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd m0, [dstq+dsq*0]
+ movd m2, [dstq+dsq*1]
+ movd m3, [maskq+hq*2]
+ movq m1, [tmpq]
+ add tmpq, 4*2
+ punpckldq m0, m2
+ punpcklwd m3, m3
+ psubw m1, m0
+ pmulhrsw m1, m3
+ paddw m0, m1
+ movd [dstq+dsq*0], m0
+ psrlq m0, 32
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova m3, [base+blend_shuf]
+.w4_loop:
+ movq m0, [dstq+dsq*0]
+ movhps m0, [dstq+dsq*1]
+ movd m2, [maskq+hq*2]
+ mova m1, [tmpq]
+ add tmpq, 8*2
+ psubw m1, m0
+ pshufb m2, m3
+ pmulhrsw m1, m2
+ paddw m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ movddup m5, [base+blend_shuf+8]
+%if WIN64
+ movaps [rsp+ 8], m6
+ movaps [rsp+24], m7
+%endif
+.w8_loop:
+ movd m7, [maskq+hq*2]
+ mova m0, [dstq+dsq*0]
+ mova m2, [tmpq+16*0]
+ mova m1, [dstq+dsq*1]
+ mova m3, [tmpq+16*1]
+ add tmpq, 16*2
+ pshufb m6, m7, m4
+ psubw m2, m0
+ pshufb m7, m5
+ psubw m3, m1
+ pmulhrsw m2, m6
+ pmulhrsw m3, m7
+ paddw m0, m2
+ paddw m1, m3
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+%if WIN64
+ movaps m6, [rsp+ 8]
+ movaps m7, [rsp+24]
+%endif
+ RET
+.w16:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0, 2
+ add dstq, dsq
+ inc hq
+ jl .w16
+ RET
+.w32:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2, 4
+ add dstq, dsq
+ inc hq
+ jl .w32
+ RET
+.w64:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2
+ BLEND_H_ROW 4, 4
+ BLEND_H_ROW 6, 6, 8
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ movd m5, [maskq+hq*2]
+ pshufb m5, m4
+ BLEND_H_ROW 0, 0
+ BLEND_H_ROW 2, 2
+ BLEND_H_ROW 4, 4
+ BLEND_H_ROW 6, 6, 16
+ BLEND_H_ROW 8, -8
+ BLEND_H_ROW 10, -6
+ BLEND_H_ROW 12, -4
+ BLEND_H_ROW 14, -2
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+; emu_edge args:
+; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
+; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
+; const pixel *ref, const ptrdiff_t ref_stride
+;
+; bw, bh total filled size
+; iw, ih, copied block -> fill bottom, right
+; x, y, offset in bw/bh -> fill top, left
+cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \
+ y, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+
+%if ARCH_X86_64
+ %define reg_zero r12q
+ %define reg_tmp r10
+ %define reg_src srcq
+ %define reg_bottomext bottomextq
+ %define reg_rightext rightextq
+ %define reg_blkm r9m
+%else
+ %define reg_zero r6
+ %define reg_tmp r0
+ %define reg_src r1
+ %define reg_bottomext r0
+ %define reg_rightext r1
+ %define reg_blkm r2m
+%endif
+ ;
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor reg_zero, reg_zero
+ lea reg_tmp, [ihq-1]
+ cmp yq, ihq
+ cmovs reg_tmp, yq
+ test yq, yq
+ cmovs reg_tmp, reg_zero
+%if ARCH_X86_64
+ imul reg_tmp, sstrideq
+ add srcq, reg_tmp
+%else
+ imul reg_tmp, sstridem
+ mov reg_src, srcm
+ add reg_src, reg_tmp
+%endif
+ ;
+ ; ref += iclip(x, 0, iw - 1)
+ lea reg_tmp, [iwq-1]
+ cmp xq, iwq
+ cmovs reg_tmp, xq
+ test xq, xq
+ cmovs reg_tmp, reg_zero
+ lea reg_src, [reg_src+reg_tmp*2]
+%if ARCH_X86_32
+ mov srcm, reg_src
+%endif
+ ;
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+%if ARCH_X86_32
+ mov r1, r1m ; restore bh
+%endif
+ lea reg_bottomext, [yq+bhq]
+ sub reg_bottomext, ihq
+ lea r3, [bhq-1]
+ cmovs reg_bottomext, reg_zero
+ ;
+
+ DEFINE_ARGS bw, bh, iw, ih, x, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, reg_zero
+ cmp reg_bottomext, bhq
+ cmovns reg_bottomext, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+ %if ARCH_X86_32
+ mov r4m, reg_bottomext
+ ;
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ mov r0, r0m ; restore bw
+ %endif
+ lea reg_rightext, [xq+bwq]
+ sub reg_rightext, iwq
+ lea r2, [bwq-1]
+ cmovs reg_rightext, reg_zero
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, reg_zero
+ cmp reg_rightext, bwq
+ cmovns reg_rightext, r2
+ %if ARCH_X86_32
+ mov r3m, r1
+ %endif
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+%undef reg_zero
+%undef reg_tmp
+%undef reg_src
+%undef reg_bottomext
+%undef reg_rightext
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; center_h = bh - top_ext - bottom_ext
+%if ARCH_X86_64
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+%else
+ mov r1, centerhm ; restore r1
+ sub centerhq, topextq
+ sub centerhq, r4m
+ mov r1m, centerhq
+%endif
+ ;
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+%if ARCH_X86_64
+ imul r2, dstrideq
+%else
+ mov r6, r6m ; restore dstq
+ imul r2, dstridem
+%endif
+ add dstq, r2
+ mov reg_blkm, dstq ; save pointer for ext
+ ;
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+%if ARCH_X86_64
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+%else
+ sub centerwq, r3m
+ sub centerwq, leftextq
+%endif
+
+; vloop Macro
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+ %if ARCH_X86_64
+ %define reg_tmp r12
+ %else
+ %define reg_tmp r0
+ %endif
+.v_loop_%3:
+ %if ARCH_X86_32
+ mov r0, r0m
+ mov r1, r1m
+ %endif
+%if %1
+ ; left extension
+ %if ARCH_X86_64
+ movd m0, [srcq]
+ %else
+ mov r3, srcm
+ movd m0, [r3]
+ %endif
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ xor r3, r3
+.left_loop_%3:
+ mova [dstq+r3*2], m0
+ add r3, mmsize/2
+ cmp r3, leftextq
+ jl .left_loop_%3
+ ; body
+ lea reg_tmp, [dstq+leftextq*2]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ %if ARCH_X86_64
+ movu m0, [srcq+r3*2]
+ %else
+ mov r1, srcm
+ movu m0, [r1+r3*2]
+ %endif
+%if %1
+ movu [reg_tmp+r3*2], m0
+%else
+ movu [dstq+r3*2], m0
+%endif
+ add r3, mmsize/2
+ cmp r3, centerwq
+ jl .body_loop_%3
+%if %2
+ ; right extension
+%if %1
+ lea reg_tmp, [reg_tmp+centerwq*2]
+%else
+ lea reg_tmp, [dstq+centerwq*2]
+%endif
+ %if ARCH_X86_64
+ movd m0, [srcq+centerwq*2-2]
+ %else
+ mov r3, srcm
+ movd m0, [r3+centerwq*2-2]
+ %endif
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ xor r3, r3
+.right_loop_%3:
+ movu [reg_tmp+r3*2], m0
+ add r3, mmsize/2
+ %if ARCH_X86_64
+ cmp r3, rightextq
+ %else
+ cmp r3, r3m
+ %endif
+ jl .right_loop_%3
+%endif
+ %if ARCH_X86_64
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+ %else
+ add dstq, dstridem
+ mov r0, sstridem
+ add srcm, r0
+ sub dword centerhm, 1
+ jg .v_loop_%3
+ mov r0, r0m ; restore r0
+ %endif
+%endmacro ; vloop MACRO
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ jnz .need_right_ext
+ %else
+ cmp leftextq, r3m ; leftextq == 0
+ jne .need_right_ext
+ %endif
+ v_loop 0, 0, 0
+ jmp .body_done
+
+ ;left right extensions
+.need_left_ext:
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ %else
+ mov r3, r3m
+ test r3, r3
+ %endif
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+; r0 ; bw
+; r1 ;; x loop
+; r4 ;; y loop
+; r5 ; topextq
+; r6 ;dstq
+; r7 ;dstrideq
+; r8 ; srcq
+%if ARCH_X86_64
+ %define reg_dstride dstrideq
+%else
+ %define reg_dstride r2
+%endif
+ ;
+ ; bottom edge extension
+ %if ARCH_X86_64
+ test bottomextq, bottomextq
+ jz .top
+ %else
+ xor r1, r1
+ cmp r1, r4m
+ je .top
+ %endif
+ ;
+ %if ARCH_X86_64
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+ %else
+ mov r3, dstq
+ mov reg_dstride, dstridem
+ sub r3, reg_dstride
+ mov srcm, r3
+ %endif
+ ;
+.bottom_x_loop:
+ %if ARCH_X86_64
+ mova m0, [srcq+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, bottomextq
+ %else
+ mov r3, srcm
+ mova m0, [r3+r1*2]
+ lea r3, [dstq+r1*2]
+ mov r4, r4m
+ %endif
+ ;
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .bottom_y_loop
+ add r1, mmsize/2
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+%if ARCH_X86_64
+ mov srcq, reg_blkm
+%else
+ mov r3, reg_blkm
+ mov reg_dstride, dstridem
+%endif
+ mov dstq, dstm
+ xor r1, r1
+ ;
+.top_x_loop:
+%if ARCH_X86_64
+ mova m0, [srcq+r1*2]
+%else
+ mov r3, reg_blkm
+ mova m0, [r3+r1*2]
+%endif
+ lea r3, [dstq+r1*2]
+ mov r4, topextq
+ ;
+.top_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .top_y_loop
+ add r1, mmsize/2
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+%undef reg_dstride
+%undef reg_blkm
+%undef reg_tmp
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+%elif STACK_ALIGNMENT >= 16
+cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+%else
+cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0, pxmax
+%endif
+ movifnidn dstq, dstmp
+ movifnidn srcq, srcmp
+%if STACK_ALIGNMENT >= 16
+ movifnidn dst_wd, dst_wm
+%endif
+%if ARCH_X86_64
+ movifnidn hd, hm
+%endif
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ movd m4, pxmaxm
+ movd m7, dxm
+ movd m6, mx0m
+ movd m5, src_wm
+ punpcklwd m4, m4
+ pshufd m4, m4, q0000
+ pshufd m7, m7, q0000
+ pshufd m6, m6, q0000
+ pshufd m5, m5, q0000
+ mova [rsp+16*3*ARCH_X86_32], m4
+%if ARCH_X86_64
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+ %define base r7-$$
+%else
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
+ %define hd dword r5m
+ %if STACK_ALIGNMENT >= 16
+ LEA r6, $$
+ %define base r6-$$
+ %else
+ LEA r4, $$
+ %define base r4-$$
+ %endif
+%endif
+%if ARCH_X86_64
+ mova m12, [base+pd_64]
+ mova m11, [base+pd_63]
+%else
+ %define m12 [base+pd_64]
+ %define m11 [base+pd_63]
+%endif
+ pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
+ pslld m7, 2 ; dx*4
+ pslld m5, 14
+ paddd m6, m4 ; mx+[0..3]*dx
+ SCRATCH 7, 15, 0
+ SCRATCH 6, 14, 1
+ SCRATCH 5, 13, 2
+ pxor m1, m1
+.loop_y:
+ xor xd, xd
+ mova m0, m14 ; per-line working version of mx
+.loop_x:
+ pcmpgtd m1, m0
+ pandn m1, m0
+ psrad m2, m0, 8 ; filter offset (unmasked)
+ pcmpgtd m3, m13, m1
+ pand m1, m3
+ pandn m3, m13
+ por m1, m3
+ psubd m3, m0, m1 ; pshufb offset
+ psrad m1, 14 ; clipped src_x offset
+ psrad m3, 14 ; pshufb edge_emu offset
+ pand m2, m11 ; filter offset (masked)
+ ; load source pixels
+%if ARCH_X86_64
+ movd r8d, m1
+ pshuflw m1, m1, q3232
+ movd r9d, m1
+ punpckhqdq m1, m1
+ movd r10d, m1
+ psrlq m1, 32
+ movd r11d, m1
+ movu m4, [srcq+r8*2]
+ movu m5, [srcq+r9*2]
+ movu m6, [srcq+r10*2]
+ movu m7, [srcq+r11*2]
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ packssdw m3, m3
+ movq r11, m3
+ test r11, r11
+ jz .filter
+ movsx r8, r11w
+ sar r11, 16
+ movsx r9, r11w
+ sar r11, 16
+ movsx r10, r11w
+ sar r11, 16
+ movu m1, [base+resize_shuf+8+r8*2]
+ movu m3, [base+resize_shuf+8+r9*2]
+ movu m8, [base+resize_shuf+8+r10*2]
+ movu m9, [base+resize_shuf+8+r11*2]
+ pshufb m4, m1
+ pshufb m5, m3
+ pshufb m6, m8
+ pshufb m7, m9
+.filter:
+ movd r8d, m2
+ pshuflw m2, m2, q3232
+ movd r9d, m2
+ punpckhqdq m2, m2
+ movd r10d, m2
+ psrlq m2, 32
+ movd r11d, m2
+ movq m8, [base+resize_filter+r8*8]
+ movq m2, [base+resize_filter+r9*8]
+ pxor m9, m9
+ punpcklbw m1, m9, m8
+ punpcklbw m3, m9, m2
+ psraw m1, 8
+ psraw m3, 8
+ movq m10, [base+resize_filter+r10*8]
+ movq m2, [base+resize_filter+r11*8]
+ punpcklbw m8, m9, m10
+ punpcklbw m9, m2
+ psraw m8, 8
+ psraw m9, 8
+ pmaddwd m4, m1
+ pmaddwd m5, m3
+ pmaddwd m6, m8
+ pmaddwd m7, m9
+ phaddd m4, m5
+%else
+ movd r3, m1
+ pshuflw m1, m1, q3232
+ movd r1, m1
+ punpckhqdq m1, m1
+ movu m4, [srcq+r3*2]
+ movu m5, [srcq+r1*2]
+ movd r3, m1
+ psrlq m1, 32
+ movd r1, m1
+ movu m6, [srcq+r3*2]
+ movu m7, [srcq+r1*2]
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ pxor m1, m1
+ pcmpeqb m1, m3
+ pmovmskb r3d, m1
+ cmp r3d, 0xffff
+ je .filter
+ movd r3, m3
+ movu m1, [base+resize_shuf+8+r3*2]
+ pshuflw m3, m3, q3232
+ movd r1, m3
+ pshufb m4, m1
+ movu m1, [base+resize_shuf+8+r1*2]
+ punpckhqdq m3, m3
+ movd r3, m3
+ pshufb m5, m1
+ movu m1, [base+resize_shuf+8+r3*2]
+ psrlq m3, 32
+ movd r1, m3
+ pshufb m6, m1
+ movu m1, [base+resize_shuf+8+r1*2]
+ pshufb m7, m1
+.filter:
+ mova [esp+4*16], m6
+ mova [esp+5*16], m7
+ movd r3, m2
+ pshuflw m2, m2, q3232
+ movd r1, m2
+ movq m6, [base+resize_filter+r3*8]
+ movq m7, [base+resize_filter+r1*8]
+ pxor m3, m3
+ punpcklbw m1, m3, m6
+ punpcklbw m3, m7
+ psraw m1, 8
+ psraw m3, 8
+ pmaddwd m4, m1
+ pmaddwd m5, m3
+ punpckhqdq m2, m2
+ movd r3, m2
+ psrlq m2, 32
+ movd r1, m2
+ phaddd m4, m5
+ movq m2, [base+resize_filter+r3*8]
+ movq m5, [base+resize_filter+r1*8]
+ mova m6, [esp+4*16]
+ mova m7, [esp+5*16]
+ pxor m3, m3
+ punpcklbw m1, m3, m2
+ punpcklbw m3, m5
+ psraw m1, 8
+ psraw m3, 8
+ pmaddwd m6, m1
+ pmaddwd m7, m3
+%endif
+ phaddd m6, m7
+ phaddd m4, m6
+ pxor m1, m1
+ psubd m2, m12, m4
+ psrad m2, 7
+ packssdw m2, m2
+ pmaxsw m2, m1
+ pminsw m2, [rsp+16*3*ARCH_X86_32]
+ movq [dstq+xq*2], m2
+ paddd m0, m15
+ add xd, 4
+%if STACK_ALIGNMENT >= 16
+ cmp xd, dst_wd
+%else
+ cmp xd, dst_wm
+%endif
+ jl .loop_x
+ add dstq, dst_stridemp
+ add srcq, src_stridemp
+ dec hd
+ jg .loop_y
+ RET
diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm
new file mode 100644
index 0000000000..3b208033bd
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc_avx2.asm
@@ -0,0 +1,5669 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018-2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+; dav1d_obmc_masks[] with 64-x interleaved
+obmc_masks: db 0, 0, 0, 0
+ ; 2
+ db 45, 19, 64, 0
+ ; 4
+ db 39, 25, 50, 14, 59, 5, 64, 0
+ ; 8
+ db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
+ ; 16
+ db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+ db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
+ ; 32
+ db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+ db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
+ db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
+ db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
+
+warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
+ db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
+warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
+ db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
+subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
+deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8
+bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
+resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
+
+wm_420_sign: dd 0x01020102, 0x01010101
+wm_422_sign: dd 0x80808080, 0x7f7f7f7f
+
+pb_64: times 4 db 64
+pw_m256: times 2 dw -256
+pw_15: times 2 dw 15
+pw_32: times 2 dw 32
+pw_34: times 2 dw 34
+pw_258: times 2 dw 258
+pw_512: times 2 dw 512
+pw_1024: times 2 dw 1024
+pw_2048: times 2 dw 2048
+pw_6903: times 2 dw 6903
+pw_8192: times 2 dw 8192
+pd_32: dd 32
+pd_63: dd 63
+pd_512: dd 512
+pd_32768: dd 32768
+pd_0x3ff: dd 0x3ff
+pd_0x4000: dd 0x4000
+pq_0x40000000: dq 0x40000000
+
+cextern mc_subpel_filters
+cextern mc_warp_filter2
+cextern resize_filter
+
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SCALED_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
+%%table:
+ %rep %0 - 2
+ dw %%base %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_1024:
+ %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy1_w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_2048:
+ %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy2_w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put)
+%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep)
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 32, 32
+
+SECTION .text
+
+INIT_XMM avx2
+cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+ movifnidn mxyd, r6m ; mx
+ lea r7, [put_avx2]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [r7+wq*2+table_offset(put,)]
+ add wq, r7
+ jmp wq
+.put_w2:
+ movzx r6d, word [srcq+ssq*0]
+ movzx r7d, word [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6w
+ mov [dstq+dsq*1], r7w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+INIT_YMM avx2
+.put_w32:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+ssq*0+32*0]
+ movu m1, [srcq+ssq*0+32*1]
+ movu m2, [srcq+ssq*1+32*0]
+ movu m3, [srcq+ssq*1+32*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+32*0], m0
+ mova [dstq+dsq*0+32*1], m1
+ mova [dstq+dsq*1+32*0], m2
+ mova [dstq+dsq*1+32*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+ movu m2, [srcq+32*2]
+ movu m3, [srcq+32*3]
+ add srcq, ssq
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+ ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+ imul mxyd, 255
+ vbroadcasti128 m4, [bilin_h_shuf8]
+ add mxyd, 16
+ movd xm5, mxyd
+ mov mxyd, r7m ; my
+ vpbroadcastw m5, xm5
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
+ vpbroadcastd m3, [pw_2048]
+ add wq, r7
+ jmp wq
+.h_w2:
+ movd xm0, [srcq+ssq*0]
+ pinsrd xm0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+ pmulhrsw xm0, xm3
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ mova xm4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+ pmulhrsw xm0, xm3
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pshufb xm1, xm4
+ pmaddubsw xm0, xm5
+ pmaddubsw xm1, xm5
+ pmulhrsw xm0, xm3
+ pmulhrsw xm1, xm3
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*1+8*0], 1
+ movu xm1, [srcq+ssq*0+8*1]
+ vinserti128 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ movu m1, [srcq+8*4]
+ movu m2, [srcq+8*5]
+ add srcq, ssq
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ mov r6, -32*3
+.h_w128_loop:
+ movu m0, [srcq+r6+32*3+8*0]
+ movu m1, [srcq+r6+32*3+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+r6+32*3], m0
+ add r6, 32
+ jle .h_w128_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 255
+ vpbroadcastd m5, [pw_2048]
+ add mxyd, 16
+ add wq, r7
+ movd xm4, mxyd
+ vpbroadcastw m4, xm4
+ jmp wq
+.v_w2:
+ movd xm0, [srcq+ssq*0]
+.v_w2_loop:
+ pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1
+ lea srcq, [srcq+ssq*2]
+ pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1
+ pshuflw xm1, xm1, q2301 ; 1 0
+ punpcklbw xm1, xm0
+ pmaddubsw xm1, xm4
+ pmulhrsw xm1, xm5
+ packuswb xm1, xm1
+ pextrw [dstq+dsq*0], xm1, 1
+ pextrw [dstq+dsq*1], xm1, 0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xm0, [srcq+ssq*0]
+.v_w4_loop:
+ vpbroadcastd xm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm1, xm2, xm0, 0x01 ; 0 1
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm2, xm0, 0x02 ; 1 2
+ punpcklbw xm1, xm2
+ pmaddubsw xm1, xm4
+ pmulhrsw xm1, xm5
+ packuswb xm1, xm1
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm0, [srcq+ssq*0]
+.v_w8_loop:
+ movq xm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw xm1, xm0, xm2
+ movq xm0, [srcq+ssq*0]
+ punpcklbw xm2, xm0
+ pmaddubsw xm1, xm4
+ pmaddubsw xm2, xm4
+ pmulhrsw xm1, xm5
+ pmulhrsw xm2, xm5
+ packuswb xm1, xm2
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu xm0, [srcq+ssq*0]
+.v_w16_loop:
+ vbroadcasti128 m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m2, m3, m0, 0x0f ; 0 1
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vpblendd m3, m0, 0xf0 ; 1 2
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+%macro PUT_BILIN_V_W32 0
+ movu m0, [srcq+ssq*0]
+%%loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m1, m0, m3
+ punpckhbw m2, m0, m3
+ movu m0, [srcq+ssq*0]
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ pmaddubsw m2, m4
+ pmaddubsw m3, m4
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+ packuswb m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg %%loop
+%endmacro
+ PUT_BILIN_V_W32
+ RET
+.v_w64:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+.v_w64_loop:
+ add srcq, ssq
+ movu m3, [srcq+32*0]
+ punpcklbw m2, m0, m3
+ punpckhbw m0, m3
+ pmaddubsw m2, m4
+ pmaddubsw m0, m4
+ pmulhrsw m2, m5
+ pmulhrsw m0, m5
+ packuswb m2, m0
+ mova m0, m3
+ movu m3, [srcq+32*1]
+ mova [dstq+32*0], m2
+ punpcklbw m2, m1, m3
+ punpckhbw m1, m3
+ pmaddubsw m2, m4
+ pmaddubsw m1, m4
+ pmulhrsw m2, m5
+ pmulhrsw m1, m5
+ packuswb m2, m1
+ mova m1, m3
+ mova [dstq+32*1], m2
+ add dstq, dsq
+ dec hd
+ jg .v_w64_loop
+ RET
+.v_w128:
+ lea r6d, [hq+(3<<8)]
+ mov r4, srcq
+ mov r7, dstq
+.v_w128_loop:
+ PUT_BILIN_V_W32
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .v_w128_loop
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+ ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11 ; can't shift by 12 due to signed overflow
+ vpbroadcastd m7, [pw_15]
+ movd xm6, mxyd
+ add wq, r7
+ paddb m5, m5
+ vpbroadcastw m6, xm6
+ jmp wq
+.hv_w2:
+ vpbroadcastd xm0, [srcq+ssq*0]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+.hv_w2_loop:
+ movd xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pinsrd xm1, [srcq+ssq*0], 1
+ pshufb xm1, xm4
+ pmaddubsw xm1, xm5 ; 1 _ 2 _
+ shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _
+ mova xm0, xm1
+ psubw xm1, xm2
+ pmulhw xm1, xm6
+ pavgw xm2, xm7
+ paddw xm1, xm2
+ psrlw xm1, 4
+ packuswb xm1, xm1
+ pextrw [dstq+dsq*0], xm1, 0
+ pextrw [dstq+dsq*1], xm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova xm4, [bilin_h_shuf4]
+ movddup xm0, [srcq+ssq*0]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+.hv_w4_loop:
+ movq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm1, [srcq+ssq*0]
+ pshufb xm1, xm4
+ pmaddubsw xm1, xm5 ; 1 2
+ shufps xm2, xm0, xm1, q1032 ; 0 1
+ mova xm0, xm1
+ psubw xm1, xm2
+ pmulhw xm1, xm6
+ pavgw xm2, xm7
+ paddw xm1, xm2
+ psrlw xm1, 4
+ packuswb xm1, xm1
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti128 m0, [srcq+ssq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m1, [srcq+ssq*0], 1
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2
+ vperm2i128 m2, m0, m1, 0x21 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhw m1, m6
+ pavgw m2, m7
+ paddw m1, m2
+ psrlw m1, 4
+ vextracti128 xm2, m1, 1
+ packuswb xm1, xm2
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ movu m0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu xm2, [srcq+ssq*1+8*0]
+ vinserti128 m2, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ movu xm3, [srcq+ssq*0+8*0]
+ vinserti128 m3, [srcq+ssq*0+8*1], 1
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ psubw m1, m2, m0
+ pmulhw m1, m6
+ pavgw m0, m7
+ paddw m1, m0
+ pmaddubsw m0, m3, m5
+ psubw m3, m0, m2
+ pmulhw m3, m6
+ pavgw m2, m7
+ paddw m3, m2
+ psrlw m1, 4
+ psrlw m3, 4
+ packuswb m1, m3
+ vpermq m1, m1, q3120
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w128:
+ lea r6d, [hq+(3<<16)]
+ jmp .hv_w32_start
+.hv_w64:
+ lea r6d, [hq+(1<<16)]
+.hv_w32_start:
+ mov r4, srcq
+ mov r7, dstq
+.hv_w32:
+%if WIN64
+ movaps r4m, xmm8
+%endif
+.hv_w32_loop0:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w32_loop:
+ add srcq, ssq
+ movu m2, [srcq+8*0]
+ movu m3, [srcq+8*1]
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ psubw m8, m2, m0
+ pmulhw m8, m6
+ pavgw m0, m7
+ paddw m8, m0
+ mova m0, m2
+ psubw m2, m3, m1
+ pmulhw m2, m6
+ pavgw m1, m7
+ paddw m2, m1
+ mova m1, m3
+ psrlw m8, 4
+ psrlw m2, 4
+ packuswb m8, m2
+ mova [dstq], m8
+ add dstq, dsq
+ dec hd
+ jg .hv_w32_loop
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<16
+ jg .hv_w32_loop0
+%if WIN64
+ movaps xmm8, r4m
+%endif
+ RET
+
+cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea r6, [prep%+SUFFIX]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movd xm0, [srcq+strideq*0]
+ pinsrd xm0, [srcq+strideq*1], 1
+ pinsrd xm0, [srcq+strideq*2], 2
+ pinsrd xm0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, xm0
+ psllw m0, 4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movq xm0, [srcq+strideq*0]
+ movhps xm0, [srcq+strideq*1]
+ movq xm1, [srcq+strideq*2]
+ movhps xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, xm0
+ pmovzxbw m1, xm1
+ psllw m0, 4
+ psllw m1, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ pmovzxbw m0, [srcq+strideq*0]
+ pmovzxbw m1, [srcq+strideq*1]
+ pmovzxbw m2, [srcq+strideq*2]
+ pmovzxbw m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmovzxbw m0, [srcq+strideq*0+16*0]
+ pmovzxbw m1, [srcq+strideq*0+16*1]
+ pmovzxbw m2, [srcq+strideq*1+16*0]
+ pmovzxbw m3, [srcq+strideq*1+16*1]
+ lea srcq, [srcq+strideq*2]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 2
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmovzxbw m0, [srcq+16*0]
+ pmovzxbw m1, [srcq+16*1]
+ pmovzxbw m2, [srcq+16*2]
+ pmovzxbw m3, [srcq+16*3]
+ add srcq, strideq
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ dec hd
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmovzxbw m0, [srcq+16*0]
+ pmovzxbw m1, [srcq+16*1]
+ pmovzxbw m2, [srcq+16*2]
+ pmovzxbw m3, [srcq+16*3]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ pmovzxbw m0, [srcq+16*4]
+ pmovzxbw m1, [srcq+16*5]
+ pmovzxbw m2, [srcq+16*6]
+ pmovzxbw m3, [srcq+16*7]
+ add tmpq, 32*8
+ add srcq, strideq
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq-32*4], m0
+ mova [tmpq-32*3], m1
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+ ; = (16 - mx) * src[x] + mx * src[x + 1]
+ imul mxyd, 255
+ vbroadcasti128 m4, [bilin_h_shuf8]
+ add mxyd, 16
+ movd xm5, mxyd
+ mov mxyd, r6m ; my
+ vpbroadcastw m5, xm5
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ vbroadcasti128 m4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xm0, [srcq+strideq*0]
+ movhps xm0, [srcq+strideq*1]
+ movq xm1, [srcq+strideq*2]
+ movhps xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m0, xm1, 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+.h_w8_loop:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*2]
+ vinserti128 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+.h_w16_loop:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ movu xm2, [srcq+strideq*2+8*0]
+ vinserti128 m2, [srcq+strideq*2+8*1], 1
+ movu xm3, [srcq+stride3q +8*0]
+ vinserti128 m3, [srcq+stride3q +8*1], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+.h_w32_loop:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ movu xm1, [srcq+strideq*0+8*2]
+ vinserti128 m1, [srcq+strideq*0+8*3], 1
+ movu xm2, [srcq+strideq*1+8*0]
+ vinserti128 m2, [srcq+strideq*1+8*1], 1
+ movu xm3, [srcq+strideq*1+8*2]
+ vinserti128 m3, [srcq+strideq*1+8*3], 1
+ lea srcq, [srcq+strideq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 2
+ jg .h_w32_loop
+ RET
+.h_w64:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
+ movu xm2, [srcq+8*4]
+ vinserti128 m2, [srcq+8*5], 1
+ movu xm3, [srcq+8*6]
+ vinserti128 m3, [srcq+8*7], 1
+ add srcq, strideq
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
+ movu xm2, [srcq+8*4]
+ vinserti128 m2, [srcq+8*5], 1
+ movu xm3, [srcq+8*6]
+ vinserti128 m3, [srcq+8*7], 1
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ movu xm0, [srcq+8* 8]
+ vinserti128 m0, [srcq+8* 9], 1
+ movu xm1, [srcq+8*10]
+ vinserti128 m1, [srcq+8*11], 1
+ movu xm2, [srcq+8*12]
+ vinserti128 m2, [srcq+8*13], 1
+ movu xm3, [srcq+8*14]
+ vinserti128 m3, [srcq+8*15], 1
+ add tmpq, 32*8
+ add srcq, strideq
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq-32*4], m0
+ mova [tmpq-32*3], m1
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ WIN64_SPILL_XMM 7
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+ imul mxyd, 255
+ add mxyd, 16
+ add wq, r6
+ lea stride3q, [strideq*3]
+ movd xm6, mxyd
+ vpbroadcastw m6, xm6
+ jmp wq
+.v_w4:
+ movd xm0, [srcq+strideq*0]
+.v_w4_loop:
+ vpbroadcastd m1, [srcq+strideq*2]
+ vpbroadcastd xm2, [srcq+strideq*1]
+ vpbroadcastd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m0, 0x05 ; 0 2 2 2
+ vpbroadcastd m0, [srcq+strideq*0]
+ vpblendd m3, m2, 0x0f ; 1 1 3 3
+ vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4
+ vpblendd m1, m3, 0xaa ; 0 1 2 3
+ vpblendd m2, m3, 0x55 ; 1 2 3 4
+ punpcklbw m1, m2
+ pmaddubsw m1, m6
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm0, [srcq+strideq*0]
+.v_w8_loop:
+ vpbroadcastq m1, [srcq+strideq*2]
+ vpbroadcastq m2, [srcq+strideq*1]
+ vpbroadcastq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m0, 0x03 ; 0 2 2 2
+ vpbroadcastq m0, [srcq+strideq*0]
+ vpblendd m2, m3, 0xcc ; 1 3 1 3
+ vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2
+ vpblendd m2, m1, 0x0f ; 0 2 1 3
+ vpblendd m3, m0, 0xc0 ; 1 3 2 4
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m6
+ pmaddubsw m2, m6
+ mova [tmpq+32*0], m1
+ mova [tmpq+32*1], m2
+ add tmpq, 32*2
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ vbroadcasti128 m0, [srcq+strideq*0]
+.v_w16_loop:
+ vbroadcasti128 m1, [srcq+strideq*1]
+ vbroadcasti128 m2, [srcq+strideq*2]
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ shufpd m4, m0, m2, 0x0c ; 0 2
+ vbroadcasti128 m0, [srcq+strideq*0]
+ shufpd m1, m3, 0x0c ; 1 3
+ shufpd m2, m0, 0x0c ; 2 4
+ punpcklbw m3, m4, m1
+ punpcklbw m5, m1, m2
+ punpckhbw m4, m1
+ punpckhbw m1, m2
+ pmaddubsw m3, m6
+ pmaddubsw m5, m6
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ mova [tmpq+32*0], m3
+ mova [tmpq+32*1], m5
+ mova [tmpq+32*2], m4
+ mova [tmpq+32*3], m1
+ add tmpq, 32*4
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ vpermq m0, [srcq+strideq*0], q3120
+.v_w32_loop:
+ vpermq m1, [srcq+strideq*1], q3120
+ vpermq m2, [srcq+strideq*2], q3120
+ vpermq m3, [srcq+stride3q ], q3120
+ lea srcq, [srcq+strideq*4]
+ punpcklbw m4, m0, m1
+ punpckhbw m5, m0, m1
+ vpermq m0, [srcq+strideq*0], q3120
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*0], m4
+ mova [tmpq+32*1], m5
+ punpcklbw m4, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ punpcklbw m5, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m5, m6
+ pmaddubsw m2, m6
+ mova [tmpq+32*2], m4
+ mova [tmpq+32*3], m1
+ add tmpq, 32*8
+ punpcklbw m1, m3, m0
+ punpckhbw m3, m0
+ pmaddubsw m1, m6
+ pmaddubsw m3, m6
+ mova [tmpq-32*4], m5
+ mova [tmpq-32*3], m2
+ mova [tmpq-32*2], m1
+ mova [tmpq-32*1], m3
+ sub hd, 4
+ jg .v_w32_loop
+ RET
+.v_w64:
+ vpermq m0, [srcq+strideq*0+32*0], q3120
+ vpermq m1, [srcq+strideq*0+32*1], q3120
+.v_w64_loop:
+ vpermq m2, [srcq+strideq*1+32*0], q3120
+ vpermq m3, [srcq+strideq*1+32*1], q3120
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m0, m2
+ punpckhbw m0, m2
+ pmaddubsw m4, m6
+ pmaddubsw m0, m6
+ mova [tmpq+32*0], m4
+ mova [tmpq+32*1], m0
+ punpcklbw m4, m1, m3
+ punpckhbw m5, m1, m3
+ vpermq m0, [srcq+strideq*0+32*0], q3120
+ vpermq m1, [srcq+strideq*0+32*1], q3120
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*2], m4
+ mova [tmpq+32*3], m5
+ add tmpq, 32*8
+ punpcklbw m4, m2, m0
+ punpckhbw m2, m0
+ punpcklbw m5, m3, m1
+ punpckhbw m3, m1
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ pmaddubsw m5, m6
+ pmaddubsw m3, m6
+ mova [tmpq-32*4], m4
+ mova [tmpq-32*3], m2
+ mova [tmpq-32*2], m5
+ mova [tmpq-32*1], m3
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ lea r6d, [hq+(3<<8)]
+ mov r3, srcq
+ mov r5, tmpq
+.v_w128_loop0:
+ vpermq m0, [srcq+strideq*0], q3120
+.v_w128_loop:
+ vpermq m1, [srcq+strideq*1], q3120
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m2, m0, m1
+ punpckhbw m3, m0, m1
+ vpermq m0, [srcq+strideq*0], q3120
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ punpcklbw m4, m1, m0
+ punpckhbw m1, m0
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ mova [tmpq+32*0], m2
+ mova [tmpq+32*1], m3
+ mova [tmpq+32*8], m4
+ mova [tmpq+32*9], m1
+ add tmpq, 32*16
+ sub hd, 2
+ jg .v_w128_loop
+ add r3, 32
+ add r5, 64
+ movzx hd, r6b
+ mov srcq, r3
+ mov tmpq, r5
+ sub r6d, 1<<8
+ jg .v_w128_loop0
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 7
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ movd xm6, mxyd
+ vpbroadcastw m6, xm6
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.hv_w4:
+ vbroadcasti128 m4, [bilin_h_shuf4]
+ vpbroadcastq m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w4_loop:
+ movq xm1, [srcq+strideq*1]
+ movhps xm1, [srcq+strideq*2]
+ movq xm2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ movhps xm2, [srcq+strideq*0]
+ vinserti128 m1, xm2, 1
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2 3 4
+ vpblendd m2, m1, m0, 0xc0
+ vpermq m2, m2, q2103 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti128 m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xm1, [srcq+strideq*1]
+ vinserti128 m1, [srcq+strideq*2], 1
+ movu xm2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m2, [srcq+strideq*0], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5 ; 1 2
+ vperm2i128 m3, m0, m1, 0x21 ; 0 1
+ pmaddubsw m0, m2, m5 ; 3 4
+ vperm2i128 m2, m1, m0, 0x21 ; 2 3
+ psubw m1, m3
+ pmulhrsw m1, m6
+ paddw m1, m3
+ psubw m3, m0, m2
+ pmulhrsw m3, m6
+ paddw m3, m2
+ mova [tmpq+32*0], m1
+ mova [tmpq+32*1], m3
+ add tmpq, 32*2
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu xm2, [srcq+strideq*0+8*0]
+ vinserti128 m2, [srcq+strideq*0+8*1], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+32*0], m3
+ mova [tmpq+32*1], m2
+ add tmpq, 32*2
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w32_loop:
+ add srcq, strideq
+ movu xm2, [srcq+8*0]
+ vinserti128 m2, [srcq+8*1], 1
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m3, m2, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ mova m0, m2
+ movu xm2, [srcq+8*2]
+ vinserti128 m2, [srcq+8*3], 1
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ mova [tmpq+32*0], m3
+ psubw m3, m2, m1
+ pmulhrsw m3, m6
+ paddw m3, m1
+ mova m1, m2
+ mova [tmpq+32*1], m3
+ add tmpq, 32*2
+ dec hd
+ jg .hv_w32_loop
+ RET
+.hv_w128:
+ lea r3d, [hq+(7<<8)]
+ mov r6d, 256
+ jmp .hv_w64_start
+.hv_w64:
+ lea r3d, [hq+(3<<8)]
+ mov r6d, 128
+.hv_w64_start:
+%if WIN64
+ PUSH r7
+%endif
+ mov r5, srcq
+ mov r7, tmpq
+.hv_w64_loop0:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w64_loop:
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu xm2, [srcq+strideq*0+8*0]
+ vinserti128 m2, [srcq+strideq*0+8*1], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+r6*0], m3
+ mova [tmpq+r6*1], m2
+ lea tmpq, [tmpq+r6*2]
+ sub hd, 2
+ jg .hv_w64_loop
+ add r5, 16
+ add r7, 32
+ movzx hd, r3b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r3d, 1<<8
+ jg .hv_w64_loop0
+%if WIN64
+ POP r7
+%endif
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; fn, type, type_h, type_v
+cglobal %1_%2_8bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx2]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+ lea r6, [ssq*3]
+ lea r7, [dsq*3]
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [pw_34] ; 2 + (8 << 2)
+ WIN64_SPILL_XMM 11
+ cmp wd, 4
+ jl .h_w2
+ vbroadcasti128 m6, [subpel_h_shufA]
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m7, [subpel_h_shufB]
+ vbroadcasti128 m8, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+ vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0]
+ vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4]
+ add wq, r8
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ dec srcq
+ mova xm4, [subpel_h_shuf4]
+ vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+.h_w2_loop:
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm3
+ phaddw xm0, xm0
+ paddw xm0, xm5
+ psraw xm0, 6
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+.h_w4_loop:
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm6
+ pshufb xm1, xm6
+ pmaddubsw xm0, xm3
+ pmaddubsw xm1, xm3
+ phaddw xm0, xm1
+ paddw xm0, xm5
+ psraw xm0, 6
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+ pshufb m%2, m%1, m7
+ pshufb m%3, m%1, m8
+ pshufb m%1, m6
+ pmaddubsw m%4, m%2, m9
+ pmaddubsw m%2, m10
+ pmaddubsw m%3, m10
+ pmaddubsw m%1, m9
+ paddw m%3, m%4
+ paddw m%1, m%2
+ phaddw m%1, m%3
+ paddw m%1, m5
+ psraw m%1, 6
+%endmacro
+ movu xm0, [srcq+ssq*0]
+ vinserti128 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 1, 2, 3
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*1+8*0], 1
+ movu xm1, [srcq+ssq*0+8*1]
+ vinserti128 m1, [srcq+ssq*1+8*1], 1
+ PUT_8TAP_H 0, 2, 3, 4
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 1, 2, 3, 4
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ xor r6d, r6d
+ jmp .h_start
+.h_w64:
+ mov r6, -32*1
+ jmp .h_start
+.h_w128:
+ mov r6, -32*3
+.h_start:
+ sub srcq, r6
+ sub dstq, r6
+ mov r4, r6
+.h_loop:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 2, 3, 4
+ packuswb m0, m1
+ mova [dstq+r6], m0
+ add r6, 32
+ jle .h_loop
+ add srcq, ssq
+ add dstq, dsq
+ mov r6, r4
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ tzcnt r6d, wd
+ movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
+ vpbroadcastd m7, [pw_512]
+ lea myq, [r8+myq*8+subpel_filters-put_avx2]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ add r6, r8
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ jmp r6
+.v_w2:
+ movd xm2, [srcq+ssq*0]
+ pinsrw xm2, [srcq+ssq*1], 2
+ pinsrw xm2, [srcq+ssq*2], 4
+ add srcq, ss3q
+ pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3
+ movd xm3, [srcq+ssq*1]
+ vpbroadcastd xm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm3, xm1, 0x02 ; 4 5
+ vpblendd xm1, xm0, 0x02 ; 5 6
+ palignr xm4, xm3, xm2, 4 ; 1 2 3 4
+ punpcklbw xm3, xm1 ; 45 56
+ punpcklbw xm1, xm2, xm4 ; 01 12
+ punpckhbw xm2, xm4 ; 23 34
+.v_w2_loop:
+ pmaddubsw xm5, xm1, xm8 ; a0 b0
+ mova xm1, xm2
+ pmaddubsw xm2, xm9 ; a1 b1
+ paddw xm5, xm2
+ mova xm2, xm3
+ pmaddubsw xm3, xm10 ; a2 b2
+ paddw xm5, xm3
+ vpbroadcastd xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm4, xm0, 0x02 ; 7 8
+ punpcklbw xm3, xm4 ; 67 78
+ pmaddubsw xm4, xm3, xm11 ; a3 b3
+ paddw xm5, xm4
+ pmulhrsw xm5, xm7
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xm2, [srcq+ssq*0]
+ pinsrd xm2, [srcq+ssq*1], 1
+ pinsrd xm2, [srcq+ssq*2], 2
+ add srcq, ss3q
+ pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3
+ movd xm3, [srcq+ssq*1]
+ vpbroadcastd xm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm3, xm1, 0x02 ; 4 5
+ vpblendd xm1, xm0, 0x02 ; 5 6
+ palignr xm4, xm3, xm2, 4 ; 1 2 3 4
+ punpcklbw xm3, xm1 ; 45 56
+ punpcklbw xm1, xm2, xm4 ; 01 12
+ punpckhbw xm2, xm4 ; 23 34
+.v_w4_loop:
+ pmaddubsw xm5, xm1, xm8 ; a0 b0
+ mova xm1, xm2
+ pmaddubsw xm2, xm9 ; a1 b1
+ paddw xm5, xm2
+ mova xm2, xm3
+ pmaddubsw xm3, xm10 ; a2 b2
+ paddw xm5, xm3
+ vpbroadcastd xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm4, xm0, 0x02 ; 7 8
+ punpcklbw xm3, xm4 ; 67 78
+ pmaddubsw xm4, xm3, xm11 ; a3 b3
+ paddw xm5, xm4
+ pmulhrsw xm5, xm7
+ packuswb xm5, xm5
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm1, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m2, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m5, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpbroadcastq m6, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m1, m4, 0x30
+ vpblendd m4, m2, 0x30
+ punpcklbw m1, m4 ; 01 12
+ vpblendd m2, m5, 0x30
+ vpblendd m5, m3, 0x30
+ punpcklbw m2, m5 ; 23 34
+ vpblendd m3, m6, 0x30
+ vpblendd m6, m0, 0x30
+ punpcklbw m3, m6 ; 45 56
+.v_w8_loop:
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m5, m1, m8 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, m9 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, m10 ; a2 b2
+ paddw m5, m3
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+ssq*0]
+ vpblendd m4, m0, 0x30
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, m11 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ vextracti128 xm4, m5, 1
+ packuswb xm5, xm4
+ movq [dstq+dsq*0], xm5
+ movhps [dstq+dsq*1], xm5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ lea r6d, [wq*8-128]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*2]
+.v_w16_loop0:
+ vbroadcasti128 m4, [srcq+ssq*0]
+ vbroadcasti128 m5, [srcq+ssq*1]
+ vbroadcasti128 m6, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vbroadcasti128 m1, [srcq+ssq*1]
+ vbroadcasti128 m2, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti128 m3, [srcq+ssq*0]
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklbw m1, m4, m5 ; 01
+ punpckhbw m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklbw m2, m5, m6 ; 12
+ punpckhbw m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklbw m3, m6, m0 ; 23
+ punpckhbw m6, m0 ; 56
+.v_w16_loop:
+ vbroadcasti128 m12, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m13, [srcq+ssq*0]
+ pmaddubsw m14, m1, m8 ; a0
+ pmaddubsw m15, m2, m8 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, m9 ; a1
+ pmaddubsw m4, m9 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, m10 ; a2
+ pmaddubsw m6, m10 ; b2
+ paddw m14, m5
+ paddw m15, m6
+ shufpd m6, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m6, m0 ; 67
+ punpckhbw m6, m0 ; 78
+ pmaddubsw m12, m5, m11 ; a3
+ pmaddubsw m13, m6, m11 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ packuswb m14, m15
+ vpermq m14, m14, q3120
+ mova [dstq+dsq*0], xm14
+ vextracti128 [dstq+dsq*1], m14, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2]
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m8, [pw_8192]
+ vpbroadcastd m9, [pd_512]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 m6, [subpel_h_shuf4]
+ movq xm2, [srcq+ssq*0]
+ movhps xm2, [srcq+ssq*1]
+ movq xm0, [srcq+ssq*2]
+ add srcq, ss3q
+ movhps xm0, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpbroadcastq m4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m1, [srcq+ssq*0]
+ vpblendd m2, m3, 0x30
+ vpblendd m0, m1, 0x30
+ vpblendd m2, m4, 0xc0
+ pshufb m2, m6
+ pshufb m0, m6
+ pmaddubsw m2, m7
+ pmaddubsw m0, m7
+ phaddw m2, m0
+ pmulhrsw m2, m8
+ vextracti128 xm3, m2, 1
+ palignr xm4, xm3, xm2, 4
+ punpcklwd xm1, xm2, xm4 ; 01 12
+ punpckhwd xm2, xm4 ; 23 34
+ pshufd xm0, xm3, q2121
+ punpcklwd xm3, xm0 ; 45 56
+.hv_w2_loop:
+ movq xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm4, [srcq+ssq*0]
+ pshufb xm4, xm6
+ pmaddubsw xm4, xm7
+ pmaddwd xm5, xm1, xm10 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm11 ; a1 b1
+ paddd xm5, xm2
+ mova xm2, xm3
+ pmaddwd xm3, xm12 ; a2 b2
+ phaddw xm4, xm4
+ pmulhrsw xm4, xm8
+ paddd xm5, xm3
+ palignr xm3, xm4, xm0, 12
+ mova xm0, xm4
+ punpcklwd xm3, xm0 ; 67 78
+ pmaddwd xm4, xm3, xm13 ; a3 b3
+ paddd xm5, xm9
+ paddd xm5, xm4
+ psrad xm5, 10
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova m6, [subpel_h_shuf4]
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m0, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m5, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpblendd m2, m4, 0xcc ; 0 1
+ vpbroadcastq m4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m1, [srcq+ssq*0]
+ vpblendd m0, m5, 0xcc ; 2 3
+ vpblendd m3, m4, 0xcc ; 4 5
+ pshufb m2, m6
+ pshufb m0, m6
+ pshufb m3, m6
+ pshufb m1, m6
+ pmaddubsw m2, m7
+ pmaddubsw m0, m7
+ pmaddubsw m3, m7
+ pmaddubsw m1, m7
+ phaddw m2, m0
+ phaddw m3, m1
+ pmulhrsw m2, m8
+ pmulhrsw m3, m8
+ palignr m4, m3, m2, 4
+ punpcklwd m1, m2, m4 ; 01 12
+ punpckhwd m2, m4 ; 23 34
+ pshufd m0, m3, q2121
+ punpcklwd m3, m0 ; 45 56
+.hv_w4_loop:
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m1, m10 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m11 ; a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m12 ; a2 b2
+ paddd m5, m3
+ vpbroadcastq m3, [srcq+ssq*0]
+ vpblendd m4, m3, 0xcc ; 7 8
+ pshufb m4, m6
+ pmaddubsw m4, m7
+ phaddw m4, m4
+ pmulhrsw m4, m8
+ palignr m3, m4, m0, 12
+ mova m0, m4
+ punpcklwd m3, m0 ; 67 78
+ pmaddwd m4, m3, m13 ; a3 b3
+ paddd m5, m9
+ paddd m5, m4
+ psrad m5, 10
+ vextracti128 xm4, m5, 1
+ packssdw xm5, xm4
+ packuswb xm5, xm5
+ pshuflw xm5, xm5, q3120
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0]
+ vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2]
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ lea r6d, [wq*8-64]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*4]
+.hv_w8_loop0:
+ vbroadcasti128 m7, [subpel_h_shufA]
+ movu xm4, [srcq+ssq*0]
+ vbroadcasti128 m8, [subpel_h_shufB]
+ movu xm5, [srcq+ssq*1]
+ vbroadcasti128 m9, [subpel_h_shufC]
+ movu xm6, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vpblendd m4, m0, 0xf0 ; 0 3
+ vinserti128 m5, [srcq+ssq*1], 1 ; 1 4
+ vinserti128 m6, [srcq+ssq*2], 1 ; 2 5
+ add srcq, ss3q
+ vinserti128 m0, [srcq+ssq*0], 1 ; 3 6
+%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+ pshufb %3, %1, %6
+ pshufb %4, %1, %7
+ pshufb %1, %5
+ pmaddubsw %2, %3, m10
+ pmaddubsw %4, m11
+ pmaddubsw %3, m11
+ pmaddubsw %1, m10
+ paddw %2, %4
+ paddw %1, %3
+ phaddw %1, %2
+%endmacro
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9
+ vpbroadcastd m7, [pw_8192]
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ pmulhrsw m0, m7
+ pmulhrsw m4, m7
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ vpermq m7, m0, q3120
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vextracti128 r6m, m0, 1 ; not enough registers
+ movu xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m0, [srcq+ssq*0], 1 ; 7 8
+ pmaddwd m8, m1, m12 ; a0
+ pmaddwd m9, m2, m12 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m13 ; a1
+ pmaddwd m4, m13 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m14 ; a2
+ pmaddwd m6, m14 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ vbroadcasti128 m5, [subpel_h_shufA]
+ HV_H_W8 m0, m5, m6, m7, m5, m6, m7
+ vpbroadcastd m5, [pw_8192]
+ vpbroadcastd m7, [pd_512]
+ vbroadcasti128 m6, r6m
+ pmulhrsw m0, m5
+ paddd m8, m7
+ paddd m9, m7
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, m15 ; a3
+ paddd m8, m7
+ pmaddwd m7, m6, m15 ; b3
+ paddd m7, m9
+ psrad m8, 10
+ psrad m7, 10
+ packssdw m8, m7
+ vextracti128 xm7, m8, 1
+ packuswb xm8, xm7
+ pshufd xm7, xm8, q3120
+ movq [dstq+dsq*0], xm7
+ movhps [dstq+dsq*1], xm7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ add r4, 8
+ add r7, 8
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+%macro PREP_8TAP_H 0
+ pshufb m1, m0, m5
+ pshufb m2, m0, m6
+ pshufb m3, m0, m7
+ pmaddubsw m1, m8
+ pmaddubsw m0, m2, m8
+ pmaddubsw m2, m9
+ pmaddubsw m3, m9
+ paddw m1, m2
+ paddw m0, m3
+ phaddw m0, m1, m0
+ pmulhrsw m0, m4
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep%+SUFFIX]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ add wq, r7
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m4, [pw_8192]
+ vbroadcasti128 m5, [subpel_h_shufA]
+ WIN64_SPILL_XMM 10
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
+ vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
+ add wq, r7
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq xm0, [srcq+strideq*0]
+ vpbroadcastq m2, [srcq+strideq*2]
+ movq xm1, [srcq+strideq*1]
+ vpblendd m0, m2, 0xf0
+ vpbroadcastq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m2, 0xf0
+ pshufb m0, m5
+ pshufb m1, m5
+ pmaddubsw m0, m6
+ pmaddubsw m1, m6
+ phaddw m0, m1
+ pmulhrsw m0, m4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ PREP_8TAP_H
+ mova [tmpq+32*0], m0
+ movu xm0, [srcq+strideq*1+8*0]
+ vinserti128 m0, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ mova [tmpq+32*1], m0
+ add tmpq, 32*2
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ xor r6d, r6d
+ jmp .h_start
+.h_w64:
+ mov r6, -32*1
+ jmp .h_start
+.h_w128:
+ mov r6, -32*3
+.h_start:
+ sub srcq, r6
+ mov r5, r6
+.h_loop:
+ movu xm0, [srcq+r6+8*0]
+ vinserti128 m0, [srcq+r6+8*1], 1
+ PREP_8TAP_H
+ mova [tmpq+32*0], m0
+ movu xm0, [srcq+r6+8*2]
+ vinserti128 m0, [srcq+r6+8*3], 1
+ PREP_8TAP_H
+ mova [tmpq+32*1], m0
+ add tmpq, 32*2
+ add r6, 32
+ jle .h_loop
+ add srcq, strideq
+ mov r6, r5
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
+ shr myd, 16 ; Note that the code is 8-tap only, having
+ cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
+ cmove myd, mxd ; had a negligible effect on performance.
+ ; TODO: Would a 6-tap code path be worth it?
+ lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ vpbroadcastd m7, [pw_8192]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ cmp wd, 8
+ jg .v_w16
+ je .v_w8
+.v_w4:
+ movd xm0, [srcq+strideq*0]
+ vpbroadcastd m1, [srcq+strideq*2]
+ vpbroadcastd xm2, [srcq+strideq*1]
+ add srcq, stride3q
+ vpbroadcastd m3, [srcq+strideq*0]
+ vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _
+ vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _
+ vpbroadcastd m0, [srcq+strideq*1]
+ vpbroadcastd m2, [srcq+strideq*2]
+ vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _
+ vpbroadcastd m0, [srcq+stride3q ]
+ vbroadcasti128 m5, [deint_shuf4]
+ vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5
+ vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5
+ vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _
+ punpcklbw m1, m2, m3 ; 01 12 23 34
+ vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6
+ punpckhbw m2, m3 ; 23 34 45 56
+.v_w4_loop:
+ lea srcq, [srcq+strideq*4]
+ pinsrd xm0, [srcq+strideq*0], 1
+ vpbroadcastd m3, [srcq+strideq*1]
+ vpbroadcastd m4, [srcq+strideq*2]
+ vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _
+ vpbroadcastd m0, [srcq+stride3q ]
+ vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _
+ vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _
+ pshufb m3, m5 ; 67 78 89 9a
+ pmaddubsw m4, m1, m8
+ vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78
+ pmaddubsw m2, m9
+ paddw m4, m2
+ mova m2, m3
+ pmaddubsw m3, m11
+ paddw m3, m4
+ pmaddubsw m4, m1, m10
+ paddw m3, m4
+ pmulhrsw m3, m7
+ mova [tmpq], m3
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm1, [srcq+strideq*0]
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq m5, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m6, [srcq+strideq*1]
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpblendd m1, m4, 0x30
+ vpblendd m4, m2, 0x30
+ punpcklbw m1, m4 ; 01 12
+ vpblendd m2, m5, 0x30
+ vpblendd m5, m3, 0x30
+ punpcklbw m2, m5 ; 23 34
+ vpblendd m3, m6, 0x30
+ vpblendd m6, m0, 0x30
+ punpcklbw m3, m6 ; 45 56
+.v_w8_loop:
+ vpbroadcastq m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmaddubsw m5, m2, m9 ; a1
+ pmaddubsw m6, m2, m8 ; b0
+ vpblendd m2, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+strideq*0]
+ vpblendd m4, m0, 0x30
+ punpcklbw m2, m4 ; 67 78
+ pmaddubsw m1, m8 ; a0
+ pmaddubsw m4, m3, m9 ; b1
+ paddw m5, m1
+ mova m1, m3
+ pmaddubsw m3, m10 ; a2
+ paddw m6, m4
+ paddw m5, m3
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpblendd m4, m0, 0x30
+ punpcklbw m3, m4 ; 89 9a
+ pmaddubsw m4, m2, m11 ; a3
+ paddw m5, m4
+ pmaddubsw m4, m2, m10 ; b2
+ paddw m6, m4
+ pmaddubsw m4, m3, m11 ; b3
+ paddw m6, m4
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ mova [tmpq+32*0], m5
+ mova [tmpq+32*1], m6
+ add tmpq, 32*2
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ add wd, wd
+ mov r5, srcq
+ mov r7, tmpq
+ lea r6d, [hq+wq*8-256]
+.v_w16_loop0:
+ vbroadcasti128 m4, [srcq+strideq*0]
+ vbroadcasti128 m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m0, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*0]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m1, [srcq+strideq*0]
+ vbroadcasti128 m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m3, [srcq+strideq*0]
+ shufpd m4, m4, m0, 0x0c
+ shufpd m5, m5, m1, 0x0c
+ punpcklbw m1, m4, m5 ; 01
+ punpckhbw m4, m5 ; 34
+ shufpd m6, m6, m2, 0x0c
+ punpcklbw m2, m5, m6 ; 12
+ punpckhbw m5, m6 ; 45
+ shufpd m0, m0, m3, 0x0c
+ punpcklbw m3, m6, m0 ; 23
+ punpckhbw m6, m0 ; 56
+.v_w16_loop:
+ vbroadcasti128 m12, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m13, [srcq+strideq*0]
+ pmaddubsw m14, m1, m8 ; a0
+ pmaddubsw m15, m2, m8 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, m9 ; a1
+ pmaddubsw m4, m9 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, m10 ; a2
+ pmaddubsw m6, m10 ; b2
+ paddw m14, m5
+ paddw m15, m6
+ shufpd m6, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m6, m0 ; 67
+ punpckhbw m6, m0 ; 78
+ pmaddubsw m12, m5, m11 ; a3
+ pmaddubsw m13, m6, m11 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ mova [tmpq+wq*0], m14
+ mova [tmpq+wq*1], m15
+ lea tmpq, [tmpq+wq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ add r5, 16
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign stack_size_padded 0
+ WIN64_SPILL_XMM 16
+ cmp wd, 4
+ je .hv_w4
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
+ vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ jmp .hv_w8
+.hv_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ mova m7, [subpel_h_shuf4]
+ pmovzxbd m9, [deint_shuf4]
+ vpbroadcastd m10, [pw_8192]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m11, [pd_32]
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ vpbroadcastq m2, [srcq+strideq*0]
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpbroadcastq m5, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m6, [srcq+strideq*1]
+ vpbroadcastq m1, [srcq+strideq*2]
+ vpblendd m2, m4, 0xcc ; 0 1
+ vpblendd m0, m5, 0xcc ; 2 3
+ vpblendd m3, m6, 0xcc ; 4 5
+ pshufb m2, m7 ; 00 01 10 11 02 03 12 13
+ pshufb m0, m7 ; 20 21 30 31 22 23 32 33
+ pshufb m3, m7 ; 40 41 50 51 42 43 52 53
+ pshufb m1, m7 ; 60 61 60 61 62 63 62 63
+ pmaddubsw m2, m8
+ pmaddubsw m0, m8
+ pmaddubsw m3, m8
+ pmaddubsw m1, m8
+ phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b
+ phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b
+ punpcklwd m1, m2, m4 ; 01 12
+ punpckhwd m2, m4 ; 23 34
+ pshufd m0, m3, q2121
+ punpcklwd m3, m0 ; 45 56
+.hv_w4_loop:
+ pmaddwd m5, m1, m12 ; a0 b0
+ pmaddwd m6, m2, m12 ; c0 d0
+ pmaddwd m2, m13 ; a1 b1
+ pmaddwd m4, m3, m13 ; c1 d1
+ mova m1, m3
+ pmaddwd m3, m14 ; a2 b2
+ paddd m5, m2
+ vpbroadcastq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ paddd m6, m4
+ vpbroadcastq m4, [srcq+strideq*0]
+ paddd m5, m3
+ vpbroadcastq m3, [srcq+strideq*1]
+ vpblendd m2, m4, 0xcc
+ vpbroadcastq m4, [srcq+strideq*2]
+ vpblendd m3, m4, 0xcc
+ pshufb m2, m7
+ pshufb m3, m7
+ pmaddubsw m2, m8
+ pmaddubsw m3, m8
+ phaddw m2, m3
+ pmulhrsw m2, m10
+ palignr m3, m2, m0, 12
+ mova m0, m2
+ punpcklwd m2, m3, m0 ; 67 78
+ punpckhwd m3, m0 ; 89 9a
+ pmaddwd m4, m2, m14 ; c2 d2
+ paddd m6, m11
+ paddd m5, m11
+ paddd m6, m4
+ pmaddwd m4, m2, m15 ; a3 b3
+ paddd m5, m4
+ pmaddwd m4, m3, m15 ; c3 d3
+ paddd m6, m4
+ psrad m5, 6
+ psrad m6, 6
+ packssdw m5, m6
+ vpermd m5, m9, m5
+ mova [tmpq], m5
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ lea r6d, [wq*8-64]
+ mov r5, srcq
+ mov r7, tmpq
+ lea r6d, [hq+r6*4]
+.hv_w8_loop0:
+ vbroadcasti128 m7, [subpel_h_shufA]
+ movu xm4, [srcq+strideq*0]
+ vbroadcasti128 m8, [subpel_h_shufB]
+ movu xm5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m9, [subpel_h_shufC]
+ movu xm6, [srcq+strideq*0]
+ vbroadcasti128 m0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpblendd m4, m0, 0xf0 ; 0 3
+ vinserti128 m5, [srcq+strideq*0], 1 ; 1 4
+ vinserti128 m6, [srcq+strideq*1], 1 ; 2 5
+ lea srcq, [srcq+strideq*2]
+ vinserti128 m0, [srcq+strideq*0], 1 ; 3 6
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9
+ vpbroadcastd m7, [pw_8192]
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ pmulhrsw m0, m7
+ pmulhrsw m4, m7
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ vpermq m7, m0, q3120
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vextracti128 [tmpq], m0, 1 ; not enough registers
+ movu xm0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti128 m0, [srcq+strideq*0], 1 ; 7 8
+ pmaddwd m8, m1, m12 ; a0
+ pmaddwd m9, m2, m12 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m13 ; a1
+ pmaddwd m4, m13 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m14 ; a2
+ pmaddwd m6, m14 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ vbroadcasti128 m5, [subpel_h_shufA]
+ HV_H_W8 m0, m5, m6, m7, m5, m6, m7
+ vpbroadcastd m5, [pw_8192]
+ vpbroadcastd m7, [pd_32]
+ vbroadcasti128 m6, [tmpq]
+ pmulhrsw m0, m5
+ paddd m8, m7
+ paddd m9, m7
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, m15 ; a3
+ paddd m8, m7
+ pmaddwd m7, m6, m15 ; b3
+ paddd m7, m9
+ psrad m8, 6
+ psrad m7, 6
+ packssdw m8, m7
+ vpermq m7, m8, q3120
+ mova [tmpq+wq*0], xm7
+ vextracti128 [tmpq+wq*2], m7, 1
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .hv_w8_loop
+ add r5, 8
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro REMAP_REG 2
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %xdefine r14_save r14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ %xdefine r14 r14_save
+ %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
+ movq xm%1, [srcq+ r4]
+ movq xm%2, [srcq+ r6]
+ movhps xm%1, [srcq+ r7]
+ movhps xm%2, [srcq+ r9]
+ vinserti128 m%1, [srcq+r10], 1
+ vinserti128 m%2, [srcq+r11], 1
+ vpbroadcastq m%5, [srcq+r13]
+ vpbroadcastq m%6, [srcq+ rX]
+ add srcq, ssq
+ movq xm%3, [srcq+ r4]
+ movq xm%4, [srcq+ r6]
+ movhps xm%3, [srcq+ r7]
+ movhps xm%4, [srcq+ r9]
+ vinserti128 m%3, [srcq+r10], 1
+ vinserti128 m%4, [srcq+r11], 1
+ vpbroadcastq m%7, [srcq+r13]
+ vpbroadcastq m%8, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m%1, m%5, 0xc0
+ vpblendd m%2, m%6, 0xc0
+ vpblendd m%3, m%7, 0xc0
+ vpblendd m%4, m%8, 0xc0
+ pmaddubsw m%1, m15
+ pmaddubsw m%2, m10
+ pmaddubsw m%3, m15
+ pmaddubsw m%4, m10
+ phaddw m%1, m%2
+ phaddw m%3, m%4
+ phaddw m%1, m%3
+ pmulhrsw m%1, m12
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isprep 0
+cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %xdefine base_reg r12
+ %define rndshift 10
+%else
+ %assign isprep 1
+cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
+ %define tmp_stridem qword [rsp+120]
+ %xdefine base_reg r11
+ %define rndshift 6
+%endif
+ lea base_reg, [%1_8tap_scaled_8bpc_avx2]
+%define base base_reg-%1_8tap_scaled_8bpc_avx2
+ tzcnt wd, wm
+ vpbroadcastd m8, dxm
+%if isprep && UNIX64
+ movd xm14, mxd
+ vpbroadcastd m14, xm14
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+%else
+ vpbroadcastd m14, mxm
+%endif
+ mov dyd, dym
+%ifidn %1, put
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %else
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %endif
+ %define dsm [rsp+112]
+ %define rX r1
+ %define rXd r1d
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %else
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm [rsp+112]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define rX r14
+ %define rXd r14d
+%endif
+ vpbroadcastd m10, [base+pd_0x3ff]
+ vpbroadcastd m12, [base+pw_8192]
+%ifidn %1, put
+ vpbroadcastd m13, [base+pd_512]
+%else
+ vpbroadcastd m13, [base+pd_32]
+%endif
+ pxor m9, m9
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0,1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*2]
+ movhps xm0, [srcq+ssq*1]
+ movhps xm1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m0, [srcq+ssq*0], 1
+ vinserti128 m1, [srcq+ssq*2], 1
+ vpbroadcastq m2, [srcq+ssq*1]
+ vpbroadcastq m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpblendd m15, m7, 0xaa
+ vpblendd m0, m2, 0xc0 ; 0 1 4 5
+ vpblendd m1, m3, 0xc0 ; 2 3 6 7
+ pblendvb m15, m11, m8
+ pshufb m0, m14
+ pshufb m1, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ phaddw m0, m1
+ pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7
+ vextracti128 xm1, m0, 1 ; 4 5 6 7
+ palignr xm2, xm1, xm0, 4 ; 1 2 3 4
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ pshufd xm4, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm4 ; 45 56
+ punpckhwd xm4, xm1, xm4 ; 67 __
+.w2_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm11, r6q
+ pmovsxbw xm11, xm11
+ pshufd xm8, xm11, q0000
+ pshufd xm9, xm11, q1111
+ pshufd xm10, xm11, q2222
+ pshufd xm11, xm11, q3333
+ pmaddwd xm5, xm3, xm8
+ pmaddwd xm6, xm0, xm9
+ pmaddwd xm7, xm2, xm10
+ pmaddwd xm8, xm4, xm11
+ paddd xm5, xm6
+ paddd xm7, xm8
+ paddd xm5, xm13
+ paddd xm5, xm7
+ psrad xm5, 10
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq], xm5, 0
+ add dstq, dsq
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w2_loop
+ movq xm5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps xm3, xm0, q1032 ; 01 12
+ shufps xm0, xm2, q1032 ; 23 34
+ shufps xm2, xm4, q1032 ; 45 56
+ pshufb xm5, xm14
+ pmaddubsw xm5, xm15
+ phaddw xm5, xm5
+ pmulhrsw xm5, xm12
+ palignr xm1, xm5, xm1, 12
+ punpcklqdq xm1, xm1 ; 6 7 6 7
+ punpcklwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+.w2_skip_line:
+ movhps xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova xm3, xm0 ; 01 12
+ mova xm0, xm2 ; 23 34
+ pshufb xm5, xm14
+ pmaddubsw xm5, xm15
+ phaddw xm5, xm5
+ pmulhrsw xm5, xm12 ; 6 7 6 7
+ palignr xm1, xm5, xm1, 8 ; 4 5 6 7
+ pshufd xm5, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm5 ; 45 56
+ punpckhwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+%endif
+.w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd xm15, xm0
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pinsrd xm15, [base+subpel_filters+r6*8+2], 1
+ pcmpeqd m0, m9
+ psrld m14, 10
+ movu xm7, [srcq+ssq*0]
+ movu xm9, [srcq+ssq*1]
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 2
+ movu xm8, [srcq+ssq*2]
+ movu xm10, [srcq+ss3q ]
+ pinsrd xm15, [base+subpel_filters+r13*8+2], 3
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m7, [srcq+ssq*0], 1
+ vinserti128 m9, [srcq+ssq*1], 1
+ vinserti128 m15, xm15, 1
+ vinserti128 m8, [srcq+ssq*2], 1
+ vinserti128 m10, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ pblendvb m15, m11, m0
+ pshufb m7, m14
+ pshufb m9, m14
+ pshufb m8, m14
+ pshufb m10, m14
+ pmaddubsw m7, m15
+ pmaddubsw m9, m15
+ pmaddubsw m8, m15
+ pmaddubsw m10, m15
+ phaddw m7, m9
+ phaddw m8, m10
+ pmulhrsw m7, m12 ; 0 1 4 5
+ pmulhrsw m8, m12 ; 2 3 6 7
+ vextracti128 xm9, m7, 1 ; 4 5
+ vextracti128 xm3, m8, 1 ; 6 7
+ shufps xm4, xm7, xm8, q1032 ; 1 2
+ shufps xm5, xm8, xm9, q1032 ; 3 4
+ shufps xm6, xm9, xm3, q1032 ; 5 6
+ psrldq xm11, xm3, 8 ; 7 _
+ punpcklwd xm0, xm7, xm4 ; 01
+ punpckhwd xm7, xm4 ; 12
+ punpcklwd xm1, xm8, xm5 ; 23
+ punpckhwd xm8, xm5 ; 34
+ punpcklwd xm2, xm9, xm6 ; 45
+ punpckhwd xm9, xm6 ; 56
+ punpcklwd xm3, xm11 ; 67
+ mova [rsp+0x00], xm7
+ mova [rsp+0x10], xm8
+ mova [rsp+0x20], xm9
+.w4_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm10, r6q
+ pmovsxbw xm10, xm10
+ pshufd xm7, xm10, q0000
+ pshufd xm8, xm10, q1111
+ pshufd xm9, xm10, q2222
+ pshufd xm10, xm10, q3333
+ pmaddwd xm4, xm0, xm7
+ pmaddwd xm5, xm1, xm8
+ pmaddwd xm6, xm2, xm9
+ pmaddwd xm7, xm3, xm10
+ paddd xm4, xm5
+ paddd xm6, xm7
+ paddd xm4, xm13
+ paddd xm4, xm6
+ psrad xm4, rndshift
+ packssdw xm4, xm4
+%ifidn %1, put
+ packuswb xm4, xm4
+ movd [dstq], xm4
+ add dstq, dsq
+%else
+ movq [tmpq], xm4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+ movu xm4, [srcq]
+ test myd, 0x400
+ jz .w4_skip_line
+ mova xm0, [rsp+0x00]
+ mova [rsp+0x00], xm1
+ mova xm1, [rsp+0x10]
+ mova [rsp+0x10], xm2
+ mova xm2, [rsp+0x20]
+ mova [rsp+0x20], xm3
+ pshufb xm4, xm14
+ pmaddubsw xm4, xm15
+ phaddw xm4, xm4
+ pmulhrsw xm4, xm12
+ punpcklwd xm3, xm11, xm4
+ mova xm11, xm4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu xm5, [srcq+ssq*1]
+ movu m6, [rsp+0x10]
+ pshufb xm4, xm14
+ pshufb xm5, xm14
+ pmaddubsw xm4, xm15
+ pmaddubsw xm5, xm15
+ movu [rsp+0x00], m6
+ phaddw xm4, xm5
+ pmulhrsw xm4, xm12
+ punpcklwd xm9, xm11, xm4
+ mova [rsp+0x20], xm9
+ psrldq xm11, xm4, 8
+ mova xm0, xm1
+ mova xm1, xm2
+ mova xm2, xm3
+ punpcklwd xm3, xm4, xm11
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+.w8:
+ mov dword [rsp+48], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [rsp+48], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [rsp+48], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [rsp+48], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [rsp+48], 16
+ movifprep tmp_stridem, 256
+.w_start:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+72], t0d
+ mov [rsp+56], srcq
+ mov [rsp+64], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ jmp .hloop
+.hloop_prep:
+ dec dword [rsp+48]
+ jz .ret
+ add qword [rsp+64], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp+16]
+ vpbroadcastd m15, [rsp+72]
+ pxor m9, m9
+ mov srcq, [rsp+56]
+ mov r0q, [rsp+64] ; dstq / tmpq
+.hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp+16], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ mova [rsp], xm14
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ mov dyd, dym
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ vbroadcasti128 m14, [base+wswap]
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm11, r6q
+ punpcklqdq xm11, xm11
+ pmovsxbw m11, xm11
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pshufd m8, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m6, m2, m8
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [rsp+52], myd
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ jz .skip_line
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ add srcq, ssq
+ mov myd, [rsp+52]
+ mov dyd, dym
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, m10
+ phaddw m4, m5
+ pslld m5, m4, 16
+ paddw m4, m5
+ pmulhrsw m4, m12
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .vloop
+.skip_line:
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ vpbroadcastq m7, [srcq+r13]
+ vpbroadcastq m8, [srcq+ rX]
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
+ vinserti128 m3, [srcq+r10], 1
+ vinserti128 m4, [srcq+r11], 1
+ add srcq, ssq
+ movq xm5, [srcq+ r4]
+ movq xm6, [srcq+ r6]
+ movhps xm5, [srcq+ r7]
+ movhps xm6, [srcq+ r9]
+ vinserti128 m5, [srcq+r10], 1
+ vinserti128 m6, [srcq+r11], 1
+ vpbroadcastq m9, [srcq+r13]
+ vpbroadcastq m11, [srcq+ rX]
+ add srcq, ssq
+ mov myd, [rsp+52]
+ mov dyd, dym
+ vpblendd m3, m7, 0xc0
+ vpblendd m4, m8, 0xc0
+ vpblendd m5, m9, 0xc0
+ vpblendd m6, m11, 0xc0
+ pmaddubsw m3, m15
+ pmaddubsw m4, m10
+ pmaddubsw m5, m15
+ pmaddubsw m6, m10
+ phaddw m3, m4
+ phaddw m5, m6
+ psrld m4, m3, 16
+ pslld m6, m5, 16
+ paddw m3, m4
+ paddw m5, m6
+ pblendw m3, m5, 0xaa
+ pmulhrsw m3, m12
+ jmp .vloop
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy1_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0-1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*2]
+ movhps xm0, [srcq+ssq*1]
+ movhps xm1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m0, [srcq+ssq*0], 1
+ vinserti128 m1, [srcq+ssq*2], 1
+ vpbroadcastq m2, [srcq+ssq*1]
+ add srcq, ss3q
+ movq xm10, r4q
+ pmovsxbw xm10, xm10
+ vpblendd m15, m7, 0xaa
+ pblendvb m15, m11, m8
+ pshufd xm8, xm10, q0000
+ pshufd xm9, xm10, q1111
+ pshufd xm11, xm10, q3333
+ pshufd xm10, xm10, q2222
+ vpblendd m0, m2, 0xc0
+ pshufb m1, m14
+ pshufb m0, m14
+ pmaddubsw m1, m15
+ pmaddubsw m0, m15
+ phaddw m0, m1
+ pmulhrsw m0, m12
+ vextracti128 xm1, m0, 1
+ palignr xm2, xm1, xm0, 4
+ pshufd xm4, xm1, q2121
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ punpcklwd xm2, xm1, xm4 ; 45 56
+.dy1_w2_loop:
+ movq xm1, [srcq+ssq*0]
+ movhps xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd xm5, xm3, xm8
+ pmaddwd xm6, xm0, xm9
+ pmaddwd xm7, xm2, xm10
+ mova xm3, xm0
+ mova xm0, xm2
+ paddd xm5, xm13
+ paddd xm6, xm7
+ pshufb xm1, xm14
+ pmaddubsw xm1, xm15
+ phaddw xm1, xm1
+ pmulhrsw xm1, xm12
+ palignr xm7, xm1, xm4, 12
+ punpcklwd xm2, xm7, xm1 ; 67 78
+ pmaddwd xm7, xm2, xm11
+ mova xm4, xm1
+ paddd xm5, xm6
+ paddd xm5, xm7
+ psrad xm5, rndshift
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+.dy1_w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ vpermq m8, m8, q3120
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r11d, xm15, 1
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ movu xm2, [srcq+ssq*0]
+ movu xm3, [srcq+ssq*2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 1
+ vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20
+ vinserti128 m2, [srcq+ssq*1], 1
+ vinserti128 m3, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ movu xm4, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*2]
+ vinserti128 m4, [srcq+ssq*1], 1
+ add srcq, ss3q
+ vpblendd m15, m7, 0x30
+ punpcklqdq m15, m15
+ pblendvb m15, m11, m8
+ movq xm10, r4q
+ punpcklqdq xm10, xm10
+ pmovsxbw m10, xm10
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m4, m14
+ pshufb xm5, xm14
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q3120
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ phaddw m2, m3
+ phaddw m4, m5
+ pmulhrsw m2, m12
+ pmulhrsw m4, m12
+ palignr m5, m4, m2, 4
+ pshufd m3, m4, q2121
+ punpcklwd m0, m2, m5 ; 01 12
+ punpckhwd m1, m2, m5 ; 23 34
+ punpcklwd m2, m4, m3 ; 45 56
+.dy1_w4_loop:
+ movu xm11, [srcq+ssq*0]
+ vinserti128 m11, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ mova m0, m1
+ mova m1, m2
+ paddd m4, m13
+ paddd m5, m6
+ pshufb m11, m14
+ vpermq m11, m11, q3120
+ pmaddubsw m11, m15
+ phaddw m11, m11
+ pmulhrsw m11, m12
+ palignr m6, m11, m3, 12
+ punpcklwd m2, m6, m11 ; 67 78
+ mova m3, m11
+ pmaddwd m6, m2, m10
+ paddd m4, m5
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ pshuflw xm4, xm4, q3120
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+%else
+ pshufd xm4, xm4, q3120
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy1_w4_loop
+ MC_8TAP_SCALED_RET
+.dy1_w8:
+ mov dword [rsp+72], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [rsp+72], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [rsp+72], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [rsp+72], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [rsp+72], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ mov myd, mym
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+76], t0d
+ mov [rsp+80], srcq
+ mov [rsp+88], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ pmovsxbw xm0, xm0
+ mova [rsp+96], xm0
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [rsp+72]
+ jz .ret
+ add qword [rsp+88], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp+32]
+ vpbroadcastd m15, [rsp+76]
+ pxor m9, m9
+ mov srcq, [rsp+80]
+ mov r0q, [rsp+88] ; dstq / tmpq
+.dy1_hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp+32], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ movq [rsp+64], xm14
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ movu [rsp], m10
+ vpbroadcastd m8, [rsp+0x60]
+ vpbroadcastd m9, [rsp+0x64]
+ vpbroadcastd m10, [rsp+0x68]
+ vpbroadcastd m11, [rsp+0x6c]
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ vbroadcasti128 m14, [base+wswap]
+.dy1_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ add srcq, ssq
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, [rsp]
+ phaddw m4, m5
+ pslld m5, m4, 16
+ paddw m4, m5
+ pmulhrsw m4, m12
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .dy1_vloop
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy2_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0-1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ vpbroadcastq m2, [srcq+ssq*1]
+ movhps xm0, [srcq+ssq*2]
+ vpbroadcastq m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vpblendd m15, m7, 0xaa
+ pblendvb m15, m11, m8
+ movhps xm1, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ vpblendd m0, m2, 0x30
+ vpblendd m1, m4, 0xc0
+ vpblendd m0, m3, 0xc0
+ pshufb m0, m14
+ pshufb m1, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ movq xm11, r4q
+ pmovsxbw xm11, xm11
+ phaddw m0, m1
+ pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5
+ pshufd xm8, xm11, q0000
+ pshufd xm9, xm11, q1111
+ pshufd xm10, xm11, q2222
+ pshufd xm11, xm11, q3333
+ pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5
+ vextracti128 xm1, m2, 1
+ punpcklwd xm3, xm2, xm1 ; 01 23
+ punpckhwd xm2, xm1 ; 23 45
+.dy2_w2_loop:
+ movq xm6, [srcq+ssq*0]
+ vpbroadcastq m7, [srcq+ssq*1]
+ movhps xm6, [srcq+ssq*2]
+ vpbroadcastq m1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pmaddwd xm4, xm3, xm8
+ pmaddwd xm5, xm2, xm9
+ vpblendd m6, m7, 0x30
+ vpblendd m6, m1, 0xc0
+ pshufb m6, m14
+ pmaddubsw m6, m15
+ phaddw m6, m6
+ pmulhrsw m6, m12
+ palignr m0, m6, m0, 8
+ pshufd m2, m0, q3221
+ vextracti128 xm1, m2, 1
+ punpcklwd xm3, xm2, xm1 ; 45 67
+ punpckhwd xm2, xm1 ; 67 89
+ pmaddwd xm6, xm3, xm10
+ pmaddwd xm7, xm2, xm11
+ paddd xm4, xm5
+ paddd xm4, xm13
+ paddd xm6, xm7
+ paddd xm4, xm6
+ psrad xm4, rndshift
+ packssdw xm4, xm4
+ packuswb xm4, xm4
+ pextrw [dstq+dsq*0], xm4, 0
+ pextrw [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+.dy2_w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pinsrd xm15, [base+subpel_filters+r6*8+2], 1
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movu xm0, [srcq+ssq*0]
+ movu xm2, [srcq+ssq*2]
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 2
+ movu xm1, [srcq+ssq*1]
+ movu xm3, [srcq+ss3q ]
+ pinsrd xm15, [base+subpel_filters+r13*8+2], 3
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ vinserti128 m15, xm15, 1
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m2, [srcq+ssq*0], 1
+ vinserti128 m3, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pblendvb m15, m11, m8
+ pshufb xm0, xm14
+ pshufb m2, m14
+ pshufb xm1, xm14
+ pshufb m3, m14
+ pmaddubsw xm0, xm15
+ pmaddubsw m2, m15
+ pmaddubsw xm1, xm15
+ pmaddubsw m3, m15
+ movq xm11, r4q
+ punpcklqdq xm11, xm11
+ pmovsxbw m11, xm11
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12 ; 0 2 _ 4
+ pmulhrsw m1, m12 ; 1 3 _ 5
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ punpcklwd xm2, xm0, xm1
+ punpckhwd m1, m0, m1 ; 23 45
+ vinserti128 m0, m2, xm1, 1 ; 01 23
+.dy2_w4_loop:
+ movu xm6, [srcq+ssq*0]
+ movu xm7, [srcq+ssq*1]
+ vinserti128 m6, [srcq+ssq*2], 1
+ vinserti128 m7, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pshufb m6, m14
+ pshufb m7, m14
+ pmaddubsw m6, m15
+ pmaddubsw m7, m15
+ psrld m2, m6, 16
+ pslld m3, m7, 16
+ paddw m6, m2
+ paddw m7, m3
+ pblendw m6, m7, 0xaa ; 67 89
+ pmulhrsw m6, m12
+ paddd m4, m5
+ vperm2i128 m0, m1, m6, 0x21 ; 45 67
+ mova m1, m6
+ pmaddwd m6, m0, m10
+ pmaddwd m7, m1, m11
+ paddd m4, m13
+ paddd m6, m7
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET
+.dy2_w8:
+ mov dword [rsp+40], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [rsp+40], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [rsp+40], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [rsp+40], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [rsp+40], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ mov myd, mym
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+64], t0d
+ mov [rsp+48], srcq
+ mov [rsp+56], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ movq xm0, r4q
+ pmovsxbw xm0, xm0
+ mova [rsp+0x50], xm0
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [rsp+40]
+ jz .ret
+ add qword [rsp+56], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp]
+ vpbroadcastd m15, [rsp+64]
+ pxor m9, m9
+ mov srcq, [rsp+48]
+ mov r0q, [rsp+56] ; dstq / tmpq
+.dy2_hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ vpbroadcastd m8, [rsp+0x50]
+ vpbroadcastd m9, [rsp+0x54]
+ vpbroadcastd m11, [rsp+0x58]
+ vpbroadcastd m4, [rsp+0x5c]
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ SWAP m14, m4
+.dy2_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m11
+ pmaddwd m7, m3, m14
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
+ vinserti128 m3, [srcq+r10], 1
+ vinserti128 m4, [srcq+r11], 1
+ vpbroadcastq m5, [srcq+r13]
+ vpbroadcastq m6, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m3, m5, 0xc0
+ vpblendd m4, m6, 0xc0
+ pmaddubsw m3, m15
+ pmaddubsw m4, m10
+ phaddw m3, m4
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, m10
+ phaddw m4, m5
+ psrld m5, m3, 16
+ pslld m6, m4, 16
+ paddw m3, m5
+ paddw m4, m6
+ pblendw m3, m4, 0xaa
+ pmulhrsw m3, m12
+ jmp .dy2_vloop
+.ret:
+ MC_8TAP_SCALED_RET 0
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled_8bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, t0d
+ jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+
+BILIN_SCALED_FN put
+PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
+PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+BILIN_SCALED_FN prep
+PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
+PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%macro WARP_V 5 ; dst, 02, 46, 13, 57
+ ; Can be done using gathers, but that's terribly slow on many CPU:s
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm8, [filterq+myq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1 ; a e
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+deltaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; b f
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm9, [filterq+myq *8]
+ vinserti128 m9, [filterq+tmp1q*8], 1 ; c g
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+gammaq] ; my += gamma
+ shr tmp2d, 10
+ shr tmp1d, 10
+ punpcklwd m8, m0
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; d h
+ punpcklwd m0, m9, m0
+ punpckldq m9, m8, m0
+ punpckhdq m0, m8, m0
+ punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
+ punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
+ pmaddwd m%2, m8
+ pmaddwd m9, m%3
+ punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
+ punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
+ pmaddwd m8, m%4
+ pmaddwd m0, m%5
+ paddd m%2, m9
+ paddd m0, m8
+ paddd m%1, m0, m%2
+%endmacro
+
+cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts
+%if WIN64
+ sub rsp, 0xa0
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main
+.loop:
+ psrad m7, 13
+ psrad m0, 13
+ packssdw m7, m0
+ pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7
+ vpermq m7, m7, q3120
+ mova [tmpq+tsq*0], xm7
+ vextracti128 [tmpq+tsq*2], m7, 1
+ dec r4d
+ jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2
+ lea tmpq, [tmpq+tsq*4]
+ jmp .loop
+
+cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
+ beta, filter, tmp1, delta, my, gamma
+%if WIN64
+ sub rsp, 0xa0
+ %assign xmm_regs_used 16
+ %assign stack_size_padded 0xa0
+ %assign stack_offset stack_offset+stack_size_padded
+%endif
+ call .main
+ jmp .start
+.loop:
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+ psrad m7, 18
+ psrad m0, 18
+ packusdw m7, m0
+ pavgw m7, m11 ; (x + (1 << 10)) >> 11
+ vextracti128 xm0, m7, 1
+ packuswb xm7, xm0
+ pshufd xm7, xm7, q3120
+ movq [dstq+dsq*0], xm7
+ movhps [dstq+dsq*1], xm7
+ dec r4d
+ jg .loop
+.end:
+ RET
+ALIGN function_align
+.main:
+ ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+ mov abcdq, r5m
+ mov mxd, r6m
+ movaps [rsp+stack_offset+0x10], xmm6
+ movaps [rsp+stack_offset+0x20], xmm7
+ movaps [rsp+0x28], xmm8
+ movaps [rsp+0x38], xmm9
+ movaps [rsp+0x48], xmm10
+ movaps [rsp+0x58], xmm11
+ movaps [rsp+0x68], xmm12
+ movaps [rsp+0x78], xmm13
+ movaps [rsp+0x88], xmm14
+ movaps [rsp+0x98], xmm15
+%endif
+ movsx alphad, word [abcdq+2*0]
+ movsx betad, word [abcdq+2*1]
+ mova m12, [warp_8x8_shufA]
+ mova m13, [warp_8x8_shufB]
+ vpbroadcastd m14, [pw_8192]
+ vpbroadcastd m15, [pd_32768]
+ pxor m11, m11
+ lea filterq, [mc_warp_filter2]
+ lea tmp1q, [ssq*3+3]
+ add mxd, 512+(64<<10)
+ lea tmp2d, [alphaq*3]
+ sub srcq, tmp1q ; src -= src_stride*3 + 3
+ sub betad, tmp2d ; beta -= alpha*3
+ mov myd, r7m
+ call .h
+ psrld m1, m0, 16
+ call .h
+ psrld m4, m0, 16
+ call .h
+ pblendw m1, m0, 0xaa ; 02
+ call .h
+ pblendw m4, m0, 0xaa ; 13
+ call .h
+ psrld m2, m1, 16
+ pblendw m2, m0, 0xaa ; 24
+ call .h
+ psrld m5, m4, 16
+ pblendw m5, m0, 0xaa ; 35
+ call .h
+ psrld m3, m2, 16
+ pblendw m3, m0, 0xaa ; 46
+ movsx deltad, word [abcdq+2*2]
+ movsx gammad, word [abcdq+2*3]
+ add myd, 512+(64<<10)
+ mov r4d, 4
+ lea tmp1d, [deltaq*3]
+ sub gammad, tmp1d ; gamma -= delta*3
+.main2:
+ call .h
+ psrld m6, m5, 16
+ pblendw m6, m0, 0xaa ; 57
+ WARP_V 7, 1, 3, 4, 6
+ call .h
+ mova m1, m2
+ mova m2, m3
+ psrld m3, 16
+ pblendw m3, m0, 0xaa ; 68
+ WARP_V 0, 4, 6, 1, 3
+ mova m4, m5
+ mova m5, m6
+ ret
+ALIGN function_align
+.h:
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ vbroadcasti128 m10, [srcq]
+ shr mxd, 10
+ shr tmp1d, 10
+ movq xm8, [filterq+mxq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+alphaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ shr mxd, 10
+ shr tmp1d, 10
+ movq xm9, [filterq+mxq *8]
+ vinserti128 m9, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ punpcklqdq m8, m0 ; 0 1 4 5
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1
+ punpcklqdq m9, m0 ; 2 3 6 7
+ pshufb m0, m10, m12
+ pmaddubsw m0, m8
+ pshufb m10, m13
+ pmaddubsw m10, m9
+ add srcq, ssq
+ phaddw m0, m10
+ pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
+ paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword
+ ret
+
+%macro BIDIR_FN 1 ; op
+ %1 0
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ cmp hd, 8
+ je .ret
+ %1 2
+ lea dstq, [dstq+strideq*4]
+ vextracti128 xm1, m0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.ret:
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w8:
+ vextracti128 xm1, m0, 1
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq ], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq*2]
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ %1_INC_PTR 4
+ %1 0
+ add dstq, strideq
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+32], m0
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ %1 0
+ add dstq, strideq
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+0*32], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+1*32], m0
+ %1_INC_PTR 8
+ %1 -4
+ vpermq m0, m0, q3120
+ mova [dstq+2*32], m0
+ %1 -2
+ vpermq m0, m0, q3120
+ mova [dstq+3*32], m0
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ mova m0, [tmp1q+(%1+0)*32]
+ paddw m0, [tmp2q+(%1+0)*32]
+ mova m1, [tmp1q+(%1+1)*32]
+ paddw m1, [tmp2q+(%1+1)*32]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ packuswb m0, m1
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*32
+ add tmp2q, %1*32
+%endmacro
+
+cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg %+ SUFFIX %+ _table
+ lea r6, [avg %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m2, [base+pw_1024]
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m0, [tmp1q+(%1+0)*32]
+ psubw m2, m0, [tmp2q+(%1+0)*32]
+ mova m1, [tmp1q+(%1+1)*32]
+ psubw m3, m1, [tmp2q+(%1+1)*32]
+ pmulhw m2, m4
+ pmulhw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-w_avg %+ SUFFIX %+ _table
+ lea r6, [w_avg %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m4, r6m ; weight
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m5, [base+pw_2048]
+ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
+ add wq, r6
+ cmp dword r6m, 7
+ jg .weight_gt7
+ mov r6, tmp1q
+ pxor m0, m0
+ mov tmp1q, tmp2q
+ psubw m4, m0, m4 ; -weight
+ mov tmp2q, r6
+.weight_gt7:
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+ vpermq m3, [maskq+%1*16], q3120
+ mova m0, [tmp2q+(%1+0)*32]
+ psubw m1, m0, [tmp1q+(%1+0)*32]
+ psubb m3, m4, m3
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3
+ punpcklbw m2, m4, m3 ; -m << 9
+ pmulhw m1, m2
+ paddw m0, m1
+ mova m1, [tmp2q+(%1+1)*32]
+ psubw m2, m1, [tmp1q+(%1+1)*32]
+ paddw m2, m2
+ punpckhbw m3, m4, m3
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*16
+ add tmp2q, %1*32
+ add tmp1q, %1*32
+%endmacro
+
+cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask %+ SUFFIX %+ _table
+ lea r7, [mask %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m5, [base+pw_2048]
+ pxor m4, m4
+ add wq, r7
+ BIDIR_FN MASK
+
+%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
+ mova m%1, [tmp1q+32*%3]
+ mova m1, [tmp2q+32*%3]
+ psubw m1, m%1
+ pabsw m%2, m1
+ psubusw m%2, m6, m%2
+ psrlw m%2, 8 ; 64 - m
+ psllw m2, m%2, 10
+ pmulhw m1, m2
+ paddw m%1, m1
+ mova m1, [tmp1q+32*%4]
+ mova m2, [tmp2q+32*%4]
+ psubw m2, m1
+ pabsw m3, m2
+ psubusw m3, m6, m3
+ psrlw m3, 8
+%if %5
+ packuswb m%2, m3
+ psubb m%2, m5, m%2
+ vpermq m%2, m%2, q3120
+%else
+ phaddw m%2, m3
+%endif
+ psllw m3, 10
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m%1, m7
+ pmulhrsw m1, m7
+ packuswb m%1, m1
+%endmacro
+
+cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx2_table
+ lea r6, [blend_avx2_table]
+ tzcnt wd, wm
+ movifnidn maskq, maskmp
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m4, [base+pb_64]
+ vpbroadcastd m5, [base+pw_512]
+ sub tmpq, maskq
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ movd xm0, [dstq+dsq*0]
+ pinsrd xm0, [dstq+dsq*1], 1
+ vpbroadcastd xm1, [dstq+dsq*2]
+ pinsrd xm1, [dstq+r6 ], 3
+ mova xm6, [maskq]
+ psubb xm3, xm4, xm6
+ punpcklbw xm2, xm3, xm6
+ punpckhbw xm3, xm6
+ mova xm6, [maskq+tmpq]
+ add maskq, 4*4
+ punpcklbw xm0, xm6
+ punpckhbw xm1, xm6
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm3
+ pmulhrsw xm0, xm5
+ pmulhrsw xm1, xm5
+ packuswb xm0, xm1
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ pextrd [dstq+dsq*2], xm0, 2
+ pextrd [dstq+r6 ], xm0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ RET
+ALIGN function_align
+.w8:
+ movq xm1, [dstq+dsq*0]
+ movhps xm1, [dstq+dsq*1]
+ vpbroadcastq m2, [dstq+dsq*2]
+ vpbroadcastq m3, [dstq+r6 ]
+ mova m0, [maskq]
+ mova m6, [maskq+tmpq]
+ add maskq, 8*4
+ vpblendd m1, m2, 0x30
+ vpblendd m1, m3, 0xc0
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ movq [dstq+dsq*2], xm1
+ movhps [dstq+r6 ], xm1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ RET
+ALIGN function_align
+.w16:
+ mova m0, [maskq]
+ mova xm1, [dstq+dsq*0]
+ vinserti128 m1, [dstq+dsq*1], 1
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ mova m6, [maskq+tmpq]
+ add maskq, 16*2
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16
+ RET
+ALIGN function_align
+.w32:
+ mova m0, [maskq]
+ mova m1, [dstq]
+ mova m6, [maskq+tmpq]
+ add maskq, 32
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .w32
+ RET
+
+cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_avx2_table
+ lea r5, [blend_v_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r5+wq*4]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r5
+ add maskq, obmc_masks-blend_v_avx2_table
+ jmp wq
+.w2:
+ vpbroadcastd xm2, [maskq+2*2]
+.w2_s0_loop:
+ movd xm0, [dstq+dsq*0]
+ pinsrw xm0, [dstq+dsq*1], 1
+ movd xm1, [tmpq]
+ add tmpq, 2*2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_s0_loop
+ RET
+ALIGN function_align
+.w4:
+ vpbroadcastq xm2, [maskq+4*2]
+.w4_loop:
+ movd xm0, [dstq+dsq*0]
+ pinsrd xm0, [dstq+dsq*1], 1
+ movq xm1, [tmpq]
+ add tmpq, 4*2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ mova xm3, [maskq+8*2]
+.w8_loop:
+ movq xm0, [dstq+dsq*0]
+ vpbroadcastq xm1, [dstq+dsq*1]
+ mova xm2, [tmpq]
+ add tmpq, 8*2
+ punpcklbw xm0, xm2
+ punpckhbw xm1, xm2
+ pmaddubsw xm0, xm3
+ pmaddubsw xm1, xm3
+ pmulhrsw xm0, xm5
+ pmulhrsw xm1, xm5
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ vbroadcasti128 m3, [maskq+16*2]
+ vbroadcasti128 m4, [maskq+16*3]
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti128 m1, [dstq+dsq*1], 1
+ mova m2, [tmpq]
+ add tmpq, 16*2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ mova xm3, [maskq+16*4]
+ vinserti128 m3, [maskq+16*6], 1
+ mova xm4, [maskq+16*5]
+ vinserti128 m4, [maskq+16*7], 1
+.w32_loop:
+ mova m1, [dstq]
+ mova m2, [tmpq]
+ add tmpq, 32
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .w32_loop
+ RET
+
+cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_h_avx2_table
+ lea r5, [blend_h_avx2_table]
+ mov r6d, wd
+ tzcnt wd, wd
+ mov hd, hm
+ movsxd wq, dword [r5+wq*4]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r5
+ lea maskq, [base+obmc_masks+hq*2]
+ lea hd, [hq*3]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd xm0, [dstq+dsq*0]
+ pinsrw xm0, [dstq+dsq*1], 1
+ movd xm2, [maskq+hq*2]
+ movd xm1, [tmpq]
+ add tmpq, 2*2
+ punpcklwd xm2, xm2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+ALIGN function_align
+.w4:
+ mova xm3, [blend_shuf]
+.w4_loop:
+ movd xm0, [dstq+dsq*0]
+ pinsrd xm0, [dstq+dsq*1], 1
+ movd xm2, [maskq+hq*2]
+ movq xm1, [tmpq]
+ add tmpq, 4*2
+ pshufb xm2, xm3
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ vbroadcasti128 m4, [blend_shuf]
+ shufpd m4, m4, 0x03
+.w8_loop:
+ vpbroadcastq m1, [dstq+dsq*0]
+ movq xm0, [dstq+dsq*1]
+ vpblendd m0, m1, 0x30
+ vpbroadcastd m3, [maskq+hq*2]
+ movq xm1, [tmpq+8*1]
+ vinserti128 m1, [tmpq+8*0], 1
+ add tmpq, 8*2
+ pshufb m3, m4
+ punpcklbw m0, m1
+ pmaddubsw m0, m3
+ pmulhrsw m0, m5
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movhps [dstq+dsq*0], xm0
+ movq [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ vbroadcasti128 m4, [blend_shuf]
+ shufpd m4, m4, 0x0c
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti128 m1, [dstq+dsq*1], 1
+ vpbroadcastd m3, [maskq+hq*2]
+ mova m2, [tmpq]
+ add tmpq, 16*2
+ pshufb m3, m4
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16_loop
+ RET
+ALIGN function_align
+.w32: ; w32/w64/w128
+ sub dsq, r6
+.w32_loop0:
+ vpbroadcastw m3, [maskq+hq*2]
+ mov wd, r6d
+.w32_loop:
+ mova m1, [dstq]
+ mova m2, [tmpq]
+ add tmpq, 32
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, 32
+ sub wd, 32
+ jg .w32_loop
+ add dstq, dsq
+ inc hq
+ jl .w32_loop0
+ RET
+
+cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
+ bottomext, rightext
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor r12d, r12d
+ lea r10, [ihq-1]
+ cmp yq, ihq
+ cmovs r10, yq
+ test yq, yq
+ cmovs r10, r12
+ imul r10, sstrideq
+ add srcq, r10
+
+ ; ref += iclip(x, 0, iw - 1)
+ lea r10, [iwq-1]
+ cmp xq, iwq
+ cmovs r10, xq
+ test xq, xq
+ cmovs r10, r12
+ add srcq, r10
+
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ lea bottomextq, [yq+bhq]
+ sub bottomextq, ihq
+ lea r3, [bhq-1]
+ cmovs bottomextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, r12
+ cmp bottomextq, bhq
+ cmovns bottomextq, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ lea rightextq, [xq+bwq]
+ sub rightextq, iwq
+ lea r2, [bwq-1]
+ cmovs rightextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, r12
+ cmp rightextq, bwq
+ cmovns rightextq, r2
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
+ dst, dstride, src, sstride, bottomext, rightext
+
+ ; center_h = bh - top_ext - bottom_ext
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+ imul r2, dstrideq
+ add dstq, r2
+ mov r9m, dstq
+
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+.v_loop_%3:
+%if %1
+ ; left extension
+ xor r3, r3
+ vpbroadcastb m0, [srcq]
+.left_loop_%3:
+ mova [dstq+r3], m0
+ add r3, 32
+ cmp r3, leftextq
+ jl .left_loop_%3
+
+ ; body
+ lea r12, [dstq+leftextq]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ movu m0, [srcq+r3]
+%if %1
+ movu [r12+r3], m0
+%else
+ movu [dstq+r3], m0
+%endif
+ add r3, 32
+ cmp r3, centerwq
+ jl .body_loop_%3
+
+%if %2
+ ; right extension
+%if %1
+ add r12, centerwq
+%else
+ lea r12, [dstq+centerwq]
+%endif
+ xor r3, r3
+ vpbroadcastb m0, [srcq+centerwq-1]
+.right_loop_%3:
+ movu [r12+r3], m0
+ add r3, 32
+ cmp r3, rightextq
+ jl .right_loop_%3
+
+%endif
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+%endmacro
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ test rightextq, rightextq
+ jnz .need_right_ext
+ v_loop 0, 0, 0
+ jmp .body_done
+
+.need_left_ext:
+ test rightextq, rightextq
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+ ; bottom edge extension
+ test bottomextq, bottomextq
+ jz .top
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+.bottom_x_loop:
+ mova m0, [srcq+r1]
+ lea r3, [dstq+r1]
+ mov r4, bottomextq
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .bottom_y_loop
+ add r1, 32
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+ mov srcq, r9m
+ mov dstq, dstm
+ xor r1, r1
+.top_x_loop:
+ mova m0, [srcq+r1]
+ lea r3, [dstq+r1]
+ mov r4, topextq
+.top_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .top_y_loop
+ add r1, 32
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+cglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+%define base r7-$$
+
+ vpbroadcastd xm3, [base+pw_m256]
+ vpbroadcastd m7, [base+pd_63]
+ vbroadcasti128 m15, [base+pb_8x0_8x8]
+ pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
+ pslld m5, 3 ; dx*8
+ pslld m6, 14
+ paddd m8, m2 ; mx+[0..7]*dx
+ pxor m2, m2
+
+ ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
+ ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8
+
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+
+.loop_x:
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ pand m9, m7 ; filter offset (masked)
+
+ ; load source pixels - this ugly code is vpgatherdq emulation since
+ ; directly using vpgatherdq on Haswell is quite a bit slower :(
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vextracti128 xm0, m0, 1
+ movq xm12, [srcq+r8]
+ movq xm13, [srcq+r10]
+ movhps xm12, [srcq+r9]
+ movhps xm13, [srcq+r11]
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vinserti128 m12, [srcq+r8], 1
+ vinserti128 m13, [srcq+r10], 1
+ vpbroadcastq m10, [srcq+r9]
+ vpbroadcastq m11, [srcq+r11]
+ vpblendd m12, m10, 11000000b
+ vpblendd m13, m11, 11000000b
+
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ ; this also saves 2 quasi-vpgatherdqs
+ vptest m1, m1
+ jz .filter
+
+ movq r9, xm1
+ pextrq r11, xm1, 1
+ movsxd r8, r9d
+ sar r9, 32
+ movsxd r10, r11d
+ sar r11, 32
+ vextracti128 xm1, m1, 1
+ movq xm14, [base+resize_shuf+4+r8]
+ movq xm0, [base+resize_shuf+4+r10]
+ movhps xm14, [base+resize_shuf+4+r9]
+ movhps xm0, [base+resize_shuf+4+r11]
+ movq r9, xm1
+ pextrq r11, xm1, 1
+ movsxd r8, r9d
+ sar r9, 32
+ movsxd r10, r11d
+ sar r11, 32
+ vinserti128 m14, [base+resize_shuf+4+r8], 1
+ vinserti128 m0, [base+resize_shuf+4+r10], 1
+ vpbroadcastq m10, [base+resize_shuf+4+r9]
+ vpbroadcastq m11, [base+resize_shuf+4+r11]
+ vpblendd m14, m10, 11000000b
+ vpblendd m0, m11, 11000000b
+
+ paddb m14, m15
+ paddb m0, m15
+ pshufb m12, m14
+ pshufb m13, m0
+
+.filter:
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ vextracti128 xm9, m9, 1
+ movq xm10, [base+resize_filter+r8*8]
+ movq xm11, [base+resize_filter+r10*8]
+ movhps xm10, [base+resize_filter+r9*8]
+ movhps xm11, [base+resize_filter+r11*8]
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ vinserti128 m10, [base+resize_filter+r8*8], 1
+ vinserti128 m11, [base+resize_filter+r10*8], 1
+ vpbroadcastq m14, [base+resize_filter+r9*8]
+ vpbroadcastq m1, [base+resize_filter+r11*8]
+ vpblendd m10, m14, 11000000b
+ vpblendd m11, m1, 11000000b
+
+ pmaddubsw m12, m10
+ pmaddubsw m13, m11
+ phaddw m12, m13
+ vextracti128 xm13, m12, 1
+ phaddsw xm12, xm13
+ pmulhrsw xm12, xm3 ; x=(x+64)>>7
+ packuswb xm12, xm12
+ movq [dstq+xq], xm12
+
+ paddd m4, m5
+ add xd, 8
+ cmp xd, dst_wd
+ jl .loop_x
+
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx2_table
+ lea r7, [w_mask_420_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ pmovzxbd m9, [base+deint_shuf4]
+ vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign
+ add wq, r7
+ W_MASK 0, 4, 0, 1
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ jg .w4_h16
+.w4_end:
+ vextracti128 xm0, m4, 1
+ vpblendd xm1, xm4, xm0, 0x05
+ vpblendd xm4, xm0, 0x0a
+ pshufd xm1, xm1, q2301
+ psubw xm4, xm8, xm4
+ psubw xm4, xm1
+ psrlw xm4, 2
+ packuswb xm4, xm4
+ movq [maskq], xm4
+ RET
+.w4_h16:
+ W_MASK 0, 5, 2, 3
+ lea dstq, [dstq+strideq*4]
+ phaddd m4, m5
+ vextracti128 xm1, m0, 1
+ psubw m4, m8, m4
+ psrlw m4, 2
+ vpermd m4, m9, m4
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq], xm4
+ RET
+.w8_loop:
+ add tmp1q, 2*32
+ add tmp2q, 2*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 8
+.w8:
+ vextracti128 xm2, m4, 1
+ vextracti128 xm1, m0, 1
+ psubw xm4, xm8, xm4
+ psubw xm4, xm2
+ psrlw xm4, 2
+ packuswb xm4, xm4
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ movq [maskq], xm4
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ W_MASK 0, 5, 2, 3
+ punpckhqdq m1, m4, m5
+ punpcklqdq m4, m5
+ psubw m1, m8, m1
+ psubw m1, m4
+ psrlw m1, 2
+ vpermq m0, m0, q3120
+ packuswb m1, m1
+ vpermd m1, m9, m1
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ mova [maskq], xm1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], m0
+ W_MASK 0, 5, 2, 3
+ psubw m4, m8, m4
+ psubw m4, m5
+ psrlw m4, 2
+ vpermq m0, m0, q3120
+ packuswb m4, m4
+ vpermd m4, m9, m4
+ mova [dstq+strideq*1], m0
+ mova [maskq], xm4
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop_even:
+ psubw m10, m8, m4
+ psubw m11, m8, m5
+ dec hd
+.w64_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ test hd, 1
+ jz .w64_loop_even
+ psubw m4, m10, m4
+ psubw m5, m11, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq], m4
+ add maskq, 32
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop_even:
+ psubw m12, m8, m4
+ psubw m13, m8, m5
+ dec hd
+.w128_loop:
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ add tmp1q, 8*32
+ add tmp2q, 8*32
+ test hd, 1
+ jz .w128_even
+ psubw m4, m10, m4
+ psubw m5, m11, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq+32*0], m4
+ jmp .w128_odd
+.w128_even:
+ psubw m10, m8, m4
+ psubw m11, m8, m5
+.w128_odd:
+ W_MASK 0, 4, -4, -3
+ vpermq m0, m0, q3120
+ mova [dstq+32*2], m0
+ W_MASK 0, 5, -2, -1
+ vpermq m0, m0, q3120
+ mova [dstq+32*3], m0
+ test hd, 1
+ jz .w128_loop_even
+ psubw m4, m12, m4
+ psubw m5, m13, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq+32*1], m4
+ add maskq, 64
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx2_table
+ lea r7, [w_mask_422_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ pxor m9, m9
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ pmovzxbd m10, [base+deint_shuf4]
+ vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign
+ add wq, r7
+ mov maskq, maskmp
+ W_MASK 0, 4, 0, 1
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ jg .w4_h16
+.w4_end:
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ psubb xm5, xm8, xm4
+ pavgb xm5, xm9
+ pshufd xm5, xm5, q3120
+ mova [maskq], xm5
+ RET
+.w4_h16:
+ W_MASK 0, 5, 2, 3
+ lea dstq, [dstq+strideq*4]
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermd m5, m10, m5
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq], m5
+ RET
+.w8_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w8:
+ vextracti128 xm5, m4, 1
+ vextracti128 xm1, m0, 1
+ packuswb xm4, xm5
+ psubb xm5, xm8, xm4
+ pavgb xm5, xm9
+ pshufd xm5, xm5, q3120
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ mova [maskq], xm5
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ mova [maskq], m5
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], m0
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+strideq*1], m0
+ mova [maskq], m5
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+ add maskq, 32
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*1], m0
+ mova [maskq], m5
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ add tmp1q, 32*8
+ add tmp2q, 32*8
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+ add maskq, 32*2
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*1], m0
+ mova [maskq+32*0], m5
+ W_MASK 0, 4, 4, 5
+ vpermq m0, m0, q3120
+ mova [dstq+32*2], m0
+ W_MASK 0, 5, 6, 7
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*3], m0
+ mova [maskq+32*1], m5
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx2_table
+ lea r7, [w_mask_444_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m5, [base+pb_64]
+ vpbroadcastd m7, [base+pw_2048]
+ add wq, r7
+ W_MASK 0, 4, 0, 1, 1
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ mova [maskq+32*0], m4
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ je .w4_end
+ W_MASK 0, 4, 2, 3, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq+32*1], m4
+.w4_end:
+ RET
+.w8_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w8:
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ mova [maskq], m4
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1, 1
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [maskq], m4
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1, 1
+ add dstq, strideq
+ add maskq, 32
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq], m0
+ mova [maskq], m4
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1, 1
+ add dstq, strideq
+ add maskq, 32*2
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ mova [maskq+32*0], m4
+ W_MASK 0, 4, 2, 3, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ mova [maskq+32*1], m4
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ add tmp1q, 32*8
+ add tmp2q, 32*8
+ W_MASK 0, 4, 0, 1, 1
+ add dstq, strideq
+ add maskq, 32*4
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ mova [maskq+32*0], m4
+ W_MASK 0, 4, 2, 3, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ mova [maskq+32*1], m4
+ W_MASK 0, 4, 4, 5, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*2], m0
+ mova [maskq+32*2], m4
+ W_MASK 0, 4, 6, 7, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*3], m0
+ mova [maskq+32*3], m4
+ dec hd
+ jg .w128_loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm
new file mode 100644
index 0000000000..7897f1decc
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc_avx512.asm
@@ -0,0 +1,4538 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+obmc_masks:
+pw_512: times 2 dw 512
+ ; 2
+ db 45, 19, 64, 0
+ ; 4
+ db 39, 25, 50, 14, 59, 5, 64, 0
+ ; 8
+ db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
+ ; 16
+ db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+ db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
+ ; 32
+ db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+ db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
+ db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
+ db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
+
+warp_8x8_permA: db 4, 5, 6, 7, 16, 17, 18, 19, 5, 6, 7, 8, 17, 18, 19, 20
+ db 6, 7, 8, 9, 18, 19, 20, 21, 7, 8, 9, 10, 19, 20, 21, 22
+ db 8, 9, 10, 11, 20, 21, 22, 23, 9, 10, 11, 12, 21, 22, 23, 24
+ db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26
+warp_8x8_permB: db 0, 1, 2, 3, 20, 21, 22, 23, 1, 2, 3, 4, 21, 22, 23, 24
+ db 2, 3, 4, 5, 22, 23, 24, 25, 3, 4, 5, 6, 23, 24, 25, 26
+ db 4, 5, 6, 7, 24, 25, 26, 27, 5, 6, 7, 8, 25, 26, 27, 28
+ db 6, 7, 8, 9, 26, 27, 28, 29, 7, 8, 9, 10, 27, 28, 29, 30
+warp_8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
+warp_8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
+pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7
+warp_8x8_hpack: db 3, 11, 3, 11, 35, 43, 35, 43
+pd_16384: dd 16384
+pd_262144: dd 262144
+warp_8x8_end: db 0, 4, 16, 20, 32, 36, 48, 52, 2, 6, 18, 22, 34, 38, 50, 54
+warp_8x8t_end: db 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59
+ db 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63
+bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31
+ db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63
+ db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
+ db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
+wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31
+ db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63
+ db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
+ db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
+wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47
+ db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63
+ db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
+ db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
+wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
+ db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127
+ db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62
+ db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126
+ db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+ db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
+ db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39
+ db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47
+bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+ db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
+ db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23
+ db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31
+bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
+ db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
+ db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87
+ db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39
+bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
+ db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
+ db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23
+ db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31
+bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7
+ db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15
+ db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
+ db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31
+bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7
+spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+ db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+ db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
+ db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42
+ db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50
+spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+ db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54
+spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
+spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+ db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
+ db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26
+ db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34
+spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
+ db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7
+ db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+ db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23
+ db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
+ db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
+ db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
+ db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23
+ db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
+spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39
+ db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47
+spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55
+ db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63
+spel_hv_perm4d: db 18, 19, 0, 1, 22, 23, 4, 5, 26, 27, 8, 9, 30, 31, 12, 13
+ db 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
+spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
+ db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
+ db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
+ db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
+spel_hv_perm8b: db 32, 33, 48, 49, 34, 35, 50, 51, 36, 37, 52, 53, 38, 39, 54, 55
+ db 40, 41, 56, 57, 42, 43, 58, 59, 44, 45, 60, 61, 46, 47, 62, 63
+ db 48, 49, 64, 65, 50, 51, 66, 67, 52, 53, 68, 69, 54, 55, 70, 71
+ db 56, 57, 72, 73, 58, 59, 74, 75, 60, 61, 76, 77, 62, 63, 78, 79
+spel_hv_perm8c: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13
+ db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29
+ db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45
+ db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61
+spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55
+ db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63
+spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36
+ db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38
+spel_hv_perm16c:db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44
+ db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46
+ db 16, 17, 18, 19, 48, 49, 50, 51, 17, 18, 19, 20, 49, 50, 51, 52
+ db 18, 19, 20, 21, 50, 51, 52, 53, 19, 20, 21, 22, 51, 52, 53, 54
+spel_hv_perm16b:db 4, 5, 6, 7, 36, 37, 38, 39, 5, 6, 7, 8, 37, 38, 39, 40
+ db 6, 7, 8, 9, 38, 39, 40, 41, 7, 8, 9, 10, 39, 40, 41, 42
+ db 12, 13, 14, 15, 44, 45, 46, 47, 13, 14, 15, 16, 45, 46, 47, 48
+ db 14, 15, 16, 17, 46, 47, 48, 49, 15, 16, 17, 18, 47, 48, 49, 50
+spel_hv_perm16d:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8
+ db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10
+ db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16
+ db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18
+spel_hv_perm16e:db 4, 5, 6, 7, 5, 6, 7, 8, 8, 9, 10, 11, 9, 10, 11, 12
+ db 6, 7, 8, 9, 7, 8, 9, 10, 10, 11, 12, 13, 11, 12, 13, 14
+ db 12, 13, 14, 15, 13, 14, 15, 16, 16, 17, 18, 19, 17, 18, 19, 20
+ db 14, 15, 16, 17, 15, 16, 17, 18, 18, 19, 20, 21, 19, 20, 21, 22
+spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55
+deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
+bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
+resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+resize_permC: dd 0, 4, 8, 12
+pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7
+
+wm_420_perm64: dq 0xfedcba9876543210
+wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040
+
+pb_8x0_8x8: times 8 db 0
+ times 8 db 8
+pb_127: times 4 db 127
+pw_m128 times 2 dw -128
+pw_m256: times 2 dw -256
+pw_1024: times 2 dw 1024
+pw_2048: times 2 dw 2048
+pw_6903: times 2 dw 6903
+pw_8192: times 2 dw 8192
+pd_32: dd 32
+pd_34: dd 34
+pd_63: dd 63
+pd_512: dd 512
+pd_32768: dd 32768
+
+%define pb_m64 (wm_sign+4)
+%define pb_64 (wm_sign+8)
+%define pd_2 (pd_0to7+8)
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+cextern mc_warp_filter
+cextern resize_filter
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+%macro BIDIR_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put)
+%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep)
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+%macro WRAP_YMM 1+
+INIT_YMM cpuname
+ %1
+INIT_ZMM cpuname
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+ movifnidn mxyd, r6m ; mx
+ lea r7, [put_avx512icl]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [r7+wq*2+table_offset(put,)]
+ add wq, r7
+ jmp wq
+.put_w2:
+ movzx r6d, word [srcq+ssq*0]
+ movzx r7d, word [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6w
+ mov [dstq+dsq*1], r7w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu xmm0, [srcq+ssq*0]
+ movu xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], xmm0
+ mova [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu ym0, [srcq+ssq*0]
+ movu ym1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], ym0
+ mova [dstq+dsq*1], ym1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+ssq*0+64*0]
+ movu m1, [srcq+ssq*0+64*1]
+ movu m2, [srcq+ssq*1+64*0]
+ movu m3, [srcq+ssq*1+64*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+64*0], m0
+ mova [dstq+dsq*0+64*1], m1
+ mova [dstq+dsq*1+64*0], m2
+ mova [dstq+dsq*1+64*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w128
+ RET
+.h:
+ ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+ ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+ imul mxyd, 0xff01
+ vbroadcasti128 m4, [bilin_h_shuf8]
+ add mxyd, 16 << 8
+ vpbroadcastw m5, mxyd
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
+ vpbroadcastd m3, [pw_2048]
+ add wq, r7
+ jmp wq
+.h_w2:
+ movd xmm0, [srcq+ssq*0]
+ pinsrd xmm0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xm4
+ pmaddubsw xmm0, xm5
+ pmulhrsw xmm0, xm3
+ packuswb xmm0, xmm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ mova xmm4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xmm0, [srcq+ssq*0]
+ movhps xmm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xmm4
+ pmaddubsw xmm0, xm5
+ pmulhrsw xmm0, xm3
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+ pmulhrsw ym0, ym3
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mova m4, [bilin_h_perm16]
+.h_w16_loop:
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ vpermb m0, m4, m0
+ pmaddubsw m0, m5
+ pmulhrsw m0, m3
+ vpmovuswb ym0, m0
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ movu ym0, [srcq+ssq*0+8*0]
+ vinserti32x8 m0, [srcq+ssq*1+8*0], 1
+ movu ym1, [srcq+ssq*0+8*1]
+ vinserti32x8 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ add srcq, ssq
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu m0, [srcq+8*0]
+ movu m2, [srcq+8*1]
+ movu m1, [srcq+8*8]
+ movu m6, [srcq+8*9]
+ add srcq, ssq
+ REPX {pshufb x, m4}, m0, m2, m1, m6
+ REPX {pmaddubsw x, m5}, m0, m2, m1, m6
+ REPX {pmulhrsw x, m3}, m0, m2, m1, m6
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 0xff01
+ vpbroadcastd m5, [pw_2048]
+ add mxyd, 16 << 8
+ add wq, r7
+ vpbroadcastw m4, mxyd
+ jmp wq
+.v_w2:
+ movd xmm0, [srcq+ssq*0]
+.v_w2_loop:
+ pinsrw xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1
+ lea srcq, [srcq+ssq*2]
+ pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1
+ pshuflw xmm1, xmm1, q2301 ; 1 0
+ punpcklbw xmm1, xmm0, xmm1
+ pmaddubsw xmm1, xm4
+ pmulhrsw xmm1, xm5
+ packuswb xmm1, xmm1
+ pextrw [dstq+dsq*0], xmm1, 1
+ pextrw [dstq+dsq*1], xmm1, 0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xmm0, [srcq+ssq*0]
+.v_w4_loop:
+ vpbroadcastd xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xmm2, xmm1, xmm0, 0x01 ; 0 1
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm1, xmm0, 0x02 ; 1 2
+ punpcklbw xmm1, xmm2
+ pmaddubsw xmm1, xm4
+ pmulhrsw xmm1, xm5
+ packuswb xmm1, xmm1
+ movd [dstq+dsq*0], xmm1
+ pextrd [dstq+dsq*1], xmm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xmm0, [srcq+ssq*0]
+.v_w8_loop:
+ movq xmm3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw xmm1, xmm3, xmm0
+ movq xmm0, [srcq+ssq*0]
+ punpcklbw xmm2, xmm0, xmm3
+ pmaddubsw xmm1, xm4
+ pmaddubsw xmm2, xm4
+ pmulhrsw xmm1, xm5
+ pmulhrsw xmm2, xm5
+ packuswb xmm1, xmm2
+ movq [dstq+dsq*0], xmm1
+ movhps [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu xmm0, [srcq+ssq*0]
+.v_w16_loop:
+ vbroadcasti128 ymm2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd ymm3, ymm2, ymm0, 0x0f ; 0 1
+ vbroadcasti128 ymm0, [srcq+ssq*0]
+ vpblendd ymm2, ymm2, ymm0, 0xf0 ; 1 2
+ punpcklbw ymm1, ymm2, ymm3
+ punpckhbw ymm2, ymm3
+ pmaddubsw ymm1, ym4
+ pmaddubsw ymm2, ym4
+ pmulhrsw ymm1, ym5
+ pmulhrsw ymm2, ym5
+ packuswb ymm1, ymm2
+ mova [dstq+dsq*0], xmm1
+ vextracti128 [dstq+dsq*1], ymm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ vzeroupper
+ RET
+.v_w32:
+ movu ym0, [srcq+ssq*0]
+ kxnorb k1, k1, k1
+.v_w32_loop:
+ vbroadcasti32x8 m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendmd m3{k1}, m2, m0 ; 0 1
+ vbroadcasti32x8 m0, [srcq+ssq*0]
+ vpblendmd m2{k1}, m0, m2 ; 1 2
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ mova [dstq+dsq*0], ym1
+ vextracti32x8 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ RET
+.v_w64:
+ movu m0, [srcq+ssq*0]
+.v_w64_loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m1, m3, m0
+ punpckhbw m6, m3, m0
+ movu m0, [srcq+ssq*0]
+ pmaddubsw m1, m4
+ pmaddubsw m6, m4
+ punpcklbw m2, m0, m3
+ punpckhbw m7, m0, m3
+ pmaddubsw m2, m4
+ pmaddubsw m7, m4
+ REPX {pmulhrsw x, m5}, m1, m6, m2, m7
+ packuswb m1, m6
+ packuswb m2, m7
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ movu m0, [srcq+64*0]
+ movu m1, [srcq+64*1]
+.v_w128_loop:
+ add srcq, ssq
+ movu m2, [srcq+64*0]
+ movu m3, [srcq+64*1]
+ punpcklbw m6, m2, m0
+ pmaddubsw m6, m4
+ punpckhbw m0, m2, m0
+ pmaddubsw m0, m4
+ punpcklbw m7, m3, m1
+ pmaddubsw m7, m4
+ punpckhbw m1, m3, m1
+ pmaddubsw m1, m4
+ REPX {pmulhrsw x, m5}, m6, m0, m7, m1
+ packuswb m6, m0
+ mova m0, m2
+ packuswb m7, m1
+ mova m1, m3
+ mova [dstq+64*0], m6
+ mova [dstq+64*1], m7
+ add dstq, dsq
+ dec hd
+ jg .v_w128_loop
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+ ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11 ; can't shift by 12 due to signed overflow
+ vpbroadcastd m7, [pw_2048]
+ add wq, r7
+ vpbroadcastw m6, mxyd
+ jmp wq
+.hv_w2:
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ pshufb xmm0, xm4
+ pmaddubsw xmm0, xm5
+.hv_w2_loop:
+ movd xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pinsrd xmm1, [srcq+ssq*0], 1
+ pshufb xmm1, xm4
+ pmaddubsw xmm1, xm5 ; 1 _ 2 _
+ shufps xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _
+ mova xmm0, xmm1
+ psubw xmm1, xmm2
+ paddw xmm1, xmm1
+ pmulhw xmm1, xm6
+ paddw xmm1, xmm2
+ pmulhrsw xmm1, xm7
+ packuswb xmm1, xmm1
+ pextrw [dstq+dsq*0], xmm1, 0
+ pextrw [dstq+dsq*1], xmm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova xmm4, [bilin_h_shuf4]
+ movddup xmm0, [srcq+ssq*0]
+ pshufb xmm0, xmm4
+ pmaddubsw xmm0, xm5
+.hv_w4_loop:
+ movq xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm1, [srcq+ssq*0]
+ pshufb xmm1, xmm4
+ pmaddubsw xmm1, xm5 ; 1 2
+ shufps xmm2, xmm0, xmm1, q1032 ; 0 1
+ mova xmm0, xmm1
+ psubw xmm1, xmm2
+ paddw xmm1, xmm1
+ pmulhw xmm1, xm6
+ paddw xmm1, xmm2
+ pmulhrsw xmm1, xm7
+ packuswb xmm1, xmm1
+ movd [dstq+dsq*0], xmm1
+ pextrd [dstq+dsq*1], xmm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti128 ym0, [srcq+ssq*0]
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 ym1, [srcq+ssq*0], 1
+ pshufb ym1, ym4
+ pmaddubsw ym1, ym5 ; 1 2
+ valignq ym2, ym1, ym0, 2
+ mova ym0, ym1
+ psubw ym1, ym2
+ paddw ym1, ym1
+ pmulhw ym1, ym6
+ paddw ym1, ym2
+ pmulhrsw ym1, ym7
+ vpmovuswb xm1, ym1
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ vbroadcasti32x8 m0, [srcq+ssq*0]
+ mova m4, [bilin_h_perm16]
+ vpermb m0, m4, m0
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu ym1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m1, [srcq+ssq*0], 1
+ vpermb m1, m4, m1
+ pmaddubsw m1, m5 ; 1 2
+ valignq m2, m1, m0, 4 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ vpmovuswb ym1, m1
+ mova [dstq+dsq*0], xm1
+ vextracti32x4 [dstq+dsq*1], ym1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+ssq*0]
+ pmovzxbq m8, [pb_02461357]
+ pmaddubsw m0, m5
+.hv_w32_loop:
+ vpermb m2, m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpermb m3, m4, [srcq+ssq*0]
+ pmaddubsw m2, m5
+ psubw m1, m2, m0
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m0
+ pmaddubsw m0, m3, m5
+ psubw m3, m0, m2
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m2
+ pmulhrsw m1, m7
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ vpermq m1, m8, m1
+ mova [dstq+dsq*0], ym1
+ vextracti32x8 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w32_loop
+ RET
+.hv_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w64_loop:
+ add srcq, ssq
+ movu m2, [srcq+8*0]
+ movu m3, [srcq+8*1]
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ psubw m8, m2, m0
+ psubw m9, m3, m1
+ paddw m8, m8
+ pmulhw m8, m6
+ paddw m9, m9
+ pmulhw m9, m6
+ paddw m8, m0
+ pmulhrsw m8, m7
+ paddw m9, m1
+ pmulhrsw m9, m7
+ mova m0, m2
+ mova m1, m3
+ packuswb m8, m9
+ mova [dstq], m8
+ add dstq, dsq
+ dec hd
+ jg .hv_w64_loop
+ RET
+.hv_w128:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ movu m2, [srcq+8*8]
+ movu m3, [srcq+8*9]
+ REPX {pshufb x, m4}, m0, m1, m2, m3
+ REPX {pmaddubsw x, m5}, m0, m1, m2, m3
+.hv_w128_loop:
+ add srcq, ssq
+ movu m8, [srcq+8*0]
+ movu m9, [srcq+8*1]
+ movu m10, [srcq+8*8]
+ movu m11, [srcq+8*9]
+ REPX {pshufb x, m4}, m8, m9, m10, m11
+ REPX {pmaddubsw x, m5}, m8, m9, m10, m11
+ psubw m12, m8, m0
+ psubw m13, m9, m1
+ psubw m14, m10, m2
+ psubw m15, m11, m3
+ paddw m12, m12
+ pmulhw m12, m6
+ paddw m13, m13
+ pmulhw m13, m6
+ paddw m14, m14
+ pmulhw m14, m6
+ paddw m15, m15
+ pmulhw m15, m6
+ paddw m12, m0
+ pmulhrsw m12, m7
+ paddw m13, m1
+ pmulhrsw m13, m7
+ paddw m14, m2
+ pmulhrsw m14, m7
+ paddw m15, m3
+ pmulhrsw m15, m7
+ mova m0, m8
+ mova m1, m9
+ mova m2, m10
+ mova m3, m11
+ packuswb m12, m13
+ packuswb m14, m15
+ mova [dstq+64*0], m12
+ mova [dstq+64*1], m14
+ add dstq, dsq
+ dec hd
+ jg .hv_w128_loop
+ RET
+
+DECLARE_REG_TMP 3, 5, 6
+
+cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea t2, [prep_avx512icl]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [t2+wq*2+table_offset(prep,)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movd xmm0, [srcq+strideq*0]
+ pinsrd xmm0, [srcq+strideq*1], 1
+ pinsrd xmm0, [srcq+strideq*2], 2
+ pinsrd xmm0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw ym0, xmm0
+ psllw ym0, 4
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movq xmm0, [srcq+strideq*0]
+ movq xmm1, [srcq+strideq*1]
+ vinserti128 ym0, ymm0, [srcq+strideq*2], 1
+ vinserti128 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq ym0, ym1
+ pmovzxbw m0, ym0
+ psllw m0, 4
+ mova [tmpq], m0
+ add tmpq, 32*2
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu xmm0, [srcq+strideq*0]
+ vinserti128 ym0, ymm0, [srcq+strideq*1], 1
+ movu xmm1, [srcq+strideq*2]
+ vinserti128 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, ym0
+ pmovzxbw m1, ym1
+ psllw m0, 4
+ psllw m1, 4
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 32*4
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmovzxbw m0, [srcq+strideq*0]
+ pmovzxbw m1, [srcq+strideq*1]
+ pmovzxbw m2, [srcq+strideq*2]
+ pmovzxbw m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 4
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmovzxbw m0, [srcq+strideq*0+32*0]
+ pmovzxbw m1, [srcq+strideq*0+32*1]
+ pmovzxbw m2, [srcq+strideq*1+32*0]
+ pmovzxbw m3, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 2
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmovzxbw m0, [srcq+32*0]
+ pmovzxbw m1, [srcq+32*1]
+ pmovzxbw m2, [srcq+32*2]
+ pmovzxbw m3, [srcq+32*3]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ add srcq, strideq
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+ ; = (16 - mx) * src[x] + mx * src[x + 1]
+ imul mxyd, 0xff01
+ add mxyd, 16 << 8
+ vpbroadcastw m5, mxyd
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ vbroadcasti32x4 ym4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xmm0, [srcq+strideq*0]
+ movq xmm1, [srcq+strideq*1]
+ vinserti32x4 ym0, ymm0, [srcq+strideq*2], 1
+ vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq ym0, ym1
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ vbroadcasti32x4 m4, [bilin_h_shuf8]
+.h_w8_loop:
+ movu xmm0, [srcq+strideq*0]
+ vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1
+ vinserti32x4 m0, [srcq+strideq*2], 2
+ vinserti32x4 m0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ mova m4, [bilin_h_perm16]
+.h_w16_loop:
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vpermb m0, m4, m0
+ vpermb m1, m4, m1
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 64*2
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+ mova m4, [bilin_h_perm32]
+.h_w32_loop:
+ vpermb m0, m4, [srcq+strideq*0]
+ vpermb m1, m4, [srcq+strideq*1]
+ vpermb m2, m4, [srcq+strideq*2]
+ vpermb m3, m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 4
+ jg .h_w32_loop
+ RET
+.h_w64:
+ mova m4, [bilin_h_perm32]
+.h_w64_loop:
+ vpermb m0, m4, [srcq+strideq*0+32*0]
+ vpermb m1, m4, [srcq+strideq*0+32*1]
+ vpermb m2, m4, [srcq+strideq*1+32*0]
+ vpermb m3, m4, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 2
+ jg .h_w64_loop
+ RET
+.h_w128:
+ mova m4, [bilin_h_perm32]
+.h_w128_loop:
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ vpermb m2, m4, [srcq+32*2]
+ vpermb m3, m4, [srcq+32*3]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ add srcq, strideq
+ dec hd
+ jg .h_w128_loop
+ RET
+.v:
+ WIN64_SPILL_XMM 7
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
+ imul mxyd, 0xff01
+ add mxyd, 16 << 8
+ add wq, t2
+ lea stride3q, [strideq*3]
+ vpbroadcastw m6, mxyd
+ jmp wq
+.v_w4:
+ vpbroadcastd xm0, [srcq+strideq*0]
+ mov r3d, 0x29
+ vbroadcasti32x4 ym3, [bilin_v_shuf4]
+ kmovb k1, r3d
+.v_w4_loop:
+ vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____
+ vpbroadcastd ym2, [srcq+strideq*2]
+ vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastd ym0, [srcq+strideq*0]
+ punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_
+ pshufb ym2, ym3
+ pmaddubsw ym2, ym6
+ mova [tmpq], ym2
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ mova m5, [bilin_v_perm8]
+ vbroadcasti32x4 ym0, [srcq+strideq*0]
+.v_w8_loop:
+ vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
+ vpbroadcastq ym0, [srcq+strideq*2]
+ vinserti32x4 m1, [srcq+stride3q ], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti32x4 ym0, [srcq+strideq*0], 0
+ vpermt2b m1, m5, m0
+ pmaddubsw m1, m6
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ mova m5, [bilin_v_perm16]
+ movu xm0, [srcq+strideq*0]
+.v_w16_loop:
+ movu xm2, [srcq+strideq*2]
+ vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
+ vpermt2b m1, m5, m2
+ vinserti32x4 ym2, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ movu xm0, [srcq+strideq*0]
+ vpermt2b m2, m5, m0
+ pmaddubsw m1, m6
+ pmaddubsw m2, m6
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m5, [bilin_v_perm32]
+ movu ym0, [srcq+strideq*0]
+.v_w32_loop:
+ movu ym2, [srcq+strideq*1]
+ movu ym3, [srcq+strideq*2]
+ movu ym4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpermt2b m0, m5, m2
+ vpermt2b m2, m5, m3
+ vpermt2b m3, m5, m4
+ pmaddubsw m1, m0, m6
+ movu ym0, [srcq+strideq*0]
+ vpermt2b m4, m5, m0
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m4, m6
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ mova [tmpq+64*2], m3
+ mova [tmpq+64*3], m4
+ add tmpq, 64*4
+ sub hd, 4
+ jg .v_w32_loop
+ RET
+.v_w64:
+ mova m5, [bilin_v_perm64]
+ vpermq m0, m5, [srcq+strideq*0]
+.v_w64_loop:
+ vpermq m1, m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m1, m0
+ punpckhbw m2, m1, m0
+ vpermq m0, m5, [srcq+strideq*0]
+ punpcklbw m3, m0, m1
+ punpckhbw m1, m0, m1
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m1, m6
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m2
+ mova [tmpq+64*2], m3
+ mova [tmpq+64*3], m1
+ add tmpq, 64*4
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ mova m5, [bilin_v_perm64]
+ vpermq m0, m5, [srcq+strideq*0+ 0]
+ vpermq m1, m5, [srcq+strideq*0+64]
+.v_w128_loop:
+ vpermq m2, m5, [srcq+strideq*1+ 0]
+ vpermq m3, m5, [srcq+strideq*1+64]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m2, m0
+ punpckhbw m0, m2, m0
+ pmaddubsw m4, m6
+ pmaddubsw m0, m6
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m0
+ punpcklbw m4, m3, m1
+ punpckhbw m1, m3, m1
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ mova [tmpq+64*2], m4
+ mova [tmpq+64*3], m1
+ vpermq m0, m5, [srcq+strideq*0+ 0]
+ vpermq m1, m5, [srcq+strideq*0+64]
+ punpcklbw m4, m0, m2
+ punpckhbw m2, m0, m2
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ mova [tmpq+64*4], m4
+ mova [tmpq+64*5], m2
+ punpcklbw m4, m1, m3
+ punpckhbw m3, m1, m3
+ pmaddubsw m4, m6
+ pmaddubsw m3, m6
+ mova [tmpq+64*6], m4
+ mova [tmpq+64*7], m3
+ add tmpq, 64*8
+ sub hd, 2
+ jg .v_w128_loop
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 7
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ vpbroadcastw m6, mxyd
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.hv_w4:
+ vbroadcasti32x4 ym4, [bilin_h_shuf4]
+ vpbroadcastq ym0, [srcq+strideq*0]
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+.hv_w4_loop:
+ movq xmm1, [srcq+strideq*1]
+ movq xmm2, [srcq+strideq*2]
+ vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vinserti32x4 ym2, ymm2, [srcq+strideq*0], 1
+ punpcklqdq ym1, ym2
+ pshufb ym1, ym4
+ pmaddubsw ym1, ym5 ; 1 2 3 4
+ valignq ym2, ym1, ym0, 3 ; 0 1 2 3
+ mova ym0, ym1
+ psubw ym1, ym2
+ pmulhrsw ym1, ym6
+ paddw ym1, ym2
+ mova [tmpq], ym1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti32x4 m4, [bilin_h_shuf8]
+ vbroadcasti32x4 m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xmm1, [srcq+strideq*1]
+ vinserti128 ym1, ymm1, [srcq+strideq*2], 1
+ vinserti128 m1, [srcq+stride3q ], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m1, [srcq+strideq*0], 3
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2 3 4
+ valignq m2, m1, m0, 6 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ mova m4, [bilin_h_perm16]
+ vbroadcasti32x8 m0, [srcq+strideq*0]
+ vpermb m0, m4, m0
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu ym1, [srcq+strideq*1]
+ vinserti32x8 m1, [srcq+strideq*2], 1
+ movu ym2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti32x8 m2, [srcq+strideq*0], 1
+ vpermb m1, m4, m1
+ vpermb m2, m4, m2
+ pmaddubsw m1, m5 ; 1 2
+ vshufi32x4 m3, m0, m1, q1032 ; 0 1
+ pmaddubsw m0, m2, m5 ; 3 4
+ vshufi32x4 m2, m1, m0, q1032 ; 2 3
+ psubw m1, m3
+ pmulhrsw m1, m6
+ paddw m1, m3
+ psubw m3, m0, m2
+ pmulhrsw m3, m6
+ paddw m3, m2
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m3
+ add tmpq, 64*2
+ sub hd, 4
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+strideq*0]
+ pmaddubsw m0, m5
+.hv_w32_loop:
+ vpermb m1, m4, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermb m2, m4, [srcq+strideq*0]
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+64*0], m3
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 2
+ jg .hv_w32_loop
+ RET
+.hv_w64:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w64_loop:
+ add srcq, strideq
+ vpermb m2, m4, [srcq+32*0]
+ vpermb m3, m4, [srcq+32*1]
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ psubw m7, m2, m0
+ psubw m8, m3, m1
+ pmulhrsw m7, m6
+ pmulhrsw m8, m6
+ paddw m7, m0
+ mova m0, m2
+ paddw m8, m1
+ mova m1, m3
+ mova [tmpq+64*0], m7
+ mova [tmpq+64*1], m8
+ add tmpq, 64*2
+ dec hd
+ jg .hv_w64_loop
+ RET
+.hv_w128:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ vpermb m2, m4, [srcq+32*2]
+ vpermb m3, m4, [srcq+32*3]
+ REPX {pmaddubsw x, m5}, m0, m1, m2, m3
+.hv_w128_loop:
+ add srcq, strideq
+ vpermb m7, m4, [srcq+32*0]
+ vpermb m8, m4, [srcq+32*1]
+ vpermb m9, m4, [srcq+32*2]
+ vpermb m10, m4, [srcq+32*3]
+ REPX {pmaddubsw x, m5}, m7, m8, m9, m10
+ psubw m11, m7, m0
+ psubw m12, m8, m1
+ psubw m13, m9, m2
+ psubw m14, m10, m3
+ REPX {pmulhrsw x, m6}, m11, m12, m13, m14
+ paddw m11, m0
+ mova m0, m7
+ paddw m12, m1
+ mova m1, m8
+ paddw m13, m2
+ mova m2, m9
+ paddw m14, m3
+ mova m3, m10
+ mova [tmpq+64*0], m11
+ mova [tmpq+64*1], m12
+ mova [tmpq+64*2], m13
+ mova [tmpq+64*3], m14
+ add tmpq, 64*4
+ dec hd
+ jg .hv_w128_loop
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; fn, type, type_h, type_v
+cglobal %1_%2_8bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb
+%if %5
+ vpermb m%2, m6, m%1
+ vpermb m%3, m7, m%1
+ vpermb m%4, m8, m%1
+%else
+%if %2 < %4 ; reuse a previous value if possible
+ pshufb m%2, m%1, m6
+%endif
+ pshufb m%3, m%1, m7
+ pshufb m%4, m%1, m8
+%endif
+ mova m%1, m5
+ vpdpbusd m%1, m%2, m9
+ mova m%2, m5
+ vpdpbusd m%2, m%3, m9
+ vpdpbusd m%1, m%3, m10
+ vpdpbusd m%2, m%4, m10
+ packusdw m%1, m%2
+ psrlw m%1, 6
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+%define base r8-put_avx512icl
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx512icl]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+ lea r6, [ssq*3]
+ lea r7, [dsq*3]
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [pd_34] ; 2 + (8 << 2)
+ WIN64_SPILL_XMM 11
+ cmp wd, 4
+ jl .h_w2
+ vbroadcasti128 m6, [subpel_h_shufA]
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m7, [subpel_h_shufB]
+ vbroadcasti128 m8, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+ vpbroadcastd m9, [base+mxq*8+subpel_filters+0]
+ vpbroadcastd m10, [base+mxq*8+subpel_filters+4]
+ add wq, r8
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ dec srcq
+ mova xmm4, [subpel_h_shuf4]
+ vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
+.h_w2_loop:
+ movq xmm0, [srcq+ssq*0]
+ movhps xmm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xmm4
+ mova xmm1, xm5
+ vpdpbusd xmm1, xmm0, xmm3
+ packssdw xmm0, xmm1, xmm1
+ psraw xmm0, 6
+ packuswb xmm0, xm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
+.h_w4_loop:
+ movq xmm0, [srcq+ssq*0]
+ movq xmm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xmm0, xm6
+ pshufb xmm1, xm6
+ mova xmm2, xm5
+ vpdpbusd xmm2, xmm0, xmm3
+ mova xmm0, xm5
+ vpdpbusd xmm0, xmm1, xmm3
+ packssdw xmm0, xmm2, xmm0
+ psraw xmm0, 6
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ WRAP_YMM PUT_8TAP_H 0, 1, 2, 3
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ mova m6, [spel_h_perm16a]
+ mova m7, [spel_h_perm16b]
+ mova m8, [spel_h_perm16c]
+.h_w16_loop:
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 1, 2, 3, 1
+ vpmovuswb ym0, m0
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], ym0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16_loop
+ RET
+.h_w32:
+ movu ym0, [srcq+ssq*0+8*0]
+ vinserti32x8 m0, [srcq+ssq*1+8*0], 1
+ movu ym1, [srcq+ssq*0+8*1]
+ vinserti32x8 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 4, 3, 2
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 4, 3, 2
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu m0, [srcq+8*0]
+ movu m2, [srcq+8*1]
+ movu m1, [srcq+8*8]
+ movu m3, [srcq+8*9]
+ add srcq, ssq
+ PUT_8TAP_H 0, 4, 11, 12
+ PUT_8TAP_H 2, 12, 11, 4
+ PUT_8TAP_H 1, 4, 11, 12
+ PUT_8TAP_H 3, 12, 11, 4
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ tzcnt r6d, wd
+ movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
+ vpbroadcastd m7, [pw_512]
+ lea myq, [base+subpel_filters+myq*8]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ add r6, r8
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ jmp r6
+.v_w2:
+ movd xmm2, [srcq+ssq*0]
+ pinsrw xmm2, [srcq+ssq*1], 2
+ pinsrw xmm2, [srcq+ssq*2], 4
+ add srcq, ss3q
+ pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3
+ movd xmm3, [srcq+ssq*1]
+ vpbroadcastd xmm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5
+ vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6
+ palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
+ punpcklbw xmm3, xmm1 ; 45 56
+ punpcklbw xmm1, xmm2, xmm4 ; 01 12
+ punpckhbw xmm2, xmm4 ; 23 34
+.v_w2_loop:
+ pmaddubsw xmm5, xmm1, xm8 ; a0 b0
+ mova xmm1, xmm2
+ pmaddubsw xmm2, xm9 ; a1 b1
+ paddw xmm5, xmm2
+ mova xmm2, xmm3
+ pmaddubsw xmm3, xm10 ; a2 b2
+ paddw xmm5, xmm3
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8
+ punpcklbw xmm3, xmm4 ; 67 78
+ pmaddubsw xmm4, xmm3, xm11 ; a3 b3
+ paddw xmm5, xmm4
+ pmulhrsw xmm5, xm7
+ packuswb xmm5, xmm5
+ pextrw [dstq+dsq*0], xmm5, 0
+ pextrw [dstq+dsq*1], xmm5, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xmm2, [srcq+ssq*0]
+ pinsrd xmm2, [srcq+ssq*1], 1
+ pinsrd xmm2, [srcq+ssq*2], 2
+ add srcq, ss3q
+ pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3
+ movd xmm3, [srcq+ssq*1]
+ vpbroadcastd xmm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5
+ vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6
+ palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
+ punpcklbw xmm3, xmm1 ; 45 56
+ punpcklbw xmm1, xmm2, xmm4 ; 01 12
+ punpckhbw xmm2, xmm4 ; 23 34
+.v_w4_loop:
+ vpbroadcastd xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw xmm5, xmm1, xm8 ; a0 b0
+ mova xmm1, xmm2
+ pmaddubsw xmm2, xm9 ; a1 b1
+ paddw xmm5, xmm2
+ mova xmm2, xmm3
+ pmaddubsw xmm3, xm10 ; a2 b2
+ paddw xmm5, xmm3
+ vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
+ vpbroadcastd xmm0, [srcq+ssq*0]
+ vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8
+ punpcklbw xmm3, xmm4 ; 67 78
+ pmaddubsw xmm4, xmm3, xm11 ; a3 b3
+ paddw xmm5, xmm4
+ pmulhrsw xmm5, xm7
+ packuswb xmm5, xmm5
+ movd [dstq+dsq*0], xmm5
+ pextrd [dstq+dsq*1], xmm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xmm1, [srcq+ssq*0]
+ vpbroadcastq ymm0, [srcq+ssq*1]
+ vpbroadcastq ymm2, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq ymm5, [srcq+ssq*0]
+ vpbroadcastq ymm3, [srcq+ssq*1]
+ vpbroadcastq ymm4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpblendd ymm1, ymm0, 0x30
+ vpblendd ymm0, ymm2, 0x30
+ punpcklbw ymm1, ymm0 ; 01 12
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm2, ymm5, 0x30
+ vpblendd ymm5, ymm3, 0x30
+ punpcklbw ymm2, ymm5 ; 23 34
+ vpblendd ymm3, ymm4, 0x30
+ vpblendd ymm4, ymm0, 0x30
+ punpcklbw ymm3, ymm4 ; 45 56
+.v_w8_loop:
+ vpbroadcastq ymm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw ymm5, ymm1, ym8 ; a0 b0
+ mova ymm1, ymm2
+ pmaddubsw ymm2, ym9 ; a1 b1
+ paddw ymm5, ymm2
+ mova ymm2, ymm3
+ pmaddubsw ymm3, ym10 ; a2 b2
+ paddw ymm5, ymm3
+ vpblendd ymm3, ymm0, ymm4, 0x30
+ vpbroadcastq ymm0, [srcq+ssq*0]
+ vpblendd ymm4, ymm4, ymm0, 0x30
+ punpcklbw ymm3, ymm4 ; 67 78
+ pmaddubsw ymm4, ymm3, ym11 ; a3 b3
+ paddw ymm5, ymm4
+ pmulhrsw ymm5, ym7
+ vextracti128 xmm4, ymm5, 1
+ packuswb xmm5, xmm4
+ movq [dstq+dsq*0], xmm5
+ movhps [dstq+dsq*1], xmm5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ vzeroupper
+ RET
+.v_w16:
+ mova m12, [spel_v_perm16]
+ vbroadcasti32x4 m1, [srcq+ssq*0]
+ vbroadcasti32x4 ym4, [srcq+ssq*1]
+ mov r6d, 0x0f
+ vbroadcasti32x4 m2, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti32x4 ym5, [srcq+ssq*0]
+ kmovb k1, r6d
+ vbroadcasti32x4 m3, [srcq+ssq*1]
+ vbroadcasti32x4 ym6, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti32x4 m0, [srcq+ssq*0]
+ vshufpd m1{k1}, m4, m2, 0xcc
+ vshufpd m2{k1}, m5, m3, 0xcc
+ vshufpd m3{k1}, m6, m0, 0xcc
+ vpermb m1, m12, m1 ; 01 12
+ vpermb m2, m12, m2 ; 23 34
+ vpermb m3, m12, m3 ; 45 56
+.v_w16_loop:
+ pmaddubsw m4, m1, m8 ; a0 b0
+ mova m1, m2
+ pmaddubsw m5, m2, m9 ; a1 b1
+ mova m2, m3
+ pmaddubsw m6, m3, m10 ; a2 b2
+ mova m3, m0
+ paddw m4, m5
+ vbroadcasti32x4 ym5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m0, [srcq+ssq*0]
+ vshufpd m3{k1}, m5, m0, 0xcc
+ vpermb m3, m12, m3 ; 67 78
+ pmaddubsw m5, m3, m11 ; a3 b3
+ paddw m4, m6
+ paddw m4, m5
+ pmulhrsw m4, m7
+ vextracti32x8 ym5, m4, 1
+ packuswb ym4, ym5
+ mova [dstq+dsq*0], xm4
+ vextracti32x4 [dstq+dsq*1], ym4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m12, [spel_v_perm32]
+ pmovzxbq m14, [pb_02461357]
+ vpshrdw m13, m12, m12, 8
+ movu ym0, [srcq+ssq*0]
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ vpermb m1, m12, m0 ; 01
+ vinserti32x8 m0, [srcq+ssq*2], 0
+ add srcq, ss3q
+ vpermb m2, m13, m0 ; 12
+ vinserti32x8 m0, [srcq+ssq*0], 1
+ vpermb m3, m12, m0 ; 23
+ vinserti32x8 m0, [srcq+ssq*1], 0
+ vpermb m4, m13, m0 ; 34
+ vinserti32x8 m0, [srcq+ssq*2], 1
+ add srcq, ss3q
+ vpermb m5, m12, m0 ; 45
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ vpermb m6, m13, m0 ; 56
+.v_w32_loop:
+ vinserti32x8 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m15, m1, m8
+ mova m1, m3
+ pmaddubsw m16, m2, m8
+ mova m2, m4
+ pmaddubsw m17, m3, m9
+ mova m3, m5
+ pmaddubsw m18, m4, m9
+ mova m4, m6
+ pmaddubsw m19, m5, m10
+ vpermb m5, m12, m0 ; 67
+ vinserti32x8 m0, [srcq+ssq*0], 0
+ pmaddubsw m20, m6, m10
+ vpermb m6, m13, m0 ; 78
+ paddw m15, m17
+ pmaddubsw m17, m5, m11
+ paddw m16, m18
+ pmaddubsw m18, m6, m11
+ paddw m15, m19
+ paddw m16, m20
+ paddw m15, m17
+ paddw m16, m18
+ pmulhrsw m15, m7
+ pmulhrsw m16, m7
+ packuswb m15, m16
+ vpermq m15, m14, m15
+ mova [dstq+dsq*0], ym15
+ vextracti32x8 [dstq+dsq*1], m15, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w32_loop
+ vzeroupper
+ RET
+.v_w64:
+.v_w128:
+ lea r6d, [hq+wq*4-256]
+ mov r4, srcq
+ mov r7, dstq
+.v_loop0:
+ movu m2, [srcq+ssq*0]
+ movu m4, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ movu m13, [srcq+ssq*0]
+ movu m15, [srcq+ssq*1]
+ movu m17, [srcq+ssq*2]
+ add srcq, ss3q
+ movu m0, [srcq+ssq*0]
+ punpcklbw m1, m2, m4 ; 01l
+ punpckhbw m2, m4 ; 01h
+ punpcklbw m3, m4, m6 ; 12l
+ punpckhbw m4, m6 ; 12h
+ punpcklbw m5, m6, m13 ; 23l
+ punpckhbw m6, m13 ; 23h
+ punpcklbw m12, m13, m15 ; 34l
+ punpckhbw m13, m15 ; 34h
+ punpcklbw m14, m15, m17 ; 45l
+ punpckhbw m15, m17 ; 45h
+ punpcklbw m16, m17, m0 ; 56l
+ punpckhbw m17, m0 ; 56h
+.v_loop:
+ pmaddubsw m18, m1, m8 ; a0l
+ mova m1, m5
+ pmaddubsw m19, m2, m8 ; a0h
+ mova m2, m6
+ pmaddubsw m20, m3, m8 ; b0l
+ mova m3, m12
+ pmaddubsw m21, m4, m8 ; b0h
+ mova m4, m13
+ pmaddubsw m5, m9 ; a1l
+ pmaddubsw m6, m9 ; a1h
+ pmaddubsw m12, m9 ; b1l
+ pmaddubsw m13, m9 ; b1h
+ paddw m18, m5
+ mova m5, m14
+ pmaddubsw m14, m10 ; a2l
+ paddw m19, m6
+ mova m6, m15
+ pmaddubsw m15, m10 ; a2h
+ paddw m20, m12
+ mova m12, m16
+ pmaddubsw m16, m10 ; b2l
+ paddw m21, m13
+ mova m13, m17
+ pmaddubsw m17, m10 ; b2h
+ paddw m18, m14
+ paddw m19, m15
+ paddw m20, m16
+ paddw m21, m17
+ movu m17, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m14, m0, m17 ; 67l
+ punpckhbw m15, m0, m17 ; 67h
+ pmaddubsw m16, m14, m11 ; a3l
+ pmaddubsw m0, m15, m11 ; a3h
+ paddw m18, m16
+ paddw m19, m0
+ movu m0, [srcq+ssq*0]
+ punpcklbw m16, m17, m0 ; 78l
+ punpckhbw m17, m0 ; 78h
+ pmulhrsw m18, m7
+ pmulhrsw m19, m7
+ packuswb m18, m19
+ mova [dstq+dsq*0], m18
+ pmaddubsw m18, m16, m11 ; b3l
+ pmaddubsw m19, m17, m11 ; b3h
+ paddw m18, m20
+ paddw m19, m21
+ pmulhrsw m18, m7
+ pmulhrsw m19, m7
+ packuswb m18, m19
+ mova [dstq+dsq*1], m18
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_loop
+ add r4, 64
+ add r7, 64
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 256
+ jg .v_loop0
+ vzeroupper
+ RET
+.hv:
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m7, [base+subpel_filters+mxq*8+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastd m8, [pd_2]
+ vpbroadcastq ym0, [base+subpel_filters+myq*8]
+ lea ss3q, [ssq*3]
+ vpbroadcastd ym9, [pd_32768]
+ mov r6, srcq
+ punpcklbw ym0, ym8, ym0
+ sub r6, ss3q
+ psraw ym0, 2 ; << 6
+ mova xm14, [spel_hv_end]
+ pshufd ym10, ym0, q0000
+ pshufd ym11, ym0, q1111
+ pshufd ym12, ym0, q2222
+ pshufd ym13, ym0, q3333
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 ym6, [subpel_h_shuf4]
+ movq xmm2, [r6+ssq*0]
+ movhps xmm2, [r6+ssq*1]
+ movq xmm0, [r6+ssq*2]
+ movhps xmm0, [srcq+ssq*0]
+ vpbroadcastq ymm3, [srcq+ssq*1]
+ vpbroadcastq ymm4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq ymm1, [srcq+ssq*0]
+ vpblendd ymm2, ymm3, 0x30
+ vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _
+ vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5
+ pshufb ymm2, ym6
+ pshufb ymm0, ym6
+ mova ymm1, ym8
+ vpdpbusd ymm1, ymm2, ym7
+ mova ymm2, ym8
+ vpdpbusd ymm2, ymm0, ym7
+ packssdw ymm2, ymm1, ymm2
+ psraw ymm2, 2
+ vextracti128 xmm3, ymm2, 1
+ palignr xmm4, xmm3, xmm2, 4
+ punpcklwd xmm1, xmm2, xmm4 ; 01 12
+ punpckhwd xmm2, xmm4 ; 23 34
+ pshufd xmm0, xmm3, q2121
+ punpcklwd xmm3, xmm0 ; 45 56
+.hv_w2_loop:
+ movq xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xmm4, [srcq+ssq*0]
+ mova xmm5, xm9
+ vpdpwssd xmm5, xmm1, xm10 ; a0 b0
+ mova xmm1, xmm2
+ vpdpwssd xmm5, xmm2, xm11 ; a1 b1
+ pshufb xmm4, xm6
+ mova xmm2, xmm3
+ vpdpwssd xmm5, xmm3, xm12 ; a2 b2
+ mova xmm3, xm8
+ vpdpbusd xmm3, xmm4, xm7
+ packssdw xmm4, xmm3, xmm3
+ psraw xmm4, 2
+ palignr xmm3, xmm4, xmm0, 12
+ mova xmm0, xmm4
+ punpcklwd xmm3, xmm4 ; 67 78
+ vpdpwssd xmm5, xmm3, xm13 ; a3 b3
+ packuswb xmm5, xmm5
+ pshufb xmm5, xm14
+ pextrw [dstq+dsq*0], xmm5, 0
+ pextrw [dstq+dsq*1], xmm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ vzeroupper
+ RET
+.hv_w4:
+ movq xmm1, [r6+ssq*0]
+ vpbroadcastq ym2, [r6+ssq*1]
+ vinserti32x4 ym1, ymm1, [r6+ssq*2], 1
+ vinserti32x4 m2, [srcq+ssq*0], 2
+ vinserti32x4 m1, [srcq+ssq*1], 2
+ vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5
+ vbroadcasti32x4 m6, [subpel_h_shufA]
+ add srcq, ss3q
+ vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6
+ pshufb m2, m6
+ pshufb m1, m6
+ mova m0, m8
+ vpdpbusd m0, m2, m7
+ mova m4, m8
+ vpdpbusd m4, m1, m7
+ mova ym1, [spel_hv_perm4a]
+ mova ym2, [spel_hv_perm4b]
+ mova ym3, [spel_hv_perm4c]
+ packssdw m0, m4
+ psraw m0, 2 ; _ 0 1 2 3 4 5 6
+ mov r6d, 0x5555
+ vpermb ym1, ym1, ym0 ; 01 12
+ vpermb m2, m2, m0 ; 23 34
+ vpermb m3, m3, m0 ; 45 56
+ kmovw k1, r6d
+ mova ym15, [spel_hv_perm4d]
+.hv_w4_loop:
+ movq xmm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1
+ mova ym5, ym9
+ vpdpwssd ym5, ym1, ym10 ; a0 b0
+ mova ym1, ym2
+ pshufb ym4, ym6
+ mova ym0, ym8
+ vpdpbusd ym0, ym4, ym7
+ vpdpwssd ym5, ym2, ym11 ; a1 b1
+ mova ym2, ym3
+ vpdpwssd ym5, ym3, ym12 ; a2 b2
+ vpsraw ym3{k1}, ym0, 2 ; 7 8
+ vpermb ym3, ym15, ym3 ; 67 78
+ vpdpwssd ym5, ym3, ym13 ; a3 b3
+ packuswb ym5, ym5
+ vpermb ym5, ym14, ym5
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [base+subpel_filters+mxq*8+0]
+ vpbroadcastd m11, [base+subpel_filters+mxq*8+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastd m8, [pd_2]
+ vpbroadcastq m0, [base+subpel_filters+myq*8]
+ vpbroadcastd m9, [pd_32768]
+ punpcklbw m0, m8, m0
+ lea ss3q, [ssq*3]
+ psraw m0, 2 ; << 6
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ cmp wd, 8
+ jne .hv_w16
+ mov r6, srcq
+ sub r6, ss3q
+ movu xmm1, [r6+ssq*0]
+ vinserti128 ymm1, [r6+ssq*1], 1
+ movu xmm2, [srcq+ssq*1]
+ vinserti32x4 m6, zmm1, [r6+ssq*2], 2
+ vinserti128 ymm2, [srcq+ssq*2], 1
+ vinserti32x4 m6, [srcq+ssq*0], 3 ; 0 1 2 3
+ add srcq, ss3q
+ vbroadcasti32x4 m4, [subpel_h_shufA]
+ vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _
+ vbroadcasti32x4 m7, [subpel_h_shufB]
+ vbroadcasti32x4 m17, [subpel_h_shufC]
+ pshufb m1, m6, m4 ; 0 1 2 3 0123
+ mova m2, m8
+ vpdpbusd m2, m1, m10
+ pshufb m5, m6, m7 ; 0 1 2 3 4567
+ mova m1, m8
+ vpdpbusd m1, m5, m10
+ pshufb m4, m0, m4 ; 4 5 6 _ 0123
+ mova m3, m8
+ vpdpbusd m3, m4, m10
+ pshufb m7, m0, m7 ; 4 5 6 _ 4567
+ mova m4, m8
+ vpdpbusd m4, m7, m10
+ pshufb m6, m17
+ vpdpbusd m2, m5, m11
+ vpdpbusd m1, m6, m11
+ pshufb m6, m0, m17
+ vpdpbusd m3, m7, m11
+ vpdpbusd m4, m6, m11
+ mova m5, [spel_hv_perm8a]
+ mova m0, [spel_hv_perm8b]
+ mov r6, 0x55555555ff00
+ packssdw m2, m1
+ packssdw m3, m4
+ mova m18, [spel_hv_perm8c]
+ psraw m2, 2 ; 0 1 2 3
+ psraw m3, 2 ; 4 5 6 _
+ vpermb m1, m5, m2 ; 01 12
+ vbroadcasti32x8 m6, [subpel_h_shufA]
+ kmovq k1, r6
+ vpermt2b m2, m0, m3 ; 23 34
+ vbroadcasti32x8 m7, [subpel_h_shufB]
+ kshiftrq k2, k1, 16
+ mova xm16, [spel_hv_end]
+ vpermb m3, m5, m3 ; 45 56
+.hv_w8_loop:
+ vbroadcasti32x4 ym4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti32x4 m4{k1}, [srcq+ssq*0]
+ mova m0, m9
+ vpdpwssd m0, m1, m12 ; a0 b0
+ pshufb m1, m4, m6 ; 7 8 0123 4567
+ mova m5, m8
+ vpdpbusd m5, m1, m10
+ pshufb m4, m7 ; 7 8 4567 89ab
+ vpdpwssd m0, m2, m13 ; a1 b1
+ mova m1, m2
+ vpdpbusd m5, m4, m11
+ mova m2, m3
+ vpdpwssd m0, m3, m14 ; a2 b2
+ psraw m3{k2}, m5, 2 ; 75 86
+ vpermb m3, m18, m3 ; 67 78
+ vpdpwssd m0, m3, m15 ; a3 b3
+ packuswb m0, m0
+ vpermb zmm1, m16, m0
+ movq [dstq+dsq*0], xmm1
+ movhps [dstq+dsq*1], xmm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ vzeroupper
+ RET
+.hv_w16:
+ movu m7, [spel_hv_perm16a]
+ sub srcq, ss3q
+ mova m20, [spel_hv_perm16b]
+ lea r6d, [wq*2-32]
+ mova m21, [spel_hv_perm16c]
+ mov r4, srcq
+ mov r7, dstq
+ mova ym16, [spel_hv_end16]
+ lea r6d, [hq+r6*8]
+.hv_w16_loop0:
+ movu ym17, [srcq+ssq*0]
+ vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1
+ movu ym18, [srcq+ssq*2]
+ add srcq, ss3q
+ vinserti32x8 m18, [srcq+ssq*0], 1 ; 2 3
+ movu ym19, [srcq+ssq*1]
+ vinserti32x8 m19, [srcq+ssq*2], 1 ; 4 5
+ add srcq, ss3q
+ vpermb m2, m7, m17 ; 0 1 0123 89ab
+ vpermb m0, m20, m17 ; 0 1 4567 cdef
+ vpermb m4, m7, m18 ; 2 3 0123 89ab
+ mova m1, m8
+ vpdpbusd m1, m2, m10
+ vpermb m5, m20, m18 ; 2 3 4567 cdef
+ mova m2, m8
+ vpdpbusd m2, m0, m10
+ vpermb m17, m21, m17 ; 0 1 89ab ghij
+ mova m3, m8
+ vpdpbusd m3, m4, m10
+ vpermb m6, m7, m19 ; 4 5 0123 89ab
+ mova m4, m8
+ vpdpbusd m4, m5, m10
+ vpermb m18, m21, m18 ; 2 3 89ab ghij
+ vpdpbusd m1, m0, m11
+ movu ym0, [srcq+ssq*0] ; 6
+ vpdpbusd m2, m17, m11
+ vpermb m17, m20, m19 ; 4 5 4567 cdef
+ vpdpbusd m3, m5, m11
+ mova m5, m8
+ vpdpbusd m5, m6, m10
+ mova m6, m8
+ vpdpbusd m6, m17, m10
+ vpdpbusd m4, m18, m11
+ mova m18, [spel_hv_perm16d]
+ vpermb m18, m18, m0 ; 6 0145 2367 89cd abef
+ vpdpbusd m5, m17, m11
+ vpermb m19, m21, m19 ; 4 5 89ab ghij
+ mova m17, m8
+ vpdpbusd m17, m18, m10
+ mova m18, [spel_hv_perm16e]
+ vpermb m0, m18, m0 ; 6 4589 67ab cdgh efij
+ packssdw m1, m2 ; 01
+ vpdpbusd m6, m19, m11
+ packssdw m3, m4 ; 23
+ vpdpbusd m17, m0, m11
+ psraw m1, 2
+ packssdw m5, m6 ; 45
+ psraw m3, 2
+ vpshrdd m2, m1, m3, 16 ; 12
+ psraw m5, 2
+ vpshrdd m4, m3, m5, 16 ; 34
+ psraw m17, 2
+ vpshrdd m6, m5, m17, 16 ; 56
+.hv_w16_loop:
+ movu ym18, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti32x8 m18, [srcq+ssq*0], 1
+ mova m0, m9
+ vpdpwssd m0, m1, m12 ; a0
+ vpermb m1, m7, m18 ; 7 8 0123 89ab
+ mova m17, m9
+ vpdpwssd m17, m2, m12 ; b0
+ vpermb m2, m20, m18 ; 7 8 4567 cdef
+ mova m19, m8
+ vpdpbusd m19, m1, m10
+ vpermb m18, m21, m18
+ mova m1, m8
+ vpdpbusd m1, m2, m10
+ vpdpwssd m0, m3, m13 ; a1
+ vpdpwssd m17, m4, m13 ; b1
+ vpdpbusd m19, m2, m11
+ mova m2, m4
+ vpdpbusd m1, m18, m11
+ mova m4, m6
+ vpdpwssd m0, m5, m14 ; a2
+ vpdpwssd m17, m6, m14 ; b2
+ packssdw m19, m1
+ mova m1, m3
+ mova m3, m5
+ psraw m6, m19, 2 ; 7 8
+ vpshrdd m5, m4, m6, 16 ; 6 7
+ vpdpwssd m17, m6, m15 ; b3
+ vpdpwssd m0, m5, m15 ; a3
+ packuswb m0, m17
+ vpermb zmm1, m16, m0
+ mova [dstq+dsq*0], xmm1
+ vextracti128 [dstq+dsq*1], ymm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .hv_w16_loop0
+ vzeroupper
+ RET
+
+%macro PREP_8TAP_H 0
+ vpermb m10, m5, m0
+ vpermb m11, m5, m1
+ vpermb m12, m6, m0
+ vpermb m13, m6, m1
+ vpermb m14, m7, m0
+ vpermb m15, m7, m1
+ mova m0, m4
+ vpdpbusd m0, m10, m8
+ mova m2, m4
+ vpdpbusd m2, m12, m8
+ mova m1, m4
+ vpdpbusd m1, m11, m8
+ mova m3, m4
+ vpdpbusd m3, m13, m8
+ vpdpbusd m0, m12, m9
+ vpdpbusd m2, m14, m9
+ vpdpbusd m1, m13, m9
+ vpdpbusd m3, m15, m9
+ packssdw m0, m2
+ packssdw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep_avx512icl]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ add wq, r7
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m4, [pd_2]
+ WIN64_SPILL_XMM 10
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0]
+ vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4]
+ add wq, r7
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ vbroadcasti128 ym5, [subpel_h_shufA]
+ mov r3d, 0x4
+ dec srcq
+ vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2]
+ kmovb k1, r3d
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq xm2, [srcq+strideq*0]
+ movq xm3, [srcq+strideq*1]
+ vpbroadcastq ym2{k1}, [srcq+strideq*2]
+ vpbroadcastq ym3{k1}, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pshufb ym2, ym5
+ pshufb ym3, ym5
+ mova ym0, ym4
+ vpdpbusd ym0, ym2, ym6
+ mova ym1, ym4
+ vpdpbusd ym1, ym3, ym6
+ packssdw ym0, ym1
+ psraw ym0, 2
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ vbroadcasti128 m5, [subpel_h_shufA]
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ lea stride3q, [strideq*3]
+.h_w8_loop:
+ movu xmm3, [srcq+strideq*0]
+ vinserti128 ym3, ymm3, [srcq+strideq*1], 1
+ vinserti128 m3, [srcq+strideq*2], 2
+ vinserti128 m3, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pshufb m1, m3, m5
+ pshufb m2, m3, m6
+ mova m0, m4
+ vpdpbusd m0, m1, m8
+ mova m1, m4
+ vpdpbusd m1, m2, m8
+ pshufb m3, m7
+ vpdpbusd m0, m2, m9
+ vpdpbusd m1, m3, m9
+ packssdw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ mova m5, [spel_h_perm16a]
+ mova m6, [spel_h_perm16b]
+ mova m7, [spel_h_perm16c]
+ lea stride3q, [strideq*3]
+.h_w16_loop:
+ movu ym0, [srcq+strideq*0]
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+ mova m5, [spel_h_perm32a]
+ mova m6, [spel_h_perm32b]
+ mova m7, [spel_h_perm32c]
+.h_w32_loop:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ sub hd, 2
+ jg .h_w32_loop
+ RET
+.h_w64:
+ xor r6d, r6d
+ jmp .h_start
+.h_w128:
+ mov r6, -64*1
+.h_start:
+ mova m5, [spel_h_perm32a]
+ mova m6, [spel_h_perm32b]
+ mova m7, [spel_h_perm32c]
+ sub srcq, r6
+ mov r5, r6
+.h_loop:
+ movu m0, [srcq+r6+32*0]
+ movu m1, [srcq+r6+32*1]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ add r6, 64
+ jle .h_loop
+ add srcq, strideq
+ mov r6, r5
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
+ shr myd, 16 ; Note that the code is 8-tap only, having
+ tzcnt wd, wd
+ cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
+ cmove myd, mxd ; had a negligible effect on performance.
+ ; TODO: Would a 6-tap code path be worth it?
+ lea myq, [r7+myq*8+subpel_filters-prep_avx512icl]
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)]
+ add wq, r7
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ vpbroadcastd m7, [pw_8192]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ jmp wq
+.v_w4:
+ movd xmm0, [srcq+strideq*0]
+ vpbroadcastd ymm1, [srcq+strideq*2]
+ vpbroadcastd xmm2, [srcq+strideq*1]
+ vpbroadcastd ymm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _
+ vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _
+ vpbroadcastd ymm0, [srcq+strideq*0]
+ vpbroadcastd ymm2, [srcq+strideq*1]
+ vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _
+ vpbroadcastd ymm0, [srcq+strideq*2]
+ vbroadcasti128 ymm5, [deint_shuf4]
+ vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5
+ vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5
+ vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _
+ punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34
+ vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6
+ punpckhbw ymm2, ymm3 ; 23 34 45 56
+.v_w4_loop:
+ pinsrd xmm0, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastd ymm3, [srcq+strideq*0]
+ vpbroadcastd ymm4, [srcq+strideq*1]
+ vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _
+ vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _
+ vpbroadcastd ymm0, [srcq+strideq*2]
+ vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _
+ pshufb ymm3, ymm5 ; 67 78 89 9a
+ pmaddubsw ymm4, ymm1, ym8
+ vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78
+ pmaddubsw ymm2, ym9
+ paddw ymm4, ymm2
+ mova ymm2, ymm3
+ pmaddubsw ymm3, ym11
+ paddw ymm3, ymm4
+ pmaddubsw ymm4, ymm1, ym10
+ paddw ymm3, ymm4
+ pmulhrsw ymm3, ym7
+ mova [tmpq], ymm3
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ mov r3d, 0xf044
+ kmovw k1, r3d
+ kshiftrw k2, k1, 8
+ movq xm0, [srcq+strideq*0]
+ vpbroadcastq ym1, [srcq+strideq*1]
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m4, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ vpbroadcastq m6, [srcq+strideq*2]
+ vmovdqa64 ym0{k1}, ym1
+ vmovdqa64 ym1{k1}, ym2
+ vmovdqa64 m2{k1}, m3
+ vmovdqa64 m3{k1}, m4
+ vmovdqa64 m4{k1}, m5
+ vmovdqa64 m5{k1}, m6
+ punpcklbw ym0, ym1 ; 01 12 __ __
+ punpcklbw m2, m3 ; 23 34 23 34
+ punpcklbw m4, m5 ; 45 56 45 56
+ vmovdqa64 m0{k2}, m2 ; 01 12 23 34
+ vmovdqa64 m2{k2}, m4 ; 23 34 45 56
+.v_w8_loop:
+ vpbroadcastq m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ pmaddubsw m14, m0, m8
+ pmaddubsw m15, m2, m9
+ vpblendmq m0{k1}, m6, m1
+ vpblendmq m2{k1}, m1, m3
+ vpbroadcastq m6, [srcq+strideq*2]
+ paddw m14, m15
+ punpcklbw m2, m0, m2 ; 67 78 67 78
+ vpblendmq m12{k1}, m3, m5
+ vpblendmq m13{k1}, m5, m6
+ vpblendmq m0{k2}, m4, m2 ; 45 56 67 78
+ punpcklbw m4, m12, m13 ; 89 9a 89 9a
+ vmovdqa64 m2{k2}, m4 ; 67 78 89 9a
+ pmaddubsw m12, m0, m10
+ pmaddubsw m13, m2, m11
+ paddw m14, m12
+ paddw m14, m13
+ pmulhrsw m14, m7
+ mova [tmpq], m14
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ mov r3d, 0xf0
+ kmovb k1, r3d
+ vbroadcasti128 m0, [srcq+strideq*0]
+ vbroadcasti128 m1, [srcq+strideq*1]
+ vbroadcasti128 m2, [srcq+strideq*2]
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vbroadcasti128 m4, [srcq+strideq*0]
+ vbroadcasti128 m5, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*2]
+ vmovdqa64 m0{k1}, m1
+ vmovdqa64 m1{k1}, m2
+ vmovdqa64 m2{k1}, m3
+ vmovdqa64 m3{k1}, m4
+ vmovdqa64 m4{k1}, m5
+ vmovdqa64 m5{k1}, m6
+ shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b
+ shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b
+ shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_--
+ shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_--
+ punpckhbw m2, m0, m1 ; 23a 23b 34a 34b
+ punpcklbw m0, m1 ; 01a 01b 12a 12b
+ punpcklbw m4, m5 ; 45a 45b 56a 56b
+.v_w16_loop:
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vbroadcasti128 m5, [srcq+strideq*0]
+ vpblendmq m1{k1}, m6, m3
+ vmovdqa64 m3{k1}, m5
+ pmaddubsw m12, m0, m8
+ pmaddubsw m13, m2, m8
+ pmaddubsw m14, m2, m9
+ pmaddubsw m15, m4, m9
+ pmaddubsw m0, m4, m10
+ vbroadcasti128 m2, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*2]
+ paddw m12, m14
+ paddw m13, m15
+ paddw m12, m0
+ vmovdqa64 m5{k1}, m2
+ vmovdqa64 m2{k1}, m6
+ mova m0, m4
+ shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b
+ shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab
+ punpcklbw m2, m1, m3 ; 67a 67b 78a 78b
+ punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab
+ pmaddubsw m14, m2, m10
+ pmaddubsw m15, m2, m11
+ paddw m13, m14
+ paddw m12, m15
+ pmaddubsw m14, m4, m11
+ paddw m13, m14
+ pmulhrsw m12, m7
+ pmulhrsw m13, m7
+ mova [tmpq+ 0], m12
+ mova [tmpq+64], m13
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m18, [bilin_v_perm64]
+ movu ym0, [srcq+strideq*0]
+ movu ym1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym2, [srcq+strideq*0]
+ movu ym3, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym4, [srcq+strideq*0]
+ movu ym5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym6, [srcq+strideq*0]
+ vpermq m0, m18, m0
+ vpermq m1, m18, m1
+ vpermq m2, m18, m2
+ vpermq m3, m18, m3
+ vpermq m4, m18, m4
+ vpermq m5, m18, m5
+ vpermq m6, m18, m6
+ punpcklbw m0, m1
+ punpcklbw m1, m2
+ punpcklbw m2, m3
+ punpcklbw m3, m4
+ punpcklbw m4, m5
+ punpcklbw m5, m6
+.v_w32_loop:
+ movu ym12, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym13, [srcq+strideq*0]
+ pmaddubsw m14, m0, m8
+ pmaddubsw m16, m2, m9
+ pmaddubsw m15, m1, m8
+ pmaddubsw m17, m3, m9
+ mova m0, m2
+ mova m1, m3
+ vpermq m12, m18, m12
+ vpermq m13, m18, m13
+ paddw m14, m16
+ paddw m15, m17
+ pmaddubsw m16, m4, m10
+ pmaddubsw m17, m5, m10
+ punpcklbw m6, m12
+ punpcklbw m12, m13
+ mova m2, m4
+ mova m3, m5
+ paddw m14, m16
+ paddw m15, m17
+ pmaddubsw m16, m6, m11
+ pmaddubsw m17, m12, m11
+ mova m4, m6
+ mova m5, m12
+ paddw m14, m16
+ paddw m15, m17
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ mova m6, m13
+ mova [tmpq+ 0], m14
+ mova [tmpq+64], m15
+ add tmpq, 64*2
+ sub hd, 2
+ jg .v_w32_loop
+ vzeroupper
+ RET
+.v_w64:
+ mov wd, 64
+ jmp .v_start
+.v_w128:
+ mov wd, 128
+.v_start:
+ WIN64_SPILL_XMM 27
+ mova m26, [bilin_v_perm64]
+ lea r6d, [hq+wq*2]
+ mov r5, srcq
+ mov r7, tmpq
+.v_loop0:
+ vpermq m0, m26, [srcq+strideq*0]
+ vpermq m1, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m2, m26, [srcq+strideq*0]
+ vpermq m3, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m4, m26, [srcq+strideq*0]
+ vpermq m5, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m6, m26, [srcq+strideq*0]
+ punpckhbw m12, m0, m1
+ punpcklbw m0, m1
+ punpckhbw m13, m1, m2
+ punpcklbw m1, m2
+ punpckhbw m14, m2, m3
+ punpcklbw m2, m3
+ punpckhbw m15, m3, m4
+ punpcklbw m3, m4
+ punpckhbw m16, m4, m5
+ punpcklbw m4, m5
+ punpckhbw m17, m5, m6
+ punpcklbw m5, m6
+.v_loop:
+ vpermq m18, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m19, m26, [srcq+strideq*0]
+ pmaddubsw m20, m0, m8
+ pmaddubsw m21, m12, m8
+ pmaddubsw m22, m1, m8
+ pmaddubsw m23, m13, m8
+ mova m0, m2
+ mova m12, m14
+ mova m1, m3
+ mova m13, m15
+ pmaddubsw m2, m9
+ pmaddubsw m14, m9
+ pmaddubsw m3, m9
+ pmaddubsw m15, m9
+ punpckhbw m24, m6, m18
+ punpcklbw m6, m18
+ paddw m20, m2
+ paddw m21, m14
+ paddw m22, m3
+ paddw m23, m15
+ mova m2, m4
+ mova m14, m16
+ mova m3, m5
+ mova m15, m17
+ pmaddubsw m4, m10
+ pmaddubsw m16, m10
+ pmaddubsw m5, m10
+ pmaddubsw m17, m10
+ punpckhbw m25, m18, m19
+ punpcklbw m18, m19
+ paddw m20, m4
+ paddw m21, m16
+ paddw m22, m5
+ paddw m23, m17
+ mova m4, m6
+ mova m16, m24
+ mova m5, m18
+ mova m17, m25
+ pmaddubsw m6, m11
+ pmaddubsw m24, m11
+ pmaddubsw m18, m11
+ pmaddubsw m25, m11
+ paddw m20, m6
+ paddw m21, m24
+ paddw m22, m18
+ paddw m23, m25
+ pmulhrsw m20, m7
+ pmulhrsw m21, m7
+ pmulhrsw m22, m7
+ pmulhrsw m23, m7
+ mova m6, m19
+ mova [tmpq+wq*0+ 0], m20
+ mova [tmpq+wq*0+64], m21
+ mova [tmpq+wq*2+ 0], m22
+ mova [tmpq+wq*2+64], m23
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_loop
+ add r5, 64
+ add r7, 128
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .v_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign stack_size_padded 0
+ WIN64_SPILL_XMM 16
+ cmp wd, 4
+ je .hv_w4
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0]
+ vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ tzcnt wd, wd
+ vpbroadcastd m8, [pd_2]
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)]
+ vpbroadcastd m9, [pd_32]
+ add wq, r7
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ jmp wq
+.hv_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ mov r3d, 0x04
+ kmovb k1, r3d
+ kshiftlb k2, k1, 2
+ kshiftlb k3, k1, 4
+ vpbroadcastd m10, [pd_2]
+ vbroadcasti128 m16, [subpel_h_shufA]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m11, [pd_32]
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ movq xm3, [srcq+strideq*0]
+ vpbroadcastq ym2, [srcq+strideq*1]
+ vpbroadcastq ym3{k1}, [srcq+strideq*2]
+ vpbroadcastq m2{k2}, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3{k2}, [srcq+strideq*0]
+ vpbroadcastq m2{k3}, [srcq+strideq*1]
+ vpbroadcastq m3{k3}, [srcq+strideq*2]
+ mova m17, [spel_hv_perm4a]
+ movu m18, [spel_hv_perm4b]
+ mova m0, m10
+ mova m1, m10
+ pshufb m2, m16
+ pshufb m3, m16
+ vpdpbusd m0, m2, m8
+ vpdpbusd m1, m3, m8
+ packssdw m0, m1 ; _ 0 1 2 3 4 5 6
+ psraw m0, 2
+ vpermb m1, m17, m0 ; 01 12 23 34
+ vpermb m2, m18, m0 ; 23 34 45 56
+.hv_w4_loop:
+ movq xm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ movq xm4, [srcq+strideq*0]
+ vpbroadcastq ym3{k1}, [srcq+strideq*1]
+ vpbroadcastq ym4{k1}, [srcq+strideq*2]
+ mova ym5, ym10
+ mova ym6, ym10
+ pshufb ym3, ym16
+ pshufb ym4, ym16
+ vpdpbusd ym5, ym3, ym8
+ vpdpbusd ym6, ym4, ym8
+ mova m7, m11
+ packssdw ym5, ym6 ; 7 8 9 a _ _ _ _
+ psraw ym5, 2
+ valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a
+ vpdpwssd m7, m1, m12
+ vpdpwssd m7, m2, m13
+ vpermb m1, m17, m0 ; 45 56 67 78
+ vpermb m2, m18, m0 ; 67 78 89 9a
+ vpdpwssd m7, m1, m14
+ vpdpwssd m7, m2, m15
+ psrad m7, 6
+ vpmovdw [tmpq], m7
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ vzeroupper
+ RET
+.hv_w8:
+ WIN64_SPILL_XMM 24
+ vbroadcasti128 m16, [subpel_h_shufA]
+ vbroadcasti128 m17, [subpel_h_shufB]
+ vbroadcasti128 m18, [subpel_h_shufC]
+ vinserti128 ym0, [srcq+strideq*0], 1
+ vinserti128 m0, [srcq+strideq*1], 2
+ vinserti128 m0, [srcq+strideq*2], 3
+ movu xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 ym1, [srcq+strideq*0], 1
+ vinserti128 m1, [srcq+strideq*1], 2
+ vinserti128 m1, [srcq+strideq*2], 3
+ mova m2, m8
+ mova m4, m8
+ mova m3, m8
+ mova m5, m8
+ pshufb m20, m0, m16
+ pshufb m21, m0, m17
+ pshufb m22, m0, m18
+ pshufb m23, m1, m16
+ pshufb m6, m1, m17
+ pshufb m7, m1, m18
+ vpdpbusd m2, m20, m10
+ vpdpbusd m4, m21, m10
+ vpdpbusd m2, m21, m11
+ vpdpbusd m4, m22, m11
+ vpdpbusd m3, m23, m10
+ vpdpbusd m5, m6, m10
+ vpdpbusd m3, m6, m11
+ vpdpbusd m5, m7, m11
+ packssdw m2, m4
+ packssdw m3, m5
+ psraw m2, 2 ; _ 0 1 2
+ psraw m3, 2 ; 3 4 5 6
+ valignq m0, m3, m2, 2 ; 0 1 2 3
+ valignq m1, m3, m2, 4 ; 1 2 3 4
+ valignq m2, m3, m2, 6 ; 2 3 4 5
+ punpcklwd m4, m0, m1 ; 01a 12a 23a 34a
+ punpckhwd m5, m0, m1 ; 01b 12b 23b 34b
+ punpcklwd m6, m2, m3 ; 23a 34a 45a 56a
+ punpckhwd m7, m2, m3 ; 23b 34b 45b 56b
+.hv_w8_loop:
+ movu xm19, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 ym19, [srcq+strideq*0], 1
+ vinserti128 m19, [srcq+strideq*1], 2
+ vinserti128 m19, [srcq+strideq*2], 3
+ mova m20, m9
+ mova m21, m9
+ mova m22, m8
+ mova m23, m8
+ vpdpwssd m20, m4, m12
+ vpdpwssd m21, m5, m12
+ vpdpwssd m20, m6, m13
+ vpdpwssd m21, m7, m13
+ pshufb m0, m19, m16
+ pshufb m1, m19, m17
+ pshufb m2, m19, m18
+ vpdpbusd m22, m0, m10
+ vpdpbusd m23, m1, m10
+ vpdpbusd m22, m1, m11
+ vpdpbusd m23, m2, m11
+ packssdw m22, m23
+ psraw m22, 2 ; 7 8 9 A
+ valignq m0, m22, m3, 2 ; 4 5 6 7
+ valignq m1, m22, m3, 4 ; 5 6 7 8
+ valignq m2, m22, m3, 6 ; 6 7 8 9
+ mova m3, m22
+ punpcklwd m4, m0, m1 ; 45a 56a 67a 78a
+ punpckhwd m5, m0, m1 ; 45b 56b 67b 78b
+ punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa
+ punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab
+ vpdpwssd m20, m4, m14
+ vpdpwssd m21, m5, m14
+ vpdpwssd m20, m6, m15
+ vpdpwssd m21, m7, m15
+ psrad m20, 6
+ psrad m21, 6
+ packssdw m20, m21
+ mova [tmpq], m20
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ mov wd, 16*2
+ jmp .hv_start
+.hv_w32:
+ mov wd, 32*2
+ jmp .hv_start
+.hv_w64:
+ mov wd, 64*2
+ jmp .hv_start
+.hv_w128:
+ mov wd, 128*2
+.hv_start:
+ WIN64_SPILL_XMM 31
+ mova m16, [spel_h_perm16a]
+ mova m17, [spel_h_perm16b]
+ mova m18, [spel_h_perm16c]
+ lea r6d, [hq+wq*8-256]
+ mov r5, srcq
+ mov r7, tmpq
+.hv_loop0:
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym1, [srcq+strideq*0]
+ vinserti32x8 m1, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym2, [srcq+strideq*0]
+ vinserti32x8 m2, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym3, [srcq+strideq*0]
+ mova m4, m8
+ mova m5, m8
+ mova m6, m8
+ mova m7, m8
+ vpermb m19, m16, m0
+ vpermb m20, m17, m0
+ vpermb m21, m18, m0
+ vpermb m22, m16, m1
+ vpermb m23, m17, m1
+ vpermb m24, m18, m1
+ vpermb m25, m16, m2
+ vpermb m26, m17, m2
+ vpermb m27, m18, m2
+ vpermb ym28, ym16, ym3
+ vpermb ym29, ym17, ym3
+ vpermb ym30, ym18, ym3
+ mova m0, m8
+ mova m1, m8
+ mova ym2, ym8
+ mova ym3, ym8
+ vpdpbusd m4, m19, m10
+ vpdpbusd m5, m20, m10
+ vpdpbusd m6, m22, m10
+ vpdpbusd m7, m23, m10
+ vpdpbusd m0, m25, m10
+ vpdpbusd m1, m26, m10
+ vpdpbusd ym2, ym28, ym10
+ vpdpbusd ym3, ym29, ym10
+ vpdpbusd m4, m20, m11
+ vpdpbusd m5, m21, m11
+ vpdpbusd m6, m23, m11
+ vpdpbusd m7, m24, m11
+ vpdpbusd m0, m26, m11
+ vpdpbusd m1, m27, m11
+ vpdpbusd ym2, ym29, ym11
+ vpdpbusd ym3, ym30, ym11
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m0, m1
+ packssdw ym2, ym3
+ psraw m4, 2 ; 0a 0b 1a 1b
+ psraw m6, 2 ; 2a 2b 3a 3b
+ psraw m0, 2 ; 4a 4b 5a 5b
+ psraw ym2, 2 ; 6a 6b __ __
+ vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b
+ vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b
+ vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b
+ punpcklwd m2, m4, m5 ; 01a 01c 12a 12c
+ punpckhwd m3, m4, m5 ; 01b 01d 12b 12d
+ punpcklwd m4, m6, m7 ; 23a 23c 34a 34c
+ punpckhwd m5, m6, m7 ; 23b 23d 34b 34d
+ punpcklwd m6, m0, m1 ; 45a 45c 56a 56c
+ punpckhwd m7, m0, m1 ; 45b 45d 56b 56d
+.hv_loop:
+ movu ym19, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m19, [srcq+strideq*0], 1
+ mova m20, m9
+ mova m21, m9
+ mova m22, m8
+ mova m23, m8
+ vpdpwssd m20, m2, m12
+ vpdpwssd m21, m3, m12
+ vpdpwssd m20, m4, m13
+ vpdpwssd m21, m5, m13
+ vpermb m24, m16, m19
+ vpermb m25, m17, m19
+ vpermb m26, m18, m19
+ vpdpbusd m22, m24, m10
+ vpdpbusd m23, m25, m10
+ vpdpbusd m22, m25, m11
+ vpdpbusd m23, m26, m11
+ packssdw m22, m23
+ psraw m22, 2 ; 7a 7b 8a 8b
+ vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b
+ mova m2, m4
+ mova m3, m5
+ mova m1, m22
+ mova m4, m6
+ mova m5, m7
+ punpcklwd m6, m0, m1 ; 67a 67c 78a 78c
+ punpckhwd m7, m0, m1 ; 67b 67d 78b 78d
+ vpdpwssd m20, m4, m14
+ vpdpwssd m21, m5, m14
+ vpdpwssd m20, m6, m15
+ vpdpwssd m21, m7, m15
+ psrad m20, 6
+ psrad m21, 6
+ packssdw m20, m21
+ mova [tmpq+wq*0], ym20
+ vextracti32x8 [tmpq+wq*1], m20, 1
+ lea tmpq, [tmpq+wq*2]
+ sub hd, 2
+ jg .hv_loop
+ add r5, 16
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .hv_loop0
+ RET
+
+cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts
+ vpbroadcastd m9, [pd_16384]
+ mova ym15, [warp_8x8t_end]
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main
+ jmp .start
+.loop:
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main2
+ lea tmpq, [tmpq+tsq*4]
+.start:
+ paddd m16, m16
+ vpermb m16, m15, m16
+ mova [tmpq+tsq*0], xm16
+ vextracti128 [tmpq+tsq*2], ym16, 1
+ sub r6d, 0x1800
+ jg .loop
+ RET
+
+cglobal warp_affine_8x8_8bpc, 4, 7, 22, dst, ds, src, ss, abcd, filter
+ vpbroadcastd m9, [pd_262144]
+ mova xm15, [warp_8x8_end]
+ call .main
+ jmp .start
+.loop:
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+ psrad m16, 19
+ packuswb m16, m16
+ vpermb m16, m15, m16
+ movq [dstq+dsq*0], xm16
+ movhps [dstq+dsq*1], xm16
+ sub r6d, 0x1800
+ jg .loop
+ RET
+ALIGN function_align
+.main:
+ vpbroadcastd m1, [pd_512]
+%if WIN64
+ mov abcdq, r5mp
+ vpaddd ym18, ym1, r6m {1to8} ; mx
+%else
+ add r5d, 512
+ vpbroadcastd ym18, r5d
+%endif
+ vpaddd ym20, ym1, r7m {1to8} ; my
+ mova ym16, [pd_0to7]
+ vpbroadcastd ym19, [abcdq+4*0]
+ vpbroadcastd ym21, [abcdq+4*1]
+ lea r4, [ssq*3+3]
+ mova m10, [warp_8x8_permA]
+ mov r6d, 0x5555
+ mova m11, [warp_8x8_permB]
+ lea filterq, [mc_warp_filter+64*8]
+ vpbroadcastq m12, [warp_8x8_hpack]
+ sub srcq, r4 ; src -= src_stride*3 + 3
+ vbroadcasti32x4 m13, [warp_8x8_permC]
+ kxnorb k2, k2, k2
+ vbroadcasti32x4 m14, [warp_8x8_permD]
+ vpdpwssd ym18, ym19, ym16 ; alpha
+ vpdpwssd ym20, ym21, ym16 ; gamma
+ vbroadcasti32x4 m0, [srcq]
+ psrad ym19, 16 ; beta
+ psrad ym21, 16 ; delta
+ kmovw k1, r6d
+ psrad ym16, ym18, 10
+ kmovb k3, k2
+ paddd ym18, ym19
+ vpgatherdq m2{k2}, [filterq+ym16*8] ; filter_x0
+ psrld m1, 8 ; pd_2
+ pshufb m0, m11
+ paddd m8, m1, m1 ; pd_4
+ vpdpbusd m1, m0, m2
+ call .h
+ psllq m2, m1, 45
+ pslld m1, 13
+ paddd m1, m2
+ vpshrdq m1, m0, 48 ; 01 12
+ call .h
+ vpshrdq m2, m1, m0, 48 ; 23 34
+ call .h
+ vpshrdq m3, m2, m0, 48 ; 45 56
+.main2:
+ call .h
+ psrad ym17, ym20, 10
+ kmovb k2, k3
+ paddd ym20, ym21
+ vpgatherdq m7{k3}, [filterq+ym17*8] ; filter_y0
+ psrad ym16, ym20, 10
+ kmovb k3, k2
+ paddd ym20, ym21
+ vpgatherdq m17{k2}, [filterq+ym16*8] ; filter_y1
+ shufps m5, m7, m17, q2020 ; a0 a1 a2 a3 b0 b1 b2 b3 A0 A1 A2 A3 B0 B1 B2 B3
+ mova m16, m9
+ pshufb m4, m5, m13 ; a0 a1 A0 A1 b0 b1 B0 B1
+ vpdpwssd m16, m1, m4
+ pshufb m5, m14 ; a2 a3 A2 A3 b2 b3 B2 B3
+ mova m1, m2
+ vpdpwssd m16, m2, m5
+ shufps m5, m7, m17, q3131 ; a4 a5 a6 a7 b4 b5 b6 b7 A4 A5 A6 A7 B4 B5 B6 B7
+ mova m2, m3
+ pshufb m4, m5, m13 ; a4 a5 A4 A5 b4 b5 B4 B5
+ vpdpwssd m16, m3, m4
+ vpshrdq m3, m0, 48 ; 67 78
+ pshufb m5, m14 ; a6 a7 A6 A7 b6 b7 B6 B7
+ vpdpwssd m16, m3, m5
+ ret
+ALIGN function_align
+.h:
+ movu xm5, [srcq+ssq*1]
+ psrad ym16, ym18, 10
+ lea srcq, [srcq+ssq*2]
+ vinserti32x4 ym5, [srcq+ssq*0], 1
+ kmovb k2, k3
+ paddd ym18, ym19
+ vpgatherdq m6{k3}, [filterq+ym16*8] ; filter_x1
+ psrad ym17, ym18, 10
+ kmovb k3, k2
+ paddd ym18, ym19
+ vpgatherdq m16{k2}, [filterq+ym17*8] ; filter_x2
+ mova m0, m8
+ vpermb m4, m10, m5 ; a4 b0 a5 b1 a6 b2 a7 b3 a8 b4 a9 b5 aa b6 ab b7
+ vpshldq m17, m16, m6, 32 ; a4 a5 a6 a7 b0 b1 b2 b3
+ vpdpbusd m0, m4, m17
+ vpermb m5, m11, m5 ; a0 b4 a1 b5 a2 b6 a3 b7 a4 b8 a5 b9 a6 ba a7 bb
+ vmovdqa32 m16{k1}, m6 ; a0 a1 a2 a3 b4 b5 b6 b7
+ vpdpbusd m0, m5, m16
+ vpmultishiftqb m0, m12, m0 ; 1 1 2 2 (>> 3)
+ ret
+
+%macro BIDIR_FN 1 ; op
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM %1 0
+ vextracti32x4 xm1, ym0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_ret:
+ RET
+.w4_h16:
+ vpbroadcastd m7, strided
+ pmulld m7, [bidir_sctr_w4]
+ %1 0
+ kxnorw k1, k1, k1
+ vpscatterdd [dstq+m7]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM %1 0
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ %1 0
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq ], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 2
+ lea dstq, [dstq+strideq*4]
+.w16:
+ %1 0
+ vpermq m0, m0, q3120
+ mova [dstq ], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m7, [pb_02461357]
+.w32_loop:
+ %1 0
+ %1_INC_PTR 2
+ vpermq m0, m7, m0
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m7, [pb_02461357]
+.w64_loop:
+ %1 0
+ %1_INC_PTR 2
+ vpermq m0, m7, m0
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m7, [pb_02461357]
+.w128_loop:
+ %1 0
+ vpermq m6, m7, m0
+ %1 2
+ mova [dstq+64*0], m6
+ %1_INC_PTR 4
+ vpermq m6, m7, m0
+ mova [dstq+64*1], m6
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ mova m0, [tmp1q+(%1+0)*mmsize]
+ paddw m0, [tmp2q+(%1+0)*mmsize]
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ paddw m1, [tmp2q+(%1+1)*mmsize]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ packuswb m0, m1
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg_avx512icl_table
+ lea r6, [avg_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m4, [base+pw_1024]
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m0, [tmp1q+(%1+0)*mmsize]
+ psubw m2, m0, [tmp2q+(%1+0)*mmsize]
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ psubw m3, m1, [tmp2q+(%1+1)*mmsize]
+ pmulhw m2, m4
+ pmulhw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-w_avg_avx512icl_table
+ lea r6, [w_avg_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m4, r6m ; weight
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m5, [base+pw_2048]
+ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
+ add wq, r6
+ cmp dword r6m, 7
+ jg .weight_gt7
+ mov r6, tmp1q
+ pxor m0, m0
+ mov tmp1q, tmp2q
+ psubw m4, m0, m4 ; -weight
+ mov tmp2q, r6
+.weight_gt7:
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+%if mmsize == 64
+ vpermq m3, m8, [maskq+%1*32]
+%else
+ vpermq m3, [maskq+%1*16], q3120
+%endif
+ mova m0, [tmp2q+(%1+0)*mmsize]
+ psubw m1, m0, [tmp1q+(%1+0)*mmsize]
+ psubb m3, m4, m3
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3
+ punpcklbw m2, m4, m3 ; -m << 9
+ pmulhw m1, m2
+ paddw m0, m1
+ mova m1, [tmp2q+(%1+1)*mmsize]
+ psubw m2, m1, [tmp1q+(%1+1)*mmsize]
+ paddw m2, m2
+ punpckhbw m3, m4, m3
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*32
+ add tmp2q, %1*64
+ add tmp1q, %1*64
+%endmacro
+
+cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask_avx512icl_table
+ lea r7, [mask_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ pxor m4, m4
+ mova m8, [base+bilin_v_perm64]
+ vpbroadcastd m5, [base+pw_2048]
+ add wq, r7
+ BIDIR_FN MASK
+
+%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
+ mova m%1, [tmp1q+mmsize*%3]
+ mova m1, [tmp2q+mmsize*%3]
+ psubw m1, m%1
+ pabsw m%2, m1
+ psubusw m%2, m6, m%2
+ psrlw m%2, 8 ; 64 - m
+ psllw m2, m%2, 10
+ pmulhw m1, m2
+ paddw m%1, m1
+ mova m1, [tmp1q+mmsize*%4]
+ mova m2, [tmp2q+mmsize*%4]
+ psubw m2, m1
+ pabsw m3, m2
+ psubusw m3, m6, m3
+ vpshldw m%2, m3, 8
+ psllw m3, m%2, 10
+%if %5
+ psubb m%2, m5, m%2
+%endif
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m%1, m7
+ pmulhrsw m1, m7
+ packuswb m%1, m1
+%endmacro
+
+cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx512icl_table
+ lea r7, [w_mask_420_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ vpbroadcastd m9, [base+pb_m64] ; -1 << 6
+ mova ym10, [base+wm_420_mask+32]
+ vpbroadcastd m8, [base+wm_sign+r6*8] ; (258 - sign) << 6
+ add wq, r7
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ mova m5, [wm_420_perm4]
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ vinserti128 ym5, [wm_420_perm4+32], 1
+ vpermb ym4, ym5, ym4
+ vpdpbusd ym8, ym4, ym9
+ vextracti32x4 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_end:
+ vpermb ym8, ym10, ym8
+ movq [maskq], xm8
+ RET
+.w4_h16:
+ vpbroadcastd m11, strided
+ pmulld m11, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ vpdpbusd m8, m4, m9
+ kxnorw k1, k1, k1
+ vpermb m8, m10, m8
+ mova [maskq], xm8
+ vpscatterdd [dstq+m11]{k1}, m0
+ RET
+.w8:
+ mova m5, [wm_420_perm8]
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ vinserti128 ym5, [wm_420_perm8+32], 1
+ vpermb ym4, ym5, ym4
+ vpdpbusd ym8, ym4, ym9
+ vpermb m8, m10, m8
+ mova [maskq], xm8
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 16
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ vpermb m1, m10, m1
+ mova [maskq], xm1
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16:
+ mova m5, [wm_420_perm16]
+.w16_loop:
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m0, q3120
+ mova [maskq], xm1
+ add maskq, 16
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m5, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ mova [maskq], xm1
+ add maskq, 16
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14
+ psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15
+.w64_loop:
+ W_MASK 0, 4, 0, 2
+ W_MASK 11, 5, 1, 3
+ mova m2, m8
+ vpdpbusd m2, m4, m9
+ mova m3, m8
+ vpdpbusd m3, m5, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermt2b m2, m10, m3
+ mova m1, m0
+ vpermt2q m0, m12, m11
+ vpermt2q m1, m13, m11
+ mova [maskq], ym2
+ add maskq, 32
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m14, [wm_420_perm64]
+ mova m10, [wm_420_mask]
+ psrlq m15, m14, 4
+.w128_loop:
+ W_MASK 0, 12, 0, 4
+ W_MASK 11, 13, 1, 5
+ mova m4, m8
+ vpdpbusd m4, m12, m9
+ mova m5, m8
+ vpdpbusd m5, m13, m9
+ mova m1, m0
+ vpermt2q m0, m14, m11
+ vpermt2q m1, m15, m11
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*1+64*0], m1
+ W_MASK 0, 12, 2, 6
+ W_MASK 11, 13, 3, 7
+ vprold m4, 16
+ vprold m5, 16
+ vpdpbusd m4, m12, m9
+ vpdpbusd m5, m13, m9
+ add tmp1q, 512
+ add tmp2q, 512
+ vpermt2b m4, m10, m5
+ mova m1, m0
+ vpermt2q m0, m14, m11
+ vpermt2q m1, m15, m11
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq+strideq*0+64*1], m0
+ mova [dstq+strideq*1+64*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w128_loop
+ RET
+
+cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx512icl_table
+ lea r7, [w_mask_422_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ vpbroadcastd m9, [base+pw_m128]
+ mova m10, [base+wm_422_mask]
+ vpbroadcastd m11, [base+pb_127]
+ add wq, r7
+ vpbroadcastd m8, [base+wm_sign+4+r6*4]
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ movhps xm10, [wm_422_mask+16]
+ vpdpwssd ym8, ym4, ym9
+ vpermb ym8, ym10, ym8
+ vextracti32x4 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_end:
+ pand xm8, xm11
+ mova [maskq], xm8
+ RET
+.w4_h16:
+ vpbroadcastd m5, strided
+ pmulld m5, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1
+ vpdpwssd m8, m4, m9
+ kxnorw k1, k1, k1
+ vpermb m8, m10, m8
+ pand ym8, ym11
+ mova [maskq], ym8
+ vpscatterdd [dstq+m5]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ movhps xm10, [wm_422_mask+16]
+ vpdpwssd ym8, ym4, ym9
+ vpermb ym8, ym10, ym8
+ pand xm8, xm11
+ mova [maskq], xm8
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 32
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ vpermb m1, m10, m1
+ pand ym1, ym11
+ mova [maskq], ym1
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 32
+ lea dstq, [dstq+strideq*4]
+.w16:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ vpermb m1, m10, m1
+ vpermq m0, m0, q3120
+ pand ym1, ym11
+ mova [maskq], ym1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m5, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ pand ym1, ym11
+ mova [maskq], ym1
+ add maskq, 32
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m5, [pb_02461357]
+.w64_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ pand ym1, ym11
+ mova [maskq], ym1
+ add maskq, 32
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m13, [pb_02461357]
+.w128_loop:
+ W_MASK 0, 4, 0, 1
+ W_MASK 12, 5, 2, 3
+ mova m2, m8
+ vpdpwssd m2, m4, m9
+ mova m3, m8
+ vpdpwssd m3, m5, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermt2b m2, m10, m3
+ vpermq m0, m13, m0
+ vpermq m1, m13, m12
+ pand m2, m11
+ mova [maskq], m2
+ add maskq, 64
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx512icl_table
+ lea r7, [w_mask_444_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m5, [base+pb_64]
+ vpbroadcastd m7, [base+pw_2048]
+ mova m8, [base+wm_444_mask]
+ add wq, r7
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1, 1
+ vinserti128 ym8, [wm_444_mask+32], 1
+ vpermb ym4, ym8, ym4
+ mova [maskq], ym4
+ vextracti32x4 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.w4_end:
+ RET
+.w4_h16:
+ vpbroadcastd m9, strided
+ pmulld m9, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ kxnorw k1, k1, k1
+ mova [maskq], m4
+ vpscatterdd [dstq+m9]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1, 1
+ vinserti128 ym8, [wm_444_mask+32], 1
+ vpermb ym4, ym8, ym4
+ mova [maskq], ym4
+ vextracti32x4 xm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 64
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ mova [maskq], m4
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 64
+ lea dstq, [dstq+strideq*4]
+.w16:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ vpermq m0, m0, q3120
+ mova [maskq], m4
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m9, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermq m0, m9, m0
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m9, [pb_02461357]
+.w64_loop:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermq m0, m9, m0
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m11, [pb_02461357]
+.w128_loop:
+ W_MASK 0, 4, 0, 1, 1
+ W_MASK 10, 9, 2, 3, 1
+ vpermb m4, m8, m4
+ vpermb m9, m8, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermq m0, m11, m0
+ vpermq m10, m11, m10
+ mova [maskq+64*0], m4
+ mova [maskq+64*1], m9
+ add maskq, 128
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m10
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx512icl_table
+ lea r6, [blend_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn maskq, maskmp
+ movifnidn hd, hm
+ movsxd wq, [r6+wq*4]
+ vpbroadcastd m6, [base+pb_64]
+ vpbroadcastd m7, [base+pw_512]
+ sub tmpq, maskq
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ vpbroadcastd xmm1, [dstq+dsq*2]
+ pinsrd xmm1, [dstq+r6 ], 3
+ mova xmm4, [maskq]
+ mova xmm5, [maskq+tmpq]
+ add maskq, 4*4
+ psubb xmm3, xm6, xmm4
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm2, xmm3, xmm4
+ punpckhbw xmm1, xmm5
+ punpckhbw xmm3, xmm4
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm1, xmm3
+ pmulhrsw xmm0, xm7
+ pmulhrsw xmm1, xm7
+ packuswb xmm0, xmm1
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ pextrd [dstq+dsq*2], xmm0, 2
+ pextrd [dstq+r6 ], xmm0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ RET
+.w8:
+ movq xmm0, [dstq+dsq*0]
+ vpbroadcastq xmm1, [dstq+dsq*1]
+ vpbroadcastq ymm2, [dstq+dsq*2]
+ vpbroadcastq ymm3, [dstq+r6 ]
+ mova ymm4, [maskq]
+ mova ymm5, [maskq+tmpq]
+ add maskq, 8*4
+ vpblendd ymm0, ymm2, 0x30
+ vpblendd ymm1, ymm3, 0xc0
+ psubb ymm3, ym6, ymm4
+ punpcklbw ymm0, ymm5
+ punpcklbw ymm2, ymm3, ymm4
+ punpckhbw ymm1, ymm5
+ punpckhbw ymm3, ymm4
+ pmaddubsw ymm0, ymm2
+ pmaddubsw ymm1, ymm3
+ pmulhrsw ymm0, ym7
+ pmulhrsw ymm1, ym7
+ packuswb ymm0, ymm1
+ vextracti128 xmm1, ymm0, 1
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ movq [dstq+dsq*2], xmm1
+ movhps [dstq+r6 ], xmm1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ vzeroupper
+ RET
+.w16:
+ mova xm1, [dstq+dsq*0]
+ vinserti32x4 ym1, [dstq+dsq*1], 1
+ vinserti32x4 m1, [dstq+dsq*2], 2
+ mova m4, [maskq]
+ vinserti32x4 m1, [dstq+r6 ], 3
+ mova m5, [maskq+tmpq]
+ add maskq, 16*4
+ psubb m3, m6, m4
+ punpcklbw m0, m1, m5
+ punpcklbw m2, m3, m4
+ punpckhbw m1, m5
+ punpckhbw m3, m4
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], ym0, 1
+ vextracti32x4 [dstq+dsq*2], m0, 2
+ vextracti32x4 [dstq+r6 ], m0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w16
+ RET
+.w32:
+ mova ym1, [dstq+dsq*0]
+ vinserti32x8 m1, [dstq+dsq*1], 1
+ mova m4, [maskq]
+ mova m5, [maskq+tmpq]
+ add maskq, 32*2
+ psubb m3, m6, m4
+ punpcklbw m0, m1, m5
+ punpcklbw m2, m3, m4
+ punpckhbw m1, m5
+ punpckhbw m3, m4
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32
+ RET
+
+cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_avx512icl_table
+ lea r5, [blend_v_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r5
+ add maskq, obmc_masks-blend_v_avx512icl_table
+ jmp wq
+.w2:
+ vpbroadcastd xmm2, [maskq+2*2]
+.w2_s0_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrw xmm0, [dstq+dsq*1], 1
+ movd xmm1, [tmpq]
+ add tmpq, 2*2
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_s0_loop
+ RET
+.w4:
+ vpbroadcastq xmm2, [maskq+4*2]
+.w4_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movq xmm1, [tmpq]
+ add tmpq, 4*2
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ mova xmm3, [maskq+8*2]
+.w8_loop:
+ movq xmm0, [dstq+dsq*0]
+ vpbroadcastq xmm1, [dstq+dsq*1]
+ mova xmm2, [tmpq]
+ add tmpq, 8*2
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddubsw xmm0, xmm3
+ pmaddubsw xmm1, xmm3
+ pmulhrsw xmm0, xm5
+ pmulhrsw xmm1, xm5
+ packuswb xmm0, xmm1
+ movq [dstq+dsq*0], xmm0
+ movhps [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ vbroadcasti32x4 ym3, [maskq+16*2]
+ vbroadcasti32x4 ym4, [maskq+16*3]
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti32x4 ym1, [dstq+dsq*1], 1
+ mova ym2, [tmpq]
+ add tmpq, 16*2
+ punpcklbw ym0, ym1, ym2
+ punpckhbw ym1, ym2
+ pmaddubsw ym0, ym3
+ pmaddubsw ym1, ym4
+ pmulhrsw ym0, ym5
+ pmulhrsw ym1, ym5
+ packuswb ym0, ym1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32:
+ mova m4, [maskq+32*2]
+ vshufi32x4 m3, m4, m4, q2020
+ vshufi32x4 m4, m4, q3131
+.w32_loop:
+ mova ym1, [dstq+dsq*0]
+ vinserti32x8 m1, [dstq+dsq*1], 1
+ mova m2, [tmpq]
+ add tmpq, 32*2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+
+cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base r6-blend_h_avx512icl_table
+ lea r6, [blend_h_avx512icl_table]
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, [r6+wq*4]
+ lea maskq, [base+obmc_masks+hq*2]
+ vpbroadcastd m5, [base+pw_512]
+ lea hd, [hq*3]
+ add wq, r6
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd xmm0, [dstq+dsq*0]
+ pinsrw xmm0, [dstq+dsq*1], 1
+ movd xmm2, [maskq+hq*2]
+ movd xmm1, [tmpq]
+ add tmpq, 2*2
+ punpcklwd xmm2, xmm2
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ pextrw [dstq+dsq*0], xmm0, 0
+ pextrw [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+ mova xmm3, [blend_shuf]
+.w4_loop:
+ movd xmm0, [dstq+dsq*0]
+ pinsrd xmm0, [dstq+dsq*1], 1
+ movd xmm2, [maskq+hq*2]
+ movq xmm1, [tmpq]
+ add tmpq, 4*2
+ pshufb xmm2, xmm3
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm2
+ pmulhrsw xmm0, xm5
+ packuswb xmm0, xmm0
+ movd [dstq+dsq*0], xmm0
+ pextrd [dstq+dsq*1], xmm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ vbroadcasti128 ymm4, [blend_shuf]
+ shufpd ymm4, ymm4, 0x03
+.w8_loop:
+ vpbroadcastq ymm1, [dstq+dsq*0]
+ movq xmm0, [dstq+dsq*1]
+ vpblendd ymm0, ymm1, 0x30
+ vpbroadcastd ymm3, [maskq+hq*2]
+ movq xmm1, [tmpq+8*1]
+ vinserti128 ymm1, [tmpq+8*0], 1
+ add tmpq, 8*2
+ pshufb ymm3, ymm4
+ punpcklbw ymm0, ymm1
+ pmaddubsw ymm0, ymm3
+ pmulhrsw ymm0, ym5
+ vextracti128 xmm1, ymm0, 1
+ packuswb xmm0, xmm1
+ movhps [dstq+dsq*0], xmm0
+ movq [dstq+dsq*1], xmm0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ vzeroupper
+ RET
+.w16:
+ vbroadcasti32x4 ym4, [blend_shuf]
+ shufpd ym4, ym4, 0x0c
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti32x4 ym1, [dstq+dsq*1], 1
+ vpbroadcastd ym3, [maskq+hq*2]
+ mova ym2, [tmpq]
+ add tmpq, 16*2
+ pshufb ym3, ym4
+ punpcklbw ym0, ym1, ym2
+ punpckhbw ym1, ym2
+ pmaddubsw ym0, ym3
+ pmaddubsw ym1, ym3
+ pmulhrsw ym0, ym5
+ pmulhrsw ym1, ym5
+ packuswb ym0, ym1
+ mova [dstq+dsq*0], xm0
+ vextracti32x4 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16_loop
+ RET
+.w32:
+ vbroadcasti32x4 m4, [blend_shuf]
+ shufpd m4, m4, 0xf0
+.w32_loop:
+ mova ym1, [dstq+dsq*0]
+ vinserti32x8 m1, [dstq+dsq*1], 1
+ vpbroadcastd m3, [maskq+hq*2]
+ mova m2, [tmpq]
+ add tmpq, 32*2
+ pshufb m3, m4
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], ym0
+ vextracti32x8 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w32_loop
+ RET
+.w64:
+ vpbroadcastw m3, [maskq+hq*2]
+ mova m1, [dstq]
+ mova m2, [tmpq]
+ add tmpq, 32*2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ inc hq
+ jl .w64
+ RET
+.w128:
+ vpbroadcastw m6, [maskq+hq*2]
+ mova m2, [dstq+64*0]
+ mova m1, [tmpq+64*0]
+ mova m3, [dstq+64*1]
+ mova m4, [tmpq+64*1]
+ add tmpq, 64*2
+ punpcklbw m0, m2, m1
+ punpckhbw m2, m1
+ pmaddubsw m0, m6
+ pmaddubsw m2, m6
+ punpcklbw m1, m3, m4
+ punpckhbw m3, m4
+ pmaddubsw m1, m6
+ pmaddubsw m3, m6
+ REPX {pmulhrsw x, m5}, m0, m2, m1, m3
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, dsq
+ inc hq
+ jl .w128
+ RET
+
+cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ mov r6, ~0
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+ kmovq k3, r6
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+%define base r7-$$
+ vpbroadcastd m3, [base+pw_m256]
+ vpbroadcastd m7, [base+pd_63]
+ vbroadcasti32x4 m15, [base+pb_8x0_8x8]
+ vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
+ pslld m5, 4 ; dx*16
+ pslld m6, 14
+ pxor m2, m2
+ mova m16, [base+resize_permA]
+ mova m17, [base+resize_permB]
+ mova xm18, [base+resize_permC]
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+.loop_x:
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ vptestmd k4, m1, m1
+ pand m9, m7 ; filter offset (masked)
+ ktestw k4, k4
+ jz .load
+ vextracti32x8 ym12, m0, 1
+ vextracti32x8 ym13, m1, 1
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdq m10{k1}, [srcq+ym0]
+ vpgatherdq m11{k2}, [srcq+ym12]
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdq m14{k1}, [base+resize_shuf+4+ym1]
+ vpgatherdq m0{k2}, [base+resize_shuf+4+ym13]
+ mova m12, m16
+ mova m13, m17
+ paddb m14, m15
+ paddb m0, m15
+ pshufb m10, m14
+ pshufb m11, m0
+ vpermi2d m12, m10, m11
+ vpermi2d m13, m10, m11
+ jmp .filter
+.load:
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdd m12{k1}, [srcq+m0+0]
+ vpgatherdd m13{k2}, [srcq+m0+4]
+.filter:
+ kmovq k1, k3
+ kmovq k2, k3
+ vpgatherdd m10{k1}, [base+resize_filter+m9*8+0]
+ vpgatherdd m11{k2}, [base+resize_filter+m9*8+4]
+ mova m14, m2
+ vpdpbusd m14, m12, m10
+ vpdpbusd m14, m13, m11
+ packssdw m14, m14
+ pmulhrsw m14, m3
+ packuswb m14, m14
+ vpermd m14, m18, m14
+ mova [dstq+xq], xm14
+ paddd m4, m5
+ add xd, 16
+ cmp xd, dst_wd
+ jl .loop_x
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/mc_sse.asm b/third_party/dav1d/src/x86/mc_sse.asm
new file mode 100644
index 0000000000..54939c647a
--- /dev/null
+++ b/third_party/dav1d/src/x86/mc_sse.asm
@@ -0,0 +1,9599 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+; dav1d_obmc_masks[] with 64-x interleaved
+obmc_masks: db 0, 0, 0, 0
+ ; 2 @4
+ db 45, 19, 64, 0
+ ; 4 @8
+ db 39, 25, 50, 14, 59, 5, 64, 0
+ ; 8 @16
+ db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
+ ; 16 @32
+ db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+ db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
+ ; 32 @64
+ db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+ db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
+ db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
+
+warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
+warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
+warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
+warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
+subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+rescale_mul: dd 0, 1, 2, 3
+resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
+
+wm_420_sign: times 4 dw 258
+ times 4 dw 257
+wm_422_sign: times 8 db 128
+ times 8 db 127
+
+pb_8x0_8x8: times 8 db 0
+ times 8 db 8
+bdct_lb_dw: times 4 db 0
+ times 4 db 4
+ times 4 db 8
+ times 4 db 12
+
+pb_64: times 16 db 64
+pw_m256: times 8 dw -256
+pw_1: times 8 dw 1
+pw_2: times 8 dw 2
+pw_8: times 8 dw 8
+pw_15: times 8 dw 15
+pw_26: times 8 dw 26
+pw_34: times 8 dw 34
+pw_512: times 8 dw 512
+pw_1024: times 8 dw 1024
+pw_2048: times 8 dw 2048
+pw_6903: times 8 dw 6903
+pw_8192: times 8 dw 8192
+pd_32: times 4 dd 32
+pd_63: times 4 dd 63
+pd_512: times 4 dd 512
+pd_16384: times 4 dd 16484
+pd_32768: times 4 dd 32768
+pd_262144:times 4 dd 262144
+pd_0x3ff: times 4 dd 0x3ff
+pd_0x4000:times 4 dd 0x4000
+pq_0x40000000: times 2 dq 0x40000000
+
+const mc_warp_filter2 ; dav1d_mc_warp_filter[] reordered for pmaddubsw usage
+ ; [-1, 0)
+ db 0, 127, 0, 0, 0, 1, 0, 0, 0, 127, 0, 0, -1, 2, 0, 0
+ db 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1, 0
+ db 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1, 0
+ db 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1, 0
+ db 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1, 0
+ db 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2, 0
+ db 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2, 0
+ db 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2, 0
+ db 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3, 0
+ db 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3, 0
+ db 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3, 0
+ db 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4, 0
+ db 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4, 0
+ db 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4, 0
+ db 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4, 0
+ db 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4, 0
+ db 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4, 0
+ db 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4, 0
+ db 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4, 0
+ db 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4, 0
+ db 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4, 0
+ db 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4, 0
+ db 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4, 0
+ db 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3, 0
+ db 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3, 0
+ db 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3, 0
+ db 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2, 0
+ db 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2, 0
+ db 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2, 0
+ db 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1, 0
+ db 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1, 0
+ db 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0, 0
+ ; [0, 1)
+ db 0, 0, 1, 0, 0, 127, 0, 0, 0, -1, 2, 0, 0, 127, 0, 0
+ db 0, -3, 4, 1, 1, 127, -2, 0, 0, -5, 6, 1, 1, 127, -2, 0
+ db 0, -6, 8, 1, 2, 126, -3, 0, -1, -7, 11, 2, 2, 126, -4, -1
+ db -1, -8, 13, 2, 3, 125, -5, -1, -1, -10, 16, 3, 3, 124, -6, -1
+ db -1, -11, 18, 3, 4, 123, -7, -1, -1, -12, 20, 3, 4, 122, -7, -1
+ db -1, -13, 23, 3, 4, 121, -8, -1, -2, -14, 25, 4, 5, 120, -9, -1
+ db -1, -15, 27, 4, 5, 119, -10, -1, -1, -16, 30, 4, 5, 118, -11, -1
+ db -2, -17, 33, 5, 6, 116, -12, -1, -2, -17, 35, 5, 6, 114, -12, -1
+ db -2, -18, 38, 5, 6, 113, -13, -1, -2, -19, 41, 6, 7, 111, -14, -2
+ db -2, -19, 43, 6, 7, 110, -15, -2, -2, -20, 46, 6, 7, 108, -15, -2
+ db -2, -20, 49, 6, 7, 106, -16, -2, -2, -21, 51, 7, 7, 104, -16, -2
+ db -2, -21, 54, 7, 7, 102, -17, -2, -2, -21, 56, 7, 8, 100, -18, -2
+ db -2, -22, 59, 7, 8, 98, -18, -2, -2, -22, 62, 7, 8, 96, -19, -2
+ db -2, -22, 64, 7, 8, 94, -19, -2, -2, -22, 67, 8, 8, 91, -20, -2
+ db -2, -22, 69, 8, 8, 89, -20, -2, -2, -22, 72, 8, 8, 87, -21, -2
+ db -2, -21, 74, 8, 8, 84, -21, -2, -2, -22, 77, 8, 8, 82, -21, -2
+ db -2, -21, 79, 8, 8, 79, -21, -2, -2, -21, 82, 8, 8, 77, -22, -2
+ db -2, -21, 84, 8, 8, 74, -21, -2, -2, -21, 87, 8, 8, 72, -22, -2
+ db -2, -20, 89, 8, 8, 69, -22, -2, -2, -20, 91, 8, 8, 67, -22, -2
+ db -2, -19, 94, 8, 7, 64, -22, -2, -2, -19, 96, 8, 7, 62, -22, -2
+ db -2, -18, 98, 8, 7, 59, -22, -2, -2, -18, 100, 8, 7, 56, -21, -2
+ db -2, -17, 102, 7, 7, 54, -21, -2, -2, -16, 104, 7, 7, 51, -21, -2
+ db -2, -16, 106, 7, 6, 49, -20, -2, -2, -15, 108, 7, 6, 46, -20, -2
+ db -2, -15, 110, 7, 6, 43, -19, -2, -2, -14, 111, 7, 6, 41, -19, -2
+ db -1, -13, 113, 6, 5, 38, -18, -2, -1, -12, 114, 6, 5, 35, -17, -2
+ db -1, -12, 116, 6, 5, 33, -17, -2, -1, -11, 118, 5, 4, 30, -16, -1
+ db -1, -10, 119, 5, 4, 27, -15, -1, -1, -9, 120, 5, 4, 25, -14, -2
+ db -1, -8, 121, 4, 3, 23, -13, -1, -1, -7, 122, 4, 3, 20, -12, -1
+ db -1, -7, 123, 4, 3, 18, -11, -1, -1, -6, 124, 3, 3, 16, -10, -1
+ db -1, -5, 125, 3, 2, 13, -8, -1, -1, -4, 126, 2, 2, 11, -7, -1
+ db 0, -3, 126, 2, 1, 8, -6, 0, 0, -2, 127, 1, 1, 6, -5, 0
+ db 0, -2, 127, 1, 1, 4, -3, 0, 0, 0, 127, 0, 0, 2, -1, 0
+ ; [1, 2)
+ db 0, 0, 127, 0, 0, 1, 0, 0, 0, 0, 127, 0, 0, -1, 2, 0
+ db 0, 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1
+ db 0, 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1
+ db 0, 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1
+ db 0, 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1
+ db 0, 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2
+ db 0, 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2
+ db 0, 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2
+ db 0, 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3
+ db 0, 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3
+ db 0, 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3
+ db 0, 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4
+ db 0, 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4
+ db 0, 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4
+ db 0, 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4
+ db 0, 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4
+ db 0, 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4
+ db 0, 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4
+ db 0, 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4
+ db 0, 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4
+ db 0, 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4
+ db 0, 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4
+ db 0, 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4
+ db 0, 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3
+ db 0, 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3
+ db 0, 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3
+ db 0, 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2
+ db 0, 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2
+ db 0, 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2
+ db 0, 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1
+ db 0, 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1
+ db 0, 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0
+ db 0, 0, 2, -1, 0, 0, 127, 0
+
+pw_258: times 2 dw 258
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+%macro BIDIR_JMP_TABLE 2-*
+ ;evaluated at definition time (in loop below)
+ %xdefine %1_%2_table (%%table - 2*%3)
+ %xdefine %%base %1_%2_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
+ ; dynamically generated label
+ %%table:
+ %rep %0 - 2 ; repeat for num args
+ dd %%prefix %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 16, 16, 16
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_8bpc_sse2.prep)
+%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put)
+%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep)
+
+BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+HV_JMP_TABLE prep, 8tap, sse2, 1, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, sse2, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128
+
+%macro SCALED_JMP_TABLE 2-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
+%%table:
+ %rep %0 - 2
+ dw %%base %+ .w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_1024:
+ %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy1_w%3 - %%base
+ %rotate 1
+ %endrep
+ %rotate 2
+%%dy_2048:
+ %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
+ %rep %0 - 2
+ dw %%base %+ .dy2_w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+SECTION .text
+
+INIT_XMM ssse3
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1
+ %define base t0-put_ssse3
+%else
+ DECLARE_REG_TMP 7
+ %define base 0
+%endif
+
+%macro RESTORE_DSQ_32 1
+ %if ARCH_X86_32
+ mov %1, dsm ; restore dsq
+ %endif
+%endmacro
+
+cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy
+ movifnidn mxyd, r6m ; mx
+ LEA t0, put_ssse3
+ movifnidn srcq, srcmp
+ movifnidn ssq, ssmp
+ tzcnt wd, wm
+ mov hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [t0+wq*2+table_offset(put,)]
+ add wq, t0
+ RESTORE_DSQ_32 t0
+ jmp wq
+.put_w2:
+ movzx r4d, word [srcq+ssq*0]
+ movzx r6d, word [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r4w
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov r4d, [srcq+ssq*0]
+ mov r6d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], r4d
+ mov [dstq+dsq*1], r6d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ movq m0, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movq [dstq+dsq*0], m0
+ movq [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+.put_w32:
+ movu m0, [srcq+ssq*0+16*0]
+ movu m1, [srcq+ssq*0+16*1]
+ movu m2, [srcq+ssq*1+16*0]
+ movu m3, [srcq+ssq*1+16*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+16*0], m0
+ mova [dstq+dsq*0+16*1], m1
+ mova [dstq+dsq*1+16*0], m2
+ mova [dstq+dsq*1+16*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ add srcq, ssq
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+16*0]
+ movu m1, [srcq+16*1]
+ movu m2, [srcq+16*2]
+ movu m3, [srcq+16*3]
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ mova [dstq+16*2], m2
+ mova [dstq+16*3], m3
+ movu m0, [srcq+16*4]
+ movu m1, [srcq+16*5]
+ movu m2, [srcq+16*6]
+ movu m3, [srcq+16*7]
+ mova [dstq+16*4], m0
+ mova [dstq+16*5], m1
+ mova [dstq+16*6], m2
+ mova [dstq+16*7], m3
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+ ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+ imul mxyd, 0x00ff00ff
+ mova m4, [base+bilin_h_shuf8]
+ mova m0, [base+bilin_h_shuf4]
+ add mxyd, 0x00100010
+ movd m5, mxyd
+ mov mxyd, r7m ; my
+ pshufd m5, m5, q0000
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)]
+ mova m3, [base+pw_2048]
+ add wq, t0
+ movifnidn dsq, dsmp
+ jmp wq
+.h_w2:
+ pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
+.h_w2_loop:
+ movd m0, [srcq+ssq*0]
+ movd m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq m0, m1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ pmulhrsw m0, m3
+ packuswb m0, m0
+ movd r6d, m0
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movq m4, [srcq+ssq*0]
+ movhps m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m4, m0
+ pmaddubsw m4, m5
+ pmulhrsw m4, m3
+ packuswb m4, m4
+ movd [dstq+dsq*0], m4
+ psrlq m4, 32
+ movd [dstq+dsq*1], m4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4
+ RET
+.h_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w16
+ RET
+.h_w32:
+ movu m0, [srcq+mmsize*0+8*0]
+ movu m1, [srcq+mmsize*0+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ movu m1, [srcq+mmsize*1+8*0]
+ movu m2, [srcq+mmsize*1+8*1]
+ add srcq, ssq
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ mova [dstq+16*0], m0
+ mova [dstq+16*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w32
+ RET
+.h_w64:
+ mov r6, -16*3
+.h_w64_loop:
+ movu m0, [srcq+r6+16*3+8*0]
+ movu m1, [srcq+r6+16*3+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+r6+16*3], m0
+ add r6, 16
+ jle .h_w64_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ mov r6, -16*7
+.h_w128_loop:
+ movu m0, [srcq+r6+16*7+8*0]
+ movu m1, [srcq+r6+16*7+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+r6+16*7], m0
+ add r6, 16
+ jle .h_w128_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 0x00ff00ff
+ mova m5, [base+pw_2048]
+ add mxyd, 0x00100010
+ add wq, t0
+ movd m4, mxyd
+ pshufd m4, m4, q0000
+ movifnidn dsq, dsmp
+ jmp wq
+.v_w2:
+ movd m0, [srcq+ssq*0]
+.v_w2_loop:
+ pinsrw m0, [srcq+ssq*1], 1 ; 0 1
+ lea srcq, [srcq+ssq*2]
+ pshuflw m1, m0, q2301
+ pinsrw m0, [srcq+ssq*0], 0 ; 2 1
+ punpcklbw m1, m0
+ pmaddubsw m1, m4
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ movd r6d, m1
+ mov [dstq+dsq*1], r6w
+ shr r6d, 16
+ mov [dstq+dsq*0], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd m0, [srcq+ssq*0]
+.v_w4_loop:
+ movd m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m1, m0
+ movd m0, [srcq+ssq*0]
+ punpckldq m1, m2 ; 0 1
+ punpckldq m2, m0 ; 1 2
+ punpcklbw m1, m2
+ pmaddubsw m1, m4
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ movd [dstq+dsq*0], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ ;
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq m0, [srcq+ssq*0]
+.v_w8_loop:
+ movq m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m1, m0
+ movq m0, [srcq+ssq*0]
+ punpcklbw m1, m2
+ punpcklbw m2, m0
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+%macro PUT_BILIN_V_W16 0
+ movu m0, [srcq+ssq*0]
+%%loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m1, m0
+ mova m2, m0
+ movu m0, [srcq+ssq*0]
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ pmaddubsw m2, m4
+ pmaddubsw m3, m4
+ pmulhrsw m2, m5
+ pmulhrsw m3, m5
+ packuswb m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg %%loop
+%endmacro
+.v_w16:
+ PUT_BILIN_V_W16
+ RET
+.v_w128:
+ lea r6d, [hq+(7<<16)]
+ jmp .v_w16gt
+.v_w64:
+ lea r6d, [hq+(3<<16)]
+ jmp .v_w16gt
+.v_w32:
+ lea r6d, [hq+(1<<16)]
+.v_w16gt:
+ mov r4, srcq
+%if ARCH_X86_64
+ mov r7, dstq
+%endif
+.v_w16gt_loop:
+ PUT_BILIN_V_W16
+%if ARCH_X86_64
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+%else
+ mov dstq, dstmp
+ add r4, 16
+ movzx hd, r6w
+ add dstq, 16
+ mov srcq, r4
+ mov dstmp, dstq
+%endif
+ sub r6d, 1<<16
+ jg .v_w16gt
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+ ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+ movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11 ; can't shift by 12 due to signed overflow
+ mova m7, [base+pw_15]
+ movd m6, mxyd
+ add wq, t0
+ pshuflw m6, m6, q0000
+ paddb m5, m5
+ punpcklqdq m6, m6
+ jmp wq
+.hv_w2:
+ RESTORE_DSQ_32 t0
+ movd m0, [srcq+ssq*0]
+ punpckldq m0, m0
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w2_loop:
+ movd m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movd m2, [srcq+ssq*0]
+ punpckldq m1, m2
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 _ 2 _
+ shufps m2, m0, m1, q1032 ; 0 _ 1 _
+ mova m0, m1
+ psubw m1, m2 ; 2 * (src[x + src_stride] - src[x])
+ pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) >> 4
+ pavgw m2, m7 ; src[x] + 8
+ paddw m1, m2 ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8
+ psrlw m1, 4
+ packuswb m1, m1
+%if ARCH_X86_64
+ movq r6, m1
+%else
+ pshuflw m1, m1, q2020
+ movd r6d, m1
+%endif
+ mov [dstq+dsq*0], r6w
+ shr r6, gprsize*4
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova m4, [base+bilin_h_shuf4]
+ movddup m0, [srcq+ssq*0]
+ movifnidn dsq, dsmp
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w4_loop:
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*0]
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2
+ shufps m2, m0, m1, q1032 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ pmulhw m1, m6
+ pavgw m2, m7
+ paddw m1, m2
+ psrlw m1, 4
+ packuswb m1, m1
+ movd [dstq+dsq*0], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ movu m0, [srcq+ssq*0]
+ movifnidn dsq, dsmp
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m1, m2, m0
+ pmulhw m1, m6
+ pavgw m0, m7
+ paddw m1, m0
+ movu m0, [srcq+ssq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ psubw m3, m0, m2
+ pmulhw m3, m6
+ pavgw m2, m7
+ paddw m3, m2
+ psrlw m1, 4
+ psrlw m3, 4
+ packuswb m1, m3
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w128:
+ lea r6d, [hq+(7<<16)]
+ jmp .hv_w16_start
+.hv_w64:
+ lea r6d, [hq+(3<<16)]
+ jmp .hv_w16_start
+.hv_w32:
+ lea r6d, [hq+(1<<16)]
+.hv_w16_start:
+ mov r4, srcq
+%if ARCH_X86_32
+ %define m8 [dstq]
+%else
+ mov r7, dstq
+%endif
+.hv_w16:
+ movifnidn dsq, dsmp
+%if WIN64
+ movaps r4m, m8
+%endif
+.hv_w16_loop0:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w16_loop:
+ add srcq, ssq
+ movu m2, [srcq+8*0]
+ movu m3, [srcq+8*1]
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova m8, m2
+ psubw m2, m0
+ pmulhw m2, m6
+ pavgw m0, m7
+ paddw m2, m0
+ mova m0, m3
+ psubw m3, m1
+ pmulhw m3, m6
+ pavgw m1, m7
+ paddw m3, m1
+ mova m1, m0
+ mova m0, m8
+ psrlw m2, 4
+ psrlw m3, 4
+ packuswb m2, m3
+ mova [dstq], m2
+ add dstq, dsmp
+ dec hd
+ jg .hv_w16_loop
+%if ARCH_X86_32
+ mov dstq, dstm
+ add r4, 16
+ movzx hd, r6w
+ add dstq, 16
+ mov srcq, r4
+ mov dstm, dstq
+%else
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+%endif
+ sub r6d, 1<<16
+ jg .hv_w16_loop0
+%if WIN64
+ movaps m8, r4m
+%endif
+ RET
+
+%macro PSHUFB_BILIN_H8 2 ; dst, src
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ psrldq %2, %1, 1
+ punpcklbw %1, %2
+ %endif
+%endmacro
+
+%macro PSHUFB_BILIN_H4 3 ; dst, src, tmp
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ psrldq %2, %1, 1
+ punpckhbw %3, %1, %2
+ punpcklbw %1, %2
+ punpcklqdq %1, %3
+ %endif
+%endmacro
+
+%macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero
+ %if cpuflag(ssse3)
+ pmaddubsw %1, %2
+ %else
+ %if %5 == 1
+ pxor %3, %3
+ %endif
+ punpckhbw %4, %1, %3
+ punpcklbw %1, %1, %3
+ pmaddwd %4, %2
+ pmaddwd %1, %2
+ packssdw %1, %4
+ %endif
+%endmacro
+
+%macro PMULHRSW 5 ; dst, src, tmp, rndval, shift
+ %if cpuflag(ssse3)
+ pmulhrsw %1, %2
+ %else
+ punpckhwd %3, %1, %4
+ punpcklwd %1, %4
+ pmaddwd %3, %2
+ pmaddwd %1, %2
+ psrad %3, %5
+ psrad %1, %5
+ packssdw %1, %3
+ %endif
+%endmacro
+
+%macro PREP_BILIN 0
+%if ARCH_X86_32
+ %define base r6-prep%+SUFFIX
+%else
+ %define base 0
+%endif
+
+cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ LEA r6, prep%+SUFFIX
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+%if notcpuflag(ssse3)
+ add r6, prep_ssse3 - prep_sse2
+ jmp prep_ssse3
+%else
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ pxor m4, m4
+ add wq, r6
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movd m0, [srcq+strideq*0]
+ movd m1, [srcq+strideq*1]
+ movd m2, [srcq+strideq*2]
+ movd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpckldq m0, m1
+ punpckldq m2, m3
+ punpcklbw m0, m4
+ punpcklbw m2, m4
+ psllw m0, 4
+ psllw m2, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movq m0, [srcq+strideq*0]
+ movq m1, [srcq+strideq*1]
+ movq m2, [srcq+strideq*2]
+ movq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu m1, [srcq+strideq*0]
+ movu m3, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 2
+ jg .prep_w16
+ RET
+.prep_w128:
+ mov r3, -128
+ jmp .prep_w32_start
+.prep_w64:
+ mov r3, -64
+ jmp .prep_w32_start
+.prep_w32:
+ mov r3, -32
+.prep_w32_start:
+ sub srcq, r3
+.prep_w32_vloop:
+ mov r6, r3
+.prep_w32_hloop:
+ movu m1, [srcq+r6+16*0]
+ movu m3, [srcq+r6+16*1]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ add r6, 32
+ jl .prep_w32_hloop
+ add srcq, strideq
+ dec hd
+ jg .prep_w32_vloop
+ RET
+%endif
+.h:
+ ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+ ; = (16 - mx) * src[x] + mx * src[x + 1]
+%if cpuflag(ssse3)
+ imul mxyd, 0x00ff00ff
+ mova m4, [base+bilin_h_shuf8]
+ add mxyd, 0x00100010
+%else
+ imul mxyd, 0xffff
+ add mxyd, 16
+%endif
+ movd m5, mxyd
+ mov mxyd, r6m ; my
+ pshufd m5, m5, q0000
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+%if notcpuflag(ssse3)
+ WIN64_SPILL_XMM 8
+ pxor m6, m6
+%endif
+ add wq, r6
+ jmp wq
+.h_w4:
+%if cpuflag(ssse3)
+ mova m4, [base+bilin_h_shuf4]
+%endif
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq m0, [srcq+strideq*0]
+ movhps m0, [srcq+strideq*1]
+ movq m1, [srcq+strideq*2]
+ movhps m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ PSHUFB_BILIN_H4 m0, m4, m2
+ PMADDUBSW m0, m5, m6, m2, 0
+ PSHUFB_BILIN_H4 m1, m4, m2
+ PMADDUBSW m1, m5, m6, m2, 0
+ mova [tmpq+0 ], m0
+ mova [tmpq+16], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ lea stride3q, [strideq*3]
+.h_w8_loop:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*2]
+ movu m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ movu m0, [srcq+strideq*0+8*0]
+ movu m1, [srcq+strideq*0+8*1]
+ movu m2, [srcq+strideq*1+8*0]
+ movu m3, [srcq+strideq*1+8*1]
+ lea srcq, [srcq+strideq*2]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w128:
+ mov r3, -128
+ jmp .h_w32_start
+.h_w64:
+ mov r3, -64
+ jmp .h_w32_start
+.h_w32:
+ mov r3, -32
+.h_w32_start:
+ sub srcq, r3
+.h_w32_vloop:
+ mov r6, r3
+.h_w32_hloop:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ movu m2, [srcq+r6+8*2]
+ movu m3, [srcq+r6+8*3]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ add r6, 32
+ jl .h_w32_hloop
+ add srcq, strideq
+ dec hd
+ jg .h_w32_vloop
+ RET
+.v:
+%if notcpuflag(ssse3)
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 8
+%endif
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+%if cpuflag(ssse3)
+ imul mxyd, 0x00ff00ff
+ add mxyd, 0x00100010
+%else
+ imul mxyd, 0xffff
+ pxor m6, m6
+ add mxyd, 16
+%endif
+ add wq, r6
+ lea stride3q, [strideq*3]
+ movd m5, mxyd
+ pshufd m5, m5, q0000
+ jmp wq
+.v_w4:
+ movd m0, [srcq+strideq*0]
+.v_w4_loop:
+ movd m1, [srcq+strideq*1]
+ movd m2, [srcq+strideq*2]
+ movd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpckldq m0, m1
+ punpckldq m1, m2
+ punpcklbw m0, m1 ; 01 12
+ PMADDUBSW m0, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ movd m0, [srcq+strideq*0]
+ punpckldq m2, m3
+ punpckldq m3, m0
+ punpcklbw m2, m3 ; 23 34
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq m0, [srcq+strideq*0]
+.v_w8_loop:
+ movq m1, [srcq+strideq*1]
+ movq m2, [srcq+strideq*2]
+ movq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpcklbw m0, m1 ; 01
+ punpcklbw m1, m2 ; 12
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
+ movq m0, [srcq+strideq*0]
+ punpcklbw m2, m3 ; 23
+ punpcklbw m3, m0 ; 34
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*1], m1
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
+ add tmpq, 16*4
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu m0, [srcq+strideq*0]
+.v_w16_loop:
+ movu m1, [srcq+strideq*1]
+ movu m2, [srcq+strideq*2]
+ movu m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ punpcklbw m4, m0, m1
+ punpckhbw m0, m1
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m0, m5, m6, m7, 0
+ mova [tmpq+16*0], m4
+ punpcklbw m4, m1, m2
+ punpckhbw m1, m2
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*1], m0
+ movu m0, [srcq+strideq*0]
+ PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*2], m4
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*3], m1
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*4], m4
+ punpcklbw m4, m3, m0
+ punpckhbw m3, m0
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*5], m2
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*6], m4
+ mova [tmpq+16*7], m3
+ add tmpq, 16*8
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w128:
+ lea r3d, [hq+(3<<8)]
+ mov r6d, 256
+ jmp .v_w32_start
+.v_w64:
+ lea r3d, [hq+(1<<8)]
+ mov r6d, 128
+ jmp .v_w32_start
+.v_w32:
+ xor r3d, r3d
+ mov r6d, 64
+.v_w32_start:
+%if ARCH_X86_64
+ %if WIN64
+ PUSH r7
+ %endif
+ mov r7, tmpq
+%endif
+ mov r5, srcq
+.v_w32_hloop:
+ movu m0, [srcq+strideq*0+16*0]
+ movu m1, [srcq+strideq*0+16*1]
+.v_w32_vloop:
+ movu m2, [srcq+strideq*1+16*0]
+ movu m3, [srcq+strideq*1+16*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m0, m2
+ punpckhbw m0, m2
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m0, m5, m6, m7, 0
+ mova [tmpq+16*0], m4
+ mova [tmpq+16*1], m0
+ movu m0, [srcq+strideq*0+16*0]
+ punpcklbw m4, m1, m3
+ punpckhbw m1, m3
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*2], m4
+ mova [tmpq+16*3], m1
+ movu m1, [srcq+strideq*0+16*1]
+ add tmpq, r6
+ punpcklbw m4, m2, m0
+ punpckhbw m2, m0
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*0], m4
+ mova [tmpq+16*1], m2
+ punpcklbw m4, m3, m1
+ punpckhbw m3, m1
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*2], m4
+ mova [tmpq+16*3], m3
+ add tmpq, r6
+ sub hd, 2
+ jg .v_w32_vloop
+ add r5, 32
+ movzx hd, r3b
+ mov srcq, r5
+%if ARCH_X86_64
+ add r7, 16*4
+ mov tmpq, r7
+%else
+ mov tmpq, tmpmp
+ add tmpq, 16*4
+ mov tmpmp, tmpq
+%endif
+ sub r3d, 1<<8
+ jg .v_w32_hloop
+%if WIN64
+ POP r7
+%endif
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
+%assign stack_offset stack_offset - stack_size_padded
+%if cpuflag(ssse3)
+ imul mxyd, 0x08000800
+ WIN64_SPILL_XMM 8
+%else
+ or mxyd, 1<<16
+ WIN64_SPILL_XMM 9
+ %if ARCH_X86_64
+ mova m8, [base+pw_8]
+ %else
+ %define m8 [base+pw_8]
+ %endif
+ pxor m7, m7
+%endif
+ movd m6, mxyd
+ add wq, r6
+ pshufd m6, m6, q0000
+ jmp wq
+.hv_w4:
+%if cpuflag(ssse3)
+ mova m4, [base+bilin_h_shuf4]
+ movddup m0, [srcq+strideq*0]
+%else
+ movhps m0, [srcq+strideq*0]
+%endif
+ lea r3, [strideq*3]
+ PSHUFB_BILIN_H4 m0, m4, m3
+ PMADDUBSW m0, m5, m7, m4, 0 ; _ 0
+.hv_w4_loop:
+ movq m1, [srcq+strideq*1]
+ movhps m1, [srcq+strideq*2]
+ movq m2, [srcq+r3 ]
+ lea srcq, [srcq+strideq*4]
+ movhps m2, [srcq+strideq*0]
+ PSHUFB_BILIN_H4 m1, m4, m3
+ PSHUFB_BILIN_H4 m2, m4, m3
+ PMADDUBSW m1, m5, m7, m4, 0 ; 1 2
+ PMADDUBSW m2, m5, m7, m4, 0 ; 3 4
+ shufpd m0, m1, 0x01 ; 0 1
+ shufpd m3, m1, m2, 0x01 ; 2 3
+ psubw m1, m0
+ PMULHRSW m1, m6, m4, m8, 4
+ paddw m1, m0
+ mova m0, m2
+ psubw m2, m3
+ PMULHRSW m2, m6, m4, m8, 4
+ paddw m2, m3
+ mova [tmpq+16*0], m1
+ mova [tmpq+16*1], m2
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ movu m0, [srcq+strideq*0]
+ PSHUFB_BILIN_H8 m0, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 0
+.hv_w8_loop:
+ movu m1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu m2, [srcq+strideq*0]
+ PSHUFB_BILIN_H8 m1, m4
+ PSHUFB_BILIN_H8 m2, m4
+ PMADDUBSW m1, m5, m7, m4, 0 ; 1
+ PMADDUBSW m2, m5, m7, m4, 0 ; 2
+ psubw m3, m1, m0
+ PMULHRSW m3, m6, m4, m8, 4
+ paddw m3, m0
+ mova m0, m2
+ psubw m2, m1
+ PMULHRSW m2, m6, m4, m8, 4
+ paddw m2, m1
+ mova [tmpq+16*0], m3
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w128:
+ lea r3d, [hq+(7<<8)]
+ mov r5d, 256
+ jmp .hv_w16_start
+.hv_w64:
+ lea r3d, [hq+(3<<8)]
+ mov r5d, 128
+ jmp .hv_w16_start
+.hv_w32:
+ lea r3d, [hq+(1<<8)]
+ mov r5d, 64
+ jmp .hv_w16_start
+.hv_w16:
+ xor r3d, r3d
+ mov r5d, 32
+.hv_w16_start:
+%if ARCH_X86_64 || cpuflag(ssse3)
+ mov r6, srcq
+%endif
+%if ARCH_X86_64
+ %if WIN64
+ PUSH r7
+ %endif
+ mov r7, tmpq
+%endif
+.hv_w16_hloop:
+ movu m0, [srcq+strideq*0+8*0]
+ movu m1, [srcq+strideq*0+8*1]
+ PSHUFB_BILIN_H8 m0, m4
+ PSHUFB_BILIN_H8 m1, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 0a
+ PMADDUBSW m1, m5, m7, m4, 0 ; 0b
+.hv_w16_vloop:
+ movu m2, [srcq+strideq*1+8*0]
+ PSHUFB_BILIN_H8 m2, m4
+ PMADDUBSW m2, m5, m7, m4, 0 ; 1a
+ psubw m3, m2, m0
+ PMULHRSW m3, m6, m4, m8, 4
+ paddw m3, m0
+ mova [tmpq+16*0], m3
+ movu m3, [srcq+strideq*1+8*1]
+ lea srcq, [srcq+strideq*2]
+ PSHUFB_BILIN_H8 m3, m4
+ PMADDUBSW m3, m5, m7, m4, 0 ; 1b
+ psubw m0, m3, m1
+ PMULHRSW m0, m6, m4, m8, 4
+ paddw m0, m1
+ mova [tmpq+16*1], m0
+ add tmpq, r5
+ movu m0, [srcq+strideq*0+8*0]
+ PSHUFB_BILIN_H8 m0, m4
+ PMADDUBSW m0, m5, m7, m4, 0 ; 2a
+ psubw m1, m0, m2
+ PMULHRSW m1, m6, m4, m8, 4
+ paddw m1, m2
+ mova [tmpq+16*0], m1
+ movu m1, [srcq+strideq*0+8*1]
+ PSHUFB_BILIN_H8 m1, m4
+ PMADDUBSW m1, m5, m7, m4, 0 ; 2b
+ psubw m2, m1, m3
+ PMULHRSW m2, m6, m4, m8, 4
+ paddw m2, m3
+ mova [tmpq+16*1], m2
+ add tmpq, r5
+ sub hd, 2
+ jg .hv_w16_vloop
+ movzx hd, r3b
+%if ARCH_X86_64
+ add r6, 16
+ add r7, 2*16
+ mov srcq, r6
+ mov tmpq, r7
+%elif cpuflag(ssse3)
+ mov tmpq, tmpm
+ add r6, 16
+ add tmpq, 2*16
+ mov srcq, r6
+ mov tmpm, tmpq
+%else
+ mov srcq, srcm
+ mov tmpq, tmpm
+ add srcq, 16
+ add tmpq, 2*16
+ mov srcm, srcq
+ mov tmpm, tmpq
+%endif
+ sub r3d, 1<<8
+ jg .hv_w16_hloop
+%if WIN64
+ POP r7
+%endif
+ RET
+%endmacro
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; prefix, type, type_h, type_v
+cglobal %1_%2_8bpc
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX)
+%endif
+%endmacro
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1, 2
+%elif WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+FN put_8tap, sharp, SHARP, SHARP
+FN put_8tap, sharp_smooth, SHARP, SMOOTH
+FN put_8tap, smooth_sharp, SMOOTH, SHARP
+FN put_8tap, smooth, SMOOTH, SMOOTH
+FN put_8tap, sharp_regular, SHARP, REGULAR
+FN put_8tap, regular_sharp, REGULAR, SHARP
+FN put_8tap, smooth_regular, SMOOTH, REGULAR
+FN put_8tap, regular_smooth, REGULAR, SMOOTH
+FN put_8tap, regular, REGULAR, REGULAR
+
+%if ARCH_X86_32
+ %define base_reg r1
+ %define base base_reg-put_ssse3
+%else
+ %define base_reg r8
+ %define base 0
+%endif
+
+cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+%assign org_stack_offset stack_offset
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+%if ARCH_X86_64
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+%else
+ imul ssd, mym, 0x010101
+ add ssd, t1d ; 8tap_v, my, 4tap_v
+ mov srcq, srcm
+%endif
+ mov wd, wm
+ movifnidn hd, hm
+ LEA base_reg, put_ssse3
+ test mxd, 0xf00
+ jnz .h
+%if ARCH_X86_32
+ test ssd, 0xf00
+%else
+ test myd, 0xf00
+%endif
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [base_reg+wq*2+table_offset(put,)]
+ add wq, base_reg
+; put_bilin mangling jump
+%assign stack_offset org_stack_offset
+ movifnidn dsq, dsmp
+ movifnidn ssq, ssmp
+%if WIN64
+ pop r8
+%endif
+ lea r6, [ssq*3]
+ jmp wq
+.h:
+%if ARCH_X86_32
+ test ssd, 0xf00
+%else
+ test myd, 0xf00
+%endif
+ jnz .hv
+ movifnidn ssq, ssmp
+ WIN64_SPILL_XMM 12
+ cmp wd, 4
+ jl .h_w2
+ je .h_w4
+ tzcnt wd, wd
+%if ARCH_X86_64
+ mova m10, [base+subpel_h_shufA]
+ mova m11, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+%endif
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)]
+ movq m6, [base_reg+mxq*8+subpel_filters-put_ssse3]
+ mova m7, [base+pw_34] ; 2 + (8 << 2)
+ pshufd m5, m6, q0000
+ pshufd m6, m6, q1111
+ add wq, base_reg
+ jmp wq
+.h_w2:
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ mova m4, [base+subpel_h_shuf4]
+ movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+ mova m5, [base+pw_34] ; 2 + (8 << 2)
+ pshufd m3, m3, q0000
+ movifnidn dsq, dsmp
+.h_w2_loop:
+ movq m0, [srcq+ssq*0]
+ movhps m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pmaddubsw m0, m3
+ phaddw m0, m0
+ paddw m0, m5 ; pw34
+ psraw m0, 6
+ packuswb m0, m0
+ movd r6d, m0
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+ mova m6, [base+subpel_h_shufA]
+ mova m5, [base+pw_34] ; 2 + (8 << 2)
+ pshufd m3, m3, q0000
+ movifnidn dsq, dsmp
+.h_w4_loop:
+ movq m0, [srcq+ssq*0] ; 1
+ movq m1, [srcq+ssq*1] ; 2
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m6 ; subpel_h_shufA
+ pshufb m1, m6 ; subpel_h_shufA
+ pmaddubsw m0, m3 ; subpel_filters
+ pmaddubsw m1, m3 ; subpel_filters
+ phaddw m0, m1
+ paddw m0, m5 ; pw34
+ psraw m0, 6
+ packuswb m0, m0
+ movd [dstq+dsq*0], m0
+ psrlq m0, 32
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+ %if ARCH_X86_32
+ pshufb %2, %1, [base+subpel_h_shufB]
+ pshufb %3, %1, [base+subpel_h_shufC]
+ pshufb %1, [base+subpel_h_shufA]
+ %else
+ pshufb %2, %1, m11; subpel_h_shufB
+ pshufb %3, %1, m9 ; subpel_h_shufC
+ pshufb %1, m10 ; subpel_h_shufA
+ %endif
+ pmaddubsw %4, %2, m5 ; subpel +0 B0
+ pmaddubsw %2, m6 ; subpel +4 B4
+ pmaddubsw %3, m6 ; C4
+ pmaddubsw %1, m5 ; A0
+ paddw %3, %4 ; C4+B0
+ paddw %1, %2 ; A0+B4
+ phaddw %1, %3
+ paddw %1, m7 ; pw34
+ psraw %1, 6
+%endmacro
+.h_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H m0, m2, m3, m4
+ PUT_8TAP_H m1, m2, m3, m4
+ packuswb m0, m1
+%if ARCH_X86_32
+ movq [dstq], m0
+ add dstq, dsm
+ movhps [dstq], m0
+ add dstq, dsm
+%else
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+%endif
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w128:
+ mov r4, -16*7
+ jmp .h_w16_start
+.h_w64:
+ mov r4, -16*3
+ jmp .h_w16_start
+.h_w32:
+ mov r4, -16*1
+ jmp .h_w16_start
+.h_w16:
+ xor r4d, r4d
+.h_w16_start:
+ sub srcq, r4
+ sub dstq, r4
+.h_w16_loop_v:
+ mov r6, r4
+.h_w16_loop_h:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ PUT_8TAP_H m0, m2, m3, m4
+ PUT_8TAP_H m1, m2, m3, m4
+ packuswb m0, m1
+ mova [dstq+r6], m0
+ add r6, 16
+ jle .h_w16_loop_h
+ add srcq, ssq
+ add dstq, dsmp
+ dec hd
+ jg .h_w16_loop_v
+ RET
+.v:
+%if ARCH_X86_32
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 6
+ cmovs ssd, mxd
+ movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
+%else
+ %assign stack_offset org_stack_offset
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-put_ssse3]
+%endif
+ tzcnt r6d, wd
+ movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)]
+ punpcklwd m0, m0
+ mova m7, [base+pw_512]
+ add r6, base_reg
+%if ARCH_X86_32
+ %define subpel0 [rsp+mmsize*0]
+ %define subpel1 [rsp+mmsize*1]
+ %define subpel2 [rsp+mmsize*2]
+ %define subpel3 [rsp+mmsize*3]
+%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
+ ALLOC_STACK -16*4
+%assign regs_used 7
+ pshufd m1, m0, q0000
+ mova subpel0, m1
+ pshufd m1, m0, q1111
+ mova subpel1, m1
+ pshufd m1, m0, q2222
+ mova subpel2, m1
+ pshufd m1, m0, q3333
+ mova subpel3, m1
+ mov ssq, [rstk+stack_offset+gprsize*4]
+ lea ssq, [ssq*3]
+ sub srcq, ssq
+ mov ssq, [rstk+stack_offset+gprsize*4]
+ mov dsq, [rstk+stack_offset+gprsize*2]
+%else
+ %define subpel0 m8
+ %define subpel1 m9
+ %define subpel2 m10
+ %define subpel3 m11
+ lea ss3q, [ssq*3]
+ pshufd m8, m0, q0000
+ sub srcq, ss3q
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+%endif
+ jmp r6
+.v_w2:
+ movd m1, [srcq+ssq*0]
+ movd m0, [srcq+ssq*1]
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ movd m2, [srcq+ssq*0]
+ movd m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movd m3, [srcq+ssq*0]
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+%else
+ movd m2, [srcq+ssq*2]
+ add srcq, ss3q
+ movd m5, [srcq+ssq*0]
+ movd m3, [srcq+ssq*1]
+ movd m4, [srcq+ssq*2]
+ add srcq, ss3q
+%endif
+ punpcklwd m1, m0 ; 0 1
+ punpcklwd m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+ssq*0]
+ punpcklwd m2, m5 ; 2 3
+ punpcklwd m5, m3 ; 3 4
+ punpcklwd m3, m4 ; 4 5
+ punpcklwd m4, m0 ; 5 6
+ punpcklbw m2, m5 ; 23 34
+ punpcklbw m3, m4 ; 45 56
+.v_w2_loop:
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m5, m1, subpel0 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, subpel1 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, subpel2 ; a2 b2
+ paddw m5, m3
+ punpcklwd m3, m0, m4 ; 6 7
+ movd m0, [srcq+ssq*0]
+ punpcklwd m4, m0 ; 7 8
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, subpel3 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ packuswb m5, m5
+ movd r6d, m5
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+%if ARCH_X86_32
+.v_w8:
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ shl wd, 14
+%if STACK_ALIGNMENT < 16
+ %define dstm [rsp+mmsize*4+gprsize]
+ mov dstm, dstq
+%endif
+ lea r6d, [hq+wq-(1<<16)]
+ mov r4, srcq
+.v_w4_loop0:
+%endif
+ movd m1, [srcq+ssq*0]
+ movd m0, [srcq+ssq*1]
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ movd m2, [srcq+ssq*0]
+ movd m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movd m3, [srcq+ssq*0]
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+%else
+ movd m2, [srcq+ssq*2]
+ add srcq, ss3q
+ movd m5, [srcq+ssq*0]
+ movd m3, [srcq+ssq*1]
+ movd m4, [srcq+ssq*2]
+ add srcq, ss3q
+%endif
+ punpckldq m1, m0 ; 0 1
+ punpckldq m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+ssq*0]
+ punpckldq m2, m5 ; 2 3
+ punpckldq m5, m3 ; 3 4
+ punpckldq m3, m4 ; 4 5
+ punpckldq m4, m0 ; 5 6
+ punpcklbw m2, m5 ; 23 34
+ punpcklbw m3, m4 ; 45 56
+.v_w4_loop:
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m5, m1, subpel0 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, subpel1 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, subpel2 ; a2 b2
+ paddw m5, m3
+ punpckldq m3, m0, m4 ; 6 7 _ _
+ movd m0, [srcq+ssq*0]
+ punpckldq m4, m0 ; 7 8 _ _
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, subpel3 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ packuswb m5, m5
+ movd [dstq+dsq*0], m5
+ psrlq m5, 32
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+%if ARCH_X86_32
+ mov dstq, dstm
+ add r4, 4
+ movzx hd, r6w
+ add dstq, 4
+ mov srcq, r4
+ mov dstm, dstq
+ sub r6d, 1<<16
+ jg .v_w4_loop0
+%endif
+ RET
+%if ARCH_X86_64
+.v_w8:
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ lea r6d, [wq*8-64]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*4]
+.v_w8_loop0:
+ movq m1, [srcq+ssq*0]
+ movq m2, [srcq+ssq*1]
+ movq m3, [srcq+ssq*2]
+ add srcq, ss3q
+ movq m4, [srcq+ssq*0]
+ movq m5, [srcq+ssq*1]
+ movq m6, [srcq+ssq*2]
+ add srcq, ss3q
+ movq m0, [srcq+ssq*0]
+ punpcklbw m1, m2 ; 01
+ punpcklbw m2, m3 ; 12
+ punpcklbw m3, m4 ; 23
+ punpcklbw m4, m5 ; 34
+ punpcklbw m5, m6 ; 45
+ punpcklbw m6, m0 ; 56
+.v_w8_loop:
+ movq m13, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddubsw m14, m1, subpel0 ; a0
+ mova m1, m3
+ pmaddubsw m15, m2, subpel0 ; b0
+ mova m2, m4
+ pmaddubsw m3, subpel1 ; a1
+ mova m12, m0
+ pmaddubsw m4, subpel1 ; b1
+ movq m0, [srcq+ssq*0]
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ pmaddubsw m5, subpel2 ; a2
+ mova m4, m6
+ pmaddubsw m6, subpel2 ; b2
+ punpcklbw m12, m13 ; 67
+ punpcklbw m13, m0 ; 78
+ paddw m14, m5
+ mova m5, m12
+ pmaddubsw m12, subpel3 ; a3
+ paddw m15, m6
+ mova m6, m13
+ pmaddubsw m13, subpel3 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ packuswb m14, m15
+ movq [dstq+dsq*0], m14
+ movhps [dstq+dsq*1], m14
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ add r4, 8
+ add r7, 8
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
+ jg .v_w8_loop0
+ RET
+%endif ;ARCH_X86_64
+%undef subpel0
+%undef subpel1
+%undef subpel2
+%undef subpel3
+.hv:
+ %assign stack_offset org_stack_offset
+ cmp wd, 4
+ jg .hv_w8
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+%if ARCH_X86_32
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 6
+ cmovs ssd, mxd
+ movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
+ mov ssq, ssmp
+ lea r6, [ssq*3]
+ sub srcq, r6
+ %define base_reg r6
+ mov r6, r1; use as new base
+ %assign regs_used 2
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
+ mov dsq, [rstk+stack_offset+gprsize*2]
+ %define subpelv0 [rsp+mmsize*0]
+ %define subpelv1 [rsp+mmsize*1]
+ %define subpelv2 [rsp+mmsize*2]
+ %define subpelv3 [rsp+mmsize*3]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m6, m0, q0000
+ mova subpelv0, m6
+ pshufd m6, m0, q1111
+ mova subpelv1, m6
+ pshufd m6, m0, q2222
+ mova subpelv2, m6
+ pshufd m6, m0, q3333
+ mova subpelv3, m6
+%else
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-put_ssse3]
+ ALLOC_STACK mmsize*14, 14
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ %define subpelv0 m10
+ %define subpelv1 m11
+ %define subpelv2 m12
+ %define subpelv3 m13
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ mova m8, [base+pw_8192]
+ mova m9, [base+pd_512]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+%endif
+ pshufd m7, m1, q0000
+ cmp wd, 4
+ je .hv_w4
+.hv_w2:
+ mova m6, [base+subpel_h_shuf4]
+ movq m2, [srcq+ssq*0] ; 0
+ movhps m2, [srcq+ssq*1] ; 0 _ 1
+%if ARCH_X86_32
+ %define w8192reg [base+pw_8192]
+ %define d512reg [base+pd_512]
+ lea srcq, [srcq+ssq*2]
+ movq m0, [srcq+ssq*0] ; 2
+ movhps m0, [srcq+ssq*1] ; 2 _ 3
+ lea srcq, [srcq+ssq*2]
+%else
+ %define w8192reg m8
+ %define d512reg m9
+ movq m0, [srcq+ssq*2] ; 2
+ add srcq, ss3q
+ movhps m0, [srcq+ssq*0] ; 2 _ 3
+%endif
+ pshufb m2, m6 ; 0 ~ 1 ~
+ pshufb m0, m6 ; 2 ~ 3 ~
+ pmaddubsw m2, m7 ; subpel_filters
+ pmaddubsw m0, m7 ; subpel_filters
+ phaddw m2, m0 ; 0 1 2 3
+ pmulhrsw m2, w8192reg
+%if ARCH_X86_32
+ movq m3, [srcq+ssq*0] ; 4
+ movhps m3, [srcq+ssq*1] ; 4 _ 5
+ lea srcq, [srcq+ssq*2]
+%else
+ movq m3, [srcq+ssq*1] ; 4
+ movhps m3, [srcq+ssq*2] ; 4 _ 5
+ add srcq, ss3q
+%endif
+ movq m0, [srcq+ssq*0] ; 6
+ pshufb m3, m6 ; 4 ~ 5 ~
+ pshufb m0, m6 ; 6 ~
+ pmaddubsw m3, m7 ; subpel_filters
+ pmaddubsw m0, m7 ; subpel_filters
+ phaddw m3, m0 ; 4 5 6 _
+ pmulhrsw m3, w8192reg
+ palignr m4, m3, m2, 4; V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2
+ punpckhwd m2, m4 ; V 23 34 2 3 3 4
+ pshufd m0, m3, q2121; V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56 4 5 5 6
+.hv_w2_loop:
+ movq m4, [srcq+ssq*1] ; V 7
+ lea srcq, [srcq+ssq*2] ; V
+ movhps m4, [srcq+ssq*0] ; V 7 8
+ pshufb m4, m6
+ pmaddubsw m4, m7
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2 ; V
+ pmaddwd m2, subpelv1 ; V a1 b1
+ paddd m5, m2 ; V
+ mova m2, m3 ; V
+ pmaddwd m3, subpelv2 ; a2 b2
+ phaddw m4, m4
+ pmulhrsw m4, w8192reg
+ paddd m5, m3 ; V
+ palignr m3, m4, m0, 12
+ mova m0, m4
+ punpcklwd m3, m0 ; V 67 78
+ pmaddwd m4, m3, subpelv3 ; V a3 b3
+ paddd m5, d512reg
+ paddd m5, m4
+ psrad m5, 10
+ packssdw m5, m5
+ packuswb m5, m5
+ movd r4d, m5
+ mov [dstq+dsq*0], r4w
+ shr r4d, 16
+ mov [dstq+dsq*1], r4w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+%undef w8192reg
+%undef d512reg
+.hv_w4:
+%define hv4_line_0_0 4
+%define hv4_line_0_1 5
+%define hv4_line_0_2 6
+%define hv4_line_0_3 7
+%define hv4_line_0_4 8
+%define hv4_line_0_5 9
+%define hv4_line_1_0 10
+%define hv4_line_1_1 11
+%define hv4_line_1_2 12
+%define hv4_line_1_3 13
+%macro SAVELINE_W4 3
+ mova [rsp+mmsize*hv4_line_%3_%2], %1
+%endmacro
+%macro RESTORELINE_W4 3
+ mova %1, [rsp+mmsize*hv4_line_%3_%2]
+%endmacro
+%if ARCH_X86_32
+ %define w8192reg [base+pw_8192]
+ %define d512reg [base+pd_512]
+%else
+ %define w8192reg m8
+ %define d512reg m9
+%endif
+ ; lower shuffle 0 1 2 3 4
+ mova m6, [base+subpel_h_shuf4]
+ movq m5, [srcq+ssq*0] ; 0 _ _ _
+ movhps m5, [srcq+ssq*1] ; 0 _ 1 _
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ movq m4, [srcq+ssq*0] ; 2 _ _ _
+ movhps m4, [srcq+ssq*1] ; 2 _ 3 _
+ lea srcq, [srcq+ssq*2]
+%else
+ movq m4, [srcq+ssq*2] ; 2 _ _ _
+ movhps m4, [srcq+ss3q ] ; 2 _ 3 _
+ lea srcq, [srcq+ssq*4]
+%endif
+ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+ pmaddubsw m2, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m2, m0 ;H 0 1 2 3
+ pmulhrsw m2, w8192reg ;H pw_8192
+ SAVELINE_W4 m2, 2, 0
+ ; upper shuffle 2 3 4 5 6
+ mova m6, [base+subpel_h_shuf4+16]
+ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+ pmaddubsw m2, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m2, m0 ;H 0 1 2 3
+ pmulhrsw m2, w8192reg ;H pw_8192
+ ;
+ ; lower shuffle
+ mova m6, [base+subpel_h_shuf4]
+ movq m5, [srcq+ssq*0] ; 4 _ _ _
+ movhps m5, [srcq+ssq*1] ; 4 _ 5 _
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ movq m4, [srcq+ssq*0] ; 6 _ _ _
+ add srcq, ssq
+%else
+ movq m4, [srcq+ssq*2] ; 6 _ _ _
+ add srcq, ss3q
+%endif
+ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+ pmaddubsw m3, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m3, m0 ;H 4 5 6 7
+ pmulhrsw m3, w8192reg ;H pw_8192
+ SAVELINE_W4 m3, 3, 0
+ ; upper shuffle
+ mova m6, [base+subpel_h_shuf4+16]
+ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+ pmaddubsw m3, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m3, m0 ;H 4 5 6 7
+ pmulhrsw m3, w8192reg ;H pw_8192
+ ;process high
+ palignr m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ ;process low
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ palignr m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+.hv_w4_loop:
+ ;process low
+ pmaddwd m5, m1, subpelv0 ; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+ mova m6, [base+subpel_h_shuf4]
+ movq m4, [srcq+ssq*0] ; 7
+ movhps m4, [srcq+ssq*1] ; 7 _ 8 _
+ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+ pmaddubsw m4, m7 ;H subpel_filters
+ phaddw m4, m4 ;H 7 8 7 8
+ pmulhrsw m4, w8192reg ;H pw_8192
+ palignr m3, m4, m0, 12 ; 6 7 8 7
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+ paddd m5, d512reg ; pd_512
+ paddd m5, m4
+ psrad m5, 10
+ SAVELINE_W4 m0, 0, 0
+ SAVELINE_W4 m1, 1, 0
+ SAVELINE_W4 m2, 2, 0
+ SAVELINE_W4 m3, 3, 0
+ SAVELINE_W4 m5, 5, 0
+ ;process high
+ RESTORELINE_W4 m0, 0, 1
+ RESTORELINE_W4 m1, 1, 1
+ RESTORELINE_W4 m2, 2, 1
+ RESTORELINE_W4 m3, 3, 1
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+ mova m6, [base+subpel_h_shuf4+16]
+ movq m4, [srcq+ssq*0] ; 7
+ movhps m4, [srcq+ssq*1] ; 7 _ 8 _
+ lea srcq, [srcq+ssq*2]
+ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+ pmaddubsw m4, m7 ;H subpel_filters
+ phaddw m4, m4 ;H 7 8 7 8
+ pmulhrsw m4, w8192reg ;H pw_8192
+ palignr m3, m4, m0, 12 ; 6 7 8 7
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+ paddd m5, d512reg ; pd_512
+ paddd m5, m4
+ psrad m4, m5, 10
+ RESTORELINE_W4 m5, 5, 0
+ packssdw m5, m4 ; d -> w
+ packuswb m5, m5 ; w -> b
+ pshuflw m5, m5, q3120
+ movd [dstq+dsq*0], m5
+ psrlq m5, 32
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ RESTORELINE_W4 m0, 0, 0
+ RESTORELINE_W4 m1, 1, 0
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ jg .hv_w4_loop
+ RET
+%undef subpelv0
+%undef subpelv1
+%undef subpelv2
+%undef subpelv3
+.hv_w8:
+ %assign stack_offset org_stack_offset
+%define hv8_line_1 0
+%define hv8_line_2 1
+%define hv8_line_3 2
+%define hv8_line_4 3
+%define hv8_line_6 4
+%macro SAVELINE_W8 2
+ mova [rsp+hv8_line_%1*mmsize], %2
+%endmacro
+%macro RESTORELINE_W8 2
+ mova %2, [rsp+hv8_line_%1*mmsize]
+%endmacro
+ shr mxd, 16
+ sub srcq, 3
+%if ARCH_X86_32
+ %define base_reg r1
+ %define subpelh0 [rsp+mmsize*5]
+ %define subpelh1 [rsp+mmsize*6]
+ %define subpelv0 [rsp+mmsize*7]
+ %define subpelv1 [rsp+mmsize*8]
+ %define subpelv2 [rsp+mmsize*9]
+ %define subpelv3 [rsp+mmsize*10]
+ %define accuv0 [rsp+mmsize*11]
+ %define accuv1 [rsp+mmsize*12]
+ movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 6
+ cmovs ssd, mxd
+ movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
+ mov ssq, ssmp
+ ALLOC_STACK -mmsize*13
+%if STACK_ALIGNMENT < 16
+ %define dstm [rsp+mmsize*13+gprsize*1]
+ %define dsm [rsp+mmsize*13+gprsize*2]
+ mov r6, [rstk+stack_offset+gprsize*2]
+ mov dsm, r6
+%endif
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ punpcklbw m5, m5
+ psraw m5, 8 ; sign-extend
+ pshufd m2, m5, q0000
+ pshufd m3, m5, q1111
+ pshufd m4, m5, q2222
+ pshufd m5, m5, q3333
+ mova subpelh0, m0
+ mova subpelh1, m1
+ mova subpelv0, m2
+ mova subpelv1, m3
+ mova subpelv2, m4
+ mova subpelv3, m5
+ lea r6, [ssq*3]
+ mov dstm, dstq
+ sub srcq, r6
+%else
+ ALLOC_STACK 16*5, 16
+ %define subpelh0 m10
+ %define subpelh1 m11
+ %define subpelv0 m12
+ %define subpelv1 m13
+ %define subpelv2 m14
+ %define subpelv3 m15
+ %define accuv0 m8
+ %define accuv1 m9
+ movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m1, [base_reg+myq*8+subpel_filters-put_ssse3]
+ pshufd subpelh0, m0, q0000
+ pshufd subpelh1, m0, q1111
+ punpcklbw m1, m1
+ psraw m1, 8 ; sign-extend
+ pshufd subpelv0, m1, q0000
+ pshufd subpelv1, m1, q1111
+ pshufd subpelv2, m1, q2222
+ pshufd subpelv3, m1, q3333
+ lea ss3q, [ssq*3]
+ mov r7, dstq
+ sub srcq, ss3q
+%endif
+ shl wd, 14
+ lea r6d, [hq+wq-(1<<16)]
+ mov r4, srcq
+.hv_w8_loop0:
+ movu m4, [srcq+ssq*0] ; 0 = _ _
+ movu m5, [srcq+ssq*1] ; 1 = _ _
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+%endif
+%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+ %if ARCH_X86_32
+ pshufb %3, %1, [base+subpel_h_shufB]
+ pshufb %4, %1, [base+subpel_h_shufC]
+ pshufb %1, [base+subpel_h_shufA]
+ %else
+ pshufb %3, %1, %6 ; subpel_h_shufB
+ pshufb %4, %1, %7 ; subpel_h_shufC
+ pshufb %1, %5 ; subpel_h_shufA
+ %endif
+ pmaddubsw %2, %3, subpelh0 ; subpel +0 C0
+ pmaddubsw %4, subpelh1; subpel +4 B4
+ pmaddubsw %3, subpelh1; C4
+ pmaddubsw %1, subpelh0; A0
+ paddw %2, %4 ; C0+B4
+ paddw %1, %3 ; A0+C4
+ phaddw %1, %2
+%endmacro
+%if ARCH_X86_64
+ mova m7, [base+subpel_h_shufA]
+ mova m8, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+%endif
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
+%if ARCH_X86_32
+ movu m6, [srcq+ssq*0] ; 2 = _ _
+ movu m0, [srcq+ssq*1] ; 3 = _ _
+ lea srcq, [srcq+ssq*2]
+%else
+ movu m6, [srcq+ssq*2] ; 2 = _ _
+ add srcq, ss3q
+ movu m0, [srcq+ssq*0] ; 3 = _ _
+%endif
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
+ mova m7, [base+pw_8192]
+ pmulhrsw m4, m7 ; H pw_8192
+ pmulhrsw m5, m7 ; H pw_8192
+ pmulhrsw m6, m7 ; H pw_8192
+ pmulhrsw m0, m7 ; H pw_8192
+ punpcklwd m1, m4, m5 ; 0 1 ~
+ punpcklwd m2, m5, m6 ; 1 2 ~
+ punpcklwd m3, m6, m0 ; 2 3 ~
+ SAVELINE_W8 1, m1
+ SAVELINE_W8 2, m2
+ SAVELINE_W8 3, m3
+ mova m7, [base+subpel_h_shufA]
+%if ARCH_X86_32
+ movu m4, [srcq+ssq*0] ; 4 = _ _
+ movu m5, [srcq+ssq*1] ; 5 = _ _
+ lea srcq, [srcq+ssq*2]
+%else
+ movu m4, [srcq+ssq*1] ; 4 = _ _
+ movu m5, [srcq+ssq*2] ; 5 = _ _
+ add srcq, ss3q
+%endif
+ movu m6, [srcq+ssq*0] ; 6 = _ _
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
+ mova m7, [base+pw_8192]
+ pmulhrsw m1, m4, m7 ; H pw_8192 4 ~
+ pmulhrsw m2, m5, m7 ; H pw_8192 5 ~
+ pmulhrsw m3, m6, m7 ; H pw_8192 6 ~
+ punpcklwd m4, m0, m1 ; 3 4 ~
+ punpcklwd m5, m1, m2 ; 4 5 ~
+ punpcklwd m6, m2, m3 ; 5 6 ~
+ SAVELINE_W8 6, m3
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+.hv_w8_loop:
+ ; m8 accu for V a
+ ; m9 accu for V b
+ SAVELINE_W8 1, m3
+ SAVELINE_W8 2, m4
+ SAVELINE_W8 3, m5
+ SAVELINE_W8 4, m6
+%if ARCH_X86_32
+ pmaddwd m0, m1, subpelv0 ; a0
+ pmaddwd m7, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m0, m3
+ paddd m7, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m0, m5
+ paddd m7, m6
+ mova m5, [base+pd_512]
+ paddd m0, m5 ; pd_512
+ paddd m7, m5 ; pd_512
+ mova accuv0, m0
+ mova accuv1, m7
+%else
+ pmaddwd m8, m1, subpelv0 ; a0
+ pmaddwd m9, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ mova m7, [base+pd_512]
+ paddd m8, m7 ; pd_512
+ paddd m9, m7 ; pd_512
+ mova m7, [base+subpel_h_shufB]
+ mova m6, [base+subpel_h_shufC]
+ mova m5, [base+subpel_h_shufA]
+%endif
+ movu m0, [srcq+ssq*1] ; 7
+ movu m4, [srcq+ssq*2] ; 8
+ lea srcq, [srcq+ssq*2]
+ HV_H_W8 m0, m1, m2, m3, m5, m7, m6
+ HV_H_W8 m4, m1, m2, m3, m5, m7, m6
+ mova m5, [base+pw_8192]
+ pmulhrsw m0, m5 ; H pw_8192
+ pmulhrsw m4, m5 ; H pw_8192
+ RESTORELINE_W8 6, m6
+ punpcklwd m5, m6, m0 ; 6 7 ~
+ punpcklwd m6, m0, m4 ; 7 8 ~
+ pmaddwd m1, m5, subpelv3 ; a3
+ paddd m2, m1, accuv0
+ pmaddwd m1, m6, subpelv3 ; b3
+ paddd m1, m1, accuv1 ; H + V
+ psrad m2, 10
+ psrad m1, 10
+ packssdw m2, m1 ; d -> w
+ packuswb m2, m1 ; w -> b
+ movd [dstq+dsq*0], m2
+ psrlq m2, 32
+%if ARCH_X86_32
+ add dstq, dsm
+ movd [dstq+dsq*0], m2
+ add dstq, dsm
+%else
+ movd [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+%endif
+ sub hd, 2
+ jle .hv_w8_outer
+ SAVELINE_W8 6, m4
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+ RESTORELINE_W8 4, m4
+ jmp .hv_w8_loop
+.hv_w8_outer:
+%if ARCH_X86_32
+ mov dstq, dstm
+ add r4, 4
+ movzx hd, r6w
+ add dstq, 4
+ mov srcq, r4
+ mov dstm, dstq
+%else
+ add r4, 4
+ add r7, 4
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+%endif
+ sub r6d, 1<<16
+ jg .hv_w8_loop0
+ RET
+
+%macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask
+ %if cpuflag(ssse3)
+ pshufb %1, %2
+ %else
+ %if %5 == 1
+ pcmpeqd %2, %2
+ psrlq %2, 32
+ %endif
+ psrldq %3, %1, 1
+ pshufd %3, %3, q2301
+ pand %1, %2
+ pandn %4, %2, %3
+ por %1, %4
+ %endif
+%endmacro
+
+%macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
+ %ifnidn %1, %2
+ mova %1, %2
+ %endif
+ PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6
+%endmacro
+
+%macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
+ %if notcpuflag(ssse3)
+ psrlq %1, %2, 16
+ %elifnidn %1, %2
+ mova %1, %2
+ %endif
+ PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6
+%endmacro
+
+%macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp]
+ %if cpuflag(ssse3)
+ palignr %1, %2, %3, %4
+ %else
+ %if %0 == 4
+ %assign %%i regnumof%+%1 + 1
+ %define %%tmp m %+ %%i
+ %else
+ %define %%tmp %5
+ %endif
+ psrldq %1, %3, %4
+ pslldq %%tmp, %2, 16-%4
+ por %1, %%tmp
+ %endif
+%endmacro
+
+%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1
+ %if cpuflag(ssse3)
+ phaddw %1, %2
+ %elifnidn %1, %2
+ %if %4 == 1
+ mova %3, [base+pw_1]
+ %endif
+ pmaddwd %1, %3
+ pmaddwd %2, %3
+ packssdw %1, %2
+ %else
+ %if %4 == 1
+ pmaddwd %1, [base+pw_1]
+ %else
+ pmaddwd %1, %3
+ %endif
+ packssdw %1, %1
+ %endif
+%endmacro
+
+%macro PMULHRSW_POW2 4 ; dst, src1, src2, shift
+ %if cpuflag(ssse3)
+ pmulhrsw %1, %2, %3
+ %else
+ paddw %1, %2, %3
+ psraw %1, %4
+ %endif
+%endmacro
+
+%macro PMULHRSW_8192 3 ; dst, src1, src2
+ PMULHRSW_POW2 %1, %2, %3, 2
+%endmacro
+
+%macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2]
+ movd %1, [%2+0]
+ movd %3, [%2+1]
+ movd %4, [%2+2]
+ movd %5, [%2+3]
+ punpckldq %1, %3
+ punpckldq %4, %5
+ punpcklqdq %1, %4
+%endmacro
+
+%macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc
+ %if cpuflag(ssse3)
+ movu m%1, [%2]
+ pshufb m2, m%1, m11 ; subpel_h_shufB
+ pshufb m3, m%1, m9 ; subpel_h_shufC
+ pshufb m%1, m10 ; subpel_h_shufA
+ %else
+ %if ARCH_X86_64
+ SWAP m12, m5
+ SWAP m13, m6
+ SWAP m14, m7
+ %define %%mx0 m%+%%i
+ %define %%mx1 m%+%%j
+ %assign %%i 0
+ %rep 12
+ movd %%mx0, [%2+%%i]
+ %assign %%i %%i+1
+ %endrep
+ %assign %%i 0
+ %rep 6
+ %assign %%j %%i+1
+ punpckldq %%mx0, %%mx1
+ %assign %%i %%i+2
+ %endrep
+ %assign %%i 0
+ %rep 3
+ %assign %%j %%i+2
+ punpcklqdq %%mx0, %%mx1
+ %assign %%i %%i+4
+ %endrep
+ SWAP m%1, m0
+ SWAP m2, m4
+ SWAP m3, m8
+ SWAP m5, m12
+ SWAP m6, m13
+ SWAP m7, m14
+ %else
+ PREP_8TAP_H_LOAD4 m0, %2+0, m1, m4, m7
+ PREP_8TAP_H_LOAD4 m2, %2+4, m1, m4, m7
+ PREP_8TAP_H_LOAD4 m3, %2+8, m1, m4, m7
+ SWAP m%1, m0
+ %endif
+ %endif
+%endmacro
+
+%macro PREP_8TAP_H 2 ; dst, src_memloc
+ PREP_8TAP_H_LOAD %1, %2
+ %if ARCH_X86_64 && notcpuflag(ssse3)
+ SWAP m8, m1
+ SWAP m9, m7
+ %endif
+ %xdefine mX m%+%1
+ %assign %%i regnumof%+mX
+ %define mX m%+%%i
+ mova m4, m2
+ PMADDUBSW m4, m5, m1, m7, 1 ; subpel +0 B0
+ PMADDUBSW m2, m6, m1, m7, 0 ; subpel +4 B4
+ PMADDUBSW m3, m6, m1, m7, 0 ; subpel +4 C4
+ PMADDUBSW mX, m5, m1, m7, 0 ; subpel +0 A0
+ %undef mX
+ %if ARCH_X86_64 && notcpuflag(ssse3)
+ SWAP m1, m8
+ SWAP m7, m9
+ %endif
+ paddw m3, m4
+ paddw m%1, m2
+ PHADDW m%1, m3, m15, ARCH_X86_32
+ %if ARCH_X86_64 || cpuflag(ssse3)
+ PMULHRSW_8192 m%1, m%1, m7
+ %else
+ PMULHRSW_8192 m%1, m%1, [base+pw_2]
+ %endif
+%endmacro
+
+%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2]
+ %if cpuflag(ssse3)
+ movu %1, [%2]
+ pshufb m2, %1, shufB
+ pshufb m3, %1, shufC
+ pshufb %1, shufA
+ %else
+ PREP_8TAP_H_LOAD4 %1, %2+0, m1, %3, %4
+ PREP_8TAP_H_LOAD4 m2, %2+4, m1, %3, %4
+ PREP_8TAP_H_LOAD4 m3, %2+8, m1, %3, %4
+ %endif
+ mova m1, m2
+ PMADDUBSW m1, subpelh0, %3, %4, 1 ; subpel +0 C0
+ PMADDUBSW m3, subpelh1, %3, %4, 0 ; subpel +4 B4
+ PMADDUBSW m2, subpelh1, %3, %4, 0 ; C4
+ PMADDUBSW %1, subpelh0, %3, %4, 0 ; A0
+ paddw m1, m3 ; C0+B4
+ paddw %1, m2 ; A0+C4
+ PHADDW %1, m1, %3, 1
+%endmacro
+
+%macro PREP_8TAP 0
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1, 2
+%elif WIN64
+ DECLARE_REG_TMP 6, 4
+%else
+ DECLARE_REG_TMP 6, 7
+%endif
+
+FN prep_8tap, sharp, SHARP, SHARP
+FN prep_8tap, sharp_smooth, SHARP, SMOOTH
+FN prep_8tap, smooth_sharp, SMOOTH, SHARP
+FN prep_8tap, smooth, SMOOTH, SMOOTH
+FN prep_8tap, sharp_regular, SHARP, REGULAR
+FN prep_8tap, regular_sharp, REGULAR, SHARP
+FN prep_8tap, smooth_regular, SMOOTH, REGULAR
+FN prep_8tap, regular_smooth, REGULAR, SMOOTH
+FN prep_8tap, regular, REGULAR, REGULAR
+
+%if ARCH_X86_32
+ %define base_reg r2
+ %define base base_reg-prep%+SUFFIX
+%else
+ %define base_reg r7
+ %define base 0
+%endif
+cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+%assign org_stack_offset stack_offset
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ mov wd, wm
+ movifnidn srcd, srcm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ LEA base_reg, prep_ssse3
+ tzcnt wd, wd
+ movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
+ pxor m4, m4
+ add wq, base_reg
+ movifnidn strided, stridem
+ lea r6, [strideq*3]
+ %assign stack_offset org_stack_offset
+%if WIN64
+ pop r8
+ pop r7
+%endif
+ jmp wq
+.h:
+ LEA base_reg, prep%+SUFFIX
+ test myd, 0xf00
+ jnz .hv
+%if cpuflag(ssse3)
+ WIN64_SPILL_XMM 12
+%else
+ WIN64_SPILL_XMM 16
+%endif
+%if ARCH_X86_32
+ %define strideq r6
+ mov strideq, stridem
+%endif
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
+ mova m10, [base+subpel_h_shufA]
+ mova m11, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+ %else
+ %define m10 [base+subpel_h_shufA]
+ %define m11 [base+subpel_h_shufB]
+ %define m9 [base+subpel_h_shufC]
+ %endif
+%endif
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
+ movq m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
+%if cpuflag(ssse3)
+ mova m7, [base+pw_8192]
+ pshufd m5, m6, q0000
+ pshufd m6, m6, q1111
+%else
+ punpcklbw m6, m6
+ psraw m6, 8
+ %if ARCH_X86_64
+ mova m7, [pw_2]
+ mova m15, [pw_1]
+ %else
+ %define m15 m4
+ %endif
+ pshufd m5, m6, q1010
+ punpckhqdq m6, m6
+%endif
+ add wq, base_reg
+ jmp wq
+.h_w4:
+%if ARCH_X86_32
+ and mxd, 0x7f
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
+%if cpuflag(ssse3)
+ mova m6, [base+pw_8192]
+ mova m5, [base+subpel_h_shufA]
+ pshufd m4, m4, q0000
+%else
+ mova m6, [base+pw_2]
+ %if ARCH_X86_64
+ mova m14, [pw_1]
+ %else
+ %define m14 m7
+ %endif
+ punpcklbw m4, m4
+ psraw m4, 8
+ punpcklqdq m4, m4
+%endif
+%if ARCH_X86_64
+ lea stride3q, [strideq*3]
+%endif
+.h_w4_loop:
+%if cpuflag(ssse3)
+ movq m0, [srcq+strideq*0] ; 0
+ movq m1, [srcq+strideq*1] ; 1
+ %if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movq m2, [srcq+strideq*0] ; 2
+ movq m3, [srcq+strideq*1] ; 3
+ lea srcq, [srcq+strideq*2]
+ %else
+ movq m2, [srcq+strideq*2] ; 2
+ movq m3, [srcq+stride3q ] ; 3
+ lea srcq, [srcq+strideq*4]
+ %endif
+ pshufb m0, m5
+ pshufb m1, m5
+ pshufb m2, m5
+ pshufb m3, m5
+%elif ARCH_X86_64
+ movd m0, [srcq+strideq*0+0]
+ movd m12, [srcq+strideq*0+1]
+ movd m1, [srcq+strideq*1+0]
+ movd m5, [srcq+strideq*1+1]
+ movd m2, [srcq+strideq*2+0]
+ movd m13, [srcq+strideq*2+1]
+ movd m3, [srcq+stride3q +0]
+ movd m7, [srcq+stride3q +1]
+ punpckldq m0, m12
+ punpckldq m1, m5
+ punpckldq m2, m13
+ punpckldq m3, m7
+ movd m12, [srcq+strideq*0+2]
+ movd m8, [srcq+strideq*0+3]
+ movd m5, [srcq+strideq*1+2]
+ movd m9, [srcq+strideq*1+3]
+ movd m13, [srcq+strideq*2+2]
+ movd m10, [srcq+strideq*2+3]
+ movd m7, [srcq+stride3q +2]
+ movd m11, [srcq+stride3q +3]
+ lea srcq, [srcq+strideq*4]
+ punpckldq m12, m8
+ punpckldq m5, m9
+ punpckldq m13, m10
+ punpckldq m7, m11
+ punpcklqdq m0, m12 ; 0
+ punpcklqdq m1, m5 ; 1
+ punpcklqdq m2, m13 ; 2
+ punpcklqdq m3, m7 ; 3
+%else
+ movd m0, [srcq+strideq*0+0]
+ movd m1, [srcq+strideq*0+1]
+ movd m2, [srcq+strideq*0+2]
+ movd m3, [srcq+strideq*0+3]
+ punpckldq m0, m1
+ punpckldq m2, m3
+ punpcklqdq m0, m2 ; 0
+ movd m1, [srcq+strideq*1+0]
+ movd m2, [srcq+strideq*1+1]
+ movd m3, [srcq+strideq*1+2]
+ movd m7, [srcq+strideq*1+3]
+ lea srcq, [srcq+strideq*2]
+ punpckldq m1, m2
+ punpckldq m3, m7
+ punpcklqdq m1, m3 ; 1
+ movd m2, [srcq+strideq*0+0]
+ movd m3, [srcq+strideq*0+1]
+ movd m7, [srcq+strideq*0+2]
+ movd m5, [srcq+strideq*0+3]
+ punpckldq m2, m3
+ punpckldq m7, m5
+ punpcklqdq m2, m7 ; 2
+ movd m3, [srcq+strideq*1+0]
+ movd m7, [srcq+strideq*1+1]
+ punpckldq m3, m7
+ movd m7, [srcq+strideq*1+2]
+ movd m5, [srcq+strideq*1+3]
+ lea srcq, [srcq+strideq*2]
+ punpckldq m7, m5
+ punpcklqdq m3, m7 ; 3
+%endif
+ PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2
+ PMADDUBSW m1, m4, m5, m7, 0
+ PMADDUBSW m2, m4, m5, m7, 0
+ PMADDUBSW m3, m4, m5, m7, 0
+ PHADDW m0, m1, m14, ARCH_X86_32
+ PHADDW m2, m3, m14, 0
+ PMULHRSW_8192 m0, m0, m6
+ PMULHRSW_8192 m2, m2, m6
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m2
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+%if cpuflag(ssse3)
+ PREP_8TAP_H 0, srcq+strideq*0
+ PREP_8TAP_H 1, srcq+strideq*1
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ lea srcq, [srcq+strideq*2]
+ add tmpq, 32
+ sub hd, 2
+%else
+ PREP_8TAP_H 0, srcq
+ mova [tmpq], m0
+ add srcq, strideq
+ add tmpq, 16
+ dec hd
+%endif
+ jg .h_w8
+ RET
+.h_w16:
+ mov r3, -16*1
+ jmp .h_start
+.h_w32:
+ mov r3, -16*2
+ jmp .h_start
+.h_w64:
+ mov r3, -16*4
+ jmp .h_start
+.h_w128:
+ mov r3, -16*8
+.h_start:
+ sub srcq, r3
+ mov r5, r3
+.h_loop:
+%if cpuflag(ssse3)
+ PREP_8TAP_H 0, srcq+r3+8*0
+ PREP_8TAP_H 1, srcq+r3+8*1
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 32
+ add r3, 16
+%else
+ PREP_8TAP_H 0, srcq+r3
+ mova [tmpq], m0
+ add tmpq, 16
+ add r3, 8
+%endif
+ jl .h_loop
+ add srcq, strideq
+ mov r3, r5
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ LEA base_reg, prep%+SUFFIX
+%if ARCH_X86_32
+ mov mxd, myd
+ and mxd, 0x7f
+%else
+ %assign stack_offset org_stack_offset
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb
+%endif
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+%if cpuflag(ssse3)
+ mova m2, [base+pw_512]
+ mova m7, [base+pw_8192]
+ punpcklwd m0, m0
+%else
+ punpcklbw m0, m0
+ psraw m0, 8
+%endif
+%if ARCH_X86_32
+ %define subpel0 [rsp+mmsize*0]
+ %define subpel1 [rsp+mmsize*1]
+ %define subpel2 [rsp+mmsize*2]
+ %define subpel3 [rsp+mmsize*3]
+%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed
+ %if cpuflag(ssse3)
+ ALLOC_STACK -mmsize*4
+ %else
+ ALLOC_STACK -mmsize*5
+ %endif
+%assign regs_used 7
+ mov strideq, [rstk+stack_offset+gprsize*3]
+ pshufd m1, m0, q0000
+ mova subpel0, m1
+ pshufd m1, m0, q1111
+ mova subpel1, m1
+ lea r5, [strideq*3]
+ pshufd m1, m0, q2222
+ mova subpel2, m1
+ pshufd m1, m0, q3333
+ mova subpel3, m1
+ sub srcq, r5
+%else
+ %define subpel0 m8
+ %define subpel1 m9
+ %define subpel2 m10
+ %define subpel3 m11
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
+ lea stride3q, [strideq*3]
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
+ sub srcq, stride3q
+ cmp wd, 8
+ jns .v_w8
+%endif
+.v_w4:
+%if notcpuflag(ssse3)
+ pxor m6, m6
+ %if ARCH_X86_64
+ mova m7, [base+pw_2]
+ %endif
+%endif
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < mmsize
+ %define srcm [esp+stack_size+gprsize*1]
+ %define tmpm [esp+stack_size+gprsize*2]
+ %endif
+ mov tmpm, tmpq
+ mov srcm, srcq
+ lea r5d, [wq - 4] ; horizontal loop
+ shl r5d, (16 - 2) ; (wq / 4) << 16
+ mov r5w, hw
+.v_w4_loop0:
+%endif
+ movd m1, [srcq+strideq*0]
+ movd m0, [srcq+strideq*1]
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movd m2, [srcq+strideq*0]
+ movd m4, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movd m3, [srcq+strideq*0]
+ movd m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+%else
+ movd m2, [srcq+strideq*2]
+ add srcq, stride3q
+ movd m4, [srcq+strideq*0]
+ movd m3, [srcq+strideq*1]
+ movd m5, [srcq+strideq*2]
+ add srcq, stride3q
+%endif
+ punpckldq m1, m0 ; 0 1
+ punpckldq m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+strideq*0]
+ punpckldq m2, m4 ; 2 3
+ punpckldq m4, m3 ; 3 4
+ punpckldq m3, m5 ; 4 5
+ punpckldq m5, m0 ; 5 6
+ punpcklbw m2, m4 ; 23 34
+ punpcklbw m3, m5 ; 45 56
+.v_w4_loop:
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ mova m7, subpel0
+ %define subpel0 m7
+%endif
+ mova m5, m1
+ PMADDUBSW m5, subpel0, m6, m4, 0 ; a0 b0
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ mova m7, subpel1
+ %define subpel1 m7
+%endif
+ mova m1, m2
+ PMADDUBSW m2, subpel1, m6, m4, 0 ; a1 b1
+ paddw m5, m2
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ mova m7, subpel2
+ %define subpel2 m7
+%endif
+ mova m2, m3
+ PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2
+ movd m4, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ paddw m5, m3
+ punpckldq m3, m0, m4 ; 6 7 _ _
+ movd m0, [srcq+strideq*0]
+ punpckldq m4, m0 ; 7 8 _ _
+ punpcklbw m3, m4 ; 67 78
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m12, m0
+ %else
+ mova [esp+mmsize*4], m0
+ mova m7, subpel3
+ %define subpel3 m7
+ %endif
+%endif
+ mova m4, m3
+ PMADDUBSW m4, subpel3, m6, m0, 0 ; a3 b3
+ paddw m5, m4
+%if ARCH_X86_64 || cpuflag(ssse3)
+ %if notcpuflag(ssse3)
+ SWAP m0, m12
+ %endif
+ PMULHRSW_8192 m5, m5, m7
+%else
+ mova m0, [esp+mmsize*4]
+ PMULHRSW_8192 m5, m5, [base+pw_2]
+%endif
+ movq [tmpq+wq*0], m5
+ movhps [tmpq+wq*2], m5
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w4_loop
+%if ARCH_X86_32
+ mov srcq, srcm
+ mov tmpq, tmpm
+ movzx hd, r5w
+ add srcq, 4
+ add tmpq, 8
+ mov srcm, srcq
+ mov tmpm, tmpq
+ sub r5d, 1<<16 ; horizontal--
+ jg .v_w4_loop0
+%endif
+ RET
+%if ARCH_X86_64
+.v_w8:
+ lea r6d, [wq*8-64]
+ mov r5, srcq
+ mov r8, tmpq
+ lea r6d, [hq+r6*4]
+.v_w8_loop0:
+ movq m1, [srcq+strideq*0]
+ movq m2, [srcq+strideq*1]
+ movq m3, [srcq+strideq*2]
+ add srcq, stride3q
+ movq m4, [srcq+strideq*0]
+ movq m5, [srcq+strideq*1]
+ movq m6, [srcq+strideq*2]
+ add srcq, stride3q
+ movq m0, [srcq+strideq*0]
+ punpcklbw m1, m2 ; 01
+ punpcklbw m2, m3 ; 12
+ punpcklbw m3, m4 ; 23
+ punpcklbw m4, m5 ; 34
+ punpcklbw m5, m6 ; 45
+ punpcklbw m6, m0 ; 56
+.v_w8_loop:
+ movq m13, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+%if cpuflag(ssse3)
+ pmaddubsw m14, m1, subpel0 ; a0
+ pmaddubsw m15, m2, subpel0 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, subpel1 ; a1
+ pmaddubsw m4, subpel1 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, subpel2 ; a2
+ pmaddubsw m6, subpel2 ; b2
+ punpcklbw m12, m0, m13 ; 67
+ movq m0, [srcq+strideq*0]
+ punpcklbw m13, m0 ; 78
+ paddw m14, m5
+ mova m5, m12
+ pmaddubsw m12, subpel3 ; a3
+ paddw m15, m6
+ mova m6, m13
+ pmaddubsw m13, subpel3 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+%else
+ mova m14, m1
+ PMADDUBSW m14, subpel0, m7, m12, 1 ; a0
+ mova m15, m2
+ PMADDUBSW m15, subpel0, m7, m12, 0 ; b0
+ mova m1, m3
+ PMADDUBSW m3, subpel1, m7, m12, 0 ; a1
+ mova m2, m4
+ PMADDUBSW m4, subpel1, m7, m12, 0 ; b1
+ paddw m14, m3
+ mova m3, m5
+ PMADDUBSW m5, subpel2, m7, m12, 0 ; a2
+ paddw m15, m4
+ mova m4, m6
+ PMADDUBSW m6, subpel2, m7, m12, 0 ; b2
+ paddw m15, m6
+ punpcklbw m12, m0, m13 ; 67
+ movq m0, [srcq+strideq*0]
+ punpcklbw m13, m0 ; 78
+ paddw m14, m5
+ mova m5, m12
+ PMADDUBSW m12, subpel3, m7, m6, 0 ; a3
+ paddw m14, m12
+ mova m6, m13
+ PMADDUBSW m13, subpel3, m7, m12, 0 ; b3
+ paddw m15, m13
+ PMULHRSW_8192 m14, m14, [base+pw_2]
+ PMULHRSW_8192 m15, m15, [base+pw_2]
+%endif
+ movu [tmpq+wq*0], m14
+ movu [tmpq+wq*2], m15
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w8_loop
+ add r5, 8
+ add r8, 16
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r8
+ sub r6d, 1<<8
+ jg .v_w8_loop0
+ RET
+%endif ;ARCH_X86_64
+%undef subpel0
+%undef subpel1
+%undef subpel2
+%undef subpel3
+.hv:
+ %assign stack_offset org_stack_offset
+ cmp wd, 4
+ jg .hv_w8
+ and mxd, 0x7f
+ movd m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
+%if ARCH_X86_32
+ mov mxd, myd
+ shr myd, 16
+ and mxd, 0x7f
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ mov strideq, stridem
+ %assign regs_used 6
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
+ lea r5, [strideq*3+1]
+ sub srcq, r5
+ %define subpelv0 [rsp+mmsize*0]
+ %define subpelv1 [rsp+mmsize*1]
+ %define subpelv2 [rsp+mmsize*2]
+ %define subpelv3 [rsp+mmsize*3]
+ punpcklbw m0, m0
+ psraw m0, 8
+ pshufd m6, m0, q0000
+ mova subpelv0, m6
+ pshufd m6, m0, q1111
+ mova subpelv1, m6
+ pshufd m6, m0, q2222
+ mova subpelv2, m6
+ pshufd m6, m0, q3333
+ mova subpelv3, m6
+%else
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ %if cpuflag(ssse3)
+ ALLOC_STACK mmsize*14, 14
+ %else
+ ALLOC_STACK mmsize*14, 16
+ %endif
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ dec srcq
+ %define subpelv0 m10
+ %define subpelv1 m11
+ %define subpelv2 m12
+ %define subpelv3 m13
+ punpcklbw m0, m0
+ psraw m0, 8
+ %if cpuflag(ssse3)
+ mova m8, [base+pw_8192]
+ %else
+ mova m8, [base+pw_2]
+ %endif
+ mova m9, [base+pd_32]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+%endif
+ pshufd m7, m1, q0000
+%if notcpuflag(ssse3)
+ punpcklbw m7, m7
+ psraw m7, 8
+%endif
+%define hv4_line_0_0 4
+%define hv4_line_0_1 5
+%define hv4_line_0_2 6
+%define hv4_line_0_3 7
+%define hv4_line_0_4 8
+%define hv4_line_0_5 9
+%define hv4_line_1_0 10
+%define hv4_line_1_1 11
+%define hv4_line_1_2 12
+%define hv4_line_1_3 13
+%if ARCH_X86_32
+ %if cpuflag(ssse3)
+ %define w8192reg [base+pw_8192]
+ %else
+ %define w8192reg [base+pw_2]
+ %endif
+ %define d32reg [base+pd_32]
+%else
+ %define w8192reg m8
+ %define d32reg m9
+%endif
+ ; lower shuffle 0 1 2 3 4
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4]
+%else
+ %if ARCH_X86_64
+ mova m15, [pw_1]
+ %else
+ %define m15 m1
+ %endif
+%endif
+ movq m5, [srcq+strideq*0] ; 0 _ _ _
+ movhps m5, [srcq+strideq*1] ; 0 _ 1 _
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movq m4, [srcq+strideq*0] ; 2 _ _ _
+ movhps m4, [srcq+strideq*1] ; 2 _ 3 _
+ lea srcq, [srcq+strideq*2]
+%else
+ movq m4, [srcq+strideq*2] ; 2 _ _ _
+ movhps m4, [srcq+stride3q ] ; 2 _ 3 _
+ lea srcq, [srcq+strideq*4]
+%endif
+ PSHUFB_SUBPEL_H_4a m2, m5, m6, m1, m3, 1 ;H subpel_h_shuf4 0~1~
+ PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~
+ PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters
+ PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
+ PMULHRSW_8192 m2, m2, w8192reg
+ SAVELINE_W4 m2, 2, 0
+ ; upper shuffle 2 3 4 5 6
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4+16]
+%endif
+ PSHUFB_SUBPEL_H_4b m2, m5, m6, m1, m3, 0 ;H subpel_h_shuf4 0~1~
+ PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~
+ PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters
+ PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
+ PMULHRSW_8192 m2, m2, w8192reg
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m14, m2
+ %else
+ mova [esp+mmsize*4], m2
+ %endif
+%endif
+ ; lower shuffle
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4]
+%endif
+ movq m5, [srcq+strideq*0] ; 4 _ _ _
+ movhps m5, [srcq+strideq*1] ; 4 _ 5 _
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movq m4, [srcq+strideq*0] ; 6 _ _ _
+ add srcq, strideq
+%else
+ movq m4, [srcq+strideq*2] ; 6 _ _ _
+ add srcq, stride3q
+%endif
+ PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~
+ PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~
+ PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters
+ PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
+ PMULHRSW_8192 m3, m3, w8192reg
+ SAVELINE_W4 m3, 3, 0
+ ; upper shuffle
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4+16]
+%endif
+ PSHUFB_SUBPEL_H_4b m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~
+ PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~
+ PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters
+ PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters
+ PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
+ PMULHRSW_8192 m3, m3, w8192reg
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m2, m14
+ %else
+ mova m2, [esp+mmsize*4]
+ %endif
+%endif
+ ;process high
+ PALIGNR m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ ;process low
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ PALIGNR m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+.hv_w4_loop:
+ ;process low
+ pmaddwd m5, m1, subpelv0 ; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m14, m5
+ %else
+ mova [esp+mmsize*4], m5
+ %define m15 m3
+ %endif
+%endif
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4]
+%endif
+ movq m4, [srcq+strideq*0] ; 7
+ movhps m4, [srcq+strideq*1] ; 7 _ 8 _
+ PSHUFB_SUBPEL_H_4a m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~
+ PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters
+ PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878
+ PMULHRSW_8192 m4, m4, w8192reg
+ PALIGNR m3, m4, m0, 12, m5 ; 6787
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m5, m14
+ %else
+ mova m5, [esp+mmsize*4]
+ %endif
+%endif
+ paddd m5, d32reg ; pd_32
+ paddd m5, m4
+ psrad m5, 6
+ SAVELINE_W4 m0, 0, 0
+ SAVELINE_W4 m1, 1, 0
+ SAVELINE_W4 m2, 2, 0
+ SAVELINE_W4 m3, 3, 0
+ SAVELINE_W4 m5, 5, 0
+ ;process high
+ RESTORELINE_W4 m0, 0, 1
+ RESTORELINE_W4 m1, 1, 1
+ RESTORELINE_W4 m2, 2, 1
+ RESTORELINE_W4 m3, 3, 1
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m14, m5
+ %else
+ mova [esp+0xA0], m5
+ %endif
+%endif
+%if cpuflag(ssse3)
+ mova m6, [base+subpel_h_shuf4+16]
+%endif
+ movq m4, [srcq+strideq*0] ; 7
+ movhps m4, [srcq+strideq*1] ; 7 _ 8 _
+ PSHUFB_SUBPEL_H_4b m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~
+ PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters
+ PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878
+ PMULHRSW_8192 m4, m4, w8192reg
+ PALIGNR m3, m4, m0, 12, m5 ; 6787
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+ SWAP m5, m14
+ %else
+ mova m5, [esp+0xA0]
+ %endif
+%endif
+ paddd m5, d32reg ; pd_32
+ paddd m5, m4
+ psrad m4, m5, 6
+ RESTORELINE_W4 m5, 5, 0
+ packssdw m5, m4
+ pshufd m5, m5, q3120
+ movu [tmpq], m5
+ lea srcq, [srcq+strideq*2]
+ add tmpq, 16
+ sub hd, 2
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ RESTORELINE_W4 m0, 0, 0
+ RESTORELINE_W4 m1, 1, 0
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ jg .hv_w4_loop
+ RET
+%undef subpelv0
+%undef subpelv1
+%undef subpelv2
+%undef subpelv3
+.hv_w8:
+ %assign stack_offset org_stack_offset
+%define hv8_line_1 0
+%define hv8_line_2 1
+%define hv8_line_3 2
+%define hv8_line_4 3
+%define hv8_line_6 4
+ shr mxd, 16
+%if ARCH_X86_32
+ %define subpelh0 [rsp+mmsize*5]
+ %define subpelh1 [rsp+mmsize*6]
+ %define subpelv0 [rsp+mmsize*7]
+ %define subpelv1 [rsp+mmsize*8]
+ %define subpelv2 [rsp+mmsize*9]
+ %define subpelv3 [rsp+mmsize*10]
+ %define accuv0 [rsp+mmsize*11]
+ %define accuv1 [rsp+mmsize*12]
+ movq m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
+ mov mxd, myd
+ shr myd, 16
+ and mxd, 0x7f
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ mov strideq, stridem
+ %assign regs_used 6
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
+ %if STACK_ALIGNMENT < mmsize
+ %define tmpm [rsp+mmsize*13+gprsize*1]
+ %define srcm [rsp+mmsize*13+gprsize*2]
+ %define stridem [rsp+mmsize*13+gprsize*3]
+ mov tmpm, tmpq
+ mov stridem, strideq
+ %endif
+ %if cpuflag(ssse3)
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ %else
+ punpcklbw m1, m1
+ psraw m1, 8
+ pshufd m0, m1, q1010
+ punpckhqdq m1, m1
+ %endif
+ punpcklbw m5, m5
+ psraw m5, 8
+ pshufd m2, m5, q0000
+ pshufd m3, m5, q1111
+ pshufd m4, m5, q2222
+ pshufd m5, m5, q3333
+ mova subpelh0, m0
+ mova subpelh1, m1
+ mova subpelv0, m2
+ mova subpelv1, m3
+ mova subpelv2, m4
+ mova subpelv3, m5
+ lea r5, [strideq*3+3]
+ sub srcq, r5
+ mov srcm, srcq
+%else
+ ALLOC_STACK mmsize*5, 16
+ %define subpelh0 m10
+ %define subpelh1 m11
+ %define subpelv0 m12
+ %define subpelv1 m13
+ %define subpelv2 m14
+ %define subpelv3 m15
+ %define accuv0 m8
+ %define accuv1 m9
+ movq m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ %if cpuflag(ssse3)
+ pshufd subpelh0, m0, q0000
+ pshufd subpelh1, m0, q1111
+ %else
+ punpcklbw m0, m0
+ psraw m0, 8
+ pshufd subpelh0, m0, q1010
+ pshufd subpelh1, m0, q3232
+ mova m7, [base+pw_2]
+ %endif
+ punpcklbw m1, m1
+ psraw m1, 8
+ pshufd subpelv0, m1, q0000
+ pshufd subpelv1, m1, q1111
+ pshufd subpelv2, m1, q2222
+ pshufd subpelv3, m1, q3333
+ lea stride3q, [strideq*3]
+ sub srcq, 3
+ sub srcq, stride3q
+ mov r6, srcq
+ mov r8, tmpq
+%endif
+ lea r5d, [wq-4]
+ shl r5d, 14
+ add r5d, hd
+.hv_w8_loop0:
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
+ mova m7, [base+subpel_h_shufA]
+ mova m8, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+ %define shufA m7
+ %define shufB m8
+ %define shufC m9
+ %else
+ %define shufA [base+subpel_h_shufA]
+ %define shufB [base+subpel_h_shufB]
+ %define shufC [base+subpel_h_shufC]
+ %endif
+%endif
+ PREP_8TAP_HV m4, srcq+strideq*0, m7, m0
+ PREP_8TAP_HV m5, srcq+strideq*1, m7, m0
+%if ARCH_X86_64
+ PREP_8TAP_HV m6, srcq+strideq*2, m7, m0
+ add srcq, stride3q
+ PREP_8TAP_HV m0, srcq+strideq*0, m7, m9
+%else
+ lea srcq, [srcq+strideq*2]
+ %if notcpuflag(ssse3)
+ mova [esp], m4
+ %endif
+ PREP_8TAP_HV m6, srcq+strideq*0, m7, m4
+ PREP_8TAP_HV m0, srcq+strideq*1, m7, m4
+ lea srcq, [srcq+strideq*2]
+%endif
+%if cpuflag(ssse3)
+ mova m7, [base+pw_8192]
+%else
+ mova m7, [base+pw_2]
+ %if ARCH_X86_32
+ mova m4, [esp]
+ %endif
+%endif
+ PMULHRSW_8192 m4, m4, m7
+ PMULHRSW_8192 m5, m5, m7
+ PMULHRSW_8192 m6, m6, m7
+ PMULHRSW_8192 m0, m0, m7
+ punpcklwd m1, m4, m5 ; 01
+ punpcklwd m2, m5, m6 ; 12
+ punpcklwd m3, m6, m0 ; 23
+ SAVELINE_W8 1, m1
+ SAVELINE_W8 2, m2
+ SAVELINE_W8 3, m3
+%if cpuflag(ssse3)
+ mova m7, [base+subpel_h_shufA]
+%endif
+%if ARCH_X86_64
+ PREP_8TAP_HV m4, srcq+strideq*1, m8, m9
+ PREP_8TAP_HV m5, srcq+strideq*2, m8, m9
+ add srcq, stride3q
+ PREP_8TAP_HV m6, srcq+strideq*0, m8, m9
+%else
+ %if notcpuflag(ssse3)
+ mova [esp+0x30], m0
+ %endif
+ PREP_8TAP_HV m4, srcq+strideq*0, m7, m0
+ PREP_8TAP_HV m5, srcq+strideq*1, m7, m0
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_HV m6, srcq+strideq*0, m7, m0
+%endif
+%if cpuflag(ssse3)
+ mova m7, [base+pw_8192]
+%elif ARCH_X86_32
+ mova m0, [esp+0x30]
+ mova m7, [base+pw_2]
+%endif
+ PMULHRSW_8192 m1, m4, m7
+ PMULHRSW_8192 m2, m5, m7
+ PMULHRSW_8192 m3, m6, m7
+ punpcklwd m4, m0, m1 ; 34
+ punpcklwd m5, m1, m2 ; 45
+ punpcklwd m6, m2, m3 ; 56
+ SAVELINE_W8 6, m3
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+.hv_w8_loop:
+ SAVELINE_W8 1, m3
+ SAVELINE_W8 2, m4
+ SAVELINE_W8 3, m5
+ SAVELINE_W8 4, m6
+%if ARCH_X86_32
+ pmaddwd m0, m1, subpelv0 ; a0
+ pmaddwd m7, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m0, m3
+ paddd m7, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m0, m5
+ paddd m7, m6
+ mova m5, [base+pd_32]
+ paddd m0, m5
+ paddd m7, m5
+ mova accuv0, m0
+ mova accuv1, m7
+%else
+ pmaddwd accuv0, m1, subpelv0 ; a0
+ pmaddwd accuv1, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd accuv0, m3
+ paddd accuv1, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd accuv0, m5
+ paddd accuv1, m6
+ mova m7, [base+pd_32]
+ paddd accuv0, m7
+ paddd accuv1, m7
+ %if cpuflag(ssse3)
+ mova m7, [base+subpel_h_shufB]
+ mova m6, [base+subpel_h_shufC]
+ mova m5, [base+subpel_h_shufA]
+ %define shufA m5
+ %define shufB m7
+ %define shufC m6
+ %endif
+%endif
+ PREP_8TAP_HV m0, srcq+strideq*1, m5, m6
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_HV m4, srcq+strideq*0, m5, m6
+%if cpuflag(ssse3)
+ mova m5, [base+pw_8192]
+%else
+ mova m5, [base+pw_2]
+%endif
+ PMULHRSW_8192 m0, m0, m5
+ PMULHRSW_8192 m4, m4, m5
+ RESTORELINE_W8 6, m6
+ punpcklwd m5, m6, m0 ; 67
+ punpcklwd m6, m0, m4 ; 78
+ pmaddwd m1, m5, subpelv3 ; a3
+ paddd m2, m1, accuv0
+ pmaddwd m1, m6, subpelv3 ; b3
+ paddd m1, m1, accuv1
+ psrad m2, 6
+ psrad m1, 6
+ packssdw m2, m1
+ movq [tmpq+wq*0], m2
+ movhps [tmpq+wq*2], m2
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jle .hv_w8_outer
+ SAVELINE_W8 6, m4
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+ RESTORELINE_W8 4, m4
+ jmp .hv_w8_loop
+.hv_w8_outer:
+%if ARCH_X86_32
+ mov srcq, srcm
+ mov tmpq, tmpm
+ movzx hd, r5w
+ add srcq, 4
+ add tmpq, 8
+ mov srcm, srcq
+ mov tmpm, tmpq
+%else
+ add r6, 4
+ add r8, 8
+ movzx hd, r5b
+ mov srcq, r6
+ mov tmpq, r8
+%endif
+ sub r5d, 1<<16
+ jg .hv_w8_loop0
+ RET
+%endmacro
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro SAVE_REG 1
+ %xdefine r%1_save r%1
+ %xdefine r%1q_save r%1q
+ %xdefine r%1d_save r%1d
+ %if ARCH_X86_32
+ %define r%1m_save [rstk+stack_offset+(%1+1)*4]
+ %endif
+%endmacro
+
+%macro LOAD_REG 1
+ %xdefine r%1 r%1_save
+ %xdefine r%1q r%1q_save
+ %xdefine r%1d r%1d_save
+ %if ARCH_X86_32
+ %define r%1m r%1m_save
+ %endif
+ %undef r%1d_save
+ %undef r%1q_save
+ %undef r%1_save
+%endmacro
+
+%macro REMAP_REG 2-3
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+ %if ARCH_X86_32
+ %if %3 == 0
+ %xdefine r%1m r%2m
+ %else
+ %define r%1m [rstk+stack_offset+(%1+1)*4]
+ %endif
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %if ARCH_X86_64
+ SAVE_REG 14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %else
+ SAVE_REG 5
+ %assign %%i 5
+ %rep 5
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j, 0
+ %assign %%i %%i-1
+ %endrep
+ %endif
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %if ARCH_X86_64
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ LOAD_REG 14
+ %else
+ %rep 4
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j, 1
+ %assign %%i %%i+1
+ %endrep
+ LOAD_REG 5
+ %endif
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%if ARCH_X86_64
+ %macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3]
+ SWAP m%2, m%5
+ movq m%1, [srcq+ r4]
+ movq m%2, [srcq+ r6]
+ movhps m%1, [srcq+ r7]
+ movhps m%2, [srcq+ r9]
+ movq m%3, [srcq+r10]
+ movq m%4, [srcq+r11]
+ movhps m%3, [srcq+r13]
+ movhps m%4, [srcq+ rX]
+ add srcq, ssq
+ movq m%5, [srcq+ r4]
+ movq m%6, [srcq+ r6]
+ movhps m%5, [srcq+ r7]
+ movhps m%6, [srcq+ r9]
+ movq m%7, [srcq+r10]
+ movq m%8, [srcq+r11]
+ movhps m%7, [srcq+r13]
+ movhps m%8, [srcq+ rX]
+ add srcq, ssq
+ pmaddubsw m%1, m%9
+ pmaddubsw m%5, m%9
+ pmaddubsw m%2, m%10
+ pmaddubsw m%6, m%10
+ pmaddubsw m%3, m%11
+ pmaddubsw m%7, m%11
+ pmaddubsw m%4, m%12
+ pmaddubsw m%8, m%12
+ phaddw m%1, m%2
+ phaddw m%5, m%6
+ phaddw m%3, m%4
+ phaddw m%7, m%8
+ phaddw m%1, m%3
+ phaddw m%5, m%7
+ pmulhrsw m%1, m12
+ pmulhrsw m%5, m12
+ SWAP m%2, m%5
+ %endmacro
+%else
+ %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem_start, load_fh_offsets
+ %if %3 == 1
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ %endif
+ movq m0, [srcq+r0]
+ movq m1, [srcq+rX]
+ movhps m0, [srcq+r4]
+ movhps m1, [srcq+r5]
+ add srcq, ssq
+ movq m4, [srcq+r0]
+ movq m5, [srcq+rX]
+ movhps m4, [srcq+r4]
+ movhps m5, [srcq+r5]
+ mov r0, [esp+16]
+ mov rX, [esp+24]
+ mov r4, [esp+20]
+ mov r5, [esp+28]
+ sub srcq, ssq
+ movq m2, [srcq+r0]
+ movq m3, [srcq+rX]
+ movhps m2, [srcq+r4]
+ movhps m3, [srcq+r5]
+ add srcq, ssq
+ movq m6, [srcq+r0]
+ movq m7, [srcq+rX]
+ movhps m6, [srcq+r4]
+ movhps m7, [srcq+r5]
+ add srcq, ssq
+ pmaddubsw m0, [esp+%1+ 0]
+ pmaddubsw m4, [esp+%1+ 0]
+ pmaddubsw m1, [esp+%1+16]
+ pmaddubsw m5, [esp+%1+16]
+ pmaddubsw m2, [esp+%1+32]
+ pmaddubsw m6, [esp+%1+32]
+ pmaddubsw m3, [esp+%1+48]
+ pmaddubsw m7, [esp+%1+48]
+ phaddw m0, m1
+ phaddw m4, m5
+ phaddw m2, m3
+ phaddw m6, m7
+ phaddw m0, m2
+ phaddw m4, m6
+ pmulhrsw m0, m12
+ pmulhrsw m4, m12
+ %if %2 != 0
+ mova [esp+%2+ 0], m0
+ mova [esp+%2+16], m4
+ %endif
+ %endmacro
+%endif
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isprep 0
+ %if ARCH_X86_64
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled_8bpc, 2, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal put_8tap_scaled_8bpc, 2, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %else ; ARCH_X86_32
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled_8bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal put_8tap_scaled_8bpc, 0, 7, 8, -0x200-0x20, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %endif
+ %xdefine base_reg r12
+ %define rndshift 10
+%else ; prep
+ %assign isprep 1
+ %if ARCH_X86_64
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled_8bpc, 2, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+ %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled_8bpc, 2, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
+ %define tmp_stridem qword [rsp+0x138]
+ %endif
+ %xdefine base_reg r11
+ %else ; ARCH_X86_32
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled_8bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %define tmp_stridem dword [esp+0x138]
+ %endif
+ %define rndshift 6
+%endif
+%if ARCH_X86_32
+ mov [esp+0x1f0], t0d
+ mov [esp+0x1f4], t1d
+ %if !isprep && required_stack_alignment > STACK_ALIGNMENT
+ mov dstd, dstm
+ mov dsd, dsm
+ mov srcd, srcm
+ mov ssd, ssm
+ mov hd, hm
+ mov r4, mxm
+ %define r0m [esp+0x200]
+ %define dsm [esp+0x204]
+ %define dsmp dsm
+ %define r1m dsm
+ %define r2m [esp+0x208]
+ %define ssm [esp+0x20c]
+ %define r3m ssm
+ %define hm [esp+0x210]
+ %define mxm [esp+0x214]
+ mov r0m, dstd
+ mov dsm, dsd
+ mov r2m, srcd
+ mov ssm, ssd
+ mov hm, hd
+ mov r0, mym
+ mov r1, dxm
+ mov r2, dym
+ %define mym [esp+0x218]
+ %define dxm [esp+0x09c]
+ %define dym [esp+0x21c]
+ mov mxm, r4
+ mov mym, r0
+ mov dxm, r1
+ mov dym, r2
+ tzcnt wd, wm
+ %endif
+ %if isprep && required_stack_alignment > STACK_ALIGNMENT
+ %xdefine base_reg r5
+ %else
+ %xdefine base_reg r6
+ %endif
+ mov ssd, ssm
+%endif
+ LEA base_reg, %1_8tap_scaled_8bpc_ssse3
+%xdefine base base_reg-%1_8tap_scaled_8bpc_ssse3
+%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
+ tzcnt wd, wm
+%endif
+%if ARCH_X86_32
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+%endif
+ movd m8, dxm
+ movd m14, mxm
+ pshufd m8, m8, q0000
+ pshufd m14, m14, q0000
+%if isprep && UNIX64
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+%endif
+%if ARCH_X86_64
+ mov dyd, dym
+%endif
+%ifidn %1, put
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %elif ARCH_X86_64
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %endif
+ %if ARCH_X86_64
+ %if required_stack_alignment > STACK_ALIGNMENT
+ %define dsm [rsp+0x138]
+ %define rX r1
+ %define rXd r1d
+ %else
+ %define dsm dsq
+ %define rX r14
+ %define rXd r14d
+ %endif
+ %else
+ %define rX r1
+ %endif
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %elif ARCH_X86_64
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm [rsp+0x94]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %if ARCH_X86_64
+ %define rX r14
+ %define rXd r14d
+ %else
+ %define rX r3
+ %endif
+%endif
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+ mova m12, [base+pw_8192]
+ %ifidn %1, put
+ mova m13, [base+pd_512]
+ %else
+ mova m13, [base+pd_32]
+ %endif
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m12 [base+pw_8192]
+ %ifidn %1, put
+ %define m13 [base+pd_512]
+ %else
+ %define m13 [base+pd_32]
+ %endif
+%endif
+ pxor m9, m9
+%if ARCH_X86_64
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+%else
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ mov r1, [esp+0x1f4]
+ lea r0, [ssq*3]
+ movzx r2, r1b
+ shr r1, 16
+ cmp dword hm, 6
+ cmovs r1, r2
+ mov [esp+0x1f4], r1
+ mov r1, r1m
+ mov r2, r2m
+ sub srcq, r0
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define ss3q r0
+ %define myd r4
+ %define dyd dword dym
+ %define hd dword hm
+%endif
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ %else
+ movzx r4, byte [esp+0x1f0]
+ dec srcq
+ movd m15, r4
+ %endif
+ punpckldq m9, m8
+ SWAP m8, m9
+ paddd m14, m8 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+ %else
+ %define m11 [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ psrldq m15, 4
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_dw]
+ mova m6, [base+subpel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m9, m9
+ pcmpeqd m8, m9
+ psrld m14, 10
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [rsp+0x180], m14
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m8 m5
+ %define m15 m6
+ %endif
+ movq m0, [srcq+ssq*0]
+ movq m2, [srcq+ssq*2]
+ movhps m0, [srcq+ssq*1]
+ movhps m2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ %if ARCH_X86_64
+ pshufb m14, m5
+ paddb m14, m6
+ %endif
+ movq m1, [srcq+ssq*0]
+ movq m3, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*1]
+ movhps m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpckldq m15, m7
+ punpcklqdq m15, m15
+ %if ARCH_X86_64
+ pand m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ %else
+ pand m7, m8, m11
+ pandn m8, m15
+ %define m8 m6
+ %define m15 m5
+ por m15, m7
+ mova [rsp+0x190], m15
+ %endif
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pmaddubsw m0, m15
+ pmaddubsw m2, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12 ; 0 1 2 3
+ pmulhrsw m1, m12 ; 4 5 6 7
+ palignr m2, m1, m0, 4 ; 1 2 3 4
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ pshufd m5, m1, q0321 ; 5 6 7 _
+ punpcklwd m2, m1, m5 ; 45 56
+ punpckhwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mov myd, mym
+ mov r0, r0m
+ mova [rsp+0x1a0], m3
+ mova [rsp+0x1b0], m0
+ mova [rsp+0x1c0], m2
+ mova [rsp+0x1d0], m4
+ %endif
+.w2_loop:
+ and myd, 0x3ff
+ %if ARCH_X86_64
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m11, r6q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m5, m3, m8
+ pmaddwd m6, m0, m9
+ pmaddwd m7, m2, m10
+ pmaddwd m8, m4, m11
+ paddd m5, m6
+ paddd m7, m8
+ %else
+ mov mym, myd
+ mov r1, [esp+0x1f4]
+ xor r3, r3
+ shr r4, 6
+ lea r1, [r1+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r1*8+0]
+ cmovnz r3, [base+subpel_filters+r1*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m5, m7, q0000
+ pshufd m6, m7, q1111
+ pmaddwd m3, m5
+ pmaddwd m0, m6
+ pshufd m5, m7, q2222
+ pshufd m7, m7, q3333
+ pmaddwd m2, m5
+ pmaddwd m4, m7
+ paddd m3, m0
+ paddd m2, m4
+ SWAP m5, m3
+ SWAP m7, m2
+ %endif
+ paddd m5, m13
+ paddd m5, m7
+ psrad m5, 10
+ packssdw m5, m5
+ packuswb m5, m5
+ %if ARCH_X86_64
+ pextrw r6d, m5, 0
+ mov [dstq], r6w
+ add dstq, dsq
+ dec hd
+ jz .ret
+ add myd, dyd
+ %else
+ pextrw r3d, m5, 0
+ mov [dstq], r3w
+ add dstq, dsm
+ dec hd
+ jz .ret
+ mov myd, mym
+ add myd, dym
+ %endif
+ test myd, ~0x3ff
+ %if ARCH_X86_32
+ SWAP m3, m5
+ SWAP m2, m7
+ mova m3, [rsp+0x1a0]
+ mova m0, [rsp+0x1b0]
+ mova m2, [rsp+0x1c0]
+ mova m4, [rsp+0x1d0]
+ %define m14 [esp+0x180]
+ %define m15 [esp+0x190]
+ %endif
+ jz .w2_loop
+ %if ARCH_X86_32
+ mov r3, r3m
+ %endif
+ movq m5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps m3, m0, q1032 ; 01 12
+ shufps m0, m2, q1032 ; 23 34
+ shufps m2, m4, q1032 ; 45 56
+ pshufb m5, m14
+ pmaddubsw m5, m15
+ phaddw m5, m5
+ pmulhrsw m5, m12
+ palignr m4, m5, m1, 12
+ punpcklqdq m1, m4, m4 ; 6 7 6 7
+ punpcklwd m4, m1, m5 ; 67 __
+ %if ARCH_X86_32
+ mova [rsp+0x1a0], m3
+ mova [rsp+0x1b0], m0
+ mova [rsp+0x1c0], m2
+ mova [rsp+0x1d0], m4
+ %endif
+ jmp .w2_loop
+.w2_skip_line:
+ movhps m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m3, m0 ; 01 12
+ mova m0, m2 ; 23 34
+ pshufb m5, m14
+ pmaddubsw m5, m15
+ phaddw m5, m5
+ pmulhrsw m5, m12 ; 6 7 6 7
+ palignr m4, m5, m1, 8 ; 4 5 6 7
+ pshufd m5, m4, q0321 ; 5 6 7 _
+ mova m1, m4
+ punpcklwd m2, m4, m5 ; 45 56
+ punpckhwd m4, m5 ; 67 __
+ %if ARCH_X86_32
+ mova [rsp+0x1a0], m3
+ mova [rsp+0x1b0], m0
+ mova [rsp+0x1c0], m2
+ mova [rsp+0x1d0], m4
+ %endif
+ jmp .w2_loop
+%endif
+INIT_XMM ssse3
+.w4:
+%if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+%else
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ movzx r4, byte [esp+0x1f0]
+ dec srcq
+ movd m15, r4
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+%else
+ %define m11 [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd m15, m0
+ psrldq m7, m15, 8
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r6d, m15
+ movd r13d, m7
+ movd m15, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+r11*8+2]
+ movd m3, [base+subpel_filters+ r6*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+%else
+ movd r0, m15
+ movd rX, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r4, m15
+ movd r5, m7
+ movd m1, [base+subpel_filters+r0*8+2]
+ movd m2, [base+subpel_filters+rX*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m7, [base+subpel_filters+r5*8+2]
+ movifprep r3, r3m
+ SWAP m4, m7
+ %define m15 m1
+%endif
+ mova m5, [base+bdct_lb_dw]
+ movq m6, [base+subpel_s_shuf2]
+ psrld m14, 10
+ punpckldq m15, m3
+ punpckldq m2, m4
+ punpcklqdq m15, m2
+ punpcklqdq m6, m6
+ pshufb m14, m5
+ paddb m14, m6
+%if ARCH_X86_64
+ pcmpeqd m0, m9
+ pand m11, m0
+%else
+ mova [esp+0x180], m14
+ SWAP m7, m4
+ pxor m3, m3
+ pcmpeqd m0, m3
+ pand m2, m11, m0
+ %define m11 m2
+%endif
+ pandn m0, m15
+%if ARCH_X86_64
+ SWAP m15, m0
+%else
+ %define m15 m0
+%endif
+ por m15, m11
+%if ARCH_X86_64
+ movu m7, [srcq+ssq*0]
+ movu m9, [srcq+ssq*1]
+ movu m8, [srcq+ssq*2]
+ movu m10, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ movu m2, [srcq+ssq*0]
+ movu m4, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m5, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m7, m14
+ pshufb m9, m14
+ pshufb m8, m14
+ pshufb m10, m14
+ pshufb m2, m14
+ pshufb m4, m14
+ pshufb m3, m14
+ pshufb m5, m14
+ pmaddubsw m7, m15
+ pmaddubsw m9, m15
+ pmaddubsw m8, m15
+ pmaddubsw m10, m15
+ pmaddubsw m2, m15
+ pmaddubsw m4, m15
+ pmaddubsw m3, m15
+ pmaddubsw m5, m15
+ phaddw m7, m9
+ phaddw m8, m10
+ phaddw m9, m2, m4
+ phaddw m3, m5
+ pmulhrsw m7, m12 ; 0 1
+ pmulhrsw m8, m12 ; 2 3
+ pmulhrsw m9, m12 ; 4 5
+ pmulhrsw m3, m12 ; 6 7
+ shufps m4, m7, m8, q1032 ; 1 2
+ shufps m5, m8, m9, q1032 ; 3 4
+ shufps m6, m9, m3, q1032 ; 5 6
+ psrldq m11, m3, 8 ; 7 _
+ punpcklwd m0, m7, m4 ; 01
+ punpckhwd m7, m4 ; 12
+ punpcklwd m1, m8, m5 ; 23
+ punpckhwd m8, m5 ; 34
+ punpcklwd m2, m9, m6 ; 45
+ punpckhwd m9, m6 ; 56
+ punpcklwd m3, m11 ; 67
+ mova [rsp+0x00], m7
+ mova [rsp+0x10], m8
+ mova [rsp+0x20], m9
+%else
+ mova [esp+0x190], m15
+ lea ss3q, [ssq*3]
+ movu m2, [srcq+ssq*0]
+ movu m3, [srcq+ssq*1]
+ movu m7, [srcq+ssq*2]
+ movu m6, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m7, m14
+ pshufb m6, m14
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ pmaddubsw m7, m15
+ pmaddubsw m6, m15
+ phaddw m2, m3
+ phaddw m7, m6
+ movu m1, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m6, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m1, m14
+ pshufb m5, m14
+ pshufb m3, m14
+ pshufb m6, m14
+ pmaddubsw m1, m15
+ pmaddubsw m5, m15
+ pmaddubsw m3, m15
+ pmaddubsw m6, m15
+ phaddw m1, m5
+ phaddw m3, m6
+ pmulhrsw m2, m12
+ pmulhrsw m7, m12
+ pmulhrsw m1, m12
+ pmulhrsw m3, m12
+ shufps m4, m2, m7, q1032 ; 1 2
+ shufps m5, m7, m1, q1032 ; 3 4
+ shufps m6, m1, m3, q1032 ; 5 6
+ psrldq m0, m3, 8 ; 7 _
+ mova [esp+0x1a0], m0
+ %define m11 [esp+0x1a0]
+ punpcklwd m0, m2, m4 ; 01
+ punpckhwd m2, m4 ; 12
+ punpcklwd m4, m7, m5 ; 23
+ punpckhwd m7, m5 ; 34
+ punpcklwd m5, m1, m6 ; 45
+ punpckhwd m1, m6 ; 56
+ punpcklwd m3, [esp+0x1a0] ; 67
+ mov myd, mym
+ mov r0, r0m
+ mova [esp+0x1b0], m0 ; 01
+ mova [esp+0x1c0], m4 ; 23
+ mova [esp+0x1d0], m5 ; 45
+ mova [esp+0x1e0], m3 ; 67
+ mova [rsp+0x00], m2 ; 12
+ mova [rsp+0x10], m7 ; 34
+ mova [rsp+0x20], m1 ; 56
+ SWAP m1, m4
+ SWAP m2, m5
+%endif
+.w4_loop:
+ and myd, 0x3ff
+%if ARCH_X86_64
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m10, r6q
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ pmaddwd m7, m3, m10
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+%else
+ mov mym, myd
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr r4, 6
+ lea r5, [r5+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ pmaddwd m2, m6
+ pmaddwd m3, m7
+ paddd m0, m1
+ paddd m2, m3
+ paddd m0, m13
+ paddd m0, m2
+ SWAP m4, m0
+%endif
+ psrad m4, rndshift
+ packssdw m4, m4
+%ifidn %1, put
+ packuswb m4, m4
+ movd [dstq], m4
+ add dstq, dsmp
+%else
+ movq [tmpq], m4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+%if ARCH_X86_64
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+%else
+ SWAP m0, m4
+ mov myd, mym
+ mov r3, r3m
+ add myd, dym
+ test myd, ~0x3ff
+ jnz .w4_next_line
+ mova m0, [esp+0x1b0]
+ mova m1, [esp+0x1c0]
+ mova m2, [esp+0x1d0]
+ mova m3, [esp+0x1e0]
+ jmp .w4_loop
+.w4_next_line:
+ %define m14 [esp+0x180]
+ %define m15 [esp+0x190]
+%endif
+ movu m4, [srcq]
+ test myd, 0x400
+ jz .w4_skip_line
+%if ARCH_X86_64
+ mova m0, [rsp+0x00]
+ mova [rsp+0x00], m1
+ mova m1, [rsp+0x10]
+ mova [rsp+0x10], m2
+ mova m2, [rsp+0x20]
+ mova [rsp+0x20], m3
+%else
+ mova m5, [esp+0x1c0]
+ mova m0, [rsp+0x000]
+ mova [rsp+0x00], m5
+ mova [esp+0x1b0], m0
+ mova m6, [esp+0x1d0]
+ mova m1, [rsp+0x010]
+ mova [rsp+0x10], m6
+ mova [esp+0x1c0], m1
+ mova m7, [esp+0x1e0]
+ mova m2, [rsp+0x020]
+ mova [rsp+0x20], m7
+ mova [esp+0x1d0], m2
+%endif
+ pshufb m4, m14
+ pmaddubsw m4, m15
+ phaddw m4, m4
+ pmulhrsw m4, m12
+ punpcklwd m3, m11, m4
+%if ARCH_X86_32
+ mova [esp+0x1e0], m3
+%endif
+ mova m11, m4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+%if ARCH_X86_32
+ mova m0, [esp+0x1c0]
+ mova m1, [esp+0x1d0]
+ mova m2, [esp+0x1e0]
+%endif
+ movu m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova m6, [rsp+0x10]
+ mova m7, [rsp+0x20]
+ pshufb m4, m14
+ pshufb m5, m14
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ phaddw m4, m5
+ pmulhrsw m4, m12
+ punpcklwd m5, m11, m4
+ mova [rsp+0x00], m6
+ mova [rsp+0x10], m7
+ mova [rsp+0x20], m5
+%if ARCH_X86_64
+ psrldq m11, m4, 8
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ punpcklwd m3, m4, m11
+%else
+ psrldq m6, m4, 8
+ punpcklwd m3, m4, m6
+ mova [esp+0x1a0], m6
+ mova [esp+0x1b0], m0
+ mova [esp+0x1c0], m1
+ mova [esp+0x1d0], m2
+ mova [esp+0x1e0], m3
+%endif
+ jmp .w4_loop
+INIT_XMM ssse3
+.w8:
+ mov dword [rsp+0x90], 1
+ movifprep tmp_stridem, 16
+ jmp .w_start
+.w16:
+ mov dword [rsp+0x90], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [rsp+0x90], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [rsp+0x90], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [rsp+0x90], 16
+ movifprep tmp_stridem, 256
+.w_start:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+%if ARCH_X86_64
+ shr t0d, 16
+ movd m15, t0d
+%else
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq ssm
+ %endif
+ mov r4, [esp+0x1f0]
+ shr r4, 16
+ movd m15, r4
+ mov r0, r0m
+ mov myd, mym
+%endif
+ sub srcq, 3
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ mova [rsp+0x100], m7
+ mova [rsp+0x120], m15
+ mov [rsp+0x098], srcq
+ mov [rsp+0x130], r0q ; dstq / tmpq
+%if ARCH_X86_64 && UNIX64
+ mov hm, hd
+%elif ARCH_X86_32
+ mov r5, hm
+ mov [esp+0x094], myd
+ mov [esp+0x134], r5
+%endif
+ jmp .hloop
+.hloop_prep:
+ dec dword [rsp+0x090]
+ jz .ret
+%if ARCH_X86_64
+ add qword [rsp+0x130], 8*(isprep+1)
+ mov hd, hm
+%else
+ add dword [esp+0x130], 8*(isprep+1)
+ mov myd, [esp+0x094]
+ mov r5, [esp+0x134]
+ mov r0, [esp+0x130]
+%endif
+ mova m7, [rsp+0x100]
+ mova m14, [rsp+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+%endif
+ mova m15, [rsp+0x120]
+ pxor m9, m9
+ mov srcq, [rsp+0x098]
+%if ARCH_X86_64
+ mov r0q, [rsp+0x130] ; dstq / tmpq
+%else
+ mov mym, myd
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.hloop:
+%if ARCH_X86_64
+ mova m11, [base+pq_0x40000000]
+%else
+ %define m11 [base+pq_0x40000000]
+%endif
+ psrld m2, m14, 10
+ mova [rsp], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m9
+ psrldq m2, m5, 8
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+ pxor m2, m2
+ %define m9 m2
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ mova [rsp+0x110], m14
+ psrldq m4, m15, 8
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ psrldq m4, m14, 8
+ movd r10d, m14
+ movd r11d, m4
+ psrldq m14, 4
+ psrldq m4, 4
+ movd r13d, m14
+ movd rXd, m4
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m14, m5, q1100
+ pshufd m5, m5, q3322
+ pand m7, m11, m4
+ pand m8, m11, m6
+ pand m15, m11, m14
+ pand m11, m11, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m14, m2
+ pandn m5, m3
+ por m7, m4
+ por m8, m6
+ por m15, m14
+ por m11, m5
+ mova [rsp+0x10], m7
+ mova [rsp+0x20], m8
+ mova [rsp+0x30], m15
+ mova [rsp+0x40], m11
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1
+ mova [rsp+0x50], m1
+ mova [rsp+0x60], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3
+ mova [rsp+0x70], m3
+ mova [rsp+0x80], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5
+ MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7
+ SWAP m7, m0
+ SWAP m8, m14
+ mova m1, [rsp+0x50]
+ mova m2, [rsp+0x60]
+ mova m3, [rsp+0x70]
+ mova m9, [rsp+0x80]
+ mov myd, mym
+ mov dyd, dym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m9 ; 23a
+ punpckhwd m3, m9 ; 23b
+ mova [rsp+0x50], m4
+ mova [rsp+0x60], m5
+ mova [rsp+0x70], m6
+ mova [rsp+0x80], m7
+ SWAP m14, m8
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq m11, r6q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufd m5, m11, q0000
+ pshufd m7, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m4, m5, m0
+ pmaddwd m5, m5, m1
+ pmaddwd m6, m7, m2
+ pmaddwd m7, m7, m3
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+ pmaddwd m6, [rsp+0x50], m10
+ pmaddwd m7, [rsp+0x60], m10
+ pmaddwd m8, [rsp+0x70], m11
+ pmaddwd m9, [rsp+0x80], m11
+ paddd m4, m6
+ paddd m5, m7
+ paddd m4, m8
+ paddd m5, m9
+%else
+ movd r0, m15
+ movd rX, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r4, m15
+ movd r5, m4
+ mova m14, [esp+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [esp+16], m14
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m11, m4
+ pand m1, m11, m6
+ pand m2, m11, m7
+ pand m3, m11, m5
+ pandn m4, [esp+0x20]
+ pandn m6, [esp+0x30]
+ pandn m7, [esp+0x40]
+ pandn m5, [esp+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ MC_8TAP_SCALED_H 0x20, 0x140, 0 ; 0-1
+ MC_8TAP_SCALED_H 0x20, 0x160 ; 2-3
+ MC_8TAP_SCALED_H 0x20, 0x180 ; 4-5
+ MC_8TAP_SCALED_H 0x20, 0x1a0 ; 6-7
+ mova m5, [esp+0x180]
+ mova m6, [esp+0x190]
+ mova m7, [esp+0x1a0]
+ mova m0, [esp+0x1b0]
+ mov myd, mym
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [esp+0x180], m4
+ mova [esp+0x190], m5
+ mova [esp+0x1a0], m6
+ mova [esp+0x1b0], m7
+ mova m1, [esp+0x140]
+ mova m2, [esp+0x150]
+ mova m3, [esp+0x160]
+ mova m4, [esp+0x170]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova [esp+0x140], m0
+ mova [esp+0x150], m1
+ mova [esp+0x160], m2
+ mova [esp+0x170], m3
+.vloop:
+ mov r0, r0m
+ mov r5, [esp+0x1f4]
+ and myd, 0x3ff
+ mov mym, myd
+ xor r3, r3
+ shr r4, 6
+ lea r5, [r5+r4]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ movd m7, r4
+ movd m6, r3
+ punpckldq m7, m6
+ punpcklbw m7, m7
+ psraw m7, 8
+ pshufd m4, m7, q0000
+ pshufd m5, m7, q1111
+ pmaddwd m0, m4
+ pmaddwd m1, m4
+ pmaddwd m2, m5
+ pmaddwd m3, m5
+ pshufd m6, m7, q2222
+ pshufd m7, m7, q3333
+ paddd m0, m2
+ paddd m1, m3
+ pmaddwd m2, [esp+0x180], m6
+ pmaddwd m3, [esp+0x190], m6
+ pmaddwd m4, [esp+0x1a0], m7
+ pmaddwd m5, [esp+0x1b0], m7
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m13
+ paddd m1, m13
+ paddd m4, m0
+ paddd m5, m1
+%endif
+ psrad m4, rndshift
+ psrad m5, rndshift
+ packssdw m4, m5
+%ifidn %1, put
+ packuswb m4, m4
+ movq [dstq], m4
+ add dstq, dsm
+%else
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+%if ARCH_X86_64
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [rsp+0x140], myd
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ jz .skip_line
+ mova m14, [base+unpckw]
+ movq m6, [srcq+r10]
+ movq m7, [srcq+r11]
+ movhps m6, [srcq+r13]
+ movhps m7, [srcq+ rX]
+ movq m4, [srcq+ r4]
+ movq m5, [srcq+ r6]
+ movhps m4, [srcq+ r7]
+ movhps m5, [srcq+ r9]
+ add srcq, ssq
+ mov myd, [rsp+0x140]
+ mov dyd, dym
+ pshufd m9, m14, q1032
+ pshufb m0, m14 ; 0a 1a
+ pshufb m1, m14 ; 0b 1b
+ pshufb m2, m9 ; 3a 2a
+ pshufb m3, m9 ; 3b 2b
+ pmaddubsw m6, [rsp+0x30]
+ pmaddubsw m7, [rsp+0x40]
+ pmaddubsw m4, [rsp+0x10]
+ pmaddubsw m5, [rsp+0x20]
+ phaddw m6, m7
+ phaddw m4, m5
+ phaddw m4, m6
+ pmulhrsw m4, m12
+ pshufb m5, [rsp+0x50], m14 ; 4a 5a
+ pshufb m6, [rsp+0x60], m14 ; 4b 5b
+ pshufb m7, [rsp+0x70], m9 ; 7a 6a
+ pshufb m8, [rsp+0x80], m9 ; 7b 6b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m5 ; 34a
+ punpcklwd m3, m6 ; 34b
+ punpckhwd m5, m7 ; 56a
+ punpckhwd m6, m8 ; 56b
+ punpcklwd m7, m4 ; 78a
+ punpckhqdq m4, m4
+ punpcklwd m8, m4 ; 78b
+ mova [rsp+0x50], m5
+ mova [rsp+0x60], m6
+ mova [rsp+0x70], m7
+ mova [rsp+0x80], m8
+ jmp .vloop
+.skip_line:
+ mova m0, [rsp+0x10]
+ mova m1, [rsp+0x20]
+ mova m14, [rsp+0x30]
+ mova m15, [rsp+0x40]
+ MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15
+ mov myd, [rsp+0x140]
+ mov dyd, dym
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ mova m2, [rsp+0x50] ; 23a
+ mova m3, [rsp+0x60] ; 23b
+ mova m5, [rsp+0x70] ; 45a
+ mova m6, [rsp+0x80] ; 45b
+ punpcklwd m7, m4, m8 ; 67a
+ punpckhwd m4, m8 ; 67b
+ mova [rsp+0x50], m5
+ mova [rsp+0x60], m6
+ mova [rsp+0x70], m7
+ mova [rsp+0x80], m4
+%else
+ mov r0m, r0
+ mov myd, mym
+ mov r3, r3m
+ add myd, dym
+ test myd, ~0x3ff
+ mov mym, myd
+ jnz .next_line
+ mova m0, [esp+0x140]
+ mova m1, [esp+0x150]
+ mova m2, [esp+0x160]
+ mova m3, [esp+0x170]
+ jmp .vloop
+.next_line:
+ test myd, 0x400
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ jz .skip_line
+ mova m6, [base+unpckw]
+ mova m0, [esp+0x140]
+ mova m1, [esp+0x150]
+ mova m7, [esp+0x180]
+ movq m4, [srcq+r0]
+ movq m5, [srcq+rX]
+ movhps m4, [srcq+r4]
+ movhps m5, [srcq+r5]
+ pshufb m0, m6 ; 0a 1a
+ pshufb m1, m6 ; 0b 1b
+ pshufb m7, m6 ; 4a 5a
+ mov r0, [esp+16]
+ mov rX, [esp+24]
+ mov r4, [esp+20]
+ mov r5, [esp+28]
+ movq m3, [srcq+r0]
+ movq m2, [srcq+rX]
+ movhps m3, [srcq+r4]
+ movhps m2, [srcq+r5]
+ add srcq, ssq
+ pmaddubsw m4, [esp+0x20]
+ pmaddubsw m5, [esp+0x30]
+ pmaddubsw m3, [esp+0x40]
+ pmaddubsw m2, [esp+0x50]
+ phaddw m4, m5
+ phaddw m3, m2
+ mova m5, [esp+0x190]
+ mova m2, [esp+0x160]
+ phaddw m4, m3
+ mova m3, [esp+0x170]
+ pmulhrsw m4, m12 ; 8a 8b
+ mov myd, mym
+ pshufb m5, m6 ; 4b 5b
+ pshufd m6, m6, q1032
+ pshufb m2, m6 ; 3a 2a
+ pshufb m3, m6 ; 3b 2b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ mova [esp+0x140], m0
+ mova [esp+0x150], m1
+ mova m0, [esp+0x1a0]
+ mova m1, [esp+0x1b0]
+ punpcklwd m2, m7 ; 34a
+ punpcklwd m3, m5 ; 34b
+ mova [esp+0x160], m2
+ mova [esp+0x170], m3
+ pshufb m0, m6 ; 7a 6a
+ pshufb m1, m6 ; 7b 6b
+ punpckhwd m7, m0 ; 56a
+ punpckhwd m5, m1 ; 56b
+ punpcklwd m0, m4
+ punpckhqdq m4, m4
+ punpcklwd m1, m4
+ mova [esp+0x180], m7
+ mova [esp+0x190], m5
+ mova [esp+0x1a0], m0
+ mova [esp+0x1b0], m1
+ mova m0, [esp+0x140]
+ mova m1, [esp+0x150]
+ jmp .vloop
+.skip_line:
+ MC_8TAP_SCALED_H 0x20, 0x1c0, 0
+ mov myd, mym
+ mova m0, [esp+0x160]
+ mova m1, [esp+0x170]
+ mova m2, [esp+0x180]
+ mova m3, [esp+0x190]
+ mova [esp+0x140], m0
+ mova [esp+0x150], m1
+ mova m4, [esp+0x1a0]
+ mova m5, [esp+0x1b0]
+ mova [esp+0x160], m2
+ mova [esp+0x170], m3
+ mova m6, [esp+0x1c0]
+ mova m7, [esp+0x1d0]
+ mova [esp+0x180], m4
+ mova [esp+0x190], m5
+ punpcklwd m4, m6, m7
+ punpckhwd m6, m7
+ mova [esp+0x1a0], m4
+ mova [esp+0x1b0], m6
+%endif
+ jmp .vloop
+INIT_XMM ssse3
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy1_w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ %else
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+ movzx r5, byte [esp+0x1f0]
+ dec srcd
+ movd m15, r5
+ %endif
+ punpckldq m9, m8
+ SWAP m8, m9
+ paddd m14, m8 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+ %else
+ %define m11 [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ psrldq m15, 4
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_dw]
+ mova m6, [base+subpel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m9, m9
+ pcmpeqd m8, m9
+ psrld m14, 10
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [esp+0x00], m14
+ %define m14 [esp+0x00]
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m8 m5
+ %define m15 m6
+ %endif
+ movq m0, [srcq+ssq*0]
+ movq m2, [srcq+ssq*2]
+ movhps m0, [srcq+ssq*1]
+ movhps m2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ %if ARCH_X86_64
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ movq m10, r4
+ %else
+ mov myd, mym
+ mov r5, [esp+0x1f4]
+ xor r3, r3
+ shr myd, 6
+ lea r5, [r5+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r5*8+0]
+ cmovnz r3, [base+subpel_filters+r5*8+4]
+ %define m10 m4
+ movd m10, r4
+ movd m3, r3
+ mov r3, r3m
+ punpckldq m10, m3
+ %endif
+ movq m1, [srcq+ssq*0]
+ movq m3, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*1]
+ add srcq, ss3q
+ punpcklbw m10, m10
+ psraw m10, 8
+ punpckldq m15, m7
+ punpcklqdq m15, m15
+ %if ARCH_X86_64
+ pand m11, m8
+ %else
+ pand m7, m11, m8
+ %define m11 m7
+ %endif
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ %if ARCH_X86_64
+ pshufd m8, m10, q0000
+ pshufd m9, m10, q1111
+ pshufd m11, m10, q3333
+ pshufd m10, m10, q2222
+ %else
+ mova [esp+0x10], m15
+ %define m15 [esp+0x10]
+ mov r0, r0m
+ pshufd m5, m4, q0000
+ pshufd m6, m4, q1111
+ pshufd m7, m4, q2222
+ pshufd m4, m4, q3333
+ %define m8 [esp+0x20]
+ %define m9 [esp+0x30]
+ %define m10 [esp+0x40]
+ %define m11 [esp+0x50]
+ mova m8, m5
+ mova m9, m6
+ mova m10, m7
+ mova m11, m4
+ %endif
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pmaddubsw m0, m15
+ pmaddubsw m2, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12
+ pmulhrsw m1, m12
+ palignr m2, m1, m0, 4
+ pshufd m4, m1, q2121
+ punpcklwd m3, m0, m2 ; 01 12
+ punpckhwd m0, m2 ; 23 34
+ punpcklwd m2, m1, m4 ; 45 56
+.dy1_w2_loop:
+ movq m1, [srcq+ssq*0]
+ movhps m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m5, m3, m8
+ pmaddwd m6, m0, m9
+ pmaddwd m7, m2, m10
+ mova m3, m0
+ mova m0, m2
+ paddd m5, m13
+ paddd m6, m7
+ pshufb m1, m14
+ pmaddubsw m1, m15
+ phaddw m1, m1
+ pmulhrsw m1, m12
+ palignr m7, m1, m4, 12
+ punpcklwd m2, m7, m1 ; 67 78
+ pmaddwd m7, m2, m11
+ mova m4, m1
+ paddd m5, m6
+ paddd m5, m7
+ psrad m5, rndshift
+ packssdw m5, m5
+ packuswb m5, m5
+ movd r4d, m5
+ mov [dstq+dsq*0], r4w
+ shr r4d, 16
+ mov [dstq+dsq*1], r4w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+INIT_XMM ssse3
+.dy1_w4:
+%if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [base+pd_0x4000]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %if isprep
+ %define ssq r3
+ %endif
+ movzx r4, byte [esp+0x1f0]
+ dec srcq
+ movd m15, r4
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ psrldq m7, m15, 8
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r6d, m15
+ movd r13d, m7
+ movd m15, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+r11*8+2]
+ movd m3, [base+subpel_filters+ r6*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+%else
+ movd r1, m15
+ movd r3, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r4, m15
+ movd r5, m7
+ %define m15 m5
+ SWAP m4, m7
+ movd m15, [base+subpel_filters+r1*8+2]
+ movd m2, [base+subpel_filters+r3*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m4, [base+subpel_filters+r5*8+2]
+ mov myd, mym
+ mov rX, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea rX, [rX+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+rX*8+0]
+ cmovnz r5, [base+subpel_filters+rX*8+4]
+ mov r3, r3m
+ %if isprep
+ lea ss3q, [ssq*3]
+ %endif
+%endif
+ punpckldq m15, m3
+ punpckldq m2, m4
+ punpcklqdq m15, m2
+ movq m6, [base+subpel_s_shuf2]
+%if ARCH_X86_64
+ pcmpeqd m8, m9
+ psrld m14, 10
+ pshufb m14, [base+bdct_lb_dw]
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpcklqdq m6, m6
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ movu m7, [srcq+ssq*2]
+ add srcq, ss3q
+ pand m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ paddb m14, m6
+ movq m10, r4q
+ punpcklbw m10, m10
+ psraw m10, 8
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m4, m14
+ pshufb m5, m14
+ pshufb m7, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ pmaddubsw m7, m15
+ phaddw m0, m1
+ phaddw m2, m3
+ phaddw m4, m5
+ phaddw m6, m7, m7
+ pmulhrsw m0, m12 ; 0 1
+ pmulhrsw m2, m12 ; 2 3
+ pmulhrsw m4, m12 ; 4 5
+ pmulhrsw m6, m12 ; 6 _
+ shufps m1, m0, m2, q1032 ; 1 2
+ shufps m3, m2, m4, q1032 ; 3 4
+ shufps m5, m4, m6, q1032 ; 5 6
+ punpcklwd m7, m0, m1 ; 01
+ punpckhwd m0, m1 ; 12
+ punpcklwd m8, m2, m3 ; 23
+ punpckhwd m2, m3 ; 34
+ punpcklwd m9, m4, m5 ; 45
+ punpckhwd m4, m5 ; 56
+%else
+ pxor m3, m3
+ pcmpeqd m8, m3
+ psrld m14, 10
+ pshufb m14, [base+bdct_lb_dw]
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ add srcq, ss3q
+ punpcklqdq m6, m6
+ SWAP m4, m7
+ pand m7, m11, m8
+ pandn m8, m15
+ SWAP m5, m0
+ por m15, m7
+ paddb m14, m6
+ movu m0, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m0, m14
+ pshufb m7, m14
+ pshufb m6, m14
+ pmaddubsw m1, m15
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ mova [esp+0x00], m14
+ mova [esp+0x10], m15
+ pmaddubsw m0, m15
+ pmaddubsw m7, m15
+ pmaddubsw m6, m15
+ phaddw m1, m2
+ movu m2, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ mov r0, r0m
+ phaddw m3, m0
+ pshufb m2, m14
+ pmaddubsw m2, m15
+ %define m14 [esp+0x00]
+ %define m15 [esp+0x10]
+ phaddw m7, m6
+ phaddw m2, m2
+ movd m6, r4
+ movd m0, r5
+ punpckldq m6, m0
+ punpcklbw m6, m6
+ psraw m6, 8
+ mova [esp+0x20], m6
+ pmulhrsw m1, m12 ; 0 1
+ pmulhrsw m3, m12 ; 2 3
+ pmulhrsw m7, m12 ; 4 5
+ pmulhrsw m2, m12 ; 6 _
+ shufps m0, m1, m3, q1032 ; 1 2
+ shufps m4, m3, m7, q1032 ; 3 4
+ shufps m5, m7, m2, q1032 ; 5 6
+ punpcklwd m6, m1, m0 ; 01
+ punpckhwd m1, m0 ; 12
+ mova [esp+0x30], m1
+ punpcklwd m1, m3, m4 ; 23
+ punpckhwd m3, m4 ; 34
+ mova [esp+0x40], m3
+ punpcklwd m3, m7, m5 ; 45
+ punpckhwd m7, m5 ; 56
+ mova [esp+0x50], m7
+ mova [esp+0x60], m2
+ mova m0, [esp+0x20]
+ %xdefine m8 m1
+ %xdefine m9 m3
+ %xdefine m10 m0
+ SWAP m7, m6
+ SWAP m1, m4
+ SWAP m3, m2
+%endif
+ pshufd m1, m10, q0000
+ pshufd m3, m10, q1111
+ pshufd m5, m10, q2222
+ pshufd m10, m10, q3333
+%if ARCH_X86_64
+ mova [rsp+0x00], m8
+ mova [rsp+0x10], m2
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m4
+%else
+ mova [esp+0x70], m8
+ mova [esp+0x80], m9
+ mova [esp+0x90], m1
+ mova [esp+0xa0], m3
+ mova [esp+0xb0], m5
+ mova [esp+0xc0], m10
+ %ifidn %1, put
+ mov dsd, dsm
+ %endif
+ %define m11 m6
+%endif
+.dy1_w4_loop:
+%if ARCH_X86_64
+ movu m11, [srcq+ssq*0]
+ pmaddwd m7, m1
+ pmaddwd m8, m3
+ pmaddwd m0, m1
+ pmaddwd m2, m3
+ pmaddwd m9, m5
+ pmaddwd m4, m5
+ paddd m7, m8
+ paddd m0, m2
+ movu m8, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m11, m14
+ pmaddubsw m11, m15
+ paddd m7, m13
+ paddd m0, m13
+ paddd m7, m9
+ paddd m0, m4
+ pshufb m8, m14
+ pmaddubsw m8, m15
+ phaddw m11, m8
+ mova m8, [rsp+0x20]
+ pmulhrsw m11, m12
+ punpcklwd m9, m6, m11 ; 67
+ psrldq m6, m11, 8
+ punpcklwd m4, m11, m6 ; 78
+ pmaddwd m2, m9, m10
+ pmaddwd m11, m4, m10
+ paddd m7, m2
+ mova m2, [rsp+0x30]
+ paddd m0, m11
+%else
+ SWAP m7, m6
+ SWAP m1, m4
+ SWAP m3, m2
+ movu m5, [srcq+ssq*0]
+ mova m0, [esp+0x30]
+ mova m2, [esp+0x40]
+ mova m4, [esp+0x50]
+ pmaddwd m6, [esp+0x90]
+ pmaddwd m1, [esp+0xa0]
+ pmaddwd m0, [esp+0x90]
+ pmaddwd m2, [esp+0xa0]
+ pmaddwd m3, [esp+0xb0]
+ pmaddwd m4, [esp+0xb0]
+ paddd m6, m1
+ paddd m0, m2
+ movu m7, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m5, m14
+ pmaddubsw m5, m15
+ paddd m6, m13
+ paddd m0, m13
+ paddd m6, m3
+ paddd m0, m4
+ pshufb m7, m14
+ pmaddubsw m7, m15
+ phaddw m5, m7
+ mova m7, [rsp+0x80]
+ pmulhrsw m5, m12
+ punpcklwd m3, [esp+0x60], m5 ; 67
+ psrldq m1, m5, 8
+ punpcklwd m4, m5, m1 ; 78
+ pmaddwd m2, m3, [esp+0xc0]
+ pmaddwd m5, m4, [esp+0xc0]
+ mova [esp+0x60], m1
+ paddd m6, m2
+ mova m2, [esp+0x50]
+ paddd m0, m5
+ SWAP m7, m6
+%endif
+ psrad m7, rndshift
+ psrad m0, rndshift
+ packssdw m7, m0
+%if ARCH_X86_64
+ mova m0, [rsp+0x10]
+%else
+ mova m0, [esp+0x40]
+%define m11 m5
+%endif
+%ifidn %1, put
+ packuswb m7, m7
+ psrldq m11, m7, 4
+ movd [dstq+dsq*0], m7
+ movd [dstq+dsq*1], m11
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], m7
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jz .ret
+%if ARCH_X86_64
+ mova m7, [rsp+0x00]
+ mova [rsp+0x00], m8
+ mova [rsp+0x10], m2
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m4
+%else
+ mova m7, [esp+0x70] ; 01
+ mova m1, [esp+0x80] ; 23
+ mova m2, [esp+0x50] ; 34
+ mova [esp+0x30], m0
+ mova [esp+0x70], m1
+ mova [esp+0x40], m2
+ mova [esp+0x80], m3
+ mova [esp+0x50], m4
+%endif
+ jmp .dy1_w4_loop
+INIT_XMM ssse3
+.dy1_w8:
+ mov dword [rsp+0x90], 1
+ movifprep tmp_stridem, 16
+ jmp .dy1_w_start
+.dy1_w16:
+ mov dword [rsp+0x90], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [rsp+0x90], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [rsp+0x90], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [rsp+0x90], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+ mov myd, mym
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+%if ARCH_X86_64
+ shr t0d, 16
+ sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+%else
+ %define m8 m0
+ %define m9 m1
+ %xdefine m14 m4
+ %xdefine m15 m3
+ %if isprep
+ %define ssq ssm
+ %endif
+ mov r5, [esp+0x1f0]
+ mov r3, [esp+0x1f4]
+ shr r5, 16
+ sub srcq, 3
+ movd m15, r5
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r0, r0m
+ mov r3, r3m
+%endif
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+%if ARCH_X86_64
+ movq m3, r4q
+ punpcklbw m3, m3
+ psraw m3, 8
+%else
+ movd m5, r4
+ movd m6, r5
+ punpckldq m5, m6
+ punpcklbw m5, m5
+ psraw m5, 8
+ SWAP m3, m5
+%endif
+ mova [rsp+0x100], m7
+ mova [rsp+0x120], m15
+ mov [rsp+0x098], srcq
+ mov [rsp+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [rsp+0x140], m0
+ mova [rsp+0x150], m1
+ mova [rsp+0x160], m2
+ mova [rsp+0x170], m3
+%if ARCH_X86_64 && UNIX64
+ mov hm, hd
+%elif ARCH_X86_32
+ SWAP m5, m3
+ mov r5, hm
+ mov [esp+0x134], r5
+%endif
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [rsp+0x090]
+ jz .ret
+%if ARCH_X86_64
+ add qword [rsp+0x130], 8*(isprep+1)
+ mov hd, hm
+%else
+ add dword [rsp+0x130], 8*(isprep+1)
+ mov r5, [esp+0x134]
+ mov r0, [esp+0x130]
+%endif
+ mova m7, [rsp+0x100]
+ mova m14, [rsp+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+%else
+ %define m10 [base+pd_0x3ff]
+%endif
+ mova m15, [rsp+0x120]
+ mov srcq, [rsp+0x098]
+%if ARCH_X86_64
+ mov r0q, [rsp+0x130] ; dstq / tmpq
+%else
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.dy1_hloop:
+ pxor m9, m9
+%if ARCH_X86_64
+ mova m11, [base+pq_0x40000000]
+%else
+ %define m11 [base+pq_0x40000000]
+%endif
+ psrld m2, m14, 10
+ mova [rsp], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m9
+ psrldq m2, m5, 8
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+ pxor m2, m2
+ %define m9 m2
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ mova [rsp+0x110], m14
+ psrldq m4, m15, 8
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ psrldq m4, m14, 8
+ movd r10d, m14
+ movd r11d, m4
+ psrldq m14, 4
+ psrldq m4, 4
+ movd r13d, m14
+ movd rXd, m4
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m8, m11, m4
+ pand m9, m11, m6
+ pand m15, m11, m7
+ pand m11, m11, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m7, m2
+ pandn m5, m3
+ por m8, m4
+ por m9, m6
+ por m15, m7
+ por m11, m5
+ mova [rsp+0x10], m8
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m15
+ mova [rsp+0x40], m11
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
+ mova [rsp+0x50], m1
+ mova [rsp+0x60], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
+ mova [rsp+0x70], m3
+ mova [rsp+0x80], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
+ MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
+ SWAP m7, m0
+ SWAP m8, m14
+ mova m1, [rsp+0x50]
+ mova m2, [rsp+0x60]
+ mova m3, [rsp+0x70]
+ mova m15, [rsp+0x80]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ SWAP m14, m8
+ mova m8, [rsp+0x140]
+ mova m9, [rsp+0x150]
+ mova m10, [rsp+0x160]
+ mova m11, [rsp+0x170]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m15; 23a
+ punpckhwd m3, m15 ; 23b
+ mova [rsp+0x50], m4
+ mova [rsp+0x60], m5
+ mova [rsp+0x70], m6
+ mova [rsp+0x80], m7
+ mova m14, [base+unpckw]
+%else
+ movd r0, m15
+ movd rX, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r4, m15
+ movd r5, m4
+ mova m14, [esp+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [esp+16], m14
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m11, m4
+ pand m1, m11, m6
+ pand m2, m11, m7
+ pand m3, m11, m5
+ pandn m4, [esp+0x20]
+ pandn m6, [esp+0x30]
+ pandn m7, [esp+0x40]
+ pandn m5, [esp+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1
+ MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3
+ MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5
+ MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7
+ mova m5, [esp+0x1a0]
+ mova m6, [esp+0x1b0]
+ mova m7, [esp+0x1c0]
+ mova m0, [esp+0x1d0]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [esp+0x1a0], m4
+ mova [esp+0x1b0], m5
+ mova [esp+0x1c0], m6
+ mova [esp+0x1d0], m7
+ mova m1, [esp+0x060]
+ mova m2, [esp+0x070]
+ mova m3, [esp+0x180]
+ mova m4, [esp+0x190]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova [esp+0x060], m0
+ mova [esp+0x070], m1
+ mova [esp+0x180], m2
+ mova [esp+0x190], m3
+ %define m8 [esp+0x140]
+ %define m9 [esp+0x150]
+ %define m10 [esp+0x160]
+ %define m11 [esp+0x170]
+%endif
+.dy1_vloop:
+%if ARCH_X86_32
+ mov r0, r0m
+%endif
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ pmaddwd m7, m3, m9
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+%if ARCH_X86_64
+ pmaddwd m6, [rsp+0x50], m10
+ pmaddwd m7, [rsp+0x60], m10
+%else
+ pmaddwd m6, [rsp+0x1a0], m10
+ pmaddwd m7, [rsp+0x1b0], m10
+%endif
+ paddd m4, m6
+ paddd m5, m7
+%if ARCH_X86_64
+ pmaddwd m6, [rsp+0x70], m11
+ pmaddwd m7, [rsp+0x80], m11
+%else
+ pmaddwd m6, [rsp+0x1c0], m11
+ pmaddwd m7, [rsp+0x1d0], m11
+%endif
+ paddd m4, m6
+ paddd m5, m7
+ psrad m4, rndshift
+ psrad m5, rndshift
+ packssdw m4, m5
+%ifidn %1, put
+ packuswb m4, m4
+ movq [dstq], m4
+ add dstq, dsm
+%else
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+%if ARCH_X86_32
+ mov r0m, r0
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+%if ARCH_X86_64
+ movq m4, [srcq+ r4]
+ movq m5, [srcq+ r6]
+ movhps m4, [srcq+ r7]
+ movhps m5, [srcq+ r9]
+ movq m6, [srcq+r10]
+ movq m7, [srcq+r11]
+ movhps m6, [srcq+r13]
+ movhps m7, [srcq+ rX]
+ add srcq, ssq
+ pshufd m15, m14, q1032
+ pshufb m0, m14 ; 0a 1a
+ pshufb m1, m14 ; 0b 1b
+ pshufb m2, m15 ; 3a 2a
+ pshufb m3, m15 ; 3b 2b
+ pmaddubsw m4, [rsp+0x10]
+ pmaddubsw m5, [rsp+0x20]
+ pmaddubsw m6, [rsp+0x30]
+ pmaddubsw m7, [rsp+0x40]
+ phaddw m4, m5
+ phaddw m6, m7
+ phaddw m4, m6
+ pmulhrsw m4, m12
+ pshufb m5, [rsp+0x70], m15 ; 7a 6a
+ pshufb m7, [rsp+0x80], m15 ; 7b 6b
+ pshufb m6, [rsp+0x50], m14 ; 4a 5a
+ pshufb m15, [rsp+0x60], m14 ; 4b 5b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ punpcklwd m2, m6 ; 34a
+ punpcklwd m3, m15 ; 34b
+ punpckhwd m6, m5 ; 56a
+ punpckhwd m15, m7 ; 56b
+ punpcklwd m5, m4 ; 78a
+ psrldq m4, 8
+ punpcklwd m7, m4 ; 78b
+ mova [rsp+0x50], m6
+ mova [rsp+0x60], m15
+ mova [rsp+0x70], m5
+ mova [rsp+0x80], m7
+%else
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ mova m6, [base+unpckw]
+ mova m0, [esp+0x060]
+ mova m1, [esp+0x070]
+ mova m7, [esp+0x1a0]
+ movq m4, [srcq+r0]
+ movq m5, [srcq+rX]
+ movhps m4, [srcq+r4]
+ movhps m5, [srcq+r5]
+ pshufb m0, m6 ; 0a 1a
+ pshufb m1, m6 ; 0b 1b
+ pshufb m7, m6 ; 4a 5a
+ mov r0, [esp+16]
+ mov rX, [esp+24]
+ mov r4, [esp+20]
+ mov r5, [esp+28]
+ movq m3, [srcq+r0]
+ movq m2, [srcq+rX]
+ movhps m3, [srcq+r4]
+ movhps m2, [srcq+r5]
+ add srcq, ssq
+ pmaddubsw m4, [esp+0x20]
+ pmaddubsw m5, [esp+0x30]
+ pmaddubsw m3, [esp+0x40]
+ pmaddubsw m2, [esp+0x50]
+ phaddw m4, m5
+ phaddw m3, m2
+ mova m5, [esp+0x1b0]
+ mova m2, [esp+0x180]
+ phaddw m4, m3
+ mova m3, [esp+0x190]
+ pmulhrsw m4, m12 ; 8a 8b
+ pshufb m5, m6 ; 4b 5b
+ pshufd m6, m6, q1032
+ pshufb m2, m6 ; 3a 2a
+ pshufb m3, m6 ; 3b 2b
+ punpckhwd m0, m2 ; 12a
+ punpckhwd m1, m3 ; 12b
+ mova [esp+0x60], m0
+ mova [esp+0x70], m1
+ mova m0, [esp+0x1c0]
+ mova m1, [esp+0x1d0]
+ punpcklwd m2, m7 ; 34a
+ punpcklwd m3, m5 ; 34b
+ mova [esp+0x180], m2
+ mova [esp+0x190], m3
+ pshufb m0, m6 ; 7a 6a
+ pshufb m1, m6 ; 7b 6b
+ punpckhwd m7, m0 ; 56a
+ punpckhwd m5, m1 ; 56b
+ punpcklwd m0, m4
+ punpckhqdq m4, m4
+ punpcklwd m1, m4
+ mova [esp+0x1a0], m7
+ mova [esp+0x1b0], m5
+ mova [esp+0x1c0], m0
+ mova [esp+0x1d0], m1
+ mova m0, [esp+0x60]
+ mova m1, [esp+0x70]
+%endif
+ jmp .dy1_vloop
+INIT_XMM ssse3
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy2_w2:
+ %if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+ %else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [base+pd_0x4000]
+ %define m8 m0
+ %define m9 m1
+ %define m14 m4
+ %define m15 m3
+ movzx r5, byte [esp+0x1f0]
+ dec srcd
+ movd m15, r5
+ %endif
+ punpckldq m9, m8
+ SWAP m8, m9
+ paddd m14, m8 ; mx+dx*[0-1]
+ %if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+ %endif
+ pshufd m15, m15, q0000
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ movd r4d, m15
+ psrldq m15, 4
+ %if ARCH_X86_64
+ movd r6d, m15
+ %else
+ movd r3d, m15
+ %endif
+ mova m5, [base+bdct_lb_dw]
+ mova m6, [base+subpel_s_shuf2]
+ movd m15, [base+subpel_filters+r4*8+2]
+ %if ARCH_X86_64
+ movd m7, [base+subpel_filters+r6*8+2]
+ %else
+ movd m7, [base+subpel_filters+r3*8+2]
+ %endif
+ pxor m9, m9
+ pcmpeqd m8, m9
+ psrld m14, 10
+ %if ARCH_X86_32
+ mov r3, r3m
+ pshufb m14, m5
+ paddb m14, m6
+ mova [esp+0x00], m14
+ %define m14 [esp+0x00]
+ SWAP m5, m0
+ SWAP m6, m3
+ %define m8 m5
+ %define m15 m6
+ %endif
+ movq m0, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ movhps m0, [srcq+ssq*2]
+ movhps m1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ %if ARCH_X86_64
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ movq m10, r4q
+ %else
+ mov myd, mym
+ mov r3, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r3, r3m
+ %define m10 m4
+ movd m10, r4
+ movd m3, r5
+ punpckldq m10, m3
+ %endif
+ movq m3, [srcq+ssq*0]
+ movhps m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m10, m10
+ psraw m10, 8
+ punpckldq m15, m7
+ punpcklqdq m15, m15
+ %if ARCH_X86_64
+ pand m11, m8
+ %else
+ pand m7, m11, m8
+ %define m11 m7
+ %endif
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ %if ARCH_X86_64
+ pshufd m8, m10, q0000
+ pshufd m9, m10, q1111
+ pshufd m11, m10, q3333
+ pshufd m10, m10, q2222
+ %else
+ mova [esp+0x10], m15
+ %define m15 [esp+0x10]
+ mov r5, r0m
+ %define dstq r5
+ mov dsd, dsm
+ pshufd m5, m4, q0000
+ pshufd m6, m4, q1111
+ pshufd m7, m4, q2222
+ pshufd m4, m4, q3333
+ %define m8 [esp+0x20]
+ %define m9 [esp+0x30]
+ %define m10 [esp+0x40]
+ %define m11 [esp+0x50]
+ mova m8, m5
+ mova m9, m6
+ mova m10, m7
+ mova m11, m4
+ %endif
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ pslldq m2, m3, 8
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12 ; 0 2 _ 4
+ pmulhrsw m1, m12 ; 1 3 _ 5
+ pshufd m2, m0, q3110 ; 0 2 2 4
+ pshufd m1, m1, q3110 ; 1 3 3 5
+ punpcklwd m3, m2, m1 ; 01 23
+ punpckhwd m2, m1 ; 23 45
+.dy2_w2_loop:
+ movq m6, [srcq+ssq*0]
+ movq m7, [srcq+ssq*1]
+ movhps m6, [srcq+ssq*2]
+ movhps m7, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m4, m3, m8
+ pmaddwd m5, m2, m9
+ pshufb m6, m14
+ pshufb m7, m14
+ pmaddubsw m6, m15
+ pmaddubsw m7, m15
+ phaddw m6, m7
+ pmulhrsw m6, m12
+ psrldq m7, m6, 8
+ palignr m6, m0, 8
+ palignr m7, m1, 8
+ mova m0, m6
+ mova m1, m7
+ pshufd m6, m6, q3221
+ pshufd m7, m7, q3221
+ punpcklwd m3, m6, m7 ; 45 67
+ punpckhwd m2, m6, m7 ; 67 89
+ pmaddwd m6, m3, m10
+ pmaddwd m7, m2, m11
+ paddd m4, m5
+ paddd m4, m13
+ paddd m6, m7
+ paddd m4, m6
+ psrad m4, rndshift
+ packssdw m4, m4
+ packuswb m4, m4
+ movd r4d, m4
+ mov [dstq+dsq*0], r4w
+ shr r4d, 16
+ mov [dstq+dsq*1], r4w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+INIT_XMM ssse3
+.dy2_w4:
+%if ARCH_X86_64
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [base+pd_0x4000]
+ %define m8 m0
+ %xdefine m14 m4
+ %define m15 m3
+ %define dstq r0
+ %if isprep
+ %define ssq r3
+ %endif
+ movzx r4, byte [esp+0x1f0]
+ dec srcq
+ movd m15, r4
+%endif
+ pmaddwd m8, [base+rescale_mul]
+%if ARCH_X86_64
+ mova m11, [base+pd_0x4000]
+%endif
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd m15, m8
+ psrldq m7, m15, 8
+%if ARCH_X86_64
+ movd r4d, m15
+ movd r11d, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r6d, m15
+ movd r13d, m7
+ movd m15, [base+subpel_filters+ r4*8+2]
+ movd m2, [base+subpel_filters+r11*8+2]
+ movd m3, [base+subpel_filters+ r6*8+2]
+ movd m4, [base+subpel_filters+r13*8+2]
+ movq m6, [base+subpel_s_shuf2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+%else
+ movd r1, m15
+ movd r3, m7
+ psrldq m15, 4
+ psrldq m7, 4
+ movd r4, m15
+ movd r5, m7
+ %define m15 m5
+ SWAP m4, m7
+ movd m15, [base+subpel_filters+r1*8+2]
+ movd m2, [base+subpel_filters+r3*8+2]
+ movd m3, [base+subpel_filters+r4*8+2]
+ movd m4, [base+subpel_filters+r5*8+2]
+ movq m6, [base+subpel_s_shuf2]
+ mov myd, mym
+ mov r3, [esp+0x1f4]
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r3, r3m
+ %if isprep
+ lea ss3q, [ssq*3]
+ %endif
+%endif
+ punpckldq m15, m3
+ punpckldq m2, m4
+ punpcklqdq m15, m2
+%if ARCH_X86_64
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movu m0, [srcq+ssq*0]
+ movu m2, [srcq+ssq*2]
+ movu m1, [srcq+ssq*1]
+ movu m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ punpcklqdq m6, m6
+ pshufb m14, [base+bdct_lb_dw]
+ movu m4, [srcq+ssq*0]
+ movu m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pand m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m11
+ paddb m14, m6
+ movq m11, r4q
+ punpcklbw m11, m11
+ psraw m11, 8
+ pshufb m0, m14
+ pshufb m2, m14
+ pshufb m1, m14
+ pshufb m3, m14
+ pshufb m4, m14
+ pshufb m5, m14
+ pmaddubsw m0, m15
+ pmaddubsw m2, m15
+ pmaddubsw m1, m15
+ pmaddubsw m3, m15
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ phaddw m0, m2
+ phaddw m1, m3
+ phaddw m4, m5
+ pmulhrsw m0, m12 ; 0 2
+ pmulhrsw m1, m12 ; 1 3
+ pmulhrsw m4, m12 ; 4 5
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+%else
+ pxor m3, m3
+ pcmpeqd m8, m3
+ psrld m14, 10
+ pshufb m14, [base+bdct_lb_dw]
+ movu m1, [srcq+ssq*0]
+ movu m2, [srcq+ssq*2]
+ movu m3, [srcq+ssq*1]
+ add srcq, ss3q
+ punpcklqdq m6, m6
+ SWAP m4, m7
+ pand m7, m11, m8
+ pandn m8, m15
+ SWAP m15, m8
+ por m15, m7
+ paddb m14, m6
+ movu m0, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m6, [srcq+ssq*2]
+ add srcq, ss3q
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m0, m14
+ pshufb m7, m14
+ pshufb m6, m14
+ pmaddubsw m1, m15
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ mova [esp+0x00], m14
+ mova [esp+0x10], m15
+ pmaddubsw m0, m15
+ pmaddubsw m7, m15
+ pmaddubsw m6, m15
+ %define m14 [esp+0x00]
+ %define m15 [esp+0x10]
+ phaddw m1, m2
+ phaddw m3, m0
+ phaddw m7, m6
+ %ifidn %1, put
+ mov dsd, dsm
+ %define dstq r5
+ %else
+ %define tmpq r5
+ %endif
+ movd m6, r4
+ movd m0, r5
+ punpckldq m6, m0
+ punpcklbw m6, m6
+ psraw m6, 8
+ mov r5, r0m
+ pmulhrsw m1, m12 ; 0 2
+ pmulhrsw m3, m12 ; 1 3
+ pmulhrsw m7, m12 ; 4 5
+ SWAP m0, m1, m3
+ SWAP m4, m7
+ pshufd m2, m6, q0000
+ pshufd m3, m6, q1111
+ pshufd m7, m6, q2222
+ pshufd m6, m6, q3333
+ mova [esp+0x30], m2
+ mova [esp+0x40], m3
+ mova [esp+0x50], m7
+ mova [esp+0x60], m6
+ %define m8 [esp+0x30]
+ %define m9 [esp+0x40]
+ %define m10 [esp+0x50]
+ %define m11 [esp+0x60]
+%endif
+ psrldq m5, m4, 8 ; 5 _
+ punpckhwd m2, m0, m1 ; 23
+ punpcklwd m0, m1 ; 01
+ punpcklwd m4, m5 ; 45
+.dy2_w4_loop:
+ pmaddwd m0, m8 ; a0
+ pmaddwd m5, m2, m8 ; b0
+ pmaddwd m2, m9 ; a1
+ pmaddwd m7, m4, m9 ; b1
+ pmaddwd m3, m4, m10 ; a2
+ paddd m0, m13
+ paddd m5, m13
+ paddd m0, m2
+ paddd m5, m7
+ paddd m0, m3
+ movu m6, [srcq+ssq*0]
+ movu m7, [srcq+ssq*1]
+ movu m3, [srcq+ssq*2]
+ movu m1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m6, m14
+ pshufb m7, m14
+ pshufb m3, m14
+ pshufb m1, m14
+ pmaddubsw m6, m15
+ pmaddubsw m7, m15
+ pmaddubsw m3, m15
+ pmaddubsw m1, m15
+ phaddw m6, m7
+ phaddw m3, m1
+ pmulhrsw m6, m12 ; 6 7
+ pmulhrsw m3, m12 ; 8 9
+ psrldq m7, m6, 8
+ psrldq m1, m3, 8
+ punpcklwd m6, m7 ; 67
+ punpcklwd m3, m1 ; 89
+ mova m2, m6
+ pmaddwd m1, m6, m10 ; b2
+ pmaddwd m6, m11 ; a3
+ pmaddwd m7, m3, m11 ; b3
+ paddd m5, m1
+ paddd m0, m6
+ paddd m5, m7
+ psrad m0, rndshift
+ psrad m5, rndshift
+ packssdw m0, m5
+%ifidn %1, put
+ packuswb m0, m0
+ psrldq m1, m0, 4
+ movd [dstq+dsq*0], m0
+ movd [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], m0
+ add tmpq, 16
+%endif
+ mova m0, m4
+ mova m4, m3
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET
+INIT_XMM ssse3
+.dy2_w8:
+ mov dword [rsp+0x90], 1
+ movifprep tmp_stridem, 16
+ jmp .dy2_w_start
+.dy2_w16:
+ mov dword [rsp+0x90], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [rsp+0x90], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [rsp+0x90], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [rsp+0x90], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+ mov myd, mym
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+%if ARCH_X86_64
+ shr t0d, 16
+ sub srcq, 3
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movd m15, t0d
+%else
+ %define m10 [base+pd_0x3ff]
+ %define m11 [base+pd_0x4000]
+ %define m8 m0
+ %define m9 m1
+ %xdefine m14 m4
+ %xdefine m15 m3
+ %if isprep
+ %define tmpq r0
+ %define ssq ssm
+ %else
+ %define dstq r0
+ %endif
+ mov r5, [esp+0x1f0]
+ mov r3, [esp+0x1f4]
+ shr r5, 16
+ sub srcq, 3
+ movd m15, r5
+ xor r5, r5
+ shr myd, 6
+ lea r3, [r3+myd]
+ mov r4, 64 << 24
+ cmovnz r4, [base+subpel_filters+r3*8+0]
+ cmovnz r5, [base+subpel_filters+r3*8+4]
+ mov r0, r0m
+ mov r3, r3m
+%endif
+ pslld m7, m8, 2 ; dx*4
+ pmaddwd m8, [base+rescale_mul] ; dx*[0-3]
+ pshufd m15, m15, q0000
+ paddd m14, m8 ; mx+dx*[0-3]
+%if ARCH_X86_64
+ movq m3, r4q
+ punpcklbw m3, m3
+ psraw m3, 8
+%else
+ movd m5, r4
+ movd m6, r5
+ punpckldq m5, m6
+ punpcklbw m5, m5
+ psraw m5, 8
+ SWAP m3, m5
+%endif
+ mova [rsp+0x100], m7
+ mova [rsp+0x120], m15
+ mov [rsp+0x098], srcq
+ mov [rsp+0x130], r0q ; dstq / tmpq
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova [rsp+0x140], m0
+ mova [rsp+0x150], m1
+ mova [rsp+0x160], m2
+ mova [rsp+0x170], m3
+%if ARCH_X86_64 && UNIX64
+ mov hm, hd
+%elif ARCH_X86_32
+ SWAP m5, m3
+ mov r5, hm
+ mov [esp+0x134], r5
+%endif
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [rsp+0x090]
+ jz .ret
+%if ARCH_X86_64
+ add qword [rsp+0x130], 8*(isprep+1)
+ mov hd, hm
+%else
+ add dword [rsp+0x130], 8*(isprep+1)
+ mov r5, [esp+0x134]
+ mov r0, [esp+0x130]
+%endif
+ mova m7, [rsp+0x100]
+ mova m14, [rsp+0x110]
+%if ARCH_X86_64
+ mova m10, [base+pd_0x3ff]
+%else
+ %define m10 [base+pd_0x3ff]
+%endif
+ mova m15, [rsp+0x120]
+ mov srcq, [rsp+0x098]
+%if ARCH_X86_64
+ mov r0q, [rsp+0x130] ; dstq / tmpq
+%else
+ mov hm, r5
+ mov r0m, r0
+ mov r3, r3m
+%endif
+ paddd m14, m7
+.dy2_hloop:
+ pxor m9, m9
+%if ARCH_X86_64
+ mova m11, [base+pq_0x40000000]
+%else
+ %define m11 [base+pq_0x40000000]
+%endif
+ psrld m2, m14, 10
+ mova [rsp], m2
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m5, m15, m6
+ pcmpeqd m6, m9
+ psrldq m2, m5, 8
+%if ARCH_X86_64
+ movd r4d, m5
+ movd r6d, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r7d, m5
+ movd r9d, m2
+ movq m0, [base+subpel_filters+r4*8]
+ movq m1, [base+subpel_filters+r6*8]
+ movhps m0, [base+subpel_filters+r7*8]
+ movhps m1, [base+subpel_filters+r9*8]
+%else
+ movd r0, m5
+ movd rX, m2
+ psrldq m5, 4
+ psrldq m2, 4
+ movd r4, m5
+ movd r5, m2
+ movq m0, [base+subpel_filters+r0*8]
+ movq m1, [base+subpel_filters+rX*8]
+ movhps m0, [base+subpel_filters+r4*8]
+ movhps m1, [base+subpel_filters+r5*8]
+ pxor m2, m2
+ %define m9 m2
+%endif
+ paddd m14, m7 ; mx+dx*[4-7]
+ pand m5, m14, m10
+ psrld m5, 6
+ paddd m15, m5
+ pcmpeqd m5, m9
+ mova [rsp+0x110], m14
+ psrldq m4, m15, 8
+%if ARCH_X86_64
+ movd r10d, m15
+ movd r11d, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r13d, m15
+ movd rXd, m4
+ movq m2, [base+subpel_filters+r10*8]
+ movq m3, [base+subpel_filters+r11*8]
+ movhps m2, [base+subpel_filters+r13*8]
+ movhps m3, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ psrldq m4, m14, 8
+ movd r10d, m14
+ movd r11d, m4
+ psrldq m14, 4
+ psrldq m4, 4
+ movd r13d, m14
+ movd rXd, m4
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m8, m11, m4
+ pand m9, m11, m6
+ pand m15, m11, m7
+ pand m11, m11, m5
+ pandn m4, m0
+ pandn m6, m1
+ pandn m7, m2
+ pandn m5, m3
+ por m8, m4
+ por m9, m6
+ por m15, m7
+ por m11, m5
+ mova [rsp+0x10], m8
+ mova [rsp+0x20], m9
+ mova [rsp+0x30], m15
+ mova [rsp+0x40], m11
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
+ mova [rsp+0x50], m1
+ mova [rsp+0x60], m2
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
+ mova [rsp+0x70], m3
+ mova [rsp+0x80], m4
+ MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
+ MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
+ SWAP m7, m0
+ SWAP m8, m14
+ mova m1, [rsp+0x50]
+ mova m2, [rsp+0x60]
+ mova m3, [rsp+0x70]
+ mova m15, [rsp+0x80]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m8 ; 67a
+ punpckhwd m7, m8 ; 67b
+ SWAP m14, m8
+ mova m8, [rsp+0x140]
+ mova m9, [rsp+0x150]
+ mova m10, [rsp+0x160]
+ mova m11, [rsp+0x170]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m15; 23a
+ punpckhwd m3, m15 ; 23b
+ mova [rsp+0x50], m4
+ mova [rsp+0x60], m5
+ mova [rsp+0x70], m6
+ mova [rsp+0x80], m7
+%else
+ movd r0, m15
+ movd rX, m4
+ psrldq m15, 4
+ psrldq m4, 4
+ movd r4, m15
+ movd r5, m4
+ mova m14, [esp+0x110]
+ movq m2, [base+subpel_filters+r0*8]
+ movq m3, [base+subpel_filters+rX*8]
+ movhps m2, [base+subpel_filters+r4*8]
+ movhps m3, [base+subpel_filters+r5*8]
+ psrld m14, 10
+ mova [esp+16], m14
+ mov r0, [esp+ 0]
+ mov rX, [esp+ 8]
+ mov r4, [esp+ 4]
+ mov r5, [esp+12]
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ pshufd m4, m6, q1100
+ pshufd m6, m6, q3322
+ pshufd m7, m5, q1100
+ pshufd m5, m5, q3322
+ pand m0, m11, m4
+ pand m1, m11, m6
+ pand m2, m11, m7
+ pand m3, m11, m5
+ pandn m4, [esp+0x20]
+ pandn m6, [esp+0x30]
+ pandn m7, [esp+0x40]
+ pandn m5, [esp+0x50]
+ por m0, m4
+ por m1, m6
+ por m2, m7
+ por m3, m5
+ mova [esp+0x20], m0
+ mova [esp+0x30], m1
+ mova [esp+0x40], m2
+ mova [esp+0x50], m3
+ MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1
+ MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3
+ MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5
+ MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7
+ mova m5, [esp+0x1a0]
+ mova m6, [esp+0x1b0]
+ mova m7, [esp+0x1c0]
+ mova m0, [esp+0x1d0]
+ punpcklwd m4, m5, m6 ; 45a
+ punpckhwd m5, m6 ; 45b
+ punpcklwd m6, m7, m0 ; 67a
+ punpckhwd m7, m0 ; 67b
+ mova [esp+0x1a0], m4
+ mova [esp+0x1b0], m5
+ mova [esp+0x1c0], m6
+ mova [esp+0x1d0], m7
+ mova m1, [esp+0x060]
+ mova m2, [esp+0x070]
+ mova m3, [esp+0x180]
+ mova m4, [esp+0x190]
+ punpcklwd m0, m1, m2 ; 01a
+ punpckhwd m1, m2 ; 01b
+ punpcklwd m2, m3, m4 ; 23a
+ punpckhwd m3, m4 ; 23b
+ mova [esp+0x180], m2
+ mova [esp+0x190], m3
+ %define m8 [esp+0x140]
+ %define m9 [esp+0x150]
+ %define m10 [esp+0x160]
+ %define m11 [esp+0x170]
+%endif
+.dy2_vloop:
+%if ARCH_X86_32
+ mov r0, r0m
+%endif
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ pmaddwd m7, m3, m9
+ paddd m4, m13
+ paddd m5, m13
+ paddd m4, m6
+ paddd m5, m7
+%if ARCH_X86_64
+ pmaddwd m6, [rsp+0x50], m10
+ pmaddwd m7, [rsp+0x60], m10
+%else
+ pmaddwd m6, [esp+0x1a0], m10
+ pmaddwd m7, [esp+0x1b0], m10
+%endif
+ paddd m4, m6
+ paddd m5, m7
+%if ARCH_X86_64
+ pmaddwd m6, [rsp+0x70], m11
+ pmaddwd m7, [rsp+0x80], m11
+%else
+ pmaddwd m6, [esp+0x1c0], m11
+ pmaddwd m7, [esp+0x1d0], m11
+%endif
+ paddd m4, m6
+ paddd m5, m7
+ psrad m4, rndshift
+ psrad m5, rndshift
+ packssdw m4, m5
+%ifidn %1, put
+ packuswb m4, m4
+ movq [dstq], m4
+ add dstq, dsm
+%else
+ mova [tmpq], m4
+ add tmpq, tmp_stridem
+%endif
+%if ARCH_X86_32
+ mov r0m, r0
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+%if ARCH_X86_64
+ mova m8, [rsp+0x10]
+ mova m9, [rsp+0x20]
+ mova m10, [rsp+0x30]
+ mova m11, [rsp+0x40]
+ mova m0, m2 ; 01a
+ mova m1, m3 ; 01b
+ MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11
+ mova m3, [rsp+0x50] ; 23a
+ mova m4, [rsp+0x60] ; 23b
+ mova m5, [rsp+0x70] ; 45a
+ mova m7, [rsp+0x80] ; 45b
+ mova m8, [rsp+0x140]
+ mova m9, [rsp+0x150]
+ mova m10, [rsp+0x160]
+ mova m11, [rsp+0x170]
+ punpcklwd m14, m2, m6 ; 67a
+ punpckhwd m2, m6 ; 67b
+ mova [rsp+0x50], m5
+ mova [rsp+0x60], m7
+ mova [rsp+0x70], m14
+ mova [rsp+0x80], m2
+ mova m2, m3
+ mova m3, m4
+%else
+ MC_8TAP_SCALED_H 0x20, 0
+ punpcklwd m6, m0, m4
+ punpckhwd m7, m0, m4
+ mova m0, [esp+0x180] ; 01a
+ mova m1, [esp+0x190] ; 01b
+ mova m2, [rsp+0x1a0] ; 23a
+ mova m3, [esp+0x1b0] ; 23b
+ mova m4, [esp+0x1c0] ; 45a
+ mova m5, [esp+0x1d0] ; 45b
+ mova [esp+0x180], m2
+ mova [esp+0x190], m3
+ mova [esp+0x1a0], m4
+ mova [esp+0x1b0], m5
+ mova [esp+0x1c0], m6 ; 67a
+ mova [esp+0x1d0], m7 ; 67b
+%endif
+ jmp .dy2_vloop
+.ret:
+ MC_8TAP_SCALED_RET 0
+%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
+ %define r0m [rstk+stack_offset+ 4]
+ %define r1m [rstk+stack_offset+ 8]
+ %define r2m [rstk+stack_offset+12]
+ %define r3m [rstk+stack_offset+16]
+%endif
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled_8bpc
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, (5*15 << 16) | 5*15
+ jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%elif ARCH_X86_64
+DECLARE_REG_TMP 6, 8
+%else
+DECLARE_REG_TMP 1, 2
+%endif
+BILIN_SCALED_FN put
+FN put_8tap_scaled, sharp, SHARP, SHARP
+FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN put_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN put_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN put_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN put_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%elif ARCH_X86_64
+DECLARE_REG_TMP 6, 7
+%else
+DECLARE_REG_TMP 1, 2
+%endif
+BILIN_SCALED_FN prep
+FN prep_8tap_scaled, sharp, SHARP, SHARP
+FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH
+FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP
+FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH
+FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR
+FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP
+FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR
+FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH
+FN prep_8tap_scaled, regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%if ARCH_X86_32
+ %macro SAVE_ALPHA_BETA 0
+ mov alpham, alphad
+ mov betam, betad
+ %endmacro
+
+ %macro SAVE_DELTA_GAMMA 0
+ mov deltam, deltad
+ mov gammam, gammad
+ %endmacro
+
+ %macro LOAD_ALPHA_BETA_MX 0
+ mov mym, myd
+ mov alphad, alpham
+ mov betad, betam
+ mov mxd, mxm
+ %endmacro
+
+ %macro LOAD_DELTA_GAMMA_MY 0
+ mov mxm, mxd
+ mov deltad, deltam
+ mov gammad, gammam
+ mov myd, mym
+ %endmacro
+
+ %define PIC_reg r2
+ %define PIC_base_offset $$
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+%else
+ %define SAVE_ALPHA_BETA
+ %define SAVE_DELTA_GAMMA
+ %define PIC_sym(sym) sym
+%endif
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < required_stack_alignment
+ %assign copy_args 8*4
+ %else
+ %assign copy_args 0
+ %endif
+%endif
+
+%macro RELOC_ARGS 0
+ %if copy_args
+ mov r0, r0m
+ mov r1, r1m
+ mov r2, r2m
+ mov r3, r3m
+ mov r5, r5m
+ mov dstm, r0
+ mov dsm, r1
+ mov srcm, r2
+ mov ssm, r3
+ mov mxm, r5
+ mov r0, r6m
+ mov mym, r0
+ %endif
+%endmacro
+
+%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2
+ %if cpuflag(sse4)
+ pblendw %1, %2, 0xAA
+ %else
+ pand %2, m10
+ por %1, %2
+ %endif
+%endmacro
+
+%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
+ %if ARCH_X86_32
+ %define m8 m4
+ %define m9 m5
+ %define m14 m6
+ %define m15 m7
+ %define m11 m7
+ %endif
+ %if notcpuflag(ssse3) || ARCH_X86_32
+ pxor m11, m11
+ %endif
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq m2, [filterq+myq *8] ; a
+ movq m8, [filterq+tmp1q*8] ; e
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+deltaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq m3, [filterq+tmp2q*8] ; b
+ movq m0, [filterq+tmp1q*8] ; f
+ punpcklwd m2, m3
+ punpcklwd m8, m0
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq m0, [filterq+myq *8] ; c
+ movq m9, [filterq+tmp1q*8] ; g
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+gammaq] ; my += gamma
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq m3, [filterq+tmp2q*8] ; d
+ movq m1, [filterq+tmp1q*8] ; h
+ punpcklwd m0, m3
+ punpcklwd m9, m1
+ punpckldq m1, m2, m0
+ punpckhdq m2, m0
+ punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
+ punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
+ punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
+ punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
+ pmaddwd m0, %3
+ pmaddwd m3, %5
+ pmaddwd m1, %7
+ pmaddwd m14, %9
+ paddd m0, m3
+ paddd m1, m14
+ paddd m0, m1
+ mova %1, m0
+ %if ARCH_X86_64
+ SWAP m3, m14
+ %endif
+ punpckldq m0, m8, m9
+ punpckhdq m8, m9
+ punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8
+ punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8
+ punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8
+ punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8
+ pmaddwd m1, %4
+ pmaddwd m14, %6
+ pmaddwd m2, %8
+ pmaddwd m15, %10
+ paddd m1, m14
+ paddd m2, m15
+ paddd m1, m2
+ mova %2, m1
+ %if ARCH_X86_64
+ SWAP m14, m3
+ %endif
+%endmacro
+
+%if ARCH_X86_64
+ %define counterd r4d
+%else
+ %if copy_args == 0
+ %define counterd dword r4m
+ %else
+ %define counterd dword [esp+stack_size-4*7]
+ %endif
+%endif
+
+%macro WARP_AFFINE_8X8T 0
+%if ARCH_X86_64
+cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts
+%else
+cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts
+ %if copy_args
+ %define tmpm [esp+stack_size-4*1]
+ %define tsm [esp+stack_size-4*2]
+ %endif
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main
+.loop:
+%if ARCH_X86_32
+ %define m12 m4
+ %define m13 m5
+ %define m14 m6
+ %define m15 m7
+ mova m12, [esp+0xC0]
+ mova m13, [esp+0xD0]
+ mova m14, [esp+0xE0]
+ mova m15, [esp+0xF0]
+%endif
+%if cpuflag(ssse3)
+ psrad m12, 13
+ psrad m13, 13
+ psrad m14, 13
+ psrad m15, 13
+ packssdw m12, m13
+ packssdw m14, m15
+ mova m13, [PIC_sym(pw_8192)]
+ pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7
+ pmulhrsw m14, m13
+%else
+ %if ARCH_X86_32
+ %define m10 m0
+ %endif
+ mova m10, [PIC_sym(pd_16384)]
+ paddd m12, m10
+ paddd m13, m10
+ paddd m14, m10
+ paddd m15, m10
+ psrad m12, 15
+ psrad m13, 15
+ psrad m14, 15
+ psrad m15, 15
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ mova [tmpq+tsq*0], m12
+ mova [tmpq+tsq*2], m14
+ dec counterd
+ jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end
+%if ARCH_X86_32
+ mov tmpm, tmpd
+ mov r0, [esp+0x100]
+ mov r1, [esp+0x104]
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2
+ lea tmpq, [tmpq+tsq*4]
+ jmp .loop
+%endmacro
+
+%macro WARP_AFFINE_8X8 0
+%if ARCH_X86_64
+cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \
+ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+ filter, tmp1, delta, my, gamma
+%else
+cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \
+ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+ filter, tmp1, delta, my, gamma
+ %define alphaq r0
+ %define alphad r0
+ %define alpham [esp+gprsize+0x100]
+ %define betaq r1
+ %define betad r1
+ %define betam [esp+gprsize+0x104]
+ %define deltaq r0
+ %define deltad r0
+ %define deltam [esp+gprsize+0x108]
+ %define gammaq r1
+ %define gammad r1
+ %define gammam [esp+gprsize+0x10C]
+ %define filterq r3
+ %define tmp1q r4
+ %define tmp1d r4
+ %define tmp1m [esp+gprsize+0x110]
+ %define myq r5
+ %define myd r5
+ %define mym r6m
+ %if copy_args
+ %define dstm [esp+stack_size-4*1]
+ %define dsm [esp+stack_size-4*2]
+ %define srcm [esp+stack_size-4*3]
+ %define ssm [esp+stack_size-4*4]
+ %define mxm [esp+stack_size-4*5]
+ %define mym [esp+stack_size-4*6]
+ %endif
+%endif
+ call .main
+ jmp .start
+.loop:
+%if ARCH_X86_32
+ mov dstm, dstd
+ mov alphad, [esp+0x100]
+ mov betad, [esp+0x104]
+%endif
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+%if notcpuflag(sse4)
+ %if cpuflag(ssse3)
+ %define roundval pw_8192
+ %else
+ %define roundval pd_262144
+ %endif
+ %if ARCH_X86_64
+ mova m10, [PIC_sym(roundval)]
+ %else
+ %define m10 [PIC_sym(roundval)]
+ %endif
+%endif
+%if ARCH_X86_32
+ %define m12 m5
+ %define m13 m6
+ mova m12, [esp+0xC0]
+ mova m13, [esp+0xD0]
+%endif
+%if cpuflag(sse4)
+ %if ARCH_X86_32
+ %define m11 m4
+ pxor m11, m11
+ %endif
+ psrad m12, 18
+ psrad m13, 18
+ packusdw m12, m13
+ pavgw m12, m11 ; (x + (1 << 10)) >> 11
+%else
+ %if cpuflag(ssse3)
+ psrad m12, 17
+ psrad m13, 17
+ packssdw m12, m13
+ pmulhrsw m12, m10
+ %else
+ paddd m12, m10
+ paddd m13, m10
+ psrad m12, 19
+ psrad m13, 19
+ packssdw m12, m13
+ %endif
+%endif
+%if ARCH_X86_32
+ %define m14 m6
+ %define m15 m7
+ mova m14, [esp+0xE0]
+ mova m15, [esp+0xF0]
+%endif
+%if cpuflag(sse4)
+ psrad m14, 18
+ psrad m15, 18
+ packusdw m14, m15
+ pavgw m14, m11 ; (x + (1 << 10)) >> 11
+%else
+ %if cpuflag(ssse3)
+ psrad m14, 17
+ psrad m15, 17
+ packssdw m14, m15
+ pmulhrsw m14, m10
+ %else
+ paddd m14, m10
+ paddd m15, m10
+ psrad m14, 19
+ psrad m15, 19
+ packssdw m14, m15
+ %endif
+%endif
+ packuswb m12, m14
+ movq [dstq+dsq*0], m12
+ movhps [dstq+dsq*1], m12
+ dec counterd
+ jg .loop
+.end:
+ RET
+ALIGN function_align
+.main:
+%assign stack_offset stack_offset+gprsize
+%if ARCH_X86_32
+ %assign stack_size stack_size+4
+ %if copy_args
+ %assign stack_offset stack_offset-4
+ %endif
+ RELOC_ARGS
+ LEA PIC_reg, $$
+ %define PIC_mem [esp+gprsize+0x114]
+ mov abcdd, abcdm
+ %if copy_args == 0
+ mov ssd, ssm
+ mov mxd, mxm
+ %endif
+ mov PIC_mem, PIC_reg
+ mov srcd, srcm
+%endif
+ movsx deltad, word [abcdq+2*2]
+ movsx gammad, word [abcdq+2*3]
+ lea tmp1d, [deltaq*3]
+ sub gammad, tmp1d ; gamma -= delta*3
+ SAVE_DELTA_GAMMA
+%if ARCH_X86_32
+ mov abcdd, abcdm
+%endif
+ movsx alphad, word [abcdq+2*0]
+ movsx betad, word [abcdq+2*1]
+ lea tmp1q, [ssq*3+3]
+ add mxd, 512+(64<<10)
+ lea tmp2d, [alphaq*3]
+ sub srcq, tmp1q ; src -= src_stride*3 + 3
+%if ARCH_X86_32
+ mov srcm, srcd
+ mov PIC_reg, PIC_mem
+%endif
+ sub betad, tmp2d ; beta -= alpha*3
+ lea filterq, [PIC_sym(mc_warp_filter2)]
+%if ARCH_X86_64
+ mov myd, r6m
+ %if cpuflag(ssse3)
+ pxor m11, m11
+ %endif
+%endif
+ call .h
+ psrld m2, m0, 16
+ psrld m3, m1, 16
+%if ARCH_X86_32
+ %if notcpuflag(ssse3)
+ mova [esp+gprsize+0x00], m2
+ %endif
+ mova [esp+gprsize+0x10], m3
+%endif
+ call .h
+ psrld m4, m0, 16
+ psrld m5, m1, 16
+%if ARCH_X86_32
+ mova [esp+gprsize+0x20], m4
+ mova [esp+gprsize+0x30], m5
+%endif
+ call .h
+%if ARCH_X86_64
+ %define blendmask [rsp+gprsize+0x80]
+%else
+ %if notcpuflag(ssse3)
+ mova m2, [esp+gprsize+0x00]
+ %endif
+ mova m3, [esp+gprsize+0x10]
+ %define blendmask [esp+gprsize+0x120]
+ %define m10 m7
+%endif
+ pcmpeqd m10, m10
+ pslld m10, 16
+ mova blendmask, m10
+ BLENDHWDW m2, m0 ; 0
+ BLENDHWDW m3, m1 ; 2
+ mova [rsp+gprsize+0x00], m2
+ mova [rsp+gprsize+0x10], m3
+ call .h
+%if ARCH_X86_32
+ mova m4, [esp+gprsize+0x20]
+ mova m5, [esp+gprsize+0x30]
+%endif
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 1
+ BLENDHWDW m5, m1 ; 3
+ mova [rsp+gprsize+0x20], m4
+ mova [rsp+gprsize+0x30], m5
+ call .h
+%if ARCH_X86_32
+ %if notcpuflag(ssse3)
+ mova m2, [esp+gprsize+0x00]
+ %endif
+ mova m3, [esp+gprsize+0x10]
+ %define m10 m5
+%endif
+ psrld m6, m2, 16
+ psrld m7, m3, 16
+ mova m10, blendmask
+ BLENDHWDW m6, m0 ; 2
+ BLENDHWDW m7, m1 ; 4
+ mova [rsp+gprsize+0x40], m6
+ mova [rsp+gprsize+0x50], m7
+ call .h
+%if ARCH_X86_32
+ mova m4, [esp+gprsize+0x20]
+ mova m5, [esp+gprsize+0x30]
+%endif
+ psrld m2, m4, 16
+ psrld m3, m5, 16
+ mova m10, blendmask
+ BLENDHWDW m2, m0 ; 3
+ BLENDHWDW m3, m1 ; 5
+ mova [rsp+gprsize+0x60], m2
+ mova [rsp+gprsize+0x70], m3
+ call .h
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0x40]
+ mova m7, [esp+gprsize+0x50]
+ %define m10 m7
+%endif
+ psrld m4, m6, 16
+ psrld m5, m7, 16
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 4
+ BLENDHWDW m5, m1 ; 6
+%if ARCH_X86_64
+ add myd, 512+(64<<10)
+ mova m6, m2
+ mova m7, m3
+%else
+ mova [esp+gprsize+0x80], m4
+ mova [esp+gprsize+0x90], m5
+ add dword mym, 512+(64<<10)
+%endif
+ mov counterd, 4
+ SAVE_ALPHA_BETA
+.main2:
+ call .h
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0x60]
+ mova m7, [esp+gprsize+0x70]
+ %define m10 m5
+%endif
+ psrld m6, 16
+ psrld m7, 16
+ mova m10, blendmask
+ BLENDHWDW m6, m0 ; 5
+ BLENDHWDW m7, m1 ; 7
+%if ARCH_X86_64
+ WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
+ m4, m5, \
+ [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
+ m6, m7
+%else
+ mova [esp+gprsize+0xA0], m6
+ mova [esp+gprsize+0xB0], m7
+ LOAD_DELTA_GAMMA_MY
+ WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \
+ [esp+gprsize+0x00], [esp+gprsize+0x10], \
+ [esp+gprsize+0x80], [esp+gprsize+0x90], \
+ [esp+gprsize+0x20], [esp+gprsize+0x30], \
+ [esp+gprsize+0xA0], [esp+gprsize+0xB0]
+ LOAD_ALPHA_BETA_MX
+%endif
+ call .h
+ mova m2, [rsp+gprsize+0x40]
+ mova m3, [rsp+gprsize+0x50]
+%if ARCH_X86_32
+ mova m4, [rsp+gprsize+0x80]
+ mova m5, [rsp+gprsize+0x90]
+ %define m10 m7
+%endif
+ mova [rsp+gprsize+0x00], m2
+ mova [rsp+gprsize+0x10], m3
+ mova [rsp+gprsize+0x40], m4
+ mova [rsp+gprsize+0x50], m5
+ psrld m4, 16
+ psrld m5, 16
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 6
+ BLENDHWDW m5, m1 ; 8
+%if ARCH_X86_64
+ WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
+ m6, m7, \
+ [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
+ m4, m5
+%else
+ mova [esp+gprsize+0x80], m4
+ mova [esp+gprsize+0x90], m5
+ LOAD_DELTA_GAMMA_MY
+ WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \
+ [esp+gprsize+0x20], [esp+gprsize+0x30], \
+ [esp+gprsize+0xA0], [esp+gprsize+0xB0], \
+ [esp+gprsize+0x00], [esp+gprsize+0x10], \
+ [esp+gprsize+0x80], [esp+gprsize+0x90]
+ mov mym, myd
+ mov dstd, dstm
+ mov dsd, dsm
+ mov mxd, mxm
+%endif
+ mova m2, [rsp+gprsize+0x60]
+ mova m3, [rsp+gprsize+0x70]
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0xA0]
+ mova m7, [esp+gprsize+0xB0]
+%endif
+ mova [rsp+gprsize+0x20], m2
+ mova [rsp+gprsize+0x30], m3
+ mova [rsp+gprsize+0x60], m6
+ mova [rsp+gprsize+0x70], m7
+ ret
+ALIGN function_align
+.h:
+%if ARCH_X86_32
+ %define m8 m3
+ %define m9 m4
+ %define m10 m5
+ %define m14 m6
+ %define m15 m7
+%endif
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+%if ARCH_X86_32
+ %assign stack_offset stack_offset+4
+ %assign stack_size stack_size+4
+ %define PIC_mem [esp+gprsize*2+0x114]
+ mov PIC_mem, PIC_reg
+ mov srcd, srcm
+%endif
+ movu m10, [srcq]
+%if ARCH_X86_32
+ add srcd, ssm
+ mov srcm, srcd
+ mov PIC_reg, PIC_mem
+%else
+ add srcq, ssq
+%endif
+ shr mxd, 10
+ shr tmp1d, 10
+ movq m1, [filterq+mxq *8] ; 0 X
+ movq m8, [filterq+tmp1q*8] ; 4 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+alphaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m1, [filterq+tmp2q*8] ; 0 1
+ movhps m8, [filterq+tmp1q*8] ; 4 5
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ shr mxd, 10
+ shr tmp1d, 10
+%if cpuflag(ssse3)
+ movq m14, [filterq+mxq *8] ; 2 X
+ movq m9, [filterq+tmp1q*8] ; 6 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m14, [filterq+tmp2q*8] ; 2 3
+ movhps m9, [filterq+tmp1q*8] ; 6 7
+ pshufb m0, m10, [PIC_sym(warp_8x8_shufA)]
+ pmaddubsw m0, m1
+ pshufb m1, m10, [PIC_sym(warp_8x8_shufB)]
+ pmaddubsw m1, m8
+ pshufb m15, m10, [PIC_sym(warp_8x8_shufC)]
+ pmaddubsw m15, m14
+ pshufb m10, m10, [PIC_sym(warp_8x8_shufD)]
+ pmaddubsw m10, m9
+ phaddw m0, m15
+ phaddw m1, m10
+%else
+ %if ARCH_X86_32
+ %define m11 m2
+ %endif
+ pcmpeqw m0, m0
+ psrlw m14, m0, 8
+ psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15
+ pand m14, m10 ; 00 02 04 06 08 10 12 14
+ packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15
+ psrldq m9, m0, 4
+ pshufd m0, m14, q0220
+ pand m0, m9
+ psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __
+ pslldq m15, m14, 12
+ por m0, m15 ; shufA
+ psrlw m15, m0, 8
+ psraw m11, m1, 8
+ psllw m0, 8
+ psllw m1, 8
+ psrlw m0, 8
+ psraw m1, 8
+ pmullw m15, m11
+ pmullw m0, m1
+ paddw m0, m15 ; pmaddubsw m0, m1
+ pshufd m15, m14, q0220
+ pand m15, m9
+ psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __
+ pslldq m1, m14, 12
+ por m15, m1 ; shufC
+ pshufd m1, m14, q0220
+ pand m1, m9
+ psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __
+ pslldq m11, m14, 12
+ por m1, m11 ; shufB
+ pshufd m10, m14, q0220
+ pand m10, m9
+ psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __
+ pslldq m14, m14, 12
+ por m10, m14 ; shufD
+ psrlw m9, m1, 8
+ psraw m11, m8, 8
+ psllw m1, 8
+ psllw m8, 8
+ psrlw m1, 8
+ psraw m8, 8
+ pmullw m9, m11
+ pmullw m1, m8
+ paddw m1, m9 ; pmaddubsw m1, m8
+ movq m14, [filterq+mxq *8] ; 2 X
+ movq m9, [filterq+tmp1q*8] ; 6 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m14, [filterq+tmp2q*8] ; 2 3
+ movhps m9, [filterq+tmp1q*8] ; 6 7
+ psrlw m8, m15, 8
+ psraw m11, m14, 8
+ psllw m15, 8
+ psllw m14, 8
+ psrlw m15, 8
+ psraw m14, 8
+ pmullw m8, m11
+ pmullw m15, m14
+ paddw m15, m8 ; pmaddubsw m15, m14
+ psrlw m8, m10, 8
+ psraw m11, m9, 8
+ psllw m10, 8
+ psllw m9, 8
+ psrlw m10, 8
+ psraw m9, 8
+ pmullw m8, m11
+ pmullw m10, m9
+ paddw m10, m8 ; pmaddubsw m10, m9
+ pslld m8, m0, 16
+ pslld m9, m1, 16
+ pslld m14, m15, 16
+ pslld m11, m10, 16
+ paddw m0, m8
+ paddw m1, m9
+ paddw m15, m14
+ paddw m10, m11
+ psrad m0, 16
+ psrad m1, 16
+ psrad m15, 16
+ psrad m10, 16
+ packssdw m0, m15 ; phaddw m0, m15
+ packssdw m1, m10 ; phaddw m1, m10
+%endif
+ mova m14, [PIC_sym(pw_8192)]
+ mova m9, [PIC_sym(pd_32768)]
+ pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
+ pmaddwd m1, m14
+ paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword
+ paddd m1, m9
+ ret
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%macro BIDIR_FN 1 ; op
+ %1 0
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w4: ; tile 4x
+ movd [dstq ], m0 ; copy dw[0]
+ pshuflw m1, m0, q1032 ; swap dw[1] and dw[0]
+ movd [dstq+strideq*1], m1 ; copy dw[1]
+ punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0]
+ movd [dstq+strideq*2], m0 ; dw[2]
+ psrlq m0, 32 ; shift right in dw[3]
+ movd [dstq+stride3q ], m0 ; copy
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*2]
+.w8:
+ movq [dstq ], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq]
+.w16:
+ mova [dstq ], m0
+ dec hd
+ jg .w16_loop
+ RET
+.w32_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq]
+.w32:
+ mova [dstq ], m0
+ %1 2
+ mova [dstq + 16 ], m0
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ %1_INC_PTR 8
+ %1 0
+ add dstq, strideq
+.w64:
+ %assign i 0
+ %rep 4
+ mova [dstq + i*16 ], m0
+ %assign i i+1
+ %if i < 4
+ %1 2*i
+ %endif
+ %endrep
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ %1_INC_PTR 16
+ %1 0
+ add dstq, strideq
+.w128:
+ %assign i 0
+ %rep 8
+ mova [dstq + i*16 ], m0
+ %assign i i+1
+ %if i < 8
+ %1 2*i
+ %endif
+ %endrep
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
+ mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
+ paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ paddw m1, [tmp2q+(%1+1)*mmsize]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+ LEA r6, avg_ssse3_table
+ tzcnt wd, wm ; leading zeros
+ movifnidn hd, hm ; move h(stack) to h(register) if not already that register
+ movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
+ mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m2, [tmp1q+(%1+0)*mmsize]
+ mova m0, m2
+ psubw m2, [tmp2q+(%1+0)*mmsize]
+ mova m3, [tmp1q+(%1+1)*mmsize]
+ mova m1, m3
+ psubw m3, [tmp2q+(%1+1)*mmsize]
+ pmulhw m2, m4
+ pmulhw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+ LEA r6, w_avg_ssse3_table
+ tzcnt wd, wm
+ movd m4, r6m
+ movifnidn hd, hm
+ pxor m0, m0
+ movsxd wq, dword [r6+wq*4]
+ mova m5, [pw_2048+r6-w_avg_ssse3_table]
+ pshufb m4, m0
+ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
+ add wq, r6
+ cmp dword r6m, 7
+ jg .weight_gt7
+ mov r6, tmp1q
+ psubw m0, m4
+ mov tmp1q, tmp2q
+ mova m4, m0 ; -weight
+ mov tmp2q, r6
+.weight_gt7:
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+ mova m3, [maskq+(%1+0)*(mmsize/2)]
+ mova m0, [tmp2q+(%1+0)*mmsize] ; b
+ psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
+ mova m6, m3 ; m
+ psubb m3, m4, m6 ; -m
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3 ; -m << 1
+ punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
+ pmulhw m1, m2 ; (-m * (b - a)) << 10
+ paddw m0, m1 ; + b
+ mova m1, [tmp2q+(%1+1)*mmsize] ; b
+ psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
+ paddw m2, m2 ; (b - a) << 1
+ mova m6, m3 ; (-m << 1)
+ punpckhbw m3, m4, m6 ; (-m << 9)
+ pmulhw m2, m3 ; (-m << 9)
+ paddw m1, m2 ; (-m * (b - a)) << 10
+ pmulhrsw m0, m5 ; round
+ pmulhrsw m1, m5 ; round
+ packuswb m0, m1 ; interleave 16 -> 8
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*mmsize/2
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+%if ARCH_X86_64
+cglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ movifnidn hd, hm
+%else
+cglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
+%define hd dword r5m
+%endif
+%define base r6-mask_ssse3_table
+ LEA r6, mask_ssse3_table
+ tzcnt wd, wm
+ movsxd wq, dword [r6+wq*4]
+ pxor m4, m4
+ mova m5, [base+pw_2048]
+ add wq, r6
+ mov maskq, r6m
+ BIDIR_FN MASK
+%undef hd
+
+%macro W_MASK_420_END 1-*
+%rep %0
+ call .main
+ paddw m2, [maskq+16*%1]
+ mova [maskq+16*%1], m2
+ mova [dstq+strideq*1+16*(2*%1+0)], m0
+ call .main
+ psubw m3, m7, m2
+ psubw m1, m7, [maskq+16*%1]
+ psubw m3, [dstq+strideq*1+16*(2*%1+1)]
+ psrlw m1, 2
+ psrlw m3, 2
+ packuswb m1, m3
+ mova [maskq+16*%1], m1
+ mova [dstq+strideq*1+16*(2*%1+1)], m0
+ %rotate 1
+%endrep
+%endmacro
+
+%if UNIX64
+DECLARE_REG_TMP 7
+%else
+DECLARE_REG_TMP 5
+%endif
+
+cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_420_ssse3_table
+ LEA t0, w_mask_420_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ sub tmp2q, tmp1q
+ movsxd wq, [t0+wq*4]
+ mova m6, [base+pw_2048]
+ movddup m7, [base+wm_420_sign+r6*8] ; 258 - sign
+ add wq, t0
+%if ARCH_X86_64
+ mova m8, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ movifnidn hd, hm
+%else
+ %define m8 [base+pw_6903]
+ %define hd dword hm
+%endif
+ mov maskq, maskmp
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ add maskq, 4
+ lea dstq, [dstq+strideq*2]
+.w4:
+ pshufd m3, m2, q2020
+ pshufd m2, m2, q3131
+ psubw m1, m7, m3
+ psubw m1, m2
+ psrlw m1, 2
+ packuswb m1, m1
+ movd [maskq], m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ call .main
+ add maskq, 4
+ lea dstq, [dstq+strideq*2]
+.w8:
+ movhlps m3, m2
+ psubw m1, m7, m2
+ psubw m1, m3
+ psrlw m1, 2
+ packuswb m1, m1
+ movd [maskq], m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ call .main
+ add maskq, 8
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*1], m2
+ mova [dstq+strideq*0], m0
+ call .main
+ psubw m1, m7, [dstq+strideq*1]
+ psubw m1, m2
+ psrlw m1, 2
+ packuswb m1, m1
+ movq [maskq], m1
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add maskq, 16
+ lea dstq, [dstq+strideq*2]
+.w32:
+ mova [maskq], m2
+ mova [dstq+strideq*0+16*0], m0
+ call .main
+ mova [dstq+strideq*1+16*1], m2
+ mova [dstq+strideq*0+16*1], m0
+ W_MASK_420_END 0
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add maskq, 16*2
+ lea dstq, [dstq+strideq*2]
+.w64:
+ mova [maskq+16*0], m2
+ mova [dstq+strideq*0+16*0], m0
+ call .main
+ mova [dstq+strideq*1+16*1], m2
+ mova [dstq+strideq*0+16*1], m0
+ call .main
+ mova [maskq+16*1], m2
+ mova [dstq+strideq*0+16*2], m0
+ call .main
+ mova [dstq+strideq*1+16*3], m2
+ mova [dstq+strideq*0+16*3], m0
+ W_MASK_420_END 0, 1
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add maskq, 16*4
+ lea dstq, [dstq+strideq*2]
+.w128:
+ mova [maskq+16*0], m2
+ mova [dstq+strideq*0+16*0], m0
+ call .main
+ mova [dstq+strideq*1+16*1], m2
+ mova [dstq+strideq*0+16*1], m0
+ call .main
+ mova [maskq+16*1], m2
+ mova [dstq+strideq*0+16*2], m0
+ call .main
+ mova [dstq+strideq*1+16*3], m2
+ mova [dstq+strideq*0+16*3], m0
+ call .main
+ mova [maskq+16*2], m2
+ mova [dstq+strideq*0+16*4], m0
+ call .main
+ mova [dstq+strideq*1+16*5], m2
+ mova [dstq+strideq*0+16*5], m0
+ call .main
+ mova [maskq+16*3], m2
+ mova [dstq+strideq*0+16*6], m0
+ call .main
+ mova [dstq+strideq*1+16*7], m2
+ mova [dstq+strideq*0+16*7], m0
+ W_MASK_420_END 0, 1, 2, 3
+ sub hd, 2
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ mova m0, [tmp1q +16*0]
+ mova m3, [tmp1q+tmp2q+16*0]
+ mova m1, [tmp1q +16*1]
+ mova m4, [tmp1q+tmp2q+16*1]
+ add tmp1q, 16*2
+ psubw m3, m0
+ psubw m4, m1
+ pabsw m5, m3
+ psubusw m2, m8, m5
+ psrlw m2, 8 ; 64 - m
+ psllw m5, m2, 10
+ pmulhw m3, m5
+ pabsw m5, m4
+ paddw m0, m3
+ psubusw m3, m8, m5
+ psrlw m3, 8
+ phaddw m2, m3
+ psllw m3, 10
+ pmulhw m4, m3
+ paddw m1, m4
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ packuswb m0, m1
+ ret
+
+%macro W_MASK_422_BACKUP 1 ; mask_offset
+%if ARCH_X86_64
+ mova m10, m2
+%else
+ mova [maskq+16*%1], m2
+%endif
+%endmacro
+
+%macro W_MASK_422_END 1 ; mask_offset
+%if ARCH_X86_64
+ packuswb m10, m2
+ psubb m1, m7, m10
+ pavgb m1, m9
+%else
+ mova m3, [maskq+16*%1]
+ packuswb m3, m2
+ pxor m2, m2
+ psubb m1, m7, m3
+ pavgb m1, m2
+%endif
+ mova [maskq+16*%1], m1
+%endmacro
+
+cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_422_ssse3_table
+ LEA t0, w_mask_422_ssse3_table
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ sub tmp2q, tmp1q
+ movsxd wq, [t0+wq*4]
+ mova m6, [base+pw_2048]
+ movddup m7, [base+wm_422_sign+r6*8] ; 128 - sign
+ add wq, t0
+%if ARCH_X86_64
+ mova m8, [base+pw_6903]
+ pxor m9, m9
+ movifnidn hd, hm
+%else
+ add t0, w_mask_420_ssse3_table-w_mask_422_ssse3_table
+ %define hd dword hm
+%endif
+ mov maskq, maskmp
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ jmp wq
+.w4_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 8
+ lea dstq, [dstq+strideq*2]
+.w4:
+ packuswb m2, m2
+ psubb m1, m7, m2
+%if ARCH_X86_64
+ pavgb m1, m9
+%else
+ pxor m2, m2
+ pavgb m1, m2
+%endif
+ movq [maskq], m1
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16
+ lea dstq, [dstq+strideq*2]
+.w8:
+ W_MASK_422_BACKUP 0
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ lea dstq, [dstq+strideq*2]
+ W_MASK_422_END 0
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16
+ lea dstq, [dstq+strideq*2]
+.w16:
+ W_MASK_422_BACKUP 0
+ mova [dstq+strideq*0], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 0
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16
+ add dstq, strideq
+.w32:
+ W_MASK_422_BACKUP 0
+ mova [dstq+16*0], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 0
+ mova [dstq+16*1], m0
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16*2
+ add dstq, strideq
+.w64:
+ W_MASK_422_BACKUP 0
+ mova [dstq+16*0], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 0
+ mova [dstq+16*1], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_BACKUP 1
+ mova [dstq+16*2], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 1
+ mova [dstq+16*3], m0
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ add maskq, 16*4
+ add dstq, strideq
+.w128:
+ W_MASK_422_BACKUP 0
+ mova [dstq+16*0], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 0
+ mova [dstq+16*1], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_BACKUP 1
+ mova [dstq+16*2], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 1
+ mova [dstq+16*3], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_BACKUP 2
+ mova [dstq+16*4], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 2
+ mova [dstq+16*5], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_BACKUP 3
+ mova [dstq+16*6], m0
+ call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
+ W_MASK_422_END 3
+ mova [dstq+16*7], m0
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
+%define base t0-w_mask_444_ssse3_table
+ LEA t0, w_mask_444_ssse3_table
+ tzcnt wd, wm
+ mov maskq, maskmp
+ sub tmp2q, tmp1q
+ movsxd wq, [t0+wq*4]
+ mova m6, [base+pw_6903]
+ mova m7, [base+pw_2048]
+ add wq, t0
+%if ARCH_X86_64
+ mova m8, [base+pb_64]
+ movifnidn hd, hm
+%else
+ %define m8 [base+pb_64]
+ %define hd dword hm
+%endif
+ call .main
+ jmp wq
+.w4_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w4:
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ punpckhqdq m0, m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq+strideq*0], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w8:
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ call .main
+ lea dstq, [dstq+strideq*2]
+.w16:
+ mova [dstq+strideq*0], m0
+ call .main
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ call .main
+ add dstq, strideq
+.w32:
+ mova [dstq+16*0], m0
+ call .main
+ mova [dstq+16*1], m0
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ call .main
+ add dstq, strideq
+.w64:
+ mova [dstq+16*0], m0
+ call .main
+ mova [dstq+16*1], m0
+ call .main
+ mova [dstq+16*2], m0
+ call .main
+ mova [dstq+16*3], m0
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ call .main
+ add dstq, strideq
+.w128:
+ mova [dstq+16*0], m0
+ call .main
+ mova [dstq+16*1], m0
+ call .main
+ mova [dstq+16*2], m0
+ call .main
+ mova [dstq+16*3], m0
+ call .main
+ mova [dstq+16*4], m0
+ call .main
+ mova [dstq+16*5], m0
+ call .main
+ mova [dstq+16*6], m0
+ call .main
+ mova [dstq+16*7], m0
+ dec hd
+ jg .w128_loop
+ RET
+ALIGN function_align
+.main:
+ mova m0, [tmp1q +16*0]
+ mova m3, [tmp1q+tmp2q+16*0]
+ mova m1, [tmp1q +16*1]
+ mova m4, [tmp1q+tmp2q+16*1]
+ add tmp1q, 16*2
+ psubw m3, m0
+ psubw m4, m1
+ pabsw m5, m3
+ psubusw m2, m6, m5
+ psrlw m2, 8 ; 64 - m
+ psllw m5, m2, 10
+ pmulhw m3, m5
+ pabsw m5, m4
+ paddw m0, m3
+ psubusw m3, m6, m5
+ psrlw m3, 8
+ packuswb m2, m3
+ psllw m3, 10
+ pmulhw m4, m3
+ psubb m3, m8, m2
+ paddw m1, m4
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ mova [maskq], m3
+ add maskq, 16
+ packuswb m0, m1
+ ret
+
+%macro BLEND_64M 4; a, b, mask1, mask2
+ punpcklbw m0, %1, %2; {b;a}[7..0]
+ punpckhbw %1, %2 ; {b;a}[15..8]
+ pmaddubsw m0, %3 ; {b*m[0] + (64-m[0])*a}[7..0] u16
+ pmaddubsw %1, %4 ; {b*m[1] + (64-m[1])*a}[15..8] u16
+ pmulhrsw m0, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
+ pmulhrsw %1, m5 ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16
+ packuswb m0, %1 ; {blendpx}[15..0] u8
+%endmacro
+
+%macro BLEND 2; a, b
+ psubb m3, m4, m0 ; m3 = (64 - m)
+ punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0]
+ punpckhbw m3, m0 ; {m;(64-m)}[15..8]
+ BLEND_64M %1, %2, m2, m3
+%endmacro
+
+cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_ssse3_table
+ LEA r6, blend_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movifnidn maskq, maskmp
+ movsxd wq, dword [r6+wq*4]
+ mova m4, [base+pb_64]
+ mova m5, [base+pw_512]
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ movq m0, [maskq]; m
+ movd m1, [dstq+dsq*0] ; a
+ movd m6, [dstq+dsq*1]
+ punpckldq m1, m6
+ movq m6, [tmpq] ; b
+ psubb m3, m4, m0 ; m3 = (64 - m)
+ punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0]
+ punpcklbw m1, m6 ; {b;a}[7..0]
+ pmaddubsw m1, m2 ; {b*m[0] + (64-m[0])*a}[7..0] u16
+ pmulhrsw m1, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
+ packuswb m1, m0 ; {blendpx}[15..0] u8
+ movd [dstq+dsq*0], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ add maskq, 8
+ add tmpq, 8
+ lea dstq, [dstq+dsq*2] ; dst_stride * 2
+ sub hd, 2
+ jg .w4
+ RET
+.w8:
+ mova m0, [maskq]; m
+ movq m1, [dstq+dsq*0] ; a
+ movhps m1, [dstq+dsq*1]
+ mova m6, [tmpq] ; b
+ BLEND m1, m6
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ add maskq, 16
+ add tmpq, 16
+ lea dstq, [dstq+dsq*2] ; dst_stride * 2
+ sub hd, 2
+ jg .w8
+ RET
+.w16:
+ mova m0, [maskq]; m
+ mova m1, [dstq] ; a
+ mova m6, [tmpq] ; b
+ BLEND m1, m6
+ mova [dstq], m0
+ add maskq, 16
+ add tmpq, 16
+ add dstq, dsq ; dst_stride
+ dec hd
+ jg .w16
+ RET
+.w32:
+ %assign i 0
+ %rep 2
+ mova m0, [maskq+16*i]; m
+ mova m1, [dstq+16*i] ; a
+ mova m6, [tmpq+16*i] ; b
+ BLEND m1, m6
+ mova [dstq+i*16], m0
+ %assign i i+1
+ %endrep
+ add maskq, 32
+ add tmpq, 32
+ add dstq, dsq ; dst_stride
+ dec hd
+ jg .w32
+ RET
+
+cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_ssse3_table
+ LEA r5, blend_v_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r5+wq*4]
+ mova m5, [base+pw_512]
+ add wq, r5
+ add maskq, obmc_masks-blend_v_ssse3_table
+ jmp wq
+.w2:
+ movd m3, [maskq+4]
+ punpckldq m3, m3
+ ; 2 mask blend is provided for 4 pixels / 2 lines
+.w2_loop:
+ movd m1, [dstq+dsq*0] ; a {..;a;a}
+ pinsrw m1, [dstq+dsq*1], 1
+ movd m2, [tmpq] ; b
+ punpcklbw m0, m1, m2; {b;a}[7..0]
+ pmaddubsw m0, m3 ; {b*m + (64-m)*a}[7..0] u16
+ pmulhrsw m0, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
+ packuswb m0, m1 ; {blendpx}[8..0] u8
+ movd r3d, m0
+ mov [dstq+dsq*0], r3w
+ shr r3d, 16
+ mov [dstq+dsq*1], r3w
+ add tmpq, 2*2
+ lea dstq, [dstq + dsq * 2]
+ sub hd, 2
+ jg .w2_loop
+ RET
+.w4:
+ movddup m3, [maskq+8]
+ ; 4 mask blend is provided for 8 pixels / 2 lines
+.w4_loop:
+ movd m1, [dstq+dsq*0] ; a
+ movd m2, [dstq+dsq*1] ;
+ punpckldq m1, m2
+ movq m2, [tmpq] ; b
+ punpcklbw m1, m2 ; {b;a}[7..0]
+ pmaddubsw m1, m3 ; {b*m + (64-m)*a}[7..0] u16
+ pmulhrsw m1, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
+ packuswb m1, m1 ; {blendpx}[8..0] u8
+ movd [dstq], m1
+ psrlq m1, 32
+ movd [dstq+dsq*1], m1
+ add tmpq, 2*4
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+.w8:
+ mova m3, [maskq+16]
+ ; 8 mask blend is provided for 16 pixels
+.w8_loop:
+ movq m1, [dstq+dsq*0] ; a
+ movhps m1, [dstq+dsq*1]
+ mova m2, [tmpq]; b
+ BLEND_64M m1, m2, m3, m3
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ add tmpq, 16
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16:
+ ; 16 mask blend is provided for 32 pixels
+ mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0])
+ mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1])
+.w16_loop:
+ mova m1, [dstq] ; a
+ mova m2, [tmpq] ; b
+ BLEND_64M m1, m2, m3, m4
+ mova [dstq], m0
+ add tmpq, 16
+ add dstq, dsq
+ dec hd
+ jg .w16_loop
+ RET
+.w32:
+%if WIN64
+ mova [rsp+8], xmm6
+%endif
+ mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0])
+ mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1])
+ mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2])
+ ; 16 mask blend is provided for 64 pixels
+.w32_loop:
+ mova m1, [dstq+16*0] ; a
+ mova m2, [tmpq+16*0] ; b
+ BLEND_64M m1, m2, m3, m4
+ movq m1, [dstq+16*1] ; a
+ punpcklbw m1, [tmpq+16*1] ; b
+ pmaddubsw m1, m6
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ mova [dstq+16*0], m0
+ movq [dstq+16*1], m1
+ add tmpq, 32
+ add dstq, dsq
+ dec hd
+ jg .w32_loop
+%if WIN64
+ mova xmm6, [rsp+8]
+%endif
+ RET
+
+cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base t0-blend_h_ssse3_table
+%if ARCH_X86_32
+ ; We need to keep the PIC pointer for w4, reload wd from stack instead
+ DECLARE_REG_TMP 6
+%else
+ DECLARE_REG_TMP 5
+ mov r6d, wd
+%endif
+ LEA t0, blend_h_ssse3_table
+ tzcnt wd, wm
+ mov hd, hm
+ movsxd wq, dword [t0+wq*4]
+ mova m5, [base+pw_512]
+ add wq, t0
+ lea maskq, [base+obmc_masks+hq*2]
+ lea hd, [hq*3]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd m0, [dstq+dsq*0]
+ pinsrw m0, [dstq+dsq*1], 1
+ movd m2, [maskq+hq*2]
+ movd m1, [tmpq]
+ punpcklwd m2, m2
+ punpcklbw m0, m1
+ pmaddubsw m0, m2
+ pmulhrsw m0, m5
+ packuswb m0, m0
+ movd r3d, m0
+ mov [dstq+dsq*0], r3w
+ shr r3d, 16
+ mov [dstq+dsq*1], r3w
+ lea dstq, [dstq+dsq*2]
+ add tmpq, 2*2
+ add hq, 2
+ jl .w2
+ RET
+.w4:
+%if ARCH_X86_32
+ mova m3, [base+blend_shuf]
+%else
+ mova m3, [blend_shuf]
+%endif
+.w4_loop:
+ movd m0, [dstq+dsq*0]
+ movd m2, [dstq+dsq*1]
+ punpckldq m0, m2 ; a
+ movq m1, [tmpq] ; b
+ movq m2, [maskq+hq*2] ; m
+ pshufb m2, m3
+ punpcklbw m0, m1
+ pmaddubsw m0, m2
+ pmulhrsw m0, m5
+ packuswb m0, m0
+ movd [dstq+dsq*0], m0
+ psrlq m0, 32
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add tmpq, 4*2
+ add hq, 2
+ jl .w4_loop
+ RET
+.w8:
+ movd m4, [maskq+hq*2]
+ punpcklwd m4, m4
+ pshufd m3, m4, q0000
+ pshufd m4, m4, q1111
+ movq m1, [dstq+dsq*0] ; a
+ movhps m1, [dstq+dsq*1]
+ mova m2, [tmpq]
+ BLEND_64M m1, m2, m3, m4
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ add tmpq, 8*2
+ add hq, 2
+ jl .w8
+ RET
+; w16/w32/w64/w128
+.w16:
+%if ARCH_X86_32
+ mov r6d, wm
+%endif
+ sub dsq, r6
+.w16_loop0:
+ movd m3, [maskq+hq*2]
+ pshuflw m3, m3, q0000
+ punpcklqdq m3, m3
+ mov wd, r6d
+.w16_loop:
+ mova m1, [dstq] ; a
+ mova m2, [tmpq] ; b
+ BLEND_64M m1, m2, m3, m3
+ mova [dstq], m0
+ add dstq, 16
+ add tmpq, 16
+ sub wd, 16
+ jg .w16_loop
+ add dstq, dsq
+ inc hq
+ jl .w16_loop0
+ RET
+
+; emu_edge args:
+; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
+; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
+; const pixel *ref, const ptrdiff_t ref_stride
+;
+; bw, bh total filled size
+; iw, ih, copied block -> fill bottom, right
+; x, y, offset in bw/bh -> fill top, left
+cglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \
+ y, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+ pxor m1, m1
+
+%if ARCH_X86_64
+ %define reg_zero r12q
+ %define reg_tmp r10
+ %define reg_src srcq
+ %define reg_bottomext bottomextq
+ %define reg_rightext rightextq
+ %define reg_blkm r9m
+%else
+ %define reg_zero r6
+ %define reg_tmp r0
+ %define reg_src r1
+ %define reg_bottomext r0
+ %define reg_rightext r1
+ %define reg_blkm r2m
+%endif
+ ;
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor reg_zero, reg_zero
+ lea reg_tmp, [ihq-1]
+ cmp yq, ihq
+ cmovs reg_tmp, yq
+ test yq, yq
+ cmovs reg_tmp, reg_zero
+%if ARCH_X86_64
+ imul reg_tmp, sstrideq
+ add srcq, reg_tmp
+%else
+ imul reg_tmp, sstridem
+ mov reg_src, srcm
+ add reg_src, reg_tmp
+%endif
+ ;
+ ; ref += iclip(x, 0, iw - 1)
+ lea reg_tmp, [iwq-1]
+ cmp xq, iwq
+ cmovs reg_tmp, xq
+ test xq, xq
+ cmovs reg_tmp, reg_zero
+ add reg_src, reg_tmp
+%if ARCH_X86_32
+ mov srcm, reg_src
+%endif
+ ;
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+%if ARCH_X86_32
+ mov r1, r1m ; restore bh
+%endif
+ lea reg_bottomext, [yq+bhq]
+ sub reg_bottomext, ihq
+ lea r3, [bhq-1]
+ cmovs reg_bottomext, reg_zero
+ ;
+
+ DEFINE_ARGS bw, bh, iw, ih, x, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, reg_zero
+ cmp reg_bottomext, bhq
+ cmovns reg_bottomext, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+ %if ARCH_X86_32
+ mov r4m, reg_bottomext
+ ;
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ mov r0, r0m ; restore bw
+ %endif
+ lea reg_rightext, [xq+bwq]
+ sub reg_rightext, iwq
+ lea r2, [bwq-1]
+ cmovs reg_rightext, reg_zero
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, reg_zero
+ cmp reg_rightext, bwq
+ cmovns reg_rightext, r2
+ %if ARCH_X86_32
+ mov r3m, r1
+ %endif
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+%undef reg_zero
+%undef reg_tmp
+%undef reg_src
+%undef reg_bottomext
+%undef reg_rightext
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
+ topext, dst, dstride, src, sstride, \
+ bottomext, rightext, blk
+
+ ; center_h = bh - top_ext - bottom_ext
+%if ARCH_X86_64
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+%else
+ mov r1, centerhm ; restore r1
+ sub centerhq, topextq
+ sub centerhq, r4m
+ mov r1m, centerhq
+%endif
+ ;
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+%if ARCH_X86_64
+ imul r2, dstrideq
+%else
+ mov r6, r6m ; restore dstq
+ imul r2, dstridem
+%endif
+ add dstq, r2
+ mov reg_blkm, dstq ; save pointer for ext
+ ;
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+%if ARCH_X86_64
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+%else
+ sub centerwq, r3m
+ sub centerwq, leftextq
+%endif
+
+; vloop Macro
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+ %if ARCH_X86_64
+ %define reg_tmp r12
+ %else
+ %define reg_tmp r0
+ %endif
+.v_loop_%3:
+ %if ARCH_X86_32
+ mov r0, r0m
+ mov r1, r1m
+ %endif
+%if %1
+ ; left extension
+ %if ARCH_X86_64
+ movd m0, [srcq]
+ %else
+ mov r3, srcm
+ movd m0, [r3]
+ %endif
+ pshufb m0, m1
+ xor r3, r3
+.left_loop_%3:
+ mova [dstq+r3], m0
+ add r3, mmsize
+ cmp r3, leftextq
+ jl .left_loop_%3
+ ; body
+ lea reg_tmp, [dstq+leftextq]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ %if ARCH_X86_64
+ movu m0, [srcq+r3]
+ %else
+ mov r1, srcm
+ movu m0, [r1+r3]
+ %endif
+%if %1
+ movu [reg_tmp+r3], m0
+%else
+ movu [dstq+r3], m0
+%endif
+ add r3, mmsize
+ cmp r3, centerwq
+ jl .body_loop_%3
+%if %2
+ ; right extension
+%if %1
+ add reg_tmp, centerwq
+%else
+ lea reg_tmp, [dstq+centerwq]
+%endif
+ %if ARCH_X86_64
+ movd m0, [srcq+centerwq-1]
+ %else
+ mov r3, srcm
+ movd m0, [r3+centerwq-1]
+ %endif
+ pshufb m0, m1
+ xor r3, r3
+.right_loop_%3:
+ movu [reg_tmp+r3], m0
+ add r3, mmsize
+ %if ARCH_X86_64
+ cmp r3, rightextq
+ %else
+ cmp r3, r3m
+ %endif
+ jl .right_loop_%3
+%endif
+ %if ARCH_X86_64
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+ %else
+ add dstq, dstridem
+ mov r0, sstridem
+ add srcm, r0
+ sub dword centerhm, 1
+ jg .v_loop_%3
+ mov r0, r0m ; restore r0
+ %endif
+%endmacro ; vloop MACRO
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ jnz .need_right_ext
+ %else
+ cmp leftextq, r3m ; leftextq == 0
+ jne .need_right_ext
+ %endif
+ v_loop 0, 0, 0
+ jmp .body_done
+
+ ;left right extensions
+.need_left_ext:
+ %if ARCH_X86_64
+ test rightextq, rightextq
+ %else
+ mov r3, r3m
+ test r3, r3
+ %endif
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+; r0 ; bw
+; r1 ;; x loop
+; r4 ;; y loop
+; r5 ; topextq
+; r6 ;dstq
+; r7 ;dstrideq
+; r8 ; srcq
+%if ARCH_X86_64
+ %define reg_dstride dstrideq
+%else
+ %define reg_dstride r2
+%endif
+ ;
+ ; bottom edge extension
+ %if ARCH_X86_64
+ test bottomextq, bottomextq
+ jz .top
+ %else
+ xor r1, r1
+ cmp r1, r4m
+ je .top
+ %endif
+ ;
+ %if ARCH_X86_64
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+ %else
+ mov r3, dstq
+ mov reg_dstride, dstridem
+ sub r3, reg_dstride
+ mov srcm, r3
+ %endif
+ ;
+.bottom_x_loop:
+ %if ARCH_X86_64
+ mova m0, [srcq+r1]
+ lea r3, [dstq+r1]
+ mov r4, bottomextq
+ %else
+ mov r3, srcm
+ mova m0, [r3+r1]
+ lea r3, [dstq+r1]
+ mov r4, r4m
+ %endif
+ ;
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .bottom_y_loop
+ add r1, mmsize
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+%if ARCH_X86_64
+ mov srcq, reg_blkm
+%else
+ mov r3, reg_blkm
+ mov reg_dstride, dstridem
+%endif
+ mov dstq, dstm
+ xor r1, r1
+ ;
+.top_x_loop:
+%if ARCH_X86_64
+ mova m0, [srcq+r1]
+%else
+ mov r3, reg_blkm
+ mova m0, [r3+r1]
+%endif
+ lea r3, [dstq+r1]
+ mov r4, topextq
+ ;
+.top_y_loop:
+ mova [r3], m0
+ add r3, reg_dstride
+ dec r4
+ jg .top_y_loop
+ add r1, mmsize
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+%undef reg_dstride
+%undef reg_blkm
+%undef reg_tmp
+
+cextern resize_filter
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+cglobal resize_8bpc, 0, 12, 14, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%elif STACK_ALIGNMENT >= 16
+cglobal resize_8bpc, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%else
+cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%endif
+ movifnidn dstq, dstmp
+ movifnidn srcq, srcmp
+%if STACK_ALIGNMENT >= 16
+ movifnidn dst_wd, dst_wm
+%endif
+%if ARCH_X86_64
+ movifnidn hd, hm
+%endif
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ movd m7, dxm
+ movd m6, mx0m
+ movd m5, src_wm
+ pshufd m7, m7, q0000
+ pshufd m6, m6, q0000
+ pshufd m5, m5, q0000
+
+%if ARCH_X86_64
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
+ LEA r7, $$
+%define base r7-$$
+%else
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
+%define hd dword r5m
+%if STACK_ALIGNMENT >= 16
+ LEA r6, $$
+%define base r6-$$
+%else
+ LEA r4, $$
+%define base r4-$$
+%endif
+%endif
+
+%if ARCH_X86_64
+ mova m10, [base+pw_m256]
+ mova m9, [base+pd_63]
+ mova m8, [base+pb_8x0_8x8]
+%else
+%define m10 [base+pw_m256]
+%define m9 [base+pd_63]
+%define m8 [base+pb_8x0_8x8]
+%endif
+ pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
+ pslld m7, 2 ; dx*4
+ pslld m5, 14
+ paddd m6, m4 ; mx+[0..3]*dx
+ SCRATCH 7, 13, 0
+ SCRATCH 6, 12, 1
+ SCRATCH 5, 11, 2
+
+ ; m10 = pmulhrsw constant for x=(x+64)>>7
+ ; m12 = mx+[0..3]*dx, m13 = dx*4, m11 = src_w, m9 = 0x3f, m8=0,8
+
+.loop_y:
+ xor xd, xd
+ mova m0, m12 ; per-line working version of mx
+
+.loop_x:
+ pxor m1, m1
+ pcmpgtd m1, m0
+ pandn m1, m0
+ psrad m2, m0, 8 ; filter offset (unmasked)
+ pcmpgtd m3, m11, m1
+ pand m1, m3
+ pandn m3, m11
+ por m1, m3
+ psubd m3, m0, m1 ; pshufb offset
+ psrad m1, 14 ; clipped src_x offset
+ psrad m3, 14 ; pshufb edge_emu offset
+ pand m2, m9 ; filter offset (masked)
+
+ ; load source pixels
+%if ARCH_X86_64
+ movd r8d, m1
+ pshuflw m1, m1, q3232
+ movd r9d, m1
+ punpckhqdq m1, m1
+ movd r10d, m1
+ psrlq m1, 32
+ movd r11d, m1
+ movq m4, [srcq+r8]
+ movq m5, [srcq+r10]
+ movhps m4, [srcq+r9]
+ movhps m5, [srcq+r11]
+%else
+ movd r3d, m1
+ pshufd m1, m1, q3312
+ movd r1d, m1
+ pshuflw m1, m1, q3232
+ movq m4, [srcq+r3]
+ movq m5, [srcq+r1]
+ movd r3d, m1
+ punpckhqdq m1, m1
+ movd r1d, m1
+ movhps m4, [srcq+r3]
+ movhps m5, [srcq+r1]
+%endif
+
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ ; this also saves 2 quasi-vpgatherdqs
+ pxor m6, m6
+ pcmpeqb m6, m3
+%if ARCH_X86_64
+ pmovmskb r8d, m6
+ cmp r8d, 0xffff
+%else
+ pmovmskb r3d, m6
+ cmp r3d, 0xffff
+%endif
+ je .filter
+
+%if ARCH_X86_64
+ movd r8d, m3
+ pshuflw m3, m3, q3232
+ movd r9d, m3
+ punpckhqdq m3, m3
+ movd r10d, m3
+ psrlq m3, 32
+ movd r11d, m3
+ movsxd r8, r8d
+ movsxd r9, r9d
+ movsxd r10, r10d
+ movsxd r11, r11d
+ movq m6, [base+resize_shuf+4+r8]
+ movq m7, [base+resize_shuf+4+r10]
+ movhps m6, [base+resize_shuf+4+r9]
+ movhps m7, [base+resize_shuf+4+r11]
+%else
+ movd r3d, m3
+ pshufd m3, m3, q3312
+ movd r1d, m3
+ pshuflw m3, m3, q3232
+ movq m6, [base+resize_shuf+4+r3]
+ movq m7, [base+resize_shuf+4+r1]
+ movd r3d, m3
+ punpckhqdq m3, m3
+ movd r1d, m3
+ movhps m6, [base+resize_shuf+4+r3]
+ movhps m7, [base+resize_shuf+4+r1]
+%endif
+
+ paddb m6, m8
+ paddb m7, m8
+ pshufb m4, m6
+ pshufb m5, m7
+
+.filter:
+%if ARCH_X86_64
+ movd r8d, m2
+ pshuflw m2, m2, q3232
+ movd r9d, m2
+ punpckhqdq m2, m2
+ movd r10d, m2
+ psrlq m2, 32
+ movd r11d, m2
+ movq m6, [base+resize_filter+r8*8]
+ movq m7, [base+resize_filter+r10*8]
+ movhps m6, [base+resize_filter+r9*8]
+ movhps m7, [base+resize_filter+r11*8]
+%else
+ movd r3d, m2
+ pshufd m2, m2, q3312
+ movd r1d, m2
+ pshuflw m2, m2, q3232
+ movq m6, [base+resize_filter+r3*8]
+ movq m7, [base+resize_filter+r1*8]
+ movd r3d, m2
+ punpckhqdq m2, m2
+ movd r1d, m2
+ movhps m6, [base+resize_filter+r3*8]
+ movhps m7, [base+resize_filter+r1*8]
+%endif
+
+ pmaddubsw m4, m6
+ pmaddubsw m5, m7
+ phaddw m4, m5
+ phaddsw m4, m4
+ pmulhrsw m4, m10 ; x=(x+64)>>7
+ packuswb m4, m4
+ movd [dstq+xq], m4
+
+ paddd m0, m13
+ add xd, 4
+%if STACK_ALIGNMENT >= 16
+ cmp xd, dst_wd
+%else
+ cmp xd, dst_wm
+%endif
+ jl .loop_x
+
+ add dstq, dst_stridemp
+ add srcq, src_stridemp
+ dec hd
+ jg .loop_y
+ RET
+
+INIT_XMM ssse3
+PREP_BILIN
+PREP_8TAP
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM sse4
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM sse2
+PREP_BILIN
+PREP_8TAP
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
diff --git a/third_party/dav1d/src/x86/msac.asm b/third_party/dav1d/src/x86/msac.asm
new file mode 100644
index 0000000000..9f05c921a6
--- /dev/null
+++ b/third_party/dav1d/src/x86/msac.asm
@@ -0,0 +1,667 @@
+; Copyright © 2019, VideoLAN and dav1d authors
+; Copyright © 2019, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 64 ; avoids cacheline splits
+
+min_prob: dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+pw_0xff00: times 8 dw 0xff00
+pw_32: times 8 dw 32
+
+%if ARCH_X86_64
+%define resp resq
+%define movp movq
+%define c_shuf q3333
+%macro DECODE_SYMBOL_ADAPT_INIT 0-1
+%endmacro
+%else
+%define resp resd
+%define movp movd
+%define c_shuf q1111
+%macro DECODE_SYMBOL_ADAPT_INIT 0-1 0 ; hi_tok
+ mov t0, r0m
+ mov t1, r1m
+%if %1 == 0
+ mov t2, r2m
+%endif
+%if STACK_ALIGNMENT >= 16
+ sub esp, 40-%1*4
+%else
+ mov eax, esp
+ and esp, ~15
+ sub esp, 40-%1*4
+ mov [esp], eax
+%endif
+%endmacro
+%endif
+
+struc msac
+ .buf: resp 1
+ .end: resp 1
+ .dif: resp 1
+ .rng: resd 1
+ .cnt: resd 1
+ .update_cdf: resd 1
+endstruc
+
+%define m(x, y) mangle(private_prefix %+ _ %+ x %+ y)
+
+SECTION .text
+
+%if WIN64
+DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3, 8
+%define buf rsp+stack_offset+8 ; shadow space
+%elif UNIX64
+DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0, 8
+%define buf rsp-40 ; red zone
+%else
+DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2, 3
+%define buf esp+8
+%endif
+
+INIT_XMM sse2
+cglobal msac_decode_symbol_adapt4, 0, 6, 6
+ DECODE_SYMBOL_ADAPT_INIT
+ LEA rax, pw_0xff00
+ movd m2, [t0+msac.rng]
+ movq m1, [t1]
+ movp m3, [t0+msac.dif]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ not t2 ; -(n_symbols + 1)
+ pshuflw m2, m2, q0000
+ movd [buf+12], m2
+ pand m2, [rax]
+ mova m0, m1
+ psrlw m1, 6
+ psllw m1, 7
+ pmulhuw m1, m2
+ movq m2, [rax+t2*2]
+ pshuflw m3, m3, c_shuf
+ paddw m1, m2
+ mova [buf+16], m1
+ psubusw m1, m3
+ pxor m2, m2
+ pcmpeqw m1, m2 ; c >= v
+ pmovmskb eax, m1
+ test t3d, t3d
+ jz .renorm ; !allow_update_cdf
+
+; update_cdf:
+ movzx t3d, word [t1+t4*2] ; count
+ pcmpeqw m2, m2
+ mov t2d, t3d
+ shr t3d, 4
+ cmp t4d, 3
+ sbb t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4
+ cmp t2d, 32
+ adc t2d, 0 ; count + (count < 32)
+ movd m3, t3d
+ pavgw m2, m1 ; i >= val ? -1 : 32768
+ psubw m2, m0 ; for (i = 0; i < val; i++)
+ psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate;
+ psraw m2, m3 ; for (; i < n_symbols; i++)
+ paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1;
+ movq [t1], m0
+ mov [t1+t4*2], t2w
+
+.renorm:
+ tzcnt eax, eax
+ mov t4, [t0+msac.dif]
+ movzx t1d, word [buf+rax+16] ; v
+ movzx t2d, word [buf+rax+14] ; u
+ shr eax, 1
+.renorm2:
+%if ARCH_X86_64 == 0
+%if STACK_ALIGNMENT >= 16
+ add esp, 40
+%else
+ mov esp, [esp]
+%endif
+%endif
+ not t4
+ sub t2d, t1d ; rng
+ shl t1, gprsize*8-16
+ add t4, t1 ; ~dif
+.renorm3:
+ mov t1d, [t0+msac.cnt]
+ movifnidn t7, t0
+.renorm4:
+ bsr ecx, t2d
+ xor ecx, 15 ; d
+.renorm5:
+ shl t2d, cl
+ shl t4, cl
+ mov [t7+msac.rng], t2d
+ not t4
+ sub t1d, ecx
+ jae .end ; no refill required
+
+; refill:
+ mov t2, [t7+msac.buf]
+ mov rcx, [t7+msac.end]
+%if ARCH_X86_64 == 0
+ push t5
+%endif
+ lea t5, [t2+gprsize]
+ cmp t5, rcx
+ ja .refill_eob
+ mov t2, [t2]
+ lea ecx, [t1+23]
+ add t1d, 16
+ shr ecx, 3 ; shift_bytes
+ bswap t2
+ sub t5, rcx
+ shl ecx, 3 ; shift_bits
+ shr t2, cl
+ sub ecx, t1d ; shift_bits - 16 - cnt
+ mov t1d, gprsize*8-16
+ shl t2, cl
+ mov [t7+msac.buf], t5
+ sub t1d, ecx ; cnt + gprsize*8 - shift_bits
+ xor t4, t2
+%if ARCH_X86_64 == 0
+ pop t5
+%endif
+.end:
+ mov [t7+msac.cnt], t1d
+ mov [t7+msac.dif], t4
+ RET
+.refill_eob: ; avoid overreading the input buffer
+ mov t5, rcx
+ mov ecx, gprsize*8-24
+ sub ecx, t1d ; c
+.refill_eob_loop:
+ cmp t2, t5
+ jae .refill_eob_end ; eob reached
+ movzx t1d, byte [t2]
+ inc t2
+ shl t1, cl
+ xor t4, t1
+ sub ecx, 8
+ jge .refill_eob_loop
+.refill_eob_end:
+ mov t1d, gprsize*8-24
+%if ARCH_X86_64 == 0
+ pop t5
+%endif
+ sub t1d, ecx
+ mov [t7+msac.buf], t2
+ mov [t7+msac.dif], t4
+ mov [t7+msac.cnt], t1d
+ RET
+
+cglobal msac_decode_symbol_adapt8, 0, 6, 6
+ DECODE_SYMBOL_ADAPT_INIT
+ LEA rax, pw_0xff00
+ movd m2, [t0+msac.rng]
+ mova m1, [t1]
+ movp m3, [t0+msac.dif]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ not t2
+ pshuflw m2, m2, q0000
+ movd [buf+12], m2
+ punpcklqdq m2, m2
+ mova m0, m1
+ psrlw m1, 6
+ pand m2, [rax]
+ psllw m1, 7
+ pmulhuw m1, m2
+ movu m2, [rax+t2*2]
+ pshuflw m3, m3, c_shuf
+ paddw m1, m2
+ punpcklqdq m3, m3
+ mova [buf+16], m1
+ psubusw m1, m3
+ pxor m2, m2
+ pcmpeqw m1, m2
+ pmovmskb eax, m1
+ test t3d, t3d
+ jz m(msac_decode_symbol_adapt4, SUFFIX).renorm
+ movzx t3d, word [t1+t4*2]
+ pcmpeqw m2, m2
+ mov t2d, t3d
+ shr t3d, 4
+ cmp t4d, 3 ; may be called with n_symbols <= 2
+ sbb t3d, -5
+ cmp t2d, 32
+ adc t2d, 0
+ movd m3, t3d
+ pavgw m2, m1
+ psubw m2, m0
+ psubw m0, m1
+ psraw m2, m3
+ paddw m0, m2
+ mova [t1], m0
+ mov [t1+t4*2], t2w
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm
+
+cglobal msac_decode_symbol_adapt16, 0, 6, 6
+ DECODE_SYMBOL_ADAPT_INIT
+ LEA rax, pw_0xff00
+ movd m4, [t0+msac.rng]
+ mova m2, [t1]
+ mova m3, [t1+16]
+ movp m5, [t0+msac.dif]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ not t2
+%if WIN64
+ sub rsp, 48 ; need 36 bytes, shadow space is only 32
+%endif
+ pshuflw m4, m4, q0000
+ movd [buf-4], m4
+ punpcklqdq m4, m4
+ mova m0, m2
+ psrlw m2, 6
+ mova m1, m3
+ psrlw m3, 6
+ pand m4, [rax]
+ psllw m2, 7
+ psllw m3, 7
+ pmulhuw m2, m4
+ pmulhuw m3, m4
+ movu m4, [rax+t2*2]
+ pshuflw m5, m5, c_shuf
+ paddw m2, m4
+ psubw m4, [rax-pw_0xff00+pw_32]
+ punpcklqdq m5, m5
+ paddw m3, m4
+ mova [buf], m2
+ psubusw m2, m5
+ mova [buf+16], m3
+ psubusw m3, m5
+ pxor m4, m4
+ pcmpeqw m2, m4
+ pcmpeqw m3, m4
+ packsswb m5, m2, m3
+ pmovmskb eax, m5
+ test t3d, t3d
+ jz .renorm
+ movzx t3d, word [t1+t4*2]
+ pcmpeqw m4, m4
+ mova m5, m4
+ lea t2d, [t3+80] ; only support n_symbols > 2
+ shr t2d, 4
+ cmp t3d, 32
+ adc t3d, 0
+ pavgw m4, m2
+ pavgw m5, m3
+ psubw m4, m0
+ psubw m0, m2
+ movd m2, t2d
+ psubw m5, m1
+ psubw m1, m3
+ psraw m4, m2
+ psraw m5, m2
+ paddw m0, m4
+ paddw m1, m5
+ mova [t1], m0
+ mova [t1+16], m1
+ mov [t1+t4*2], t3w
+.renorm:
+ tzcnt eax, eax
+ mov t4, [t0+msac.dif]
+ movzx t1d, word [buf+rax*2]
+ movzx t2d, word [buf+rax*2-2]
+%if WIN64
+ add rsp, 48
+%endif
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm2
+
+cglobal msac_decode_bool_adapt, 0, 6, 0
+ movifnidn t1, r1mp
+ movifnidn t0, r0mp
+ movzx eax, word [t1]
+ movzx t3d, byte [t0+msac.rng+1]
+ mov t4, [t0+msac.dif]
+ mov t2d, [t0+msac.rng]
+%if ARCH_X86_64
+ mov t5d, eax
+%endif
+ and eax, ~63
+ imul eax, t3d
+%if UNIX64
+ mov t6, t4
+%endif
+ shr eax, 7
+ add eax, 4 ; v
+ mov t3d, eax
+ shl rax, gprsize*8-16 ; vw
+ sub t2d, t3d ; r - v
+ sub t4, rax ; dif - vw
+ setb al
+ cmovb t2d, t3d
+ mov t3d, [t0+msac.update_cdf]
+%if UNIX64
+ cmovb t4, t6
+%else
+ cmovb t4, [t0+msac.dif]
+%endif
+%if ARCH_X86_64 == 0
+ movzx eax, al
+%endif
+ not t4
+ test t3d, t3d
+ jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3
+%if UNIX64 == 0
+ push t6
+%endif
+ movzx t6d, word [t1+2]
+%if ARCH_X86_64 == 0
+ push t5
+ movzx t5d, word [t1]
+%endif
+ movifnidn t7, t0
+ lea ecx, [t6+64]
+ cmp t6d, 32
+ adc t6d, 0
+ mov [t1+2], t6w
+ imul t6d, eax, -32769
+ shr ecx, 4 ; rate
+ add t6d, t5d ; if (bit)
+ sub t5d, eax ; cdf[0] -= ((cdf[0] - 32769) >> rate) + 1;
+ sar t6d, cl ; else
+ sub t5d, t6d ; cdf[0] -= cdf[0] >> rate;
+ mov [t1], t5w
+%if WIN64
+ mov t1d, [t7+msac.cnt]
+ pop t6
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm4
+%else
+%if ARCH_X86_64 == 0
+ pop t5
+ pop t6
+%endif
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
+%endif
+
+cglobal msac_decode_bool_equi, 0, 6, 0
+ movifnidn t0, r0mp
+ mov t1d, [t0+msac.rng]
+ mov t4, [t0+msac.dif]
+ mov t2d, t1d
+ mov t1b, 8
+ mov t3, t4
+ mov eax, t1d
+ shr t1d, 1 ; v
+ shl rax, gprsize*8-17 ; vw
+ sub t2d, t1d ; r - v
+ sub t4, rax ; dif - vw
+ cmovb t2d, t1d
+ mov t1d, [t0+msac.cnt]
+ cmovb t4, t3
+ movifnidn t7, t0
+ mov ecx, 0xbfff
+ setb al ; the upper 32 bits contains garbage but that's OK
+ sub ecx, t2d
+ not t4
+ ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14)
+ ; i.e. (0 <= d <= 2) and v < (3 << 14)
+ shr ecx, 14 ; d
+%if ARCH_X86_64 == 0
+ movzx eax, al
+%endif
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm5
+
+cglobal msac_decode_bool, 0, 6, 0
+ movifnidn t0, r0mp
+ movifnidn t1d, r1m
+ movzx eax, byte [t0+msac.rng+1] ; r >> 8
+ mov t4, [t0+msac.dif]
+ mov t2d, [t0+msac.rng]
+ and t1d, ~63
+ imul eax, t1d
+ mov t3, t4
+ shr eax, 7
+ add eax, 4 ; v
+ mov t1d, eax
+ shl rax, gprsize*8-16 ; vw
+ sub t2d, t1d ; r - v
+ sub t4, rax ; dif - vw
+ cmovb t2d, t1d
+ cmovb t4, t3
+ setb al
+ not t4
+%if ARCH_X86_64 == 0
+ movzx eax, al
+%endif
+ jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
+
+%macro HI_TOK 1 ; update_cdf
+%if ARCH_X86_64 == 0
+ mov eax, -24
+%endif
+%%loop:
+%if %1
+ movzx t2d, word [t1+3*2]
+%endif
+ mova m1, m0
+ pshuflw m2, m2, q0000
+ psrlw m1, 6
+ movd [buf+12], m2
+ pand m2, m4
+ psllw m1, 7
+ pmulhuw m1, m2
+%if ARCH_X86_64 == 0
+ add eax, 5
+ mov [buf+8], eax
+%endif
+ pshuflw m3, m3, c_shuf
+ paddw m1, m5
+ movq [buf+16], m1
+ psubusw m1, m3
+ pxor m2, m2
+ pcmpeqw m1, m2
+ pmovmskb eax, m1
+%if %1
+ lea ecx, [t2+80]
+ pcmpeqw m2, m2
+ shr ecx, 4
+ cmp t2d, 32
+ adc t2d, 0
+ movd m3, ecx
+ pavgw m2, m1
+ psubw m2, m0
+ psubw m0, m1
+ psraw m2, m3
+ paddw m0, m2
+ movq [t1], m0
+ mov [t1+3*2], t2w
+%endif
+ tzcnt eax, eax
+ movzx ecx, word [buf+rax+16]
+ movzx t2d, word [buf+rax+14]
+ not t4
+%if ARCH_X86_64
+ add t6d, 5
+%endif
+ sub eax, 5 ; setup for merging the tok_br and tok branches
+ sub t2d, ecx
+ shl rcx, gprsize*8-16
+ add t4, rcx
+ bsr ecx, t2d
+ xor ecx, 15
+ shl t2d, cl
+ shl t4, cl
+ movd m2, t2d
+ mov [t7+msac.rng], t2d
+ not t4
+ sub t5d, ecx
+ jae %%end
+ mov t2, [t7+msac.buf]
+ mov rcx, [t7+msac.end]
+%if UNIX64 == 0
+ push t8
+%endif
+ lea t8, [t2+gprsize]
+ cmp t8, rcx
+ ja %%refill_eob
+ mov t2, [t2]
+ lea ecx, [t5+23]
+ add t5d, 16
+ shr ecx, 3
+ bswap t2
+ sub t8, rcx
+ shl ecx, 3
+ shr t2, cl
+ sub ecx, t5d
+ mov t5d, gprsize*8-16
+ shl t2, cl
+ mov [t7+msac.buf], t8
+%if UNIX64 == 0
+ pop t8
+%endif
+ sub t5d, ecx
+ xor t4, t2
+%%end:
+ movp m3, t4
+%if ARCH_X86_64
+ add t6d, eax ; CF = tok_br < 3 || tok == 15
+ jnc %%loop
+ lea eax, [t6+30]
+%else
+ add eax, [buf+8]
+ jnc %%loop
+ add eax, 30
+%if STACK_ALIGNMENT >= 16
+ add esp, 36
+%else
+ mov esp, [esp]
+%endif
+%endif
+ mov [t7+msac.dif], t4
+ shr eax, 1
+ mov [t7+msac.cnt], t5d
+ RET
+%%refill_eob:
+ mov t8, rcx
+ mov ecx, gprsize*8-24
+ sub ecx, t5d
+%%refill_eob_loop:
+ cmp t2, t8
+ jae %%refill_eob_end
+ movzx t5d, byte [t2]
+ inc t2
+ shl t5, cl
+ xor t4, t5
+ sub ecx, 8
+ jge %%refill_eob_loop
+%%refill_eob_end:
+%if UNIX64 == 0
+ pop t8
+%endif
+ mov t5d, gprsize*8-24
+ mov [t7+msac.buf], t2
+ sub t5d, ecx
+ jmp %%end
+%endmacro
+
+cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
+ DECODE_SYMBOL_ADAPT_INIT 1
+%if ARCH_X86_64 == 0 && PIC
+ LEA t2, min_prob+12*2
+ %define base t2-(min_prob+12*2)
+%else
+ %define base 0
+%endif
+ movq m0, [t1]
+ movd m2, [t0+msac.rng]
+ mov eax, [t0+msac.update_cdf]
+ movq m4, [base+pw_0xff00]
+ movp m3, [t0+msac.dif]
+ movq m5, [base+min_prob+12*2]
+ mov t4, [t0+msac.dif]
+ mov t5d, [t0+msac.cnt]
+%if ARCH_X86_64
+ mov t6d, -24
+%endif
+ movifnidn t7, t0
+ test eax, eax
+ jz .no_update_cdf
+ HI_TOK 1
+.no_update_cdf:
+ HI_TOK 0
+
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal msac_decode_symbol_adapt16, 3, 6, 6
+ lea rax, [pw_0xff00]
+ vpbroadcastw m2, [t0+msac.rng]
+ mova m0, [t1]
+ vpbroadcastw m3, [t0+msac.dif+6]
+ vbroadcasti128 m4, [rax]
+ mov t3d, [t0+msac.update_cdf]
+ mov t4d, t2d
+ not t2
+ mov r5, rsp
+%if WIN64
+ and rsp, ~31
+ sub rsp, 40
+%else
+ and r5, ~31
+ %define buf r5-32
+%endif
+ psrlw m1, m0, 6
+ movd [buf-4], xm2
+ pand m2, m4
+ psllw m1, 7
+ pmulhuw m1, m2
+ paddw m1, [rax+t2*2]
+ mova [buf], m1
+ pmaxuw m1, m3
+ pcmpeqw m1, m3
+ pmovmskb eax, m1
+ test t3d, t3d
+ jz .renorm
+ movzx t3d, word [t1+t4*2]
+ pcmpeqw m2, m2
+ lea t2d, [t3+80]
+ shr t2d, 4
+ cmp t3d, 32
+ adc t3d, 0
+ movd xm3, t2d
+ pavgw m2, m1
+ psubw m2, m0
+ psubw m0, m1
+ psraw m2, xm3
+ paddw m0, m2
+ mova [t1], m0
+ mov [t1+t4*2], t3w
+.renorm:
+ tzcnt eax, eax
+ mov t4, [t0+msac.dif]
+ movzx t1d, word [buf+rax-0]
+ movzx t2d, word [buf+rax-2]
+ shr eax, 1
+%if WIN64
+ mov rsp, r5
+%endif
+ vzeroupper
+ jmp m(msac_decode_symbol_adapt4, _sse2).renorm2
+%endif
diff --git a/third_party/dav1d/src/x86/msac.h b/third_party/dav1d/src/x86/msac.h
new file mode 100644
index 0000000000..0bb632fb31
--- /dev/null
+++ b/third_party/dav1d/src/x86/msac.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_X86_MSAC_H
+#define DAV1D_SRC_X86_MSAC_H
+
+#include "src/cpu.h"
+
+unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
+unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
+unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);
+
+#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
+#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
+#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_sse2
+#endif
+
+#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_sse2
+#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_sse2
+#define dav1d_msac_decode_bool dav1d_msac_decode_bool_sse2
+
+#if ARCH_X86_64
+#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
+
+static ALWAYS_INLINE void msac_init_x86(MsacContext *const s) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (flags & DAV1D_X86_CPU_FLAG_SSE2) {
+ s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
+ }
+
+ if (flags & DAV1D_X86_CPU_FLAG_AVX2) {
+ s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
+ }
+}
+
+#elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
+#endif
+
+#endif /* DAV1D_SRC_X86_MSAC_H */
diff --git a/third_party/dav1d/src/x86/refmvs.asm b/third_party/dav1d/src/x86/refmvs.asm
new file mode 100644
index 0000000000..06f555db11
--- /dev/null
+++ b/third_party/dav1d/src/x86/refmvs.asm
@@ -0,0 +1,688 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 64
+
+%macro JMP_TABLE 2-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %1_table:
+ %xdefine %%base %1_table
+ %rep %0 - 1
+ dd %%prefix %+ .w%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix
+ %rep %1
+ db %2*3
+ db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \
+ mangle(private_prefix %+ _save_tmvs_%3).write1
+ %endrep
+%endmacro
+
+%if ARCH_X86_64
+splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3
+ db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7
+ db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3
+%endif
+save_pack0: db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0
+ db 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1
+save_pack1: db 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2
+ db 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3
+save_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1
+cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3
+save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00
+save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00
+pb_128: times 16 db 128
+
+save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3
+ SAVE_TMVS_TABLE 4, 8, ssse3
+ SAVE_TMVS_TABLE 4, 4, ssse3
+ SAVE_TMVS_TABLE 5, 2, ssse3
+ SAVE_TMVS_TABLE 7, 1, ssse3
+
+%if ARCH_X86_64
+save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2
+ SAVE_TMVS_TABLE 4, 8, avx2
+ SAVE_TMVS_TABLE 4, 4, avx2
+ SAVE_TMVS_TABLE 5, 2, avx2
+ SAVE_TMVS_TABLE 7, 1, avx2
+
+save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl
+ SAVE_TMVS_TABLE 4, 8, avx512icl
+ SAVE_TMVS_TABLE 4, 4, avx512icl
+ SAVE_TMVS_TABLE 5, 2, avx512icl
+ SAVE_TMVS_TABLE 7, 1, avx512icl
+
+JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32
+JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32
+%endif
+
+JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32
+
+SECTION .text
+
+%macro movif32 2
+%if ARCH_X86_32
+ mov %1, %2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+; refmvs_temporal_block *rp, ptrdiff_t stride,
+; refmvs_block **rr, uint8_t *ref_sign,
+; int col_end8, int row_end8, int col_start8, int row_start8
+%if ARCH_X86_64
+cglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \
+ xend, yend, xstart, ystart
+%define base_reg r12
+%else
+cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \
+ xend, yend, xstart, ystart
+ movq m5, [ref_signq]
+ lea strided, [strided*5]
+ mov stridem, strided
+ mov r3, xstartm
+ mov r1, ystartm
+ DEFINE_ARGS b, ystart, rr, cand, xend, x
+%define stridemp r1m
+%define m8 [base+pb_128]
+%define m9 [base+save_pack0+ 0]
+%define m10 [base+save_pack0+16]
+%define base_reg r6
+%endif
+%define base base_reg-.write1
+ LEA base_reg, .write1
+%if ARCH_X86_64
+ movifnidn xendd, xendm
+ movifnidn yendd, yendm
+ mov xstartd, xstartm
+ mov ystartd, ystartm
+ movq m5, [ref_signq]
+%endif
+ movu m4, [base+save_ref_shuf]
+ movddup m6, [base+save_cond0]
+ movddup m7, [base+save_cond1]
+%if ARCH_X86_64
+ mova m8, [base+pb_128]
+ mova m9, [base+save_pack0+ 0]
+ mova m10, [base+save_pack0+16]
+%endif
+ psllq m5, 8
+%if ARCH_X86_64
+ lea r9d, [xendq*5]
+ lea xstartd, [xstartq*5]
+ sub yendd, ystartd
+ add ystartd, ystartd
+ lea strideq, [strideq*5]
+ sub xstartq, r9
+ add xendd, r9d
+ add rpq, r9
+ DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
+%else
+ lea r0, [xendd*5] ; xend5
+ lea r3, [r3*5] ; xstart5
+ sub r3, r0 ; -w5
+ mov r6m, r3
+%define xstartq r6m
+ add xendd, r0 ; xend6
+ add r0m, r0 ; rp+xend5
+ mov xendm, xendd
+ sub r5, r1 ; h
+ add r1, r1
+ mov r7m, r1
+ mov r5m, r5
+%define hd r5mp
+ jmp .loop_y_noload
+%endif
+.loop_y:
+ movif32 ystartd, r7m
+ movif32 xendd, xendm
+.loop_y_noload:
+ and ystartd, 30
+ mov xq, xstartq
+ mov bq, [rrq+ystartq*gprsize]
+ add ystartd, 2
+ movif32 r7m, ystartd
+ lea bq, [bq+xendq*4]
+.loop_x:
+%if ARCH_X86_32
+%define rpq r3
+%define r10 r1
+%define r10d r1
+%define r11 r4
+%define r11d r4
+%endif
+ imul candq, xq, 0x9999 ; x / 5 * 3
+ sar candq, 16
+ movzx r10d, byte [bq+candq*8+22] ; cand_b->bs
+ movu m0, [bq+candq*8+12] ; cand_b
+ movzx r11d, byte [base+save_tmvs_ssse3_table+r10*2+0]
+ movzx r10d, byte [base+save_tmvs_ssse3_table+r10*2+1]
+ add r10, base_reg
+ add candq, r11
+ jge .calc
+ movu m1, [bq+candq*8+12]
+ movzx r11d, byte [bq+candq*8+22]
+ movzx r11d, byte [base+save_tmvs_ssse3_table+r11*2+1]
+ add r11, base_reg
+.calc:
+ movif32 rpq, r0m
+ ; ref check
+ punpckhqdq m2, m0, m1
+ pshufb m2, m4 ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ...
+ pshufb m3, m5, m2 ; ref > 0 && res_sign[ref - 1]
+ ; mv check
+ punpcklqdq m2, m0, m1 ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ...
+ pabsw m2, m2
+ psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096
+ ; res
+ pcmpgtd m3, m2
+ pshufd m2, m3, q2301
+ pand m3, m6 ; b0c0 b0c1 b1c0 b1c1 | ...
+ pand m2, m7 ; b0c1 b0c0 b1c1 b1c0 | ...
+ por m3, m2 ; b0.shuf b1.shuf | ...
+ pxor m3, m8 ; if cond0|cond1 == 0 => zero out
+ pshufb m0, m3
+ pshufb m1, m3
+ call r10
+ jge .next_line
+ pshufd m0, m1, q3232
+ call r11
+ jl .loop_x
+.next_line:
+ add rpq, stridemp
+ movif32 r0m, rpq
+ dec hd
+ jg .loop_y
+ RET
+.write1:
+ movd [rpq+xq+0], m0
+ psrlq m0, 8
+ movd [rpq+xq+1], m0
+ add xq, 5*1
+ ret
+.write2:
+ movq [rpq+xq+0], m0
+ psrlq m0, 8
+ movd [rpq+xq+6], m0
+ add xq, 5*2
+ ret
+.write4:
+ pshufb m0, m9
+ movu [rpq+xq+ 0], m0
+ psrlq m0, 8
+ movd [rpq+xq+16], m0
+ add xq, 5*4
+ ret
+.write8:
+ pshufb m2, m0, m9
+ movu [rpq+xq+ 0], m2
+ pshufb m0, m10
+ movu [rpq+xq+16], m0
+ psrldq m2, 2
+ movq [rpq+xq+32], m2
+ add xq, 5*8
+ ret
+.write16:
+ pshufb m2, m0, m9
+ movu [rpq+xq+ 0], m2
+ pshufb m0, m10
+ movu [rpq+xq+16], m0
+ shufps m2, m0, q1032
+ movu [rpq+xq+48], m2
+ shufps m2, m0, q2121
+ movu [rpq+xq+32], m2
+ shufps m0, m2, q1032
+ movu [rpq+xq+64], m0
+ add xq, 5*16
+ ret
+
+INIT_XMM sse2
+; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4
+cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
+ add bx4d, bw4d
+ tzcnt bw4d, bw4d
+ mova m2, [aq]
+ LEA aq, splat_mv_sse2_table
+ lea bx4q, [bx4q*3-32]
+ movsxd bw4q, [aq+bw4q*4]
+ movifnidn bh4d, bh4m
+ pshufd m0, m2, q0210
+ pshufd m1, m2, q1021
+ pshufd m2, m2, q2102
+ add bw4q, aq
+.loop:
+ mov aq, [rrq]
+ add rrq, gprsize
+ lea aq, [aq+bx4q*4]
+ jmp bw4q
+.w32:
+ mova [aq-16*16], m0
+ mova [aq-16*15], m1
+ mova [aq-16*14], m2
+ mova [aq-16*13], m0
+ mova [aq-16*12], m1
+ mova [aq-16*11], m2
+ mova [aq-16*10], m0
+ mova [aq-16* 9], m1
+ mova [aq-16* 8], m2
+ mova [aq-16* 7], m0
+ mova [aq-16* 6], m1
+ mova [aq-16* 5], m2
+.w16:
+ mova [aq-16* 4], m0
+ mova [aq-16* 3], m1
+ mova [aq-16* 2], m2
+ mova [aq-16* 1], m0
+ mova [aq+16* 0], m1
+ mova [aq+16* 1], m2
+.w8:
+ mova [aq+16* 2], m0
+ mova [aq+16* 3], m1
+ mova [aq+16* 4], m2
+.w4:
+ mova [aq+16* 5], m0
+ mova [aq+16* 6], m1
+ mova [aq+16* 7], m2
+ dec bh4d
+ jg .loop
+ RET
+.w2:
+ movu [aq+104], m0
+ movq [aq+120], m1
+ dec bh4d
+ jg .loop
+ RET
+.w1:
+ movq [aq+116], m0
+ movd [aq+124], m2
+ dec bh4d
+ jg .loop
+ RET
+
+%if ARCH_X86_64
+INIT_YMM avx2
+; refmvs_temporal_block *rp, ptrdiff_t stride,
+; refmvs_block **rr, uint8_t *ref_sign,
+; int col_end8, int row_end8, int col_start8, int row_start8
+cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \
+ xend, yend, xstart, ystart
+%define base r12-.write1
+ lea r12, [.write1]
+ movifnidn xendd, xendm
+ movifnidn yendd, yendm
+ mov xstartd, xstartm
+ mov ystartd, ystartm
+ vpbroadcastq m4, [ref_signq]
+ vpbroadcastq m3, [base+save_ref_shuf+8]
+ vpbroadcastq m5, [base+save_cond0]
+ vpbroadcastq m6, [base+save_cond1]
+ vpbroadcastd m7, [base+pb_128]
+ mova m8, [base+save_pack0]
+ mova m9, [base+save_pack1]
+ psllq m4, 8
+ lea r9d, [xendq*5]
+ lea xstartd, [xstartq*5]
+ sub yendd, ystartd
+ add ystartd, ystartd
+ lea strideq, [strideq*5]
+ sub xstartq, r9
+ add xendd, r9d
+ add rpq, r9
+ DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
+.loop_y:
+ and ystartd, 30
+ mov xq, xstartq
+ mov bq, [rrq+ystartq*8]
+ add ystartd, 2
+ lea bq, [bq+xendq*4]
+.loop_x:
+ imul candq, xq, 0x9999
+ sar candq, 16 ; x / 5 * 3
+ movzx r10d, byte [bq+candq*8+22] ; cand_b->bs
+ movu xm0, [bq+candq*8+12] ; cand_b
+ movzx r11d, byte [base+save_tmvs_avx2_table+r10*2+0]
+ movzx r10d, byte [base+save_tmvs_avx2_table+r10*2+1]
+ add r10, r12
+ add candq, r11
+ jge .calc
+ vinserti128 m0, [bq+candq*8+12], 1
+ movzx r11d, byte [bq+candq*8+22]
+ movzx r11d, byte [base+save_tmvs_avx2_table+r11*2+1]
+ add r11, r12
+.calc:
+ pshufb m1, m0, m3
+ pabsw m2, m0
+ pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1]
+ psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096
+ pcmpgtd m1, m2
+ pshufd m2, m1, q2301
+ pand m1, m5 ; b0.cond0 b1.cond0
+ pand m2, m6 ; b0.cond1 b1.cond1
+ por m1, m2 ; b0.shuf b1.shuf
+ pxor m1, m7 ; if cond0|cond1 == 0 => zero out
+ pshufb m0, m1
+ call r10
+ jge .next_line
+ vextracti128 xm0, m0, 1
+ call r11
+ jl .loop_x
+.next_line:
+ add rpq, strideq
+ dec hd
+ jg .loop_y
+ RET
+.write1:
+ movd [rpq+xq+ 0], xm0
+ pextrb [rpq+xq+ 4], xm0, 4
+ add xq, 5*1
+ ret
+.write2:
+ movq [rpq+xq+0], xm0
+ psrlq xm1, xm0, 8
+ movd [rpq+xq+6], xm1
+ add xq, 5*2
+ ret
+.write4:
+ pshufb xm1, xm0, xm8
+ movu [rpq+xq+ 0], xm1
+ psrlq xm1, 8
+ movd [rpq+xq+16], xm1
+ add xq, 5*4
+ ret
+.write8:
+ vinserti128 m1, m0, xm0, 1
+ pshufb m1, m8
+ movu [rpq+xq+ 0], m1
+ psrldq xm1, 2
+ movq [rpq+xq+32], xm1
+ add xq, 5*8
+ ret
+.write16:
+ vinserti128 m1, m0, xm0, 1
+ pshufb m2, m1, m8
+ movu [rpq+xq+ 0], m2
+ pshufb m1, m9
+ movu [rpq+xq+32], m1
+ shufps xm2, xm1, q1021
+ movu [rpq+xq+64], xm2
+ add xq, 5*16
+ ret
+
+cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
+ add bx4d, bw4d
+ tzcnt bw4d, bw4d
+ vbroadcasti128 m0, [aq]
+ lea aq, [splat_mv_avx2_table]
+ lea bx4q, [bx4q*3-32]
+ movsxd bw4q, [aq+bw4q*4]
+ pshufb m0, [splat_mv_shuf]
+ movifnidn bh4d, bh4m
+ pshufd m1, m0, q2102
+ pshufd m2, m0, q1021
+ add bw4q, aq
+.loop:
+ mov aq, [rrq]
+ add rrq, gprsize
+ lea aq, [aq+bx4q*4]
+ jmp bw4q
+.w32:
+ mova [aq-32*8], m0
+ mova [aq-32*7], m1
+ mova [aq-32*6], m2
+ mova [aq-32*5], m0
+ mova [aq-32*4], m1
+ mova [aq-32*3], m2
+.w16:
+ mova [aq-32*2], m0
+ mova [aq-32*1], m1
+ mova [aq+32*0], m2
+.w8:
+ mova [aq+32*1], m0
+ mova [aq+32*2], m1
+ mova [aq+32*3], m2
+ dec bh4d
+ jg .loop
+ RET
+.w4:
+ movu [aq+ 80], m0
+ mova [aq+112], xm1
+ dec bh4d
+ jg .loop
+ RET
+.w2:
+ movu [aq+104], xm0
+ movq [aq+120], xm2
+ dec bh4d
+ jg .loop
+ RET
+.w1:
+ movq [aq+116], xm0
+ movd [aq+124], xm1
+ dec bh4d
+ jg .loop
+ RET
+
+INIT_ZMM avx512icl
+; refmvs_temporal_block *rp, ptrdiff_t stride,
+; refmvs_block **rr, uint8_t *ref_sign,
+; int col_end8, int row_end8, int col_start8, int row_start8
+cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \
+ xend, yend, xstart, ystart
+%define base r14-.write1
+ lea r14, [.write1]
+ movifnidn xendd, xendm
+ movifnidn yendd, yendm
+ mov xstartd, xstartm
+ mov ystartd, ystartm
+ psllq m4, [ref_signq]{bcstq}, 8
+ vpbroadcastq m3, [base+save_ref_shuf+8]
+ vbroadcasti32x4 m5, [base+cond_shuf512]
+ vbroadcasti32x4 m6, [base+save_cond0]
+ vpbroadcastd m7, [base+pb_128]
+ mova m8, [base+save_pack0]
+ movu xm9, [base+save_pack0+4]
+ lea r9d, [xendq*5]
+ lea xstartd, [xstartq*5]
+ sub yendd, ystartd
+ add ystartd, ystartd
+ lea strideq, [strideq*5]
+ sub xstartq, r9
+ add xendd, r9d
+ add rpq, r9
+ mov r10d, 0x1f
+ kmovb k2, r10d
+ DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
+.loop_y:
+ and ystartd, 30
+ mov xq, xstartq
+ mov bq, [rrq+ystartq*8]
+ add ystartd, 2
+ lea bq, [bq+xendq*4]
+.loop_x:
+ imul candq, xq, 0x9999
+ sar candq, 16 ; x / 5 * 3
+ movzx r10d, byte [bq+candq*8+22] ; cand_b->bs
+ movu xm0, [bq+candq*8+12] ; cand_b
+ movzx r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0]
+ movzx r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1]
+ add r10, r14
+ add candq, r11
+ jge .calc
+ movzx r11d, byte [bq+candq*8+22]
+ vinserti32x4 ym0, [bq+candq*8+12], 1
+ movzx r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0]
+ movzx r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1]
+ add r11, r14
+ add candq, r12
+ jge .calc
+ movzx r12d, byte [bq+candq*8+22]
+ vinserti32x4 m0, [bq+candq*8+12], 2
+ movzx r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0]
+ movzx r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1]
+ add r12, r14
+ add candq, r13
+ jge .calc
+ vinserti32x4 m0, [bq+candq*8+12], 3
+ movzx r13d, byte [bq+candq*8+22]
+ movzx r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1]
+ add r13, r14
+.calc:
+ pshufb m1, m0, m3
+ pabsw m2, m0
+ pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1]
+ psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096
+ psubd m2, m1
+ pshufb m2, m5 ; c0 c1 c1 c0
+ pand m2, m6
+ punpckhqdq m1, m2, m2
+ vpternlogd m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80
+ pshufb m2, m0, m1
+ mova xm0, xm2
+ call r10
+ jge .next_line
+ vextracti32x4 xm0, m2, 1
+ call r11
+ jge .next_line
+ vextracti32x4 xm0, m2, 2
+ call r12
+ jge .next_line
+ vextracti32x4 xm0, m2, 3
+ call r13
+ jl .loop_x
+.next_line:
+ add rpq, strideq
+ dec hd
+ jg .loop_y
+ RET
+.write1:
+ vmovdqu8 [rpq+xq]{k2}, xm0
+ add xq, 5*1
+ ret
+.write2:
+ pshufb xm0, xm8
+ vmovdqu16 [rpq+xq]{k2}, xm0
+ add xq, 5*2
+ ret
+.write4:
+ vpermb ym0, ym8, ym0
+ vmovdqu32 [rpq+xq]{k2}, ym0
+ add xq, 5*4
+ ret
+.write8:
+ vpermb m0, m8, m0
+ vmovdqu64 [rpq+xq]{k2}, m0
+ add xq, 5*8
+ ret
+.write16:
+ vpermb m1, m8, m0
+ movu [rpq+xq+ 0], m1
+ pshufb xm0, xm9
+ movu [rpq+xq+64], xm0
+ add xq, 5*16
+ ret
+
+INIT_ZMM avx512icl
+cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
+ vbroadcasti32x4 m0, [aq]
+ lea r1, [splat_mv_avx512icl_table]
+ tzcnt bw4d, bw4d
+ lea bx4d, [bx4q*3]
+ pshufb m0, [splat_mv_shuf]
+ movsxd bw4q, [r1+bw4q*4]
+ mov r6d, bh4m
+ add bw4q, r1
+ lea rrq, [rrq+r6*8]
+ mov r1d, 0x3f
+ neg r6
+ kmovb k1, r1d
+ jmp bw4q
+.w1:
+ mov r1, [rrq+r6*8]
+ vmovdqu16 [r1+bx4q*4]{k1}, xm0
+ inc r6
+ jl .w1
+ RET
+.w2:
+ mov r1, [rrq+r6*8]
+ vmovdqu32 [r1+bx4q*4]{k1}, ym0
+ inc r6
+ jl .w2
+ RET
+.w4:
+ mov r1, [rrq+r6*8]
+ vmovdqu64 [r1+bx4q*4]{k1}, m0
+ inc r6
+ jl .w4
+ RET
+.w8:
+ pshufd ym1, ym0, q1021
+.w8_loop:
+ mov r1, [rrq+r6*8+0]
+ mov r3, [rrq+r6*8+8]
+ movu [r1+bx4q*4+ 0], m0
+ mova [r1+bx4q*4+64], ym1
+ movu [r3+bx4q*4+ 0], m0
+ mova [r3+bx4q*4+64], ym1
+ add r6, 2
+ jl .w8_loop
+ RET
+.w16:
+ pshufd m1, m0, q1021
+ pshufd m2, m0, q2102
+.w16_loop:
+ mov r1, [rrq+r6*8+0]
+ mov r3, [rrq+r6*8+8]
+ mova [r1+bx4q*4+64*0], m0
+ mova [r1+bx4q*4+64*1], m1
+ mova [r1+bx4q*4+64*2], m2
+ mova [r3+bx4q*4+64*0], m0
+ mova [r3+bx4q*4+64*1], m1
+ mova [r3+bx4q*4+64*2], m2
+ add r6, 2
+ jl .w16_loop
+ RET
+.w32:
+ pshufd m1, m0, q1021
+ pshufd m2, m0, q2102
+.w32_loop:
+ mov r1, [rrq+r6*8]
+ lea r1, [r1+bx4q*4]
+ mova [r1+64*0], m0
+ mova [r1+64*1], m1
+ mova [r1+64*2], m2
+ mova [r1+64*3], m0
+ mova [r1+64*4], m1
+ mova [r1+64*5], m2
+ inc r6
+ jl .w32_loop
+ RET
+%endif ; ARCH_X86_64
diff --git a/third_party/dav1d/src/x86/refmvs.h b/third_party/dav1d/src/x86/refmvs.h
new file mode 100644
index 0000000000..9dafa78b13
--- /dev/null
+++ b/third_party/dav1d/src/x86/refmvs.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2021, VideoLAN and dav1d authors
+ * Copyright © 2021, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/refmvs.h"
+
+decl_save_tmvs_fn(dav1d_save_tmvs_ssse3);
+decl_save_tmvs_fn(dav1d_save_tmvs_avx2);
+decl_save_tmvs_fn(dav1d_save_tmvs_avx512icl);
+
+decl_splat_mv_fn(dav1d_splat_mv_sse2);
+decl_splat_mv_fn(dav1d_splat_mv_avx2);
+decl_splat_mv_fn(dav1d_splat_mv_avx512icl);
+
+static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+ c->splat_mv = dav1d_splat_mv_sse2;
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->save_tmvs = dav1d_save_tmvs_ssse3;
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->save_tmvs = dav1d_save_tmvs_avx2;
+ c->splat_mv = dav1d_splat_mv_avx2;
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->save_tmvs = dav1d_save_tmvs_avx512icl;
+ c->splat_mv = dav1d_splat_mv_avx512icl;
+#endif
+}